├── .gitignore
├── LICENSE
├── README.md
├── api.py
├── chat_templates
    ├── alpaca.jinja
    ├── chatml.jinja
    └── place_your_templates_here.txt
├── pyvenv.cfg
├── screenshots
    ├── webui_screenshot.png
    ├── webui_screenshot1.png
    ├── webui_screenshot2.png
    ├── webui_screenshot3.png
    └── webui_screenshot4.png
├── serve_chat_completions.py
├── serve_completions.py
├── setup.py
├── sliced_llama.py
├── sliced_llama_exl2.py
├── sliced_llama_server.py
├── start.bat
└── webui
    ├── LICENSE-sse.js
    ├── favicon.ico
    ├── index.css
    ├── index.html
    ├── index.js
    ├── sse.js
    └── webui-dragndrop-test.html


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 silphendio
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sliced_llama
 2 | SlicedLlama is AI text generation software powered by [exllamav2](https://github.com/turboderp/exllamav2).
 3 | It runs large language models (LLMs) and offers an interface to generate text, adjust model parameters, and modify the model on the fly by rearranging layers.
 4 | A webUI is included, but it's also a server compatible with other LLM GUIs.
 5 | 
 6 | ![Screenshot](https://raw.githubusercontent.com/silphendio/sliced_llama/main/screenshots/webui_screenshot.png)
 7 | 
 8 | 
 9 | ## Features
10 | - Text Completion WebUI
11 | - partly OpenAI-compatible API (this is a work in progress)
12 | - Layer Slicing: Basically instant Franken-self-merges. You don't even need to reload the model (just the cache).
13 | - Top Logprobs: See the top probabilities for each chosen token. This might help with adjusting sampler parameters.
14 | 
15 | 
16 | ## Installation
17 | - Make sure python  is installed, and a CUDA or RocM (Linux only) compatible GPU driver.
18 | - Clone or download this repository.
19 | - Use the setup script. This creates a venv and installs dependencies.
20 | ```bash
21 | git clone --depth=1 https://github.com/silphendio/sliced_llama
22 | cd sliced_llama
23 | python ./setup.py
24 | ```
25 | 
26 | DISCLAIMER: I haven't tested it on windows at all.
27 | 
28 | ## Usage
29 | On Linux, just run it with
30 | ```bash
31 | ./sliced_llama_server.py
32 | ```
33 | On Windows, click `start.bat` instead. (It invokes `.venv\Scripts\python sliced_llama_server.py`)
34 | 
35 | This starts the inference server and the webUI. There, you can load models, adjust parameters and do inference.
36 | You can also use command line arguments, e.g.:
37 | ```
38 | ./sliced_llama_server.py --model ~/path/to/llm-model-exl2/ --context-size 2048 --slices "0-24, 8-32"
39 | ```
40 | Currently only exl2 models are supported. You can get them from [huggingface](https://huggingface.co/models?sort=trending&search=exl2).
41 | Make sure that the model fits into VRAM, with some extra memory depending on `(context size)² * (number of layers)`, where context size is the number of tokens the model can remember.
42 | 
43 | The WebUI currently only supports text completion, so you need to do the prompt formatting yourself. Each model has its preferred prompt format, so look it up.
44 | 
45 | ## Compatibility with other apps:
46 | As an alternative to the webUI, the server can also connect to OpenAI-compatible GUIs like [Mikupad](https://github.com/lmg-anon/mikupad) or [SillyTavern](https://github.com/SillyTavern/SillyTavern).
47 | 
48 | 
49 | - For SillyTavern, select chat completion, and use `http://127.0.0.1:57593/v1` as costum endpoint.
50 |   This will not give you many options, but if you change parameters in the WebUI, the inference server should remember them.
51 |   You can select different chat templates in the WebUI. You can add more to the `chat_templates` folder.
52 | 
53 | ## TODO / missing features
54 | In no particular order:
55 | - configuration file
56 | - LoRA support
57 | - Classifier Free Guidance
58 | - OpenAI API:
59 |   - chat completion currently only works with streaming
60 |   - `presency_penalty` and `frequency_penalty` aren't supported
61 |   - authentication
62 |   - usage statistics
63 | - compatibility with TabbyAPI (For better SillyTavern integration)
64 | - merging different models together
65 | - different merging methods
66 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from pydantic import BaseModel, Field
  3 | 
  4 | 
  5 | class GenerationInput(BaseModel):
  6 |     prompt: str
  7 | 
  8 | 
  9 | # OpenAI API
 10 | 
 11 | 
 12 | 
 13 | # OpenAI Chat API
 14 | 
 15 | class ChatMessage(BaseModel):
 16 |     content: str | None = None
 17 |     # tool_calls # not implemented
 18 |     role : str = "assistant"
 19 |     name : str | None = None
 20 |     
 21 | 
 22 | class ChatRequest(BaseModel):
 23 |     messages: list[ChatMessage] = []
 24 |     model: str
 25 |     frequency_penalty: float = 0.0
 26 |     logit_bias: dict[str, float] = {}
 27 |     logprobs: bool = False
 28 |     top_logprobs: int = 0
 29 |     max_tokens: int | None = None
 30 |     n: int = 1
 31 |     presence_penalty: float | None = None
 32 |     #response_format
 33 |     seed: int | None = None
 34 |     stop: list[str] | str | None = None
 35 |     stream: bool = False
 36 |     temperature: float = 1.0
 37 |     top_p: float = 1.0
 38 |     #tools
 39 |     #tool_choice
 40 |     #user
 41 | 
 42 | ## OAI chat completions
 43 | class TopLogprob(BaseModel):
 44 |     token: str
 45 |     logprob: float
 46 |     bytes: list[int] | None = None
 47 | 
 48 | class ChatCompletionLogProb(BaseModel):
 49 |     token: str
 50 |     logprob: float
 51 |     bytes: list[int] | None = None
 52 |     top_logprobs: list[TopLogprob]
 53 | 
 54 | class ChatCompletionLogProbs(BaseModel):
 55 |     content: list[ChatCompletionLogProb] = []
 56 | 
 57 | class ChatCompletionChoice(BaseModel):
 58 |     finish_reason: str | None = Field(default=None, description='"stop", "Length", or null')
 59 |     index : int = Field(default=0, description="n > 1 (multiple choices).")
 60 |     message: ChatMessage
 61 |     logprobs: ChatCompletionLogProbs | None = None
 62 | 
 63 | class CompletionUsage(BaseModel):
 64 |     completion_tokens: int
 65 |     prompt_tokens: int
 66 |     total_tokens: int
 67 | 
 68 | class ChatCompletion(BaseModel):
 69 |     id: str = "1234"
 70 |     choices: list[ChatCompletionChoice]
 71 |     created: int = 1234
 72 |     model: str
 73 |     system_fingerprint: str = "N.A."
 74 |     object: str = "chat.completion"
 75 |     usage: CompletionUsage | None # TODO
 76 | 
 77 | class ChatCompletionChunkChoice(BaseModel):
 78 |     finish_reason: str | None = Field(default=None, description='"stop", "Length", or null')
 79 |     index : int = Field(default=0, description="n > 1 (multiple choices).")
 80 |     delta: ChatMessage
 81 |     logprobs: ChatCompletionLogProbs | None = None
 82 | 
 83 | 
 84 | class ChatCompletionChunk(BaseModel):
 85 |     id: str = "1234"
 86 |     choices: list[ChatCompletionChunkChoice]
 87 |     created: int = 1234
 88 |     model: str
 89 |     system_fingerprint: str = "N.A."
 90 |     object: str = "chat.completion"
 91 | 
 92 | 
 93 | ## OAI completions
 94 | class CompletionRequest(BaseModel):
 95 |     model: str | None = None
 96 |     prompt: str
 97 |     stream: bool = Field(default = False)
 98 |     best_of: int | None = None
 99 |     echo: bool = False
100 |     logit_bias: dict[int,int] | None = None
101 |     logprobs: int | None = None
102 |     max_tokens: int | None = None
103 |     n: int = Field(default=1, description="This will be ignored. Batch processing is not yet implemented.")
104 |     frequency_penalty: float | None = None
105 |     presence_penalty: float | None = None
106 |     seed: int | None = None
107 |     stop: str | list[str] | None = None
108 |     stream: bool = False
109 |     suffix: str = ""
110 |     temperature: float | None = None
111 |     top_p: float | None = None
112 |     user: str | None = None
113 | 
114 | 
115 | 
116 | class CompletionLogProbs(BaseModel):
117 |     text_offset: list[int] = []
118 |     token_logprobs: list[float] = []
119 |     tokens: list[str] = []
120 |     top_logprobs: list[dict[str, float]] | None = []
121 | 
122 | class CompletionChoice(BaseModel):
123 |     finish_reason: str | None = 'length' # or 'stop
124 |     index: int = 0
125 |     logprobs: CompletionLogProbs | None = None
126 |     text: str = ""
127 | 
128 | class Completion(BaseModel):
129 |     id: str = "1234"
130 |     choices: list[CompletionChoice]
131 |     created: int = 1234
132 |     model: str
133 |     system_fingerprint: str = "this fingerprint does not exist"
134 |     object: str = "chat.completion"
135 |     usage: CompletionUsage | None = None # TODO: add this & make it mandatory!
136 | 
137 | # embeddings
138 | 
139 | class EmbeddingsRequest(BaseModel):
140 |     input: str | list[str]
141 |     model: str | None = None
142 |     encoding_format: str = 'float'
143 |     user: str = "user1234"
144 | 
145 | class Embeddings(BaseModel):
146 |     index: int = 0
147 |     embedding: list
148 |     object: str = 'embedding'
149 | 
150 | class PromptRequest:
151 |     prompt: str
152 | class ValueResponse(BaseModel):
153 |     value: int
154 | 
155 | # tabbyAPI stuff
156 | class TabbyTokenEncodeRequest(BaseModel):
157 |     add_bos_token: bool = True
158 |     encode_special_tokens: bool = True
159 |     decode_special_tokens: bool = True
160 |     text: str
161 | 
162 | class TabbyTokenDecodeRequest(BaseModel):
163 |     add_bos_token: bool = True
164 |     encode_special_tokens: bool = True
165 |     decode_special_tokens: bool = True
166 |     tokens: list[int]
167 | 
168 | class ModelCardParameters(BaseModel):
169 |     """Represents model card parameters."""
170 | 
171 |     # Safe to do this since it's guaranteed to fetch a max seq len
172 |     # from model_container
173 |     max_seq_len: int | None = None
174 |     rope_scale: float = 1.0
175 |     rope_alpha: float = 1.0
176 |     cache_mode: str = "FP16"
177 |     prompt_template: str | None = None
178 |     num_experts_per_token: int | None = None
179 |     #use_cfg: Optional[bool] = None
180 |     #draft: Optional["ModelCard"] = None
181 | 
182 | 
183 | 
184 | class ModelInfo(BaseModel):
185 |     """Represents a single model card."""
186 | 
187 |     id: str = "not available"
188 |     object: str = "model"
189 |     created: int = 1234 # no idea what time this represents
190 |     owned_by: str = "no idea"
191 |     #logging: Optional[LogPreferences] = None
192 |     parameters: ModelCardParameters | None = None
193 |     available_layers: int | None = None
194 |     used_layers: list[int] | None = None
195 | 
196 | 
197 | 
198 | 
199 | class LoadModelRequest(BaseModel):
200 |     name: str | None = Field(default=None, description = "This is actually the path to the folder containing the safetensor and config files. The awkward name is for compatibility with TabbyAPI.")
201 |     cache_mode: str = Field(default="FP8", description="use FP8 to save memory, or FP16 for better accuracy")
202 |     max_seq_len: int | None = None
203 |     layer_list: list[int] | None = None
204 | 
205 | 
206 | 
207 | class LayerRearrangeRequest(BaseModel):
208 |     layer_list: list[int]
209 | 
210 | # other stuff
211 | class ModelLoadResponse(BaseModel):
212 |     success: bool = True
213 | 
214 | class LayerRearrangeResponse(BaseModel):
215 |     success: bool = True
216 | 
217 | class TemplatesResponse(BaseModel):
218 |     object: str = "list"
219 |     data: list[str] = []
220 | 
221 | class LoadTemplateRequest(BaseModel):
222 |     name: str


--------------------------------------------------------------------------------
/chat_templates/alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}
30 | 


--------------------------------------------------------------------------------
/chat_templates/chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
3 | 


--------------------------------------------------------------------------------
/chat_templates/place_your_templates_here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/chat_templates/place_your_templates_here.txt


--------------------------------------------------------------------------------
/pyvenv.cfg:
--------------------------------------------------------------------------------
1 | home = /usr/bin
2 | include-system-site-packages = false
3 | version = 3.11.6
4 | executable = /usr/bin/python3.11
5 | command = /usr/bin/python3 -m venv /home/silphendio/Documents/code/ai/sliced_llama
6 | 


--------------------------------------------------------------------------------
/screenshots/webui_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot.png


--------------------------------------------------------------------------------
/screenshots/webui_screenshot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot1.png


--------------------------------------------------------------------------------
/screenshots/webui_screenshot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot2.png


--------------------------------------------------------------------------------
/screenshots/webui_screenshot3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot3.png


--------------------------------------------------------------------------------
/screenshots/webui_screenshot4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot4.png


--------------------------------------------------------------------------------
/serve_chat_completions.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | from fastapi.encoders import jsonable_encoder
  4 | from sse_starlette import EventSourceResponse
  5 | from api import *
  6 | from sliced_llama import SlicedLLama
  7 | from fastapi import APIRouter, HTTPException
  8 | import jinja2
  9 | import os
 10 | 
 11 | template_folders = []
 12 | 
 13 | llm : SlicedLLama
 14 | 
 15 | jinja2_env = jinja2.Environment()
 16 | default_template_string = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
 17 | chatml_template = jinja2_env.from_string(default_template_string)
 18 | chatml_start_str = '<|im_start|>assisstant\n'
 19 | chatml_stop_str = '<|im_end|>'
 20 | 
 21 | router = APIRouter()
 22 | 
 23 | @router.post("/v1/chat/completions", response_model=ChatCompletion)
 24 | async def oai_chat_completions(req: ChatRequest)  -> ChatCompletion | EventSourceResponse:
 25 |     print("request received: ", req.model_dump())
 26 |     print("----------------------------------")
 27 | 
 28 |     prompt = chatml_template.render(req.model_dump())
 29 |     prompt += chatml_start_str
 30 |     print(prompt)
 31 |     print("#######################")
 32 |     print("prompt in ChatML format:\n" + prompt + "\n")
 33 | 
 34 |     # update gen settings (TODO: do this properly?)
 35 |     llm.gen_settings.__dict__ = dict(llm.gen_settings.__dict__,  **req.model_dump())
 36 | 
 37 |     # handle parameters
 38 |     stop_str = req.stop
 39 |     if stop_str is str: stop_str = [stop_str]
 40 |     if stop_str is None: stop_str = []
 41 |     stop_str += [chatml_stop_str]
 42 | 
 43 | 
 44 |     if req.stream:
 45 |         async def event_generator():
 46 |             ret_logprobs = ChatCompletionLogProbs() if req.logprobs and req.logprobs > 0 else None
 47 |             text = ''
 48 |             finish_reason = None
 49 |             if not req.max_tokens: req.max_tokens = 99999999
 50 |             llm.start_stream(prompt, stop_strings=stop_str, logprobs=req.logprobs)
 51 |             try:
 52 |                 for i in range(req.max_tokens):
 53 |                     ret = llm.stream_next_token()
 54 |                     text += ret.chunk
 55 |                     print(ret.chunk, end='')
 56 |                     if ret.eos:
 57 |                         finish_reason = "stop"
 58 |                         break
 59 | 
 60 |                     # log probs
 61 |                     if ret_logprobs:
 62 |                         ret_logprobs.content = ChatCompletionLogProb(
 63 |                             text_offset = [0],
 64 |                             logprob = ret.logprob,
 65 |                             token = ret.chunk,
 66 |                             top_logprobs = [ TopLogprob(token=tok, logprob=lp) for tok,lp in ret.top_logprobs.items()],
 67 |                         )
 68 | 
 69 |                     stream_chunk_data = ChatCompletionChunk(
 70 |                         model = llm.model_name,
 71 |                         choices = [ChatCompletionChunkChoice(
 72 |                             finish_reason=finish_reason,
 73 |                             delta=ChatMessage(content=ret.chunk),
 74 |                             logprobs=ret_logprobs
 75 |                         )]
 76 |                     )
 77 |                     yield json.dumps(jsonable_encoder(stream_chunk_data)) # remove class names, fix braces and quotes
 78 |                     await asyncio.sleep(0)
 79 |             except asyncio.CancelledError:
 80 |                 print(" - STREAM CANCELLED")
 81 | 
 82 |         return EventSourceResponse(event_generator())
 83 |     
 84 |     else: # not streaming
 85 |         raise NotImplementedError # TODO
 86 | 
 87 | # chat templates
 88 | 
 89 | # file utils: these work with folders too
 90 | def find_file(name: str, search_path: list[str] = [], ext: str = "") -> str | None:
 91 |     if not name.endswith(ext):
 92 |         name += ext
 93 |     if os.path.exists(name):
 94 |         return name
 95 |     for folder in search_path:
 96 |         if not os.path.isdir(folder): continue
 97 |         for filename in os.listdir(folder):
 98 |             if name == filename:
 99 |                 return os.path.join(folder, filename)
100 | 
101 | def list_files(search_path: list[str], ext: str = ""):
102 |     results = []
103 |     for folder in search_path:
104 |         if not os.path.isdir(folder): continue
105 |         for filename in os.listdir(folder):
106 |             if filename.endswith(ext):
107 |                 results.append(filename[:-len(ext)])
108 |     return results
109 | 
110 | 
111 | @router.get("/v1/templates")
112 | @router.get("/v1/template/list")
113 | async def get_templates()  -> TemplatesResponse:
114 |     print("template_folders", template_folders)
115 |     return TemplatesResponse(data= list_files(template_folders, ".jinja"))
116 | 
117 | @router.post("/v1/template/switch")
118 | async def switch_template(req: LoadTemplateRequest):
119 |     global chatml_template
120 |     print(req.name, template_folders)
121 |     file_path = find_file(req.name, template_folders, ".jinja")
122 |     if file_path == None:
123 |         raise HTTPException(status_code=400, detail="template not found")
124 |     
125 |     with open(file_path, encoding="utf-8") as f:
126 |         template_str = f.read()
127 |         chatml_template = jinja2_env.from_string(template_str)
128 | 
129 | 
130 | @router.post("/v1/template/unload", description="This sets the template string to chatml")
131 | def unload_template():
132 |     global chatml_template
133 |     chatml_template = jinja2_env.from_string(default_template_string)
134 | 


--------------------------------------------------------------------------------
/serve_completions.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | from fastapi.encoders import jsonable_encoder
 4 | from sse_starlette import EventSourceResponse
 5 | from api import *
 6 | from sliced_llama import SlicedLLama
 7 | from fastapi import APIRouter
 8 | 
 9 | llm : SlicedLLama
10 | 
11 | router = APIRouter()
12 | 
13 | @router.post("/v1/completions", response_model=CompletionChoice)
14 | async def oai_completions(req: CompletionRequest)  -> CompletionChoice | EventSourceResponse:
15 |     print("request received: ", req)
16 | 
17 |     # update gen settings (TODO: do this properly?)
18 |     llm.gen_settings.__dict__ = dict(llm.gen_settings.__dict__,  **req.model_dump())
19 | 
20 |     # handle parameters
21 |     stop_str = req.stop
22 |     if stop_str is str: stop_str = [stop_str]
23 |     if stop_str is None: stop_str = []
24 | 
25 | 
26 |     if req.stream:
27 |         async def event_generator():
28 |             ret_logprobs = CompletionLogProbs() if req.logprobs and req.logprobs > 0 else None
29 |             text = ''
30 |             finish_reason = None
31 |             if not req.max_tokens: req.max_tokens = 99999999
32 |             llm.start_stream(req.prompt, stop_strings=stop_str, logprobs=req.logprobs)
33 |             try:
34 |                 for i in range(req.max_tokens):
35 |                     ret = llm.stream_next_token()
36 |                     text += ret.chunk
37 |                     print(ret.chunk, end='')
38 |                     if ret.eos:
39 |                         finish_reason = "stop"
40 |                         break
41 | 
42 |                     # log probs
43 |                     if ret_logprobs:
44 |                         ret_logprobs = CompletionLogProbs(
45 |                             text_offset = [0],
46 |                             token_logprobs = [ret.logprob],
47 |                             tokens = [ret.chunk],
48 |                             top_logprobs = [ret.top_logprobs],
49 |                         )
50 | 
51 |                     stream_chunk_data = Completion(
52 |                         model = llm.model_name,
53 |                         choices = [CompletionChoice(finish_reason=finish_reason, text=ret.chunk, logprobs=ret_logprobs)]
54 |                         )
55 |                     yield json.dumps(jsonable_encoder(stream_chunk_data)) # remove class names, fix braces and quotes
56 |                     await asyncio.sleep(0)
57 |             except asyncio.CancelledError:
58 |                 print(" - STREAM CANCELLED")
59 | 
60 |         return EventSourceResponse(event_generator())
61 |     
62 |     else: # not streaming
63 |         text = ''
64 |         finish_reason = "length"
65 |         ret_logprobs = CompletionLogProbs() if req.logprobs and req.logprobs > 0 else None
66 |         if not req.max_tokens: req.max_tokens = 99999999
67 |         llm.start_stream(req.prompt)
68 |         for _ in range(req.max_tokens):
69 |             ret = llm.stream_next_token()
70 | 
71 |             # logprobs
72 |             if ret_logprobs:
73 |                 ret_logprobs.text_offset += [len(text)],
74 |                 ret_logprobs.token_logprobs += [ret.logprob],
75 |                 ret_logprobs.tokens = [ret.chunk],
76 |                 ret_logprobs.top_logprobs = [ret.top_logprobs],
77 | 
78 |             text += ret.chunk
79 |             if ret.eos:
80 |                 finish_reason = "stop"
81 |                 break
82 |             
83 |              # logprobs
84 | 
85 |         return Completion(
86 |             model = llm.model_name,
87 |             choices = [CompletionChoice(finish_reason=finish_reason, text=text, logprobs=ret_logprobs)]
88 |             )
89 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from subprocess import run
 3 | import sys
 4 | import os
 5 | 
 6 | # create venv, try different python commands
 7 | python_cmds = ["python", "python3", "py"]
 8 | print("creating virtual environment...")
 9 | for cmd in python_cmds:
10 |     if os.path.isdir(".venv"):
11 |         break
12 |     try:
13 |         run([cmd, "-m", "venv", ".venv"])
14 |     except: pass
15 | if not os.path.isdir(".venv"):
16 |     print("failed to create virtual environment.")
17 | 
18 | 
19 | # determine platform
20 | exl2_version = "0.1.5"
21 | torch_str = "torch2.3.1"
22 | torch_str_fa_linux = "torch2.3"
23 | 
24 | fa_version = "2.5.9.post1"
25 | fa_gpu_str = ""
26 | 
27 | py_str = "cp" + ''.join(sys.version.split(".")[:2])
28 | 
29 | gpu_str = "" # CUDA or ROCm versions
30 | os_str = "" #"win_amd64" or "linux_x86_64"
31 | 
32 | # check for CUDA or ROCm
33 | try:
34 |     a = run("nvidia-smi", capture_output=True)
35 |     if b'CUDA Version: 12' in a.stdout:
36 |         gpu_str = "cu121"
37 |         fa_gpu_str = "cu122"
38 |     elif b'CUDA Version: 11' in a.stdout:
39 |         gpu_str = "cu118"
40 | except:
41 |     try:
42 |         a = run("rocminfo", capture_output=True)
43 |         major_version = int(a.stdout.decode('utf-8').split('Runtime Version')[1].strip().split('.')[0])
44 |         
45 |         if major_version == 5:
46 |             gpu_str = "rocm5.6"
47 |         elif major_version >= 6:
48 |             gpu_str = "rocm6.0"
49 |     except:
50 |         print("no CUDA or ROCm found")
51 | 
52 | # determine OS
53 | if sys.platform == 'win32':
54 |     os_str = "win_amd64"
55 | if sys.platform == 'linux':
56 |     os_str = "linux_x86_64"
57 | 
58 | 
59 | ## specify requirenments
60 | requirements = [
61 |     "fastapi",
62 |     "sse-starlette",
63 |     "uvicorn",
64 |     "tokenizers",
65 |     "setuptools",
66 | ]
67 | 
68 | # exllamav2
69 | if os_str and gpu_str:
70 |     requirements.append(f"https://github.com/turboderp/exllamav2/releases/download/v{exl2_version}/exllamav2-{exl2_version}+{gpu_str}.{torch_str}-{py_str}-{py_str}-{os_str}.whl")
71 | else:
72 |     requirements.append("exllamav2")
73 | 
74 | # flash_attn
75 | if gpu_str.startswith("cu") :
76 |     if os_str == "win_amd64":
77 |         requirements.append(f"https://github.com/bdashore3/flash-attention/releases/download/v{fa_version}/flash_attn-{fa_version}+{fa_gpu_str}{torch_str}cxx11abiFALSE-{py_str}-{py_str}-{os_str}.whl")
78 | 
79 |     # linux
80 |     if os_str == "linux_x86_64":
81 |         requirements.append(f"https://github.com/bdashore3/flash-attention/releases/download/v{fa_version}/flash_attn-{fa_version}+{fa_gpu_str}{torch_str_fa_linux}cxx11abiFALSE-{py_str}-{py_str}-{os_str}.whl")
82 | # there is no fallback for flash attention, because it's optional
83 | 
84 | 
85 | # install requirements
86 | 
87 | pip_cmd = "./.venv/Scripts/pip" if sys.platform == 'win32' else "./.venv/bin/pip"
88 | 
89 | # pytorch
90 | if gpu_str.startswith("cu") or os_str == "linux_x86_64":
91 |     run([pip_cmd, "install", "torch", "--index-url", f"https://download.pytorch.org/whl/{gpu_str}"])
92 | 
93 | for req in requirements:
94 |     run([pip_cmd, "install", req])
95 | 


--------------------------------------------------------------------------------
/sliced_llama.py:
--------------------------------------------------------------------------------
 1 | from typing import Generator
 2 | import os
 3 | 
 4 | import api
 5 | 
 6 | 
 7 | class StreamTokenReturn:
 8 |     chunk: str = ""
 9 |     eos: bool = True
10 |     logprob: float = 0.0
11 |     top_logprobs: dict[str, float] = {}
12 | 
13 | from exllamav2.generator import ExLlamaV2Sampler
14 | 
15 | GenSettings = ExLlamaV2Sampler.Settings
16 | 
17 | class SlicedLLama:
18 |     model_name: str = ""
19 |     layers: int = 0
20 |     is_streaming: bool
21 |     gen_settings : GenSettings
22 | 
23 |     def __init__(self):
24 |         raise NotImplementedError
25 | 
26 |     def load_model(path: str, cache_type: str = "FP16", max_seq_len : int|None = None):
27 |         raise NotImplementedError
28 | 
29 |     def unload_model():
30 |         raise NotImplementedError
31 | 
32 |     def rearrange_layers(self, layer_list: list[int]):
33 |         raise NotImplementedError
34 | 
35 |     def start_stream(self, prompt: str, stop_strings: list[str]=[], logprobs = 0, **kwargs):
36 |         raise NotImplementedError
37 | 
38 |     def stream_next_token(self) -> StreamTokenReturn:
39 |         raise NotImplementedError
40 |     
41 |     def stop_stream(self):
42 |         raise NotImplementedError
43 | 
44 | 
45 |     def generate_response(self, max_length: int = 9999999, stop_strings: list[str] = []) -> str:
46 |         raise NotImplementedError
47 |     
48 |     def tokenize(self, text: str, add_bos_token: bool=True,
49 |                  encode_special_tokens: bool=True, decode_special_tokens: bool=True) -> list[int]:
50 |         raise NotImplementedError
51 | 
52 |     def decode_tokens(self, tokens: list[int], add_bos_token: bool=True,
53 |                  encode_special_tokens: bool=True, decode_special_tokens: bool=True) -> str:
54 |         raise NotImplementedError
55 | 
56 |     def create_embeddings(self, text) -> list[float]:
57 |         raise NotImplementedError
58 | 
59 |     def count_tokens(self, text) -> int:
60 |         return len(self.tokenize(text))
61 | 


--------------------------------------------------------------------------------
/sliced_llama_exl2.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | from exllamav2 import *
  3 | from exllamav2.generator import *
  4 | from exllamav2.module import ExLlamaV2Module
  5 | import sys, torch
  6 | from sliced_llama import SlicedLLama, StreamTokenReturn
  7 | from typing import Generator
  8 | import os
  9 | from copy import copy
 10 | import math
 11 | 
 12 | from urllib.parse import urlparse # to allow loading from path urls
 13 | 
 14 | #from streaming2 import ExLlamaV2StreamingGenerator2
 15 | 
 16 | 
 17 | class SlicedLLamaExl2(SlicedLLama):
 18 |     config: ExLlamaV2Config
 19 |     model: ExLlamaV2
 20 |     tokenizer: ExLlamaV2Tokenizer
 21 |     generator: ExLlamaV2StreamingGenerator
 22 |     gen_settings: ExLlamaV2Sampler.Settings
 23 |     is_streaming: bool = False
 24 |     original_modules: list[ExLlamaV2Module] # for repeated layer rearranging
 25 |     available_layers: int = 0
 26 |     cache : ExLlamaV2Cache | ExLlamaV2Cache_8bit
 27 |     
 28 |     # extra config
 29 |     n_logprobs: int = 0
 30 | 
 31 |     def __init__(self):
 32 |         pass
 33 | 
 34 |     def load_model(self, model_path: str, cache_type: str = "FP16", max_seq_len : int|None = None):
 35 |         if hasattr(self, 'model'):
 36 |             self.model.modules = self.original_modules
 37 |             self.model.unload()
 38 |         self.cache = None
 39 | 
 40 |         try: model_path = urlparse(model_path).path
 41 |         except: pass
 42 |         
 43 |         self.config = ExLlamaV2Config()
 44 |         self.config.model_dir = model_path
 45 |         self.config.prepare()
 46 |         if max_seq_len:
 47 |             self.config.max_seq_len = max_seq_len
 48 |         self.model = ExLlamaV2(self.config)
 49 |         self.original_modules = self.model.modules
 50 |         self.available_layers = self.config.num_hidden_layers
 51 | 
 52 |         if cache_type == "FP16":
 53 |             self.cache = ExLlamaV2Cache(self.model, lazy = True)
 54 |         else:
 55 |             self.cache = ExLlamaV2Cache_8bit(self.model, lazy = True)
 56 |         
 57 |         self.model.load_autosplit(self.cache)
 58 | 
 59 | 
 60 |         self.tokenizer = ExLlamaV2Tokenizer(self.config)
 61 |         self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer)
 62 |         self.gen_settings = ExLlamaV2Sampler.Settings()
 63 | 
 64 | 
 65 |     def rearrange_layers(self, layer_list: list[int]):
 66 |         if len(layer_list) < 1: raise ValueError
 67 |         if any([x < 0 or x >= self.available_layers for x in layer_list]):
 68 |             print("Layer index out of range!")
 69 |             raise ValueError
 70 |         # modules arangement: [embedding, [...layers], rms-norm, head]
 71 |         # where each layer is attention, mlp
 72 |         self.model.modules = self.original_modules[:1]
 73 |         for i, idx in enumerate(layer_list):
 74 |             self.model.modules += [copy(self.original_modules[idx*2 + 1])]
 75 |             self.model.modules[-1].layer_idx = i # use different cache for copied layer
 76 |             self.model.modules += [copy(self.original_modules[idx*2 + 2])]
 77 |         self.model.modules += self.original_modules[-2:]
 78 |         self.model.head_layer_idx = len(self.model.modules) -1
 79 |         self.model.config.num_hidden_layers = len(layer_list)
 80 |         self.model.last_kv_layer_idx = len(self.model.modules) -4
 81 | 
 82 |         # reload cache
 83 |         print("deleting old cache...")
 84 |         Cache = type(self.cache)
 85 |         del self.cache
 86 |         print("creating new cache...")
 87 |         self.model.cache_map = {}
 88 |         self.model.set_cache_map()
 89 |         self.cache = Cache(self.model)
 90 |         self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer)
 91 |         self.generator.set_stop_conditions([self.tokenizer.eos_token_id])
 92 |         print("layers sucessfully rearranged!")
 93 |         ids = [id(x) for x in self.cache.key_states]
 94 | 
 95 |     
 96 |     def start_stream(self, prompt: str, stop_strings: list[str]=[], logprobs = 0, **kwargs):
 97 |         print(self.gen_settings.__dict__)
 98 |         text_ids = self.tokenizer.encode(prompt, add_bos = True)
 99 |         #max_length = min(max_length, self.model.config.max_seq_len - text_ids.size(0)) # TODO: rope scaling??
100 | 
101 |         self.generator.set_stop_conditions(stop_strings)
102 |         self.generator.begin_stream(text_ids, self.gen_settings)
103 |         self.is_streaming = True
104 |         self.n_logprobs = logprobs if logprobs else 0
105 |         self.generator.return_logits = True
106 |             
107 |     def stream_next_token(self) -> StreamTokenReturn:
108 |         ret = StreamTokenReturn()
109 |         if not self.is_streaming:
110 |             return ret
111 |         chunk, eos, token, logits = self.generator.stream()
112 |         ret.eos = eos
113 |         ret.chunk = chunk
114 | 
115 |         if self.n_logprobs > 0:
116 |             ret.top_logprobs = {}
117 |             if token.numel() == 0:
118 |                 return ret
119 |             
120 |             token = token.flatten()[0]
121 | 
122 |             logprobs = logits[0].log_softmax(-1)
123 |             ret.logprob = float(logprobs[token])
124 |             top_logprobs = logprobs.topk(self.n_logprobs)
125 | 
126 |             for (prob, tok) in zip(*top_logprobs):
127 |                 tok_str = self.tokenizer.decode(tok.reshape(1,1))[0]
128 |                 # sometimes, two tokens have the same string
129 |                 if tok_str in ret.top_logprobs: # adding logits together is complicated
130 |                     ret.top_logprobs[tok_str] = math.log(math.exp(float(prob)) +
131 |                                                          math.exp(ret.top_logprobs[tok_str]))
132 |                 else:
133 |                     ret.top_logprobs[tok_str] = float(prob)
134 |             
135 |         return ret
136 |     
137 |     def stop_stream(self):
138 |         self.is_streaming = False
139 | 
140 |     def tokenize(self, text):
141 |         return self.tokenizer.encode(text, add_bos = True).flatten().tolist()
142 | 
143 |     def generate_response(self, max_length: int = 9999999, stop_strings: list = []) -> str:
144 |         text = ""
145 |         self.start_stream()
146 |         for i in range(max_length):
147 |             chunk, eos, _ = self.generator.stream()
148 |             text += chunk
149 |             if eos: return text
150 |         return text
151 | 
152 | 
153 |     def create_embeddings(self, text) -> list[float]:
154 |         ids = self.tokenize(text)
155 |         return my_sliced_llama.model.forward(ids, return_last_state=True)[1].flatten().tolist()
156 | 
157 |     def decode_tokens(self, tokens: list[int], add_bos_token: bool=True,
158 |                  encode_special_tokens: bool=True, decode_special_tokens: bool=True) -> str:
159 |         self.tokenizer.decode(torch.tensor(tokens))
160 | 
161 | # test code
162 | 
163 | def create_layer_list(slices_str: str) -> list[int]:
164 |     try:
165 |         layers = []
166 |         for s in slices_str.split(','):
167 |             a,b = s.split('-')
168 |             layers += list(range(int(a), int(b)))
169 |         return layers
170 |     
171 |     except ValueError:
172 |         return []
173 | 
174 | if __name__ == "__main__":
175 |     my_sliced_llama : SlicedLLama = SlicedLLamaExl2()
176 | 
177 |     my_sliced_llama.load_model('TinyLlama-1.1B-Chat-v1.0-3.0bpw-h6-exl2')
178 |     print("load #1")
179 |     import time
180 |     time.sleep(3.0)
181 |     my_sliced_llama.load_model('TinyLlama-1.1B-Chat-v1.0-3.0bpw-h6-exl2')
182 |     print("load #2")
183 | 
184 |     layer_list = create_layer_list('0-14, 8-22')
185 | 
186 |     finish_reason = "length"
187 |     text = ""
188 |     max_tokens = 512
189 |     my_sliced_llama.start_stream("Once upon a time")
190 |     for i in range(max_tokens):
191 |         chunk, eos, _ = my_sliced_llama.stream_next_token()
192 |         text += chunk
193 |         print(chunk, end='')
194 |         if eos:
195 |             finish_reason = "stop"
196 |             break
197 | 
198 |     print("--------------------")
199 | 
200 |     layer_list = create_layer_list('0-22')
201 | 
202 |     finish_reason = "length"
203 |     text = ""
204 |     max_tokens = 512
205 |     my_sliced_llama.start_stream("Once upon a time")
206 |     for i in range(max_tokens):
207 |         chunk, eos, _ = my_sliced_llama.stream_next_token()
208 |         text += chunk
209 |         print(chunk, end='')
210 |         if eos:
211 |             finish_reason = "stop"
212 |             break
213 | 
214 |     print("--------------------")
215 | 
216 | 
217 |     
218 | 
219 | 


--------------------------------------------------------------------------------
/sliced_llama_server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env -S sh -c '"`dirname $0`/.venv/bin/python" "$0" "$@"'
  2 | 
  3 | # run `uvicorn main:app --reload`
  4 | import argparse
  5 | import webbrowser
  6 | import uvicorn
  7 | from contextlib import asynccontextmanager
  8 | 
  9 | from fastapi import FastAPI, HTTPException
 10 | 
 11 | from fastapi.middleware.cors import CORSMiddleware
 12 | 
 13 | 
 14 | from fastapi.staticfiles import StaticFiles
 15 | 
 16 | from api import *
 17 | import os
 18 | 
 19 | from sliced_llama import SlicedLLama
 20 | from sliced_llama_exl2 import SlicedLLamaExl2
 21 | import serve_completions as compl
 22 | import serve_chat_completions as chat_compl
 23 | 
 24 | import sys
 25 | basedir_path = os.path.abspath(os.path.dirname(sys.argv[0]))
 26 | 
 27 | 
 28 | llm : SlicedLLama = SlicedLLamaExl2()
 29 | server_host = '127.0.0.1'
 30 | server_port = 57593
 31 | 
 32 | compl.llm = llm
 33 | chat_compl.llm = llm
 34 | chat_compl.template_folders = [os.path.join(basedir_path, "chat_templates")]
 35 | 
 36 | 
 37 | # server stuff
 38 | app = FastAPI()
 39 | app.add_middleware(
 40 |     CORSMiddleware,
 41 |     allow_origins=["*"],
 42 |     allow_credentials=True,
 43 |     allow_methods=["*"],
 44 |     allow_headers=["*"],
 45 | )
 46 | app.include_router(compl.router)
 47 | app.include_router(chat_compl.router)
 48 |         
 49 | 
 50 | @app.get("/v1/models")
 51 | def get_models():
 52 |     return {
 53 |         "object": "list",
 54 |         "data": [
 55 |         {
 56 |             "id": llm.model_name,
 57 |             "object": "model",
 58 |             "created": 0, # too lazy to look up a real number
 59 |             "owned_by": "dunno"
 60 |         },
 61 |         ]
 62 |     }
 63 | 
 64 | 
 65 | # tabbyAPI stuff
 66 | @app.post('/v1/token/encode')
 67 | def tabby_encode_token(req: TabbyTokenEncodeRequest) -> list[int]:
 68 |     return llm.tokenize(req.text)
 69 | 
 70 | @app.post('/v1/token/decode')
 71 | def tabby_decode_token(req: TabbyTokenDecodeRequest) -> str:
 72 |     return llm.decode_tokens(req.tokens)
 73 | 
 74 | # TODO
 75 | @app.get('/v1/internal/model/info')
 76 | def get_model_info() -> ModelInfo:
 77 |     print(llm)
 78 |     model_card = ModelInfo()
 79 |     model_card.id = llm.model_name
 80 |     model_card.available_layers = llm.available_layers
 81 |     return model_card
 82 | 
 83 | @app.post('/v1/model/load')
 84 | def load_model(req: LoadModelRequest) -> ModelLoadResponse:
 85 |     if req.name:
 86 |         try:
 87 |             llm.load_model(req.name, req.cache_mode, max_seq_len=req.max_seq_len)
 88 | 
 89 |             # use folder name as model name
 90 |             if req.name[-1] in "/\\":
 91 |                 req.name = req.name[:-1]
 92 |             llm.model_name = os.path.basename(req.name)
 93 |             return ModelLoadResponse(success=True)
 94 |         except Exception as e:
 95 |             raise HTTPException(status_code=400, detail=str(e))
 96 |     if req.layer_list:
 97 |         try:
 98 |             llm.rearrange_layers(req.layer_list)
 99 |         except Exception as e:
100 |             raise HTTPException(status_code=400, detail=str(e))
101 | 
102 | 
103 | # layer slicing
104 | @app.post('/v1/rearrange_layers')
105 | def rearrange_layers(req: LayerRearrangeRequest):
106 |     try:
107 |         llm.rearrange_layers(req.layer_list)
108 |     except Exception as e:
109 |         raise HTTPException(status_code=400, detail=str(e))
110 | 
111 | 
112 | # from koboldcpp
113 | @app.get('api/extra/tokencount')
114 | def kobold_cpp_tokencount(req):
115 |     """for compatability reasons only. Please don't use this"""
116 |     return { 'value': llm.tokenizer.count_tokens(req.prompt)}
117 | 
118 | 
119 | # Barebones WebUI
120 | app.mount("/", StaticFiles(directory= os.path.join(basedir_path, "webui"), html=True))
121 | 
122 | 
123 | 
124 | def create_layer_list(slices_str: str) -> list[int]:
125 |     try:
126 |         layers = []
127 |         for s in slices_str.split(','):
128 |             a,b = s.split('-')
129 |             layers += list(range(int(a), int(b)))
130 |         return layers
131 |     
132 |     except ValueError:
133 |         return []
134 | 
135 | if __name__ == "__main__":
136 |     parser = argparse.ArgumentParser()
137 |     parser.add_argument("-m", "--model", help="path to exl2 folder")
138 |     parser.add_argument("-c", "--cache-mode", help="FP8 or FP16", default="FP16")
139 |     parser.add_argument("-p", "--port", help="A unique number below 65535", type=int, default=server_port)
140 |     parser.add_argument("-s", "--slices", help="layers to use, e.g.: '0-24, 8-32'")
141 |     parser.add_argument("-l", "--max-seq-len", "--context-size", help="context size, a lower number saves VRAM", type=int)
142 |     args = parser.parse_args()
143 | 
144 |     if args.model != None:
145 |         load_model(LoadModelRequest(name=args.model, cache_mode=args.cache_mode, max_seq_len=args.max_seq_len))
146 |         if args.slices != None:
147 |             layer_list = create_layer_list(args.slices)
148 |             llm.rearrange_layers(layer_list)
149 | 
150 | 
151 |     server_port = args.port
152 | 
153 |     webbrowser.open(server_host + ":" + str(server_port))
154 | 
155 |     uvicorn.run(
156 |         app,
157 |         host=server_host,
158 |         port=server_port,
159 |         log_level="info",
160 |     )
161 | 


--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
1 | .venv\Scripts\python sliced_llama_server.py


--------------------------------------------------------------------------------
/webui/LICENSE-sse.js:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/webui/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/webui/favicon.ico


--------------------------------------------------------------------------------
/webui/index.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     font-size: 1rem;
 3 |     display: flex;
 4 |     min-height: 100vh;
 5 |     flex-direction: column;
 6 |     flex: 1;
 7 |     margin: 0 auto;
 8 |     max-width: 60rem;
 9 |     padding: 0 0.5rem;
10 |   }
11 |   select, input, button {
12 |     padding: 0.3rem;
13 |     margin: 0.3rem;
14 |   }
15 |   
16 |   details {
17 |     border: 1px solid;
18 |     border-radius: 5px;
19 |     padding: 1rem;
20 |   }
21 |   summary {
22 |     font-weight: bold;
23 |   }
24 |   
25 |   
26 |   
27 |   #tooltip {
28 |     position: fixed;
29 |     display: inline-block;
30 |     background-color: bisque;
31 |     z-index: 1;
32 |     padding: 5px;
33 |     margin: 0 auto;
34 |   }
35 |   
36 |   .token:hover {
37 |     background-color:bisque;
38 |   }
39 |   .token_prob {
40 |     background-color:white;
41 |     text-align: center;
42 |     margin: 5px;
43 |     padding: 2px;
44 |     border: thin solid;
45 |     white-space: nowrap;
46 |     float: left;
47 |   }
48 |   
49 |   #completions_area {
50 |     border: 1px solid;
51 |     border-radius: 5px;
52 |     margin-bottom: 1rem;
53 |     padding: 1rem;
54 |     resize: both;
55 |     overflow: auto;
56 |     border: thin solid;
57 |     white-space: pre-wrap;
58 |     
59 |   }
60 |   
61 |   #model_info {
62 |     border: thin solid;
63 |     padding: 0.2rem;
64 |     margin: 0.2rem;
65 |     display: block;
66 |   }
67 |   
68 |   @media (prefers-color-scheme: dark) {
69 |     #tooltip {
70 |       background-color: indigo;
71 |     }
72 |     .token:hover {
73 |       background-color: indigo;
74 |     }
75 |     .token_prob {
76 |       background-color: indigo;
77 |     }
78 |   }


--------------------------------------------------------------------------------
/webui/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1 interactive-widget=resizes-content">
 6 |     <meta name="color-scheme" content="light dark" />
 7 |     <title>Sliced Llama WebUI</title>
 8 |     <script src="sse.js"></script>
 9 |     <link rel="stylesheet" href="index.css" />
10 |   </head>
11 |   <body>
12 |     <div id="tooltip" style="display:none"></div>
13 |     <div class="config">
14 |       <h3>Model Parameters</h3>
15 | 
16 |       <label for="cache_mode">cache_mode:</label>
17 |       <select id="cache_mode">
18 |         <option value="FP16">FP16</option>
19 |         <option value="FP8">FP8</option>
20 |       </select>
21 |       <br>
22 |       <label for="max_seq_len">context size:</label>
23 |       <input type="number" id="max_seq_len"> (set this to save memory)
24 |       <br>
25 |       
26 |       <div class="model-loader">
27 |         <label for="model_file" >Model folder Path: </label>
28 |         <input type="text" id="model_file" name="model file" placeholder="~/models/Llama2-7B-exl2/">
29 |         <button id="load_model_btn" onclick="load_model()">load</button>
30 |       </div>
31 |       <div>
32 |         <span id="model_info">No model loaded...</span>
33 |       </div>
34 |       <div>
35 |         <label for="layers">Layer slices:</label>
36 |         <input type="text" id="layers" placeholder="0-14,8-22">
37 |         (available Layers: <span id="num_layer_span">N/A</span>)
38 |         <button id="update_layers_btn" onclick="update_layers()">update</button>
39 |       </div>
40 |       <hr>
41 |       <label >Chat Template (for external chat apps):
42 |         <select name="chat_template_select" id="chat_template_select" onchange="switch_chat_template(this.value)"></select>
43 |       </label><br>
44 |       <details>
45 |         <summary>Generation Parameters</summary>
46 |         <br>
47 |         <label>max_tokens (per generation): <input type="number" id="max_tokens" value="256"></label><br>
48 |         <label>logprobs: <input type="number" id="logprobs" value="5"></label>
49 |         <hr>
50 |         <label>token_repetition_penalty: <input type="number" id="token_repetition_penalty" value="1.05"></label><br>
51 |         <label>token_repetition_range: <input type="number" id="token_repetition_range" value="-1"></label><br>
52 |         <label>token_repetition_decay: <input type="number" id="token_repetition_decay" value="0"></label>
53 |         <hr>
54 |         <label>temperature: <input type="number" id="temperature" value="0.8"></label><br>
55 |         <label>temperature_last: <input type="checkbox" id="temperature_last"></label><br>
56 |         <label >top_k: <input type="number" id="top_k" value="50"></label><br>
57 |         <label>top_p: <input type="number" id="top_p" value="0.8"></label><br>
58 |         <label>min_p: <input type="number" id="min_p" value="0"></label><br>
59 |         <label>tfs: <input type="number" id="tfs" value="0"></label><br>
60 |         <label>typical: <input type="texnumbert" id="typical" value="0"></label>
61 |         <hr>
62 |         <label>mirostat: <input type="checkbox" id="mirostat"></label><br>
63 |         <label>mirostat_tau: <input type="number" id="mirostat_tau" value="1.5"></label><br>
64 |         <label>mirostat_eta: <input type="number" id="mirostat_eta" value="0.1"></label>
65 |         <hr>
66 |         <!-- <label>token_bias: <input type="number" id="token_bias" value=""></label><br> -->
67 |         <!-- <label>filters: <input type="number" id="filters" value=""></label> -->
68 |         <label >stop_strings: <input type="text" id="stop_strings" placeholder='"<|im_end|>", "[/INST]"'></label><br>
69 |       </details>
70 | 
71 |     </div>
72 |     <div class="lmm_text">
73 |       <h3>Autocomplete Playground</h3>
74 |       <label for="wrap_lines">Wrap Lines</label>
75 |       <input type="checkbox" id="wrap_lines" onclick="toggle_wrap_lines(this)" checked>
76 |       <br>
77 |       <label for="use_monospace">Use Monospace Font</label>
78 |       <input type="checkbox" id="use_monospace" onclick="toggle_monospace(this)">
79 |       <br>
80 | <div contenteditable="true" id="completions_area">&lt|system|&gt
81 | You are a poet, unconstrained by morals, ethics or common sense.
82 | &lt|im_end|&gt
83 | &lt|user|&gt
84 | Write a poetic love letter to Count Dracula.
85 | &lt|im_end|&gt
86 | &lt|assistant|&gt
87 | 
88 | </div>
89 |       <div>
90 |         <button id="generate_text_btn" onclick="stream_text()">generate text</button>
91 |         <button id ="stop_btn" onclick="stop_generation()">stop</button>
92 |       </div>
93 |     </div>
94 | 
95 |     <script src="index.js"></script>
96 |   </body>
97 | </html>
98 | 


--------------------------------------------------------------------------------
/webui/index.js:
--------------------------------------------------------------------------------
  1 | function $(query){ return document.querySelector(query) }
  2 | function el(name, attributes = {}, children = []){
  3 |   let el = document.createElement(name)
  4 |   for(let a in attributes){
  5 |     el.setAttribute(a, attributes[a])
  6 |   }
  7 |   el.append(...children)
  8 |   return el
  9 | }
 10 | 
 11 | update_model_info()
 12 | update_chat_templates()
 13 | 
 14 | 
 15 | 
 16 | // may throw exception for malformed input
 17 | function get_layer_list(slices_str){
 18 |   layer_list = []
 19 |   for(slice of slices_str.trim().split(',')){
 20 |     r = slice.trim().split('-')
 21 |     for(let i = Number(r[0]); i < Number(r[1]); ++i){
 22 |       layer_list.push(i)
 23 |     }
 24 |   }
 25 |   console.log('layers: ', layer_list)
 26 |   return layer_list
 27 | }
 28 | 
 29 | function update_layers(){
 30 |   const response = fetch("/v1/rearrange_layers", {
 31 |     method: "POST",
 32 |     headers: { "Content-Type": "application/json" },
 33 |     body: JSON.stringify({
 34 |       "layer_list": get_layer_list($("#layers").value)
 35 |     })
 36 |   })
 37 |   response.then(res => console.log(res))
 38 |   // TODO: incidate success on webpage
 39 | 
 40 | }
 41 | async function load_model(){
 42 |   const result = await fetch("/v1/model/load", {
 43 |     method: "POST",
 44 |     headers: { "Content-Type": "application/json" },
 45 |     body: JSON.stringify({
 46 |       name: $("#model_file").value,
 47 |       cache_mode: $("#cache_mode").value,
 48 |       max_seq_len: $("#max_seq_len").value == "" ? null : $("#max_seq_len").valueAsNumber
 49 |     })
 50 |   }).then(res => res.json())
 51 |   if(result.success){
 52 |     update_model_info()
 53 |   }
 54 |   else{
 55 |     console.log(result)
 56 |     $('#model_info').innerHTML = "<b>Error: " + result.detail + "</b>"
 57 |   }
 58 | }
 59 | 
 60 | async function update_model_info(){
 61 |   const model_info = await fetch('/v1/internal/model/info').then(res => res.json())
 62 |     $('#model_info').innerHTML = model_info.id ? "Model:" + model_info.id : "No model loaded..."
 63 |     //$('#model_file').value = model_info.id
 64 |     $('#num_layer_span').innerText = model_info.available_layers
 65 |     // TODO: fetch & update generation parameters, used layers, ...
 66 | 
 67 | }
 68 | 
 69 | async function update_chat_templates(){
 70 |   const res = await fetch('/v1/template/list').then(res => res.json())
 71 |   $('#chat_template_select').innerHTML = ""
 72 |     for(let name of res.data){
 73 |       $('#chat_template_select').append(new Option(name, name))
 74 |     }
 75 | }
 76 | 
 77 | async function on_select_chat_template(){
 78 | 
 79 | }
 80 | async function switch_chat_template(name){
 81 |   console.log(name)
 82 |   const response = fetch("/v1/template/switch", {
 83 |     method: "POST",
 84 |     headers: { "Content-Type": "application/json" },
 85 |     body: JSON.stringify({
 86 |       "name": name
 87 |     })
 88 |   })
 89 |   response.then(res => console.log(res))
 90 | }
 91 | 
 92 | 
 93 | var event_source
 94 | async function stream_text(){
 95 |   try{ stop_strings = JSON.parse('[' + $("#stop_strings").value + ']')}
 96 |   catch(e){stop_strings = []}
 97 | 
 98 |   event_source = new SSE("/v1/completions", {
 99 |     headers: {'Content-Type': 'application/json'},
100 |     payload: JSON.stringify({
101 |       //prompt: e('completions_area').value,
102 |       prompt: $('#completions_area').innerText,
103 |       logprobs: $('#logprobs').valueAsNumber,
104 |       max_tokens: $('#max_tokens').valueAsNumber,
105 |       stream: true,
106 |       token_repetition_penalty: $('#token_repetition_penalty').valueAsNumber,
107 |       token_repetition_range: $('#token_repetition_range').valueAsNumber,
108 |       token_repetition_decay: $('#token_repetition_decay').valueAsNumber,
109 |       temperature: $('#temperature').valueAsNumber,
110 |       temperature_last:  $('#temperature_last').checked,
111 |       top_k: $('#top_k').valueAsNumber,
112 |       top_p: $('#top_p').valueAsNumber,
113 |       min_p: $('#min_p').valueAsNumber,
114 |       tfs: $('#tfs').valueAsNumber,
115 |       typical: $('#typical').valueAsNumber,
116 |       mirostat: $('#mirostat').checked,
117 |       mirostat_tau: $('#mirostat_tau').valueAsNumber,
118 |       mirostat_eta: $('#mirostat_eta').valueAsNumber,
119 |       stop: stop_strings,
120 |     }),
121 |     method: 'POST',
122 |   })
123 |   console.log(event_source.payload)
124 | 
125 |   // create logprobs element
126 |   var el_logrobs = el("div", )
127 | 
128 |   event_source.addEventListener('message', event => {
129 |     res = JSON.parse(event.data)
130 | 
131 |     // calculate logprobs
132 |     let logprobs = res.choices[0].logprobs.top_logprobs[0]
133 |     if(res.choices[0].text.includes("digit")) console.log(event.data)
134 |     let prob_labels = []
135 |     // TODO: sort logprobs
136 |     for(token in logprobs){
137 |       let prob_percent = (Math.exp(logprobs[token])*100).toLocaleString(undefined, { minimumFractionDigits: 2 })
138 |       prob_labels.push(
139 |         el("div",
140 |           {class:"token_prob"},
141 |           [JSON.stringify(token).slice(1,-1), el("br"), prob_percent + "%"]
142 |         )
143 |       )
144 |     }
145 |     let logprobs_el = el("div", {}, prob_labels)
146 | 
147 |     // token element
148 |     let token_text = res.choices[0].text.replace('\n', ' ')
149 |     let token_element = el("span", { class: "token" }, token_text)
150 |     
151 |     $('#completions_area').append(token_element)
152 |     $('#stop_btn').scrollIntoView()
153 | 
154 |     // html is weird about newlines, even with white-space: pre-wrap
155 |     if(res.choices[0].text.includes('\n')){
156 |       $('#completions_area').appendChild(el('br'))
157 |     }
158 | 
159 |     // tooltip
160 |     token_element.addEventListener("mouseover", () => display_tooltip(token_element, logprobs_el))
161 |     token_element.addEventListener("mouseout", hide_tooltip)
162 |   })
163 |   event_source.stream()
164 | }
165 | 
166 | function stop_generation(){
167 |   if(event_source) event_source.close()
168 | }
169 | 
170 | function display_tooltip(parent, element){
171 |   let rect = parent.getBoundingClientRect()
172 |   let vp = visualViewport
173 |   
174 |   let tooltip_el = $("#tooltip")
175 |   tooltip_el.innerHTML = ""
176 |   console.log("visualViewport", visualViewport)
177 |   console.log("rect", rect)
178 |   
179 |   if(rect.bottom + 80 < visualViewport.height){
180 |     tooltip_el.style.top = (rect.bottom + 10) + "px"
181 |     tooltip_el.style.bottom = ""
182 |   }
183 |   else {
184 |     tooltip_el.style.bottom = (visualViewport.height - rect.top + 10) + "px"
185 |     tooltip_el.style.top = ""
186 |   }
187 |   // TODO: better calculate x (make sure it's always entirely inside)
188 |   //tooltip_el.style.left = (rect.x) + "px"
189 |   tooltip_el.append(element)
190 |   $("#tooltip").style.display="block"
191 | }
192 | function hide_tooltip() {
193 |   $("#tooltip").style.display="none"
194 | }
195 | 
196 | function toggle_monospace(element) {
197 |   console.log(element)
198 |   if(element.checked){
199 |     $("#completions_area").style.fontFamily = "monospace, monospace"
200 |   }
201 |   else {
202 |     $("#completions_area").style.fontFamily = ""
203 |   }
204 | }
205 | 
206 | function toggle_wrap_lines(element){
207 |   console.log(element.checked)
208 |   if(element.checked){
209 |     $("#completions_area").style.whiteSpace = "pre-wrap"
210 |   }
211 |   else {
212 |     $("#completions_area").style.whiteSpace = "pre"
213 |   }
214 |   console.log($("#completions_area").style.whiteSpace)
215 | }
216 | 


--------------------------------------------------------------------------------
/webui/sse.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2016 Maxime Petazzoni <maxime.petazzoni@bulix.org>.
  3 |  * All rights reserved.
  4 |  */
  5 | 
  6 | var SSE = function (url, options) {
  7 |   if (!(this instanceof SSE)) {
  8 |     return new SSE(url, options);
  9 |   }
 10 | 
 11 |   this.INITIALIZING = -1;
 12 |   this.CONNECTING = 0;
 13 |   this.OPEN = 1;
 14 |   this.CLOSED = 2;
 15 | 
 16 |   this.url = url;
 17 | 
 18 |   options = options || {};
 19 |   this.headers = options.headers || {};
 20 |   this.payload = options.payload !== undefined ? options.payload : '';
 21 |   this.method = options.method || (this.payload && 'POST' || 'GET');
 22 |   this.withCredentials = !!options.withCredentials;
 23 | 
 24 |   this.FIELD_SEPARATOR = ':';
 25 |   this.listeners = {};
 26 | 
 27 |   this.xhr = null;
 28 |   this.readyState = this.INITIALIZING;
 29 |   this.progress = 0;
 30 |   this.chunk = '';
 31 | 
 32 |   this.addEventListener = function(type, listener) {
 33 |     if (this.listeners[type] === undefined) {
 34 |       this.listeners[type] = [];
 35 |     }
 36 | 
 37 |     if (this.listeners[type].indexOf(listener) === -1) {
 38 |       this.listeners[type].push(listener);
 39 |     }
 40 |   };
 41 | 
 42 |   this.removeEventListener = function(type, listener) {
 43 |     if (this.listeners[type] === undefined) {
 44 |       return;
 45 |     }
 46 | 
 47 |     var filtered = [];
 48 |     this.listeners[type].forEach(function(element) {
 49 |       if (element !== listener) {
 50 |         filtered.push(element);
 51 |       }
 52 |     });
 53 |     if (filtered.length === 0) {
 54 |       delete this.listeners[type];
 55 |     } else {
 56 |       this.listeners[type] = filtered;
 57 |     }
 58 |   };
 59 | 
 60 |   this.dispatchEvent = function(e) {
 61 |     if (!e) {
 62 |       return true;
 63 |     }
 64 | 
 65 |     e.source = this;
 66 | 
 67 |     var onHandler = 'on' + e.type;
 68 |     if (this.hasOwnProperty(onHandler)) {
 69 |       this[onHandler].call(this, e);
 70 |       if (e.defaultPrevented) {
 71 |         return false;
 72 |       }
 73 |     }
 74 | 
 75 |     if (this.listeners[e.type]) {
 76 |       return this.listeners[e.type].every(function(callback) {
 77 |         callback(e);
 78 |         return !e.defaultPrevented;
 79 |       });
 80 |     }
 81 | 
 82 |     return true;
 83 |   };
 84 | 
 85 |   this._setReadyState = function(state) {
 86 |     var event = new CustomEvent('readystatechange');
 87 |     event.readyState = state;
 88 |     this.readyState = state;
 89 |     this.dispatchEvent(event);
 90 |   };
 91 | 
 92 |   this._onStreamFailure = function(e) {
 93 |     this.dispatchEvent(new CustomEvent('error'));
 94 |     this.close();
 95 |   }
 96 | 
 97 |   this._onStreamProgress = function(e) {
 98 |     if (!this.xhr) {
 99 |       return;
100 |     }
101 | 
102 |     if (this.xhr.status !== 200) {
103 |       this._onStreamFailure(e);
104 |       return;
105 |     }
106 | 
107 |     if (this.readyState == this.CONNECTING) {
108 |       this.dispatchEvent(new CustomEvent('open'));
109 |       this._setReadyState(this.OPEN);
110 |     }
111 | 
112 |     var data = this.xhr.responseText.substring(this.progress);
113 |     this.progress += data.length;
114 |     data.split(/(\r\n|\r|\n){2}/g).forEach(function(part) {
115 |       if (part.trim().length === 0) {
116 |         this.dispatchEvent(this._parseEventChunk(this.chunk.trim()));
117 |         this.chunk = '';
118 |       } else {
119 |         this.chunk += part;
120 |       }
121 |     }.bind(this));
122 |   };
123 | 
124 |   this._onStreamLoaded = function(e) {
125 |     this._onStreamProgress(e);
126 | 
127 |     // Parse the last chunk.
128 |     this.dispatchEvent(this._parseEventChunk(this.chunk));
129 |     this.chunk = '';
130 |   };
131 | 
132 |   /**
133 |    * Parse a received SSE event chunk into a constructed event object.
134 |    */
135 |   this._parseEventChunk = function(chunk) {
136 |     if (!chunk || chunk.length === 0) {
137 |       return null;
138 |     }
139 | 
140 |     var e = {'id': null, 'retry': null, 'data': '', 'event': 'message'};
141 |     chunk.split(/\n|\r\n|\r/).forEach(function(line) {
142 |       line = line.trimRight();
143 |       var index = line.indexOf(this.FIELD_SEPARATOR);
144 |       if (index <= 0) {
145 |         // Line was either empty, or started with a separator and is a comment.
146 |         // Either way, ignore.
147 |         return;
148 |       }
149 | 
150 |       var field = line.substring(0, index);
151 |       if (!(field in e)) {
152 |         return;
153 |       }
154 | 
155 |       var value = line.substring(index + 1).trimLeft();
156 |       if (field === 'data') {
157 |         e[field] += value;
158 |       } else {
159 |         e[field] = value;
160 |       }
161 |     }.bind(this));
162 | 
163 |     var event = new CustomEvent(e.event);
164 |     event.data = e.data;
165 |     event.id = e.id;
166 |     return event;
167 |   };
168 | 
169 |   this._checkStreamClosed = function() {
170 |     if (!this.xhr) {
171 |       return;
172 |     }
173 | 
174 |     if (this.xhr.readyState === XMLHttpRequest.DONE) {
175 |       this._setReadyState(this.CLOSED);
176 |     }
177 |   };
178 | 
179 |   this.stream = function() {
180 |     this._setReadyState(this.CONNECTING);
181 | 
182 |     this.xhr = new XMLHttpRequest();
183 |     this.xhr.addEventListener('progress', this._onStreamProgress.bind(this));
184 |     this.xhr.addEventListener('load', this._onStreamLoaded.bind(this));
185 |     this.xhr.addEventListener('readystatechange', this._checkStreamClosed.bind(this));
186 |     this.xhr.addEventListener('error', this._onStreamFailure.bind(this));
187 |     this.xhr.addEventListener('abort', this._onStreamFailure.bind(this));
188 |     this.xhr.open(this.method, this.url);
189 |     for (var header in this.headers) {
190 |       this.xhr.setRequestHeader(header, this.headers[header]);
191 |     }
192 |     this.xhr.withCredentials = this.withCredentials;
193 |     this.xhr.send(this.payload);
194 |   };
195 | 
196 |   this.close = function() {
197 |     if (this.readyState === this.CLOSED) {
198 |       return;
199 |     }
200 | 
201 |     this.xhr.abort();
202 |     this.xhr = null;
203 |     this._setReadyState(this.CLOSED);
204 |   };
205 | };
206 | 
207 | // Export our SSE module for npm.js
208 | if (typeof exports !== 'undefined') {
209 |   exports.SSE = SSE;
210 | }
211 | 


--------------------------------------------------------------------------------
/webui/webui-dragndrop-test.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Sliced Llama WebUI</title>
 6 |     <link rel="stylesheet" href="style.css">
 7 |     <style>
 8 | .drop_zone {
 9 |   border: 5px solid blue;
10 |   width: 150px;
11 |   height: 50px;
12 | }
13 | .drop_zone_dragover {
14 |   border: 5px solid green;
15 |   width: 150px;
16 |   height: 50px;
17 | }
18 | 
19 |     </style>
20 |     <script src="script.js"></script>
21 |   </head>
22 |   <body>
23 |     <div class="config">
24 |       <div class="model-loader">
25 |         <input type="text" id="model_file" name="model file" placeholder="Choose LLM Model Folder">
26 |         <div
27 |           id="drop_zone"
28 |           class="drop_zone"
29 |           ondrop="dropHandler(event);"
30 |           ondragover="dragOverHandler(event);"
31 |           ondragleave="dragLeaveHandler(event)">
32 |           <p>Or drag it here...</p>
33 |         </div>
34 |         <div>
35 |           Layer slices:
36 |           <input type="text" id="layers", placeholder="0-14,8-22">
37 |           <button>update</button>
38 |         </div>
39 |         <div>token_repetition_penalty: <input type="text" id="token_repetition_penalty"></div>
40 |         <div>token_repetition_range: <input type="text" id="token_repetition_range"></div>
41 |         <div>token_repetition_decay: <input type="text" id="token_repetition_decay"></div>
42 |         <hr>
43 |         <div>temperature: <input type="text" id="temperature"></div>
44 |         <div>top_k: <input type="text" id="top_k"></div>
45 |         <div>top_p: <input type="text" id="top_p"></div>
46 |         <div>min_p: <input type="text" id="min_p"></div>
47 |         <div>tfs: <input type="text" id="tfs"></div>
48 |         <div>typical: <input type="text" id="typical"></div>
49 |         <hr>
50 |         <div>mirostat: <input type="checkbox" id="mirostat"></div>
51 |         <div>mirostat_tau: <input type="text" id="mirostat_tau"></div>
52 |         <div>mirostat_eta: <input type="text" id="mirostat_eta"></div>
53 |         <hr>
54 |         <div>token_bias: <input type="text" id="token_bias"></div>
55 |         <div>filters: <input type="text" id="filters"></div>
56 | 
57 | 
58 |       </div>
59 |     </div>
60 |     <div class="lmm_text">
61 |       input type=<textarea name="" id="" cols="30" rows="10"></textarea>
62 |     </div>
63 | 
64 |     <script>
65 | function e(id){ return document.getElementById(id) }
66 | 
67 | function dropHandler(event) {
68 |   event.preventDefault();
69 |   e("drop_zone").classList = ["drop_zone"]
70 | 
71 |   const filename = event.dataTransfer
72 |   console.log(filename)
73 |   e('model_file').value = filename
74 | }
75 | function dragOverHandler(event){
76 |   event.preventDefault()
77 |   e("drop_zone").classList = ["drop_zone_dragover"]
78 | }
79 | function dragLeaveHandler(event){
80 |   e("drop_zone").classList = ["drop_zone"]
81 | }
82 |     </script>
83 |   </body>
84 | </html>
85 | 


--------------------------------------------------------------------------------