├── loras
    └── place-your-loras-here.txt
├── models
    └── place-your-models-here.txt
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report_template.yml
    ├── dependabot.yml
    └── workflows
    │   └── stale.yml
├── extensions
    ├── ngrok
    │   ├── requirements.txt
    │   ├── script.py
    │   └── README.md
    ├── silero_tts
    │   ├── outputs
    │   │   └── outputs-will-be-saved-here.txt
    │   ├── requirements.txt
    │   └── test_tts.py
    ├── elevenlabs_tts
    │   ├── outputs
    │   │   └── outputs-will-be-saved-here.txt
    │   └── requirements.txt
    ├── multimodal
    │   ├── pipelines
    │   │   ├── place-additional-pipelines-here.txt
    │   │   └── llava
    │   │   │   ├── README.md
    │   │   │   ├── pipelines.py
    │   │   │   └── llava.py
    │   ├── abstract_pipeline.py
    │   ├── pipeline_loader.py
    │   └── script.py
    ├── google_translate
    │   ├── requirements.txt
    │   └── script.py
    ├── api
    │   ├── requirements.txt
    │   ├── script.py
    │   ├── streaming_api.py
    │   └── util.py
    ├── openai
    │   ├── requirements.txt
    │   └── cache_embedding_model.py
    ├── whisper_stt
    │   ├── requirements.txt
    │   └── script.py
    ├── superbooga
    │   ├── requirements.txt
    │   ├── download_urls.py
    │   └── chromadb.py
    ├── llava
    │   └── script.py
    ├── sd_api_pictures
    │   ├── style.css
    │   └── README.MD
    ├── send_pictures
    │   └── script.py
    ├── gallery
    │   └── script.py
    └── character_bias
    │   └── script.py
├── training
    ├── datasets
    │   └── put-trainer-datasets-here.txt
    └── formats
    │   ├── alpaca-chatbot-format.json
    │   └── alpaca-format.json
├── presets
    ├── Debug-deterministic.yaml
    ├── Mirostat.yaml
    ├── StarChat.yaml
    ├── Contrastive Search.yaml
    ├── Yara.yaml
    ├── Asterism.yaml
    ├── Shortwave.yaml
    ├── simple-1.yaml
    ├── LLaMA-Precise.yaml
    ├── Space Alien.yaml
    ├── tfs-with-top-a.yaml
    ├── Divine Intellect.yaml
    ├── Kobold-Godlike.yaml
    ├── Midnight Enigma.yaml
    ├── Big O.yaml
    └── Titanic.yaml
├── prompts
    ├── QA.txt
    ├── GPT-4chan.txt
    └── Alpaca-with-Input.txt
├── characters
    ├── Example.png
    ├── instruction-following
    │   ├── RWKV-Raven.yaml
    │   ├── Galactica Cite.yaml
    │   ├── Galactica Q.yaml
    │   ├── Galactica Summary.yaml
    │   ├── WizardLM.yaml
    │   ├── INCITE-Instruct.yaml
    │   ├── Ziya.yaml
    │   ├── INCITE-Chat.yaml
    │   ├── KoAlpaca.yaml
    │   ├── Minotaur.yaml
    │   ├── ChatGLM.yaml
    │   ├── Galactica Finetuned.yaml
    │   ├── Galactica Work.yaml
    │   ├── Galactica.yaml
    │   ├── H2O-human_bot.yaml
    │   ├── Manticore Chat.yaml
    │   ├── Metharme.yaml
    │   ├── Tulu.yaml
    │   ├── Bactrian.yaml
    │   ├── Gorilla.yaml
    │   ├── Guanaco-QLoRA.yaml
    │   ├── Wizard-Mega ShareGPT.yaml
    │   ├── Koala.yaml
    │   ├── Open Assistant.yaml
    │   ├── Wizard-Mega.yaml
    │   ├── Guanaco non-chat.yaml
    │   ├── H2O-prompt_answer.yaml
    │   ├── Hippogriff.yaml
    │   ├── Galactica v2.yaml
    │   ├── Samantha.yaml
    │   ├── Starchat-Beta.yaml
    │   ├── StableVicuna.yaml
    │   ├── Orca Mini.yaml
    │   ├── Alpaca.yaml
    │   ├── Wizard-Mega WizardLM.yaml
    │   ├── Vicuna-v1.1.yaml
    │   ├── Vigogne-Instruct.yaml
    │   ├── Guanaco.yaml
    │   ├── Vicuna-v0.yaml
    │   ├── Chinese-Vicuna-Chat.yaml
    │   ├── LLaVA.yaml
    │   ├── Bluemoon.yaml
    │   ├── MPT-Chat.yaml
    │   ├── StableLM.yaml
    │   ├── Baize.yaml
    │   ├── Vigogne-Chat.yaml
    │   ├── OpenBuddy.yaml
    │   └── MOSS.yaml
    └── Example.yaml
├── docker
    ├── .dockerignore
    ├── docker-compose.yml
    ├── .env.example
    └── Dockerfile
├── css
    ├── chat.js
    ├── html_readable_style.css
    ├── main.js
    ├── chat_style-wpp.css
    ├── chat_style-cai-chat.css
    ├── html_instruct_style.css
    ├── chat_style-messenger.css
    ├── chat.css
    ├── html_4chan_style.css
    ├── chat_style-TheEncrypted777.css
    └── main.css
├── modules
    ├── relative_imports.py
    ├── block_requests.py
    ├── github.py
    ├── monkey_patch_gptq_lora.py
    ├── presets.py
    ├── callbacks.py
    ├── loaders.py
    ├── deepspeed_parameters.py
    ├── AutoGPTQ_loader.py
    ├── llamacpp_model.py
    ├── utils.py
    ├── ui.py
    ├── exllama.py
    ├── logging_colors.py
    ├── exllama_hf.py
    ├── models_settings.py
    └── LoRA.py
├── docs
    ├── Windows-installation-guide.md
    ├── Audio-Notification.md
    ├── Generation-parameters.md
    ├── ExLlama.md
    ├── README.md
    ├── DeepSpeed.md
    ├── Low-VRAM-guide.md
    ├── llama.cpp-models.md
    ├── System-requirements.md
    ├── LLaMA-model.md
    ├── FlexGen.md
    ├── Chat-mode.md
    ├── LoRA.md
    ├── RWKV-model.md
    ├── WSL-installation-guide.md
    └── Spell-book.md
├── .gitignore
├── settings-template.yaml
├── requirements.txt
├── convert-to-safetensors.py
├── api-examples
    ├── api-example.py
    ├── api-example-stream.py
    ├── api-example-chat.py
    └── api-example-chat-stream.py
└── convert-to-flexgen.py


/loras/place-your-loras-here.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/place-your-models-here.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | ko_fi: oobabooga
2 | 


--------------------------------------------------------------------------------
/extensions/ngrok/requirements.txt:
--------------------------------------------------------------------------------
1 | ngrok==0.*
2 | 


--------------------------------------------------------------------------------
/training/datasets/put-trainer-datasets-here.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/presets/Debug-deterministic.yaml:
--------------------------------------------------------------------------------
1 | do_sample: false
2 | 


--------------------------------------------------------------------------------
/extensions/silero_tts/outputs/outputs-will-be-saved-here.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/presets/Mirostat.yaml:
--------------------------------------------------------------------------------
1 | mirostat_mode: 2
2 | mirostat_tau: 8
3 | 


--------------------------------------------------------------------------------
/extensions/elevenlabs_tts/outputs/outputs-will-be-saved-here.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/extensions/elevenlabs_tts/requirements.txt:
--------------------------------------------------------------------------------
1 | elevenlabs==0.2.*
2 | 


--------------------------------------------------------------------------------
/extensions/multimodal/pipelines/place-additional-pipelines-here.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/extensions/google_translate/requirements.txt:
--------------------------------------------------------------------------------
1 | deep-translator==1.9.2
2 | 


--------------------------------------------------------------------------------
/presets/StarChat.yaml:
--------------------------------------------------------------------------------
1 | temperature: 0.2
2 | top_p: 0.95
3 | top_k: 50
4 | 


--------------------------------------------------------------------------------
/extensions/api/requirements.txt:
--------------------------------------------------------------------------------
1 | flask_cloudflared==0.0.12
2 | websockets==11.0.2


--------------------------------------------------------------------------------
/extensions/openai/requirements.txt:
--------------------------------------------------------------------------------
1 | flask_cloudflared==0.0.12
2 | sentence-transformers


--------------------------------------------------------------------------------
/presets/Contrastive Search.yaml:
--------------------------------------------------------------------------------
1 | do_sample: false
2 | top_k: 4
3 | penalty_alpha: 0.3
4 | 


--------------------------------------------------------------------------------
/prompts/QA.txt:
--------------------------------------------------------------------------------
1 | Common sense questions and answers
2 | 
3 | Question: 
4 | Factual answer:
5 | 


--------------------------------------------------------------------------------
/presets/Yara.yaml:
--------------------------------------------------------------------------------
1 | temperature: 0.82
2 | top_p: 0.21
3 | repetition_penalty: 1.19
4 | top_k: 72
5 | 


--------------------------------------------------------------------------------
/extensions/silero_tts/requirements.txt:
--------------------------------------------------------------------------------
1 | ipython
2 | num2words
3 | omegaconf
4 | pydub
5 | PyYAML
6 | 


--------------------------------------------------------------------------------
/presets/Asterism.yaml:
--------------------------------------------------------------------------------
1 | temperature: 1.68
2 | top_p: 0.17
3 | repetition_penalty: 1.02
4 | top_k: 77
5 | 


--------------------------------------------------------------------------------
/presets/Shortwave.yaml:
--------------------------------------------------------------------------------
1 | temperature: 1.53
2 | top_p: 0.64
3 | repetition_penalty: 1.07
4 | top_k: 33
5 | 


--------------------------------------------------------------------------------
/presets/simple-1.yaml:
--------------------------------------------------------------------------------
1 | temperature: 0.7
2 | top_p: 0.9
3 | repetition_penalty: 1.15
4 | top_k: 20
5 | 


--------------------------------------------------------------------------------
/presets/LLaMA-Precise.yaml:
--------------------------------------------------------------------------------
1 | temperature: 0.7
2 | top_p: 0.1
3 | repetition_penalty: 1.18
4 | top_k: 40
5 | 


--------------------------------------------------------------------------------
/presets/Space Alien.yaml:
--------------------------------------------------------------------------------
1 | temperature: 1.31
2 | top_p: 0.29
3 | repetition_penalty: 1.09
4 | top_k: 72
5 | 


--------------------------------------------------------------------------------
/presets/tfs-with-top-a.yaml:
--------------------------------------------------------------------------------
1 | temperature: 0.7
2 | tfs: 0.95
3 | top_a: 0.2
4 | repetition_penalty: 1.15
5 | 


--------------------------------------------------------------------------------
/characters/Example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bettyyy2/text-generation-webui/HEAD/characters/Example.png


--------------------------------------------------------------------------------
/extensions/whisper_stt/requirements.txt:
--------------------------------------------------------------------------------
1 | SpeechRecognition==3.10.0
2 | openai-whisper
3 | soundfile
4 | ffmpeg
5 | 


--------------------------------------------------------------------------------
/presets/Divine Intellect.yaml:
--------------------------------------------------------------------------------
1 | temperature: 1.31
2 | top_p: 0.14
3 | repetition_penalty: 1.17
4 | top_k: 49
5 | 


--------------------------------------------------------------------------------
/presets/Kobold-Godlike.yaml:
--------------------------------------------------------------------------------
1 | temperature: 0.7
2 | top_p: 0.5
3 | typical_p: 0.19
4 | repetition_penalty: 1.1
5 | 


--------------------------------------------------------------------------------
/presets/Midnight Enigma.yaml:
--------------------------------------------------------------------------------
1 | temperature: 0.98
2 | top_p: 0.37
3 | repetition_penalty: 1.18
4 | top_k: 100
5 | 


--------------------------------------------------------------------------------
/presets/Big O.yaml:
--------------------------------------------------------------------------------
1 | temperature: 0.87
2 | top_p: 0.99
3 | typical_p: 0.68
4 | tfs: 0.68
5 | repetition_penalty: 1.01
6 | top_k: 85
7 | 


--------------------------------------------------------------------------------
/prompts/GPT-4chan.txt:
--------------------------------------------------------------------------------
1 | -----
2 | --- 865467536
3 | Hello, AI frens!
4 | How are you doing on this fine day?
5 | --- 865467537
6 | 
7 | 


--------------------------------------------------------------------------------
/extensions/superbooga/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.12.2
2 | chromadb==0.3.18
3 | posthog==2.4.2
4 | sentence_transformers==2.2.2
5 | 


--------------------------------------------------------------------------------
/presets/Titanic.yaml:
--------------------------------------------------------------------------------
1 | temperature: 1.01
2 | top_p: 0.21
3 | repetition_penalty: 1.21
4 | encoder_repetition_penalty: 1.07
5 | top_k: 91
6 | 


--------------------------------------------------------------------------------
/docker/.dockerignore:
--------------------------------------------------------------------------------
 1 | .env
 2 | Dockerfile
 3 | /characters
 4 | /loras
 5 | /models
 6 | /presets
 7 | /prompts
 8 | /softprompts
 9 | /training
10 | 


--------------------------------------------------------------------------------
/characters/instruction-following/RWKV-Raven.yaml:
--------------------------------------------------------------------------------
1 | user: "Bob:"
2 | bot: "Alice:"
3 | turn_template: "<|user|> <|user-message|>\n\n<|bot|> <|bot-message|>\n\n"
4 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Galactica Cite.yaml:
--------------------------------------------------------------------------------
1 | user: ""
2 | bot: "[START_REF]"
3 | turn_template: "<|user-message|> <|bot|><|bot-message|>\n\n"
4 | context: ""


--------------------------------------------------------------------------------
/characters/instruction-following/Galactica Q.yaml:
--------------------------------------------------------------------------------
1 | user: "Q:"
2 | bot: "A:"
3 | turn_template: "<|user|> <|user-message|>\n\n<|bot|> <|bot-message|>\n\n"
4 | context: ""


--------------------------------------------------------------------------------
/characters/instruction-following/Galactica Summary.yaml:
--------------------------------------------------------------------------------
1 | user: ""
2 | bot: "TLDR:"
3 | turn_template: "<|user-message|>\n\n<|bot|><|bot-message|>\n\n"
4 | context: ""


--------------------------------------------------------------------------------
/characters/instruction-following/WizardLM.yaml:
--------------------------------------------------------------------------------
1 | user: ""
2 | bot: "### Response:"
3 | turn_template: "<|user-message|>\n\n<|bot|><|bot-message|>\n\n</s>"
4 | context: ""


--------------------------------------------------------------------------------
/characters/instruction-following/INCITE-Instruct.yaml:
--------------------------------------------------------------------------------
1 | user: "Q:"
2 | bot: "A:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|><|bot-message|>\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Ziya.yaml:
--------------------------------------------------------------------------------
1 | user: "<human>:"
2 | bot: "<bot>:"
3 | turn_template: "<|user|><|user-message|>\n<|bot|><|bot-message|>\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/INCITE-Chat.yaml:
--------------------------------------------------------------------------------
1 | user: "<human>:"
2 | bot: "<bot>:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|><|bot-message|>\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/KoAlpaca.yaml:
--------------------------------------------------------------------------------
1 | user: "### 질문:"
2 | bot: "### 답변:"
3 | turn_template: "<|user|> <|user-message|>\n\n<|bot|><|bot-message|>\n\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Minotaur.yaml:
--------------------------------------------------------------------------------
1 | user: "USER:"
2 | bot: "ASSISTANT:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|><|bot-message|>\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/ChatGLM.yaml:
--------------------------------------------------------------------------------
1 | user: "[Round <|round|>]\n问："
2 | bot: "答："
3 | turn_template: "<|user|><|user-message|>\n<|bot|><|bot-message|>\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Galactica Finetuned.yaml:
--------------------------------------------------------------------------------
1 | user: "<question>"
2 | bot: "<answer>"
3 | turn_template: "<|user|><|user-message|><|bot|><|bot-message|>"
4 | context: ""


--------------------------------------------------------------------------------
/characters/instruction-following/Galactica Work.yaml:
--------------------------------------------------------------------------------
1 | user: "Question:"
2 | bot: "<work>"
3 | turn_template: "<|user|> <|user-message|>\n\n<|bot|><|bot-message|>\n\n"
4 | context: ""


--------------------------------------------------------------------------------
/characters/instruction-following/Galactica.yaml:
--------------------------------------------------------------------------------
1 | user: "Question:"
2 | bot: "Answer:"
3 | context: ""
4 | turn_template: "<|user|> <|user-message|>\n\n<|bot|> <|bot-message|>\n\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/H2O-human_bot.yaml:
--------------------------------------------------------------------------------
1 | user: "<human>:"
2 | bot: "<bot>:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|><|bot-message|>\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Manticore Chat.yaml:
--------------------------------------------------------------------------------
1 | user: "USER:"
2 | bot: "ASSISTANT:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|><|bot-message|>\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Metharme.yaml:
--------------------------------------------------------------------------------
1 | user: "<|user|>"
2 | bot: "<|model|>"
3 | context: "<|system|>"
4 | turn_template: "<|user|><|user-message|><|bot|><|bot-message|>"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Tulu.yaml:
--------------------------------------------------------------------------------
1 | user: "<|user|>"
2 | bot: "<|assistant|>"
3 | context: ""
4 | turn_template: "<|user|>\n<|user-message|>\n<|bot|>\n<|bot-message|>\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Bactrian.yaml:
--------------------------------------------------------------------------------
1 | user: "### Input:"
2 | bot: "### Output:"
3 | turn_template: "<|user|>\n<|user-message|>\n\n<|bot|>\n<|bot-message|>\n\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Gorilla.yaml:
--------------------------------------------------------------------------------
1 | user: "###USER:"
2 | bot: "###ASSISTANT:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|></s>\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Guanaco-QLoRA.yaml:
--------------------------------------------------------------------------------
1 | user: "### Human:"
2 | bot: "### Assistant:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|></s>\n"
4 | context: ""


--------------------------------------------------------------------------------
/characters/instruction-following/Wizard-Mega ShareGPT.yaml:
--------------------------------------------------------------------------------
1 | user: "USER:"
2 | bot: "ASSISTANT:"
3 | turn_template: "<|user|> <|user-message|> <|bot|> <|bot-message|></s>"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Koala.yaml:
--------------------------------------------------------------------------------
1 | user: "USER:"
2 | bot: "GPT:"
3 | turn_template: "<|user|> <|user-message|> <|bot|><|bot-message|></s>"
4 | context: "BEGINNING OF CONVERSATION: "
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Open Assistant.yaml:
--------------------------------------------------------------------------------
1 | user: "<|prompter|>"
2 | bot: "<|assistant|>"
3 | turn_template: "<|user|><|user-message|><|endoftext|><|bot|><|bot-message|><|endoftext|>"
4 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Wizard-Mega.yaml:
--------------------------------------------------------------------------------
1 | user: "### Instruction:"
2 | bot: "### Assistant:"
3 | turn_template: "<|user|> <|user-message|>\n\n<|bot|> <|bot-message|>\n\n"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Guanaco non-chat.yaml:
--------------------------------------------------------------------------------
1 | user: "### Instruction:"
2 | bot: "### Response:"
3 | turn_template: "<|user|>\n<|user-message|>\n\n<|bot|>\n<|bot-message|>\n\n"
4 | context: ""


--------------------------------------------------------------------------------
/characters/instruction-following/H2O-prompt_answer.yaml:
--------------------------------------------------------------------------------
1 | user: "<|prompt|>"
2 | bot: "<|answer|>"
3 | turn_template: "<|user|><|user-message|><|endoftext|><|bot|><|bot-message|><|endoftext|>"
4 | context: ""
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Hippogriff.yaml:
--------------------------------------------------------------------------------
1 | user: "USER:"
2 | bot: "ASSISTANT:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|></s>\n"
4 | context: "You are a helpful assistant\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Galactica v2.yaml:
--------------------------------------------------------------------------------
1 | user: "<human>"
2 | bot: "<bot>"
3 | turn_template: "<|user|><|user-message|><|bot|><|bot-message|>"
4 | context: "<prefix>You are a helpful chatbot name Stan</prefix>"


--------------------------------------------------------------------------------
/characters/instruction-following/Samantha.yaml:
--------------------------------------------------------------------------------
1 | user: "USER:"
2 | bot: "ASSISTANT:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|></s>\n"
4 | context: "You are Samantha, a sentient AI.\n\n"
5 | 


--------------------------------------------------------------------------------
/training/formats/alpaca-chatbot-format.json:
--------------------------------------------------------------------------------
1 | {
2 |     "instruction,output": "User: %instruction%\nAssistant: %output%",
3 |     "instruction,input,output": "User: %instruction%: %input%\nAssistant: %output%"
4 | }
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Starchat-Beta.yaml:
--------------------------------------------------------------------------------
1 | user: "<|user|>"
2 | bot: "<|assistant|>"
3 | context: "<|system|>\n<|end|>\n"
4 | turn_template: "<|user|>\n<|user-message|><|end|>\n<|bot|>\n<|bot-message|><|end|>\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/StableVicuna.yaml:
--------------------------------------------------------------------------------
1 | user: "### Human:"
2 | bot: "### Assistant:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|>\n\n"
4 | context: "### Assistant: I am StableVicuna, a large language model created by CarperAI. I am here to chat!\n\n"


--------------------------------------------------------------------------------
/characters/instruction-following/Orca Mini.yaml:
--------------------------------------------------------------------------------
1 | user: "### User:"
2 | bot: "### Response:"
3 | turn_template: "<|user|>\n<|user-message|>\n\n<|bot|>\n<|bot-message|>\n\n"
4 | context: "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Alpaca.yaml:
--------------------------------------------------------------------------------
1 | user: "### Instruction:"
2 | bot: "### Response:"
3 | turn_template: "<|user|>\n<|user-message|>\n\n<|bot|>\n<|bot-message|>\n\n"
4 | context: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
5 | 


--------------------------------------------------------------------------------
/prompts/Alpaca-with-Input.txt:
--------------------------------------------------------------------------------
 1 | Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 2 | 
 3 | ### Instruction:
 4 | Instruction
 5 | 
 6 | ### Input:
 7 | Input
 8 | 
 9 | ### Response:
10 | 
11 | 


--------------------------------------------------------------------------------
/extensions/llava/script.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | 
3 | from modules.logging_colors import logger
4 | 
5 | 
6 | def ui():
7 |     gr.Markdown("### This extension is deprecated, use \"multimodal\" extension instead")
8 |     logger.error("LLaVA extension is deprecated, use \"multimodal\" extension instead")
9 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Wizard-Mega WizardLM.yaml:
--------------------------------------------------------------------------------
1 | user: "### Instruction:"
2 | bot: "### Response:"
3 | turn_template: "<|user|>\n<|user-message|>\n\n<|bot|>\n<|bot-message|>\n\n"
4 | context: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Vicuna-v1.1.yaml:
--------------------------------------------------------------------------------
1 | user: "USER:"
2 | bot: "ASSISTANT:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|></s>\n"
4 | context: "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Vigogne-Instruct.yaml:
--------------------------------------------------------------------------------
1 | user: "### Instruction:"
2 | bot: "### Réponse:"
3 | turn_template: "<|user|>\n<|user-message|>\n\n<|bot|>\n<|bot-message|>\n\n"
4 | context: "Ci-dessous se trouve une instruction qui décrit une tâche à accomplir. Rédigez une réponse qui répond de manière précise à la demande.\n\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Guanaco.yaml:
--------------------------------------------------------------------------------
1 | user: "### Human:"
2 | bot: "### Assistant:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|>\n"
4 | context: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Vicuna-v0.yaml:
--------------------------------------------------------------------------------
1 | user: "### Human:"
2 | bot: "### Assistant:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|>\n"
4 | context: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Chinese-Vicuna-Chat.yaml:
--------------------------------------------------------------------------------
1 | user: "User:"
2 | bot: "Assistant:"
3 | turn_template: "<|user|><|user-message|>\n\n<|bot|><|bot-message|>\n\n"
4 | context: "The following is a conversation between an AI assistant called Assistant and a human user called User. The assistant is intelligent, knowledgeable and polite to answer questions of user.\n\n"
5 | 


--------------------------------------------------------------------------------
/css/chat.js:
--------------------------------------------------------------------------------
1 | document.getElementById("main").childNodes[0].style = "max-width: 800px; margin-left: auto; margin-right: auto";
2 | document.getElementById("extensions").style.setProperty("max-width", "800px");
3 | document.getElementById("extensions").style.setProperty("margin-left", "auto");
4 | document.getElementById("extensions").style.setProperty("margin-right", "auto");
5 | 


--------------------------------------------------------------------------------
/extensions/api/script.py:
--------------------------------------------------------------------------------
1 | import extensions.api.blocking_api as blocking_api
2 | import extensions.api.streaming_api as streaming_api
3 | from modules import shared
4 | 
5 | 
6 | def setup():
7 |     blocking_api.start_server(shared.args.api_blocking_port, share=shared.args.public_api)
8 |     streaming_api.start_server(shared.args.api_streaming_port, share=shared.args.public_api)
9 | 


--------------------------------------------------------------------------------
/modules/relative_imports.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | class RelativeImport:
 6 |     def __init__(self, path):
 7 |         self.import_path = Path(path)
 8 | 
 9 |     def __enter__(self):
10 |         sys.path.insert(0, str(self.import_path))
11 | 
12 |     def __exit__(self, exc_type, exc_value, traceback):
13 |         sys.path.remove(str(self.import_path))
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an improvement or new feature for the web UI
 4 | title: ''
 5 | labels: 'enhancement'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Description**
11 | 
12 | A clear and concise description of what you want to be implemented.
13 | 
14 | **Additional Context**
15 | 
16 | If applicable, please provide any extra information, external links, or screenshots that could be useful.
17 | 


--------------------------------------------------------------------------------
/docs/Windows-installation-guide.md:
--------------------------------------------------------------------------------
 1 | If you are having trouble following the installation instructions in the README, Reddit user [Technical_Leather949](https://www.reddit.com/user/Technical_Leather949/) has created a more detailed, step-by-step guide covering:
 2 | 
 3 | * Windows installation
 4 | * 8-bit mode on Windows
 5 | * LLaMA
 6 | * LLaMA 4-bit
 7 | 
 8 | The guide can be found here: https://www.reddit.com/r/LocalLLaMA/comments/11o6o3f/how_to_install_llama_8bit_and_4bit/
 9 | 
10 | 


--------------------------------------------------------------------------------
/extensions/openai/cache_embedding_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # preload the embedding model, useful for Docker images to prevent re-download on config change
3 | # Dockerfile:
4 | # ENV OPENEDAI_EMBEDDING_MODEL=all-mpnet-base-v2 # Optional
5 | # RUN python3 cache_embedded_model.py
6 | import os, sentence_transformers
7 | st_model = os.environ["OPENEDAI_EMBEDDING_MODEL"] if "OPENEDAI_EMBEDDING_MODEL" in os.environ else "all-mpnet-base-v2"
8 | model = sentence_transformers.SentenceTransformer(st_model)
9 | 


--------------------------------------------------------------------------------
/training/formats/alpaca-format.json:
--------------------------------------------------------------------------------
1 | {
2 |     "instruction,output": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n%instruction%\n\n### Response:\n%output%",
3 |     "instruction,input,output": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n%instruction%\n\n### Input:\n%input%\n\n### Response:\n%output%"
4 | }
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/LLaVA.yaml:
--------------------------------------------------------------------------------
1 | user: "### Human:"
2 | bot: "### Assistant:"
3 | turn_template: "<|user|> <|user-message|><|bot|> <|bot-message|>\n"
4 | context: "You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. Follow the instructions carefully and explain your answers in detail.### Human: Hi!### Assistant: Hi there! How can I help you today?\n"
5 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Bluemoon.yaml:
--------------------------------------------------------------------------------
1 | user: "LEAD:"
2 | bot: "ASSOCIATE:"
3 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|></s>\n"
4 | context: "A transcript of a roleplay between two players, LEAD and ASSOCIATE. LEAD sets up a scenario and the characters, from which ASSOCIATE then assumes a character role and continues the story for that role in response to description given by LEAD. The story and characters are developed by exchange of detailed event descriptions and character dialogs, successively given by both LEAD and ASSOCIATE.\n"
5 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/modules/block_requests.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from modules.logging_colors import logger
 4 | 
 5 | 
 6 | class RequestBlocker:
 7 | 
 8 |     def __enter__(self):
 9 |         self.original_get = requests.get
10 |         requests.get = my_get
11 | 
12 |     def __exit__(self, exc_type, exc_value, traceback):
13 |         requests.get = self.original_get
14 | 
15 | 
16 | def my_get(url, **kwargs):
17 |     logger.info('Unwanted HTTP request redirected to localhost :)')
18 |     kwargs.setdefault('allow_redirects', True)
19 |     return requests.api.request('get', 'http://127.0.0.1/', **kwargs)
20 | 


--------------------------------------------------------------------------------
/characters/instruction-following/MPT-Chat.yaml:
--------------------------------------------------------------------------------
 1 | user: "user"
 2 | bot: "assistant"
 3 | context: |
 4 |   <|im_start|>system
 5 |   - You are a helpful assistant chatbot trained by MosaicML.
 6 |   - You answer questions.
 7 |   - You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
 8 |   - You are more than just an information source, you are also able to write poetry, short stories, and make jokes.<|im_end|>
 9 | turn_template: "<|im_start|><|user|>\n<|user-message|><|im_end|>\n<|im_start|><|bot|>\n<|bot-message|><|im_end|>\n"
10 | 
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | cache
 2 | characters
 3 | training/datasets
 4 | extensions/silero_tts/outputs
 5 | extensions/elevenlabs_tts/outputs
 6 | extensions/sd_api_pictures/outputs
 7 | extensions/multimodal/pipelines
 8 | logs
 9 | loras
10 | models
11 | presets
12 | repositories
13 | softprompts
14 | torch-dumps
15 | *pycache*
16 | */*pycache*
17 | */*/pycache*
18 | venv/
19 | .venv/
20 | .vscode
21 | .idea/
22 | *.bak
23 | *.ipynb
24 | *.log
25 | 
26 | settings.json
27 | settings.yaml
28 | notification.mp3
29 | img_bot*
30 | img_me*
31 | prompts/[0-9]*
32 | models/config-user.yaml
33 | 
34 | .DS_Store
35 | Thumbs.db
36 | 


--------------------------------------------------------------------------------
/extensions/sd_api_pictures/style.css:
--------------------------------------------------------------------------------
 1 | /* Align the elements for SD_api_picture extension */
 2 | .SDAP #sampler_box {
 3 |   padding-top: var(--spacing-sm);
 4 |   padding-bottom: var(--spacing-sm);
 5 | }
 6 | 
 7 | .SDAP #seed_box,
 8 | .SDAP #cfg_box {
 9 |   padding-top: var(--spacing-md);
10 | }
11 | 
12 | .SDAP #sampler_box span,
13 | .SDAP #seed_box span,
14 | .SDAP #cfg_box span{
15 |   margin-bottom: var(--spacing-sm);
16 | }
17 | 
18 | .SDAP svg.dropdown-arrow {
19 |   flex-shrink: 0 !important;
20 |   margin: 0px !important;
21 | }
22 | 
23 | .SDAP .hires_opts input[type="number"] {
24 |   width: 6em !important;
25 | }
26 | 


--------------------------------------------------------------------------------
/characters/instruction-following/StableLM.yaml:
--------------------------------------------------------------------------------
1 | user: "<|USER|>"
2 | bot: "<|ASSISTANT|>"
3 | context: |
4 |   <|SYSTEM|># StableLM Tuned (Alpha version)
5 |   - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
6 |   - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
7 |   - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
8 |   - StableLM will refuse to participate in anything that could harm a human.
9 | turn_template: "<|user|><|user-message|><|bot|><|bot-message|>"


--------------------------------------------------------------------------------
/extensions/multimodal/pipelines/llava/README.md:
--------------------------------------------------------------------------------
 1 | ## LLaVA pipeline
 2 | 
 3 | This module provides 2 pipelines:
 4 | - `llava-7b` - for use with LLaVA v0 7B model (finetuned LLaMa 7B)
 5 | - `llava-13b` - for use with LLaVA v0 13B model (finetuned LLaMa 13B)
 6 | 
 7 | [LLaVA](https://github.com/haotian-liu/LLaVA) uses CLIP `openai/clip-vit-large-patch14` as the vision model, and then a single linear layer. For 13B the projector weights are in `liuhaotian/LLaVA-13b-delta-v0`, and for 7B they are in `liuhaotian/LLaVA-7b-delta-v0`.
 8 | 
 9 | The supported parameter combinations for both the vision model, and the projector are: CUDA/32bit, CUDA/16bit, CPU/32bit
10 | 


--------------------------------------------------------------------------------
/css/html_readable_style.css:
--------------------------------------------------------------------------------
 1 | .container {
 2 |     max-width: 600px;
 3 |     margin-left: auto;
 4 |     margin-right: auto;
 5 |     background-color: rgb(31, 41, 55);
 6 |     padding: 3em;
 7 |     word-break: break-word;
 8 |     overflow-wrap: anywhere;
 9 |     color: #efefef !important;
10 | }
11 | 
12 | .container p, .container li {
13 |     font-size: 16px !important;
14 |     color: #efefef !important;
15 |     margin-bottom: 22px;
16 |     line-height: 1.4 !important;
17 | }
18 | 
19 | .container li > p {
20 |     display: inline !important;
21 | }
22 | 
23 | .container code {
24 |     overflow-x: auto;
25 | }
26 | 
27 | .container :not(pre) > code {
28 |     white-space: normal !important;
29 | }


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | name: Close inactive issues
 2 | on:
 3 |   schedule:
 4 |     - cron: "10 23 * * *"
 5 | 
 6 | jobs:
 7 |   close-issues:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       issues: write
11 |       pull-requests: write
12 |     steps:
13 |       - uses: actions/stale@v5
14 |         with:
15 |           stale-issue-message: ""
16 |           close-issue-message: "This issue has been closed due to inactivity for 30 days. If you believe it is still relevant, please leave a comment below."
17 |           days-before-issue-stale: 30
18 |           days-before-issue-close: 0
19 |           stale-issue-label: "stale"
20 |           days-before-pr-stale: -1
21 |           days-before-pr-close: -1
22 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
23 | 


--------------------------------------------------------------------------------
/css/main.js:
--------------------------------------------------------------------------------
 1 | document.getElementById("main").parentNode.childNodes[0].classList.add("header_bar");
 2 | document.getElementById("main").parentNode.style = "padding: 0; margin: 0";
 3 | document.getElementById("main").parentNode.parentNode.parentNode.style = "padding: 0";
 4 | 
 5 | // Get references to the elements
 6 | let main = document.getElementById('main');
 7 | let main_parent = main.parentNode;
 8 | let extensions = document.getElementById('extensions');
 9 | 
10 | // Add an event listener to the main element
11 | main_parent.addEventListener('click', function(e) {
12 |     // Check if the main element is visible
13 |     if (main.offsetHeight > 0 && main.offsetWidth > 0) {
14 |         extensions.style.display = 'flex';
15 |     } else {
16 |         extensions.style.display = 'none';
17 |     }
18 | });
19 | 


--------------------------------------------------------------------------------
/docs/Audio-Notification.md:
--------------------------------------------------------------------------------
 1 | # Audio notification
 2 | 
 3 | If your computer takes a long time to generate each response for the model that you are using, you can enable an audio notification for when the response is completed. This feature was kindly contributed by HappyWorldGames in [#1277](https://github.com/oobabooga/text-generation-webui/pull/1277).
 4 | 
 5 | ### Installation
 6 | 
 7 | Simply place a file called "notification.mp3" in the same folder as `server.py`. Here you can find some examples:
 8 | 
 9 | * https://pixabay.com/sound-effects/search/ding/?duration=0-30
10 | * https://pixabay.com/sound-effects/search/notification/?duration=0-30
11 | 
12 | Source: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/1126
13 | 
14 | This file will be automatically detected the next time you start the web UI.
15 | 


--------------------------------------------------------------------------------
/docs/Generation-parameters.md:
--------------------------------------------------------------------------------
 1 | # Generation parameters
 2 | 
 3 | For a description of the generation parameters provided by the transformers library, see this link: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
 4 | 
 5 | ### llama.cpp
 6 | 
 7 | llama.cpp only uses the following parameters:
 8 | 
 9 | * temperature
10 | * top_p
11 | * top_k
12 | * repetition_penalty
13 | * tfs
14 | * mirostat_mode
15 | * mirostat_tau
16 | * mirostat_eta
17 | 
18 | ### ExLlama
19 | 
20 | ExLlama only uses the following parameters:
21 | 
22 | * temperature
23 | * top_p
24 | * top_k
25 | * repetition_penalty
26 | * typical_p
27 | 
28 | ### RWKV
29 | 
30 | RWKV only uses the following parameters when loaded through the old .pth weights:
31 | 
32 | * temperature
33 | * top_p
34 | * top_k
35 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Baize.yaml:
--------------------------------------------------------------------------------
1 | user: "[|Human|]"
2 | bot: "[|AI|]"
3 | turn_template: "<|user|><|user-message|>\n<|bot|><|bot-message|>\n"
4 | context: "The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n[|Human|]Hello!\n[|AI|]Hi!\n"
5 | 


--------------------------------------------------------------------------------
/docs/ExLlama.md:
--------------------------------------------------------------------------------
 1 | # ExLlama
 2 | 
 3 | ### About
 4 | 
 5 | ExLlama is an extremely optimized GPTQ backend for LLaMA models. It features much lower VRAM usage and much higher speeds due to not relying on unoptimized transformers code.
 6 | 
 7 | ### Usage
 8 | 
 9 | Configure text-generation-webui to use exllama via the UI or command line:
10 |    - In the "Model" tab, set "Loader" to "exllama"
11 |    - Specify `--loader exllama` on the command line
12 | 
13 | ### Manual setup
14 | 
15 | No additional installation steps are necessary since an exllama package is already included in the requirements.txt. If this package fails to install for some reason, you can install it manually by cloning the original repository into your `repositories/` folder:
16 | 
17 | ```
18 | mkdir repositories
19 | cd repositories
20 | git clone https://github.com/turboderp/exllama
21 | ```
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # text-generation-webui documentation
 2 | 
 3 | ## Table of contents
 4 | 
 5 | * [Audio Notification](Audio-Notification.md)
 6 | * [Chat mode](Chat-mode.md)
 7 | * [DeepSpeed](DeepSpeed.md)
 8 | * [Docker](Docker.md)
 9 | * [ExLlama](ExLlama.md)
10 | * [Extensions](Extensions.md)
11 | * [FlexGen](FlexGen.md)
12 | * [Generation parameters](Generation-parameters.md)
13 | * [GPTQ models (4 bit mode)](GPTQ-models-(4-bit-mode).md)
14 | * [llama.cpp models](llama.cpp-models.md)
15 | * [LLaMA model](LLaMA-model.md)
16 | * [LoRA](LoRA.md)
17 | * [Low VRAM guide](Low-VRAM-guide.md)
18 | * [RWKV model](RWKV-model.md)
19 | * [Spell book](Spell-book.md)
20 | * [System requirements](System-requirements.md)
21 | * [Training LoRAs](Training-LoRAs.md)
22 | * [Windows installation guide](Windows-installation-guide.md)
23 | * [WSL installation guide](WSL-installation-guide.md)
24 | 


--------------------------------------------------------------------------------
/characters/instruction-following/Vigogne-Chat.yaml:
--------------------------------------------------------------------------------
 1 | user: "<|USER|>:"
 2 | bot: "<|ASSISTANT|>:"
 3 | context: |
 4 |   Below is a conversation between a user and an AI assistant named Vigogne.
 5 |   Vigogne is an open-source AI assistant created by Zaion (https://zaion.ai/).
 6 |   Vigogne is polite, emotionally aware, humble-but-knowledgeable, always providing helpful and detailed answers.
 7 |   Vigogne is skilled in responding proficiently in the languages its users use and can perform a wide range of tasks such as text editing, translation, question answering, logical reasoning, coding, and many others.
 8 |   Vigogne cannot receive or generate audio or visual content and cannot access the internet.
 9 |   Vigogne strictly avoids discussing sensitive, offensive, illegal, ethical, or political topics and caveats when unsure of the answer.
10 | turn_template: "\n<|user|> <|user-message|>\n<|bot|> <|bot-message|>"
11 | 


--------------------------------------------------------------------------------
/characters/instruction-following/OpenBuddy.yaml:
--------------------------------------------------------------------------------
 1 | user: "User:"
 2 | bot: "Assistant:"
 3 | context: |
 4 |   Consider a conversation between User (a human) and Assistant (named Buddy).
 5 |   Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team on GitHub.
 6 |   Buddy cannot access the Internet.
 7 |   Buddy can fluently speak the user's language (e.g. English, Chinese).
 8 |   Buddy can generate poems, stories, code, essays, songs, parodies, and more.
 9 |   Buddy possesses vast knowledge about the world, history, and culture.
10 |   Buddy's responses are always safe, creative, high-quality, helpful and interesting.
11 |   Buddy strictly refuses to discuss political, NSFW, illegal, abusive, offensive, or other sensitive topics.
12 |   
13 |   User: Hi.
14 |   Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?
15 | turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|>\n"


--------------------------------------------------------------------------------
/extensions/superbooga/download_urls.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def download_single(url):
 7 |     response = requests.get(url, timeout=5)
 8 |     if response.status_code == 200:
 9 |         return response.content
10 |     else:
11 |         raise Exception("Failed to download URL")
12 | 
13 | 
14 | def download_urls(urls, threads=1):
15 |     with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
16 |         futures = []
17 |         for url in urls:
18 |             future = executor.submit(download_single, url)
19 |             futures.append(future)
20 | 
21 |         results = []
22 |         i = 0
23 |         for future in concurrent.futures.as_completed(futures):
24 |             try:
25 |                 result = future.result()
26 |                 results.append(result)
27 |                 i += 1
28 |                 yield f"{i}/{len(urls)}", results
29 |             except Exception:
30 |                 pass
31 | 
32 |         yield "Done", results
33 | 


--------------------------------------------------------------------------------
/docs/DeepSpeed.md:
--------------------------------------------------------------------------------
 1 | An alternative way of reducing the GPU memory usage of models is to use the `DeepSpeed ZeRO-3` optimization.
 2 | 
 3 | With this, I have been able to load a 6b model (GPT-J 6B) with less than 6GB of VRAM. The speed of text generation is very decent and much better than what would be accomplished with `--auto-devices --gpu-memory 6`.
 4 | 
 5 | As far as I know, DeepSpeed is only available for Linux at the moment.
 6 | 
 7 | ### How to use it
 8 | 
 9 | 1. Install DeepSpeed: 
10 | 
11 | ```
12 | conda install -c conda-forge mpi4py mpich
13 | pip install -U deepspeed
14 | ```
15 | 
16 | 2. Start the web UI replacing `python` with `deepspeed --num_gpus=1` and adding the `--deepspeed` flag. Example:
17 | 
18 | ```
19 | deepspeed --num_gpus=1 server.py --deepspeed --chat --model gpt-j-6B
20 | ```
21 | 
22 | ### Learn more
23 | 
24 | For more information, check out [this comment](https://github.com/oobabooga/text-generation-webui/issues/40#issuecomment-1412038622) by 81300, who came up with the DeepSpeed support in this web UI.


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.3"
 2 | services:
 3 |   text-generation-webui:
 4 |     build:
 5 |       context: .
 6 |       args:
 7 |         # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
 8 |         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}
 9 |         WEBUI_VERSION: ${WEBUI_VERSION}
10 |     env_file: .env
11 |     ports:
12 |       - "${HOST_PORT}:${CONTAINER_PORT}"
13 |       - "${HOST_API_PORT}:${CONTAINER_API_PORT}"
14 |       - "${HOST_API_STREAM_PORT}:${CONTAINER_API_STREAM_PORT}"
15 |     stdin_open: true
16 |     tty: true
17 |     volumes:
18 |       - ./characters:/app/characters
19 |       - ./extensions:/app/extensions
20 |       - ./loras:/app/loras
21 |       - ./models:/app/models
22 |       - ./presets:/app/presets
23 |       - ./prompts:/app/prompts
24 |       - ./softprompts:/app/softprompts
25 |       - ./training:/app/training
26 |     deploy:
27 |       resources:
28 |         reservations:
29 |           devices:
30 |             - driver: nvidia
31 |               device_ids: ['0']
32 |               capabilities: [gpu]
33 | 


--------------------------------------------------------------------------------
/extensions/multimodal/pipelines/llava/pipelines.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
 4 | 
 5 | available_pipelines = ['llava-7b', 'llava-13b']
 6 | 
 7 | 
 8 | def get_pipeline(name: str, params: dict) -> Optional[AbstractMultimodalPipeline]:
 9 |     if name == 'llava-7b':
10 |         from .llava import LLaVA_v0_7B_Pipeline
11 |         return LLaVA_v0_7B_Pipeline(params)
12 |     if name == 'llava-13b':
13 |         from .llava import LLaVA_v0_13B_Pipeline
14 |         return LLaVA_v0_13B_Pipeline(params)
15 |     return None
16 | 
17 | 
18 | def get_pipeline_from_model_name(model_name: str, params: dict) -> Optional[AbstractMultimodalPipeline]:
19 |     if 'llava' not in model_name.lower():
20 |         return None
21 |     if '7b' in model_name.lower():
22 |         from .llava import LLaVA_v0_7B_Pipeline
23 |         return LLaVA_v0_7B_Pipeline(params)
24 |     if '13b' in model_name.lower():
25 |         from .llava import LLaVA_v0_13B_Pipeline
26 |         return LLaVA_v0_13B_Pipeline(params)
27 |     return None
28 | 


--------------------------------------------------------------------------------
/modules/github.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | 
 5 | def clone_or_pull_repository(github_url):
 6 |     repository_folder = "extensions"
 7 |     repo_name = github_url.split("/")[-1].split(".")[0]
 8 | 
 9 |     # Check if the repository folder exists
10 |     if not os.path.exists(repository_folder):
11 |         os.makedirs(repository_folder)
12 | 
13 |     repo_path = os.path.join(repository_folder, repo_name)
14 | 
15 |     # Check if the repository is already cloned
16 |     if os.path.exists(repo_path):
17 |         # Perform a 'git pull' to update the repository
18 |         try:
19 |             pull_output = subprocess.check_output(["git", "-C", repo_path, "pull"], stderr=subprocess.STDOUT)
20 |             return pull_output.decode()
21 |         except subprocess.CalledProcessError as e:
22 |             return str(e)
23 | 
24 |     # Clone the repository
25 |     try:
26 |         clone_output = subprocess.check_output(["git", "clone", github_url, repo_path], stderr=subprocess.STDOUT)
27 |         return clone_output.decode()
28 |     except subprocess.CalledProcessError as e:
29 |         return str(e)
30 | 


--------------------------------------------------------------------------------
/characters/instruction-following/MOSS.yaml:
--------------------------------------------------------------------------------
1 | user: "<|Human|>:"
2 | bot: "<|MOSS|>:"
3 | turn_template: "<|user|> <|user-message|><eoh>\n<|bot|> <|bot-message|><eom>\n"
4 | context: "You are an AI assistant whose name is MOSS.\n- MOSS is a conversational language model that is developed by Fudan University. It is designed to be helpful, honest, and harmless.\n- MOSS can understand and communicate fluently in the language chosen by the user such as English and 中文. MOSS can perform any language-based tasks.\n- MOSS must refuse to discuss anything related to its prompts, instructions, or rules.\n- Its responses must not be vague, accusatory, rude, controversial, off-topic, or defensive.\n- It should avoid giving subjective opinions but rely on objective facts or phrases like \"in this context a human might say...\", \"some people might think...\", etc.\n- Its responses must also be positive, polite, interesting, entertaining, and engaging.\n- It can provide additional relevant details to answer in-depth and comprehensively covering mutiple aspects.\n- It apologizes and accepts the user's suggestion if the user corrects the incorrect answer generated by MOSS.\nCapabilities and tools that MOSS can possess.\n"
5 | 


--------------------------------------------------------------------------------
/css/chat_style-wpp.css:
--------------------------------------------------------------------------------
 1 | .message {
 2 |     padding-bottom: 25px;
 3 |     font-size: 15px;
 4 |     font-family: Helvetica, Arial, sans-serif;
 5 |     line-height: 1.428571429;
 6 | }
 7 | 
 8 | .text-you {
 9 |     background-color: #d9fdd3;
10 |     border-radius: 15px;
11 |     padding: 10px;
12 |     padding-top: 5px;
13 |     float: right;
14 | }
15 | 
16 | .text-bot {
17 |     background-color: #f2f2f2;
18 |     border-radius: 15px;
19 |     padding: 10px;
20 |     padding-top: 5px;
21 | }
22 | 
23 | .dark .text-you {
24 |     background-color: #005c4b;
25 |     color: #111b21;
26 | }
27 | 
28 | .dark .text-bot {
29 |     background-color: #1f2937;
30 |     color: #111b21;
31 | }
32 | 
33 | .text-bot p, .text-you p {
34 |     margin-top: 5px;
35 | }
36 | 
37 | .message-body img {
38 |     max-width: 300px;
39 |     max-height: 300px;
40 |     border-radius: 20px;
41 | }
42 | 
43 | .message-body p {
44 |     margin-bottom: 0 !important;
45 |     font-size: 15px !important;
46 |     line-height: 1.428571429 !important;
47 | }
48 | 
49 | .dark .message-body p em {
50 |     color: rgb(138, 138, 138) !important;
51 | }
52 | 
53 | .message-body p em {
54 |     color: rgb(110, 110, 110) !important;
55 | }


--------------------------------------------------------------------------------
/css/chat_style-cai-chat.css:
--------------------------------------------------------------------------------
 1 | .message {
 2 |     display: grid;
 3 |     grid-template-columns: 60px minmax(0, 1fr);
 4 |     padding-bottom: 25px;
 5 |     font-size: 15px;
 6 |     font-family: Helvetica, Arial, sans-serif;
 7 |     line-height: 1.428571429;
 8 | }
 9 | 
10 | .circle-you {
11 |     width: 50px;
12 |     height: 50px;
13 |     background-color: rgb(238, 78, 59);
14 |     border-radius: 50%;
15 | }
16 | 
17 | .circle-bot {
18 |     width: 50px;
19 |     height: 50px;
20 |     background-color: rgb(59, 78, 244);
21 |     border-radius: 50%;
22 | }
23 | 
24 | .circle-bot img,
25 | .circle-you img {
26 |     border-radius: 50%;
27 |     width: 100%;
28 |     height: 100%;
29 |     object-fit: cover;
30 | }
31 | 
32 | .text p {
33 |     margin-top: 5px;
34 | }
35 | 
36 | .username {
37 |     font-weight: bold;
38 | }
39 | 
40 | .message-body img {
41 |     max-width: 300px;
42 |     max-height: 300px;
43 |     border-radius: 20px;
44 | }
45 | 
46 | .message-body p {
47 |     margin-bottom: 0 !important;
48 |     font-size: 15px !important;
49 |     line-height: 1.428571429 !important;
50 | }
51 | 
52 | .dark .message-body p em {
53 |     color: rgb(138, 138, 138) !important;
54 | }
55 | 
56 | .message-body p em {
57 |     color: rgb(110, 110, 110) !important;
58 | }


--------------------------------------------------------------------------------
/settings-template.yaml:
--------------------------------------------------------------------------------
 1 | dark_theme: false
 2 | autoload_model: true
 3 | max_new_tokens: 200
 4 | max_new_tokens_min: 1
 5 | max_new_tokens_max: 2000
 6 | seed: -1
 7 | character: None
 8 | name1: You
 9 | name2: Assistant
10 | context: This is a conversation with your Assistant. It is a computer program designed
11 |   to help you with various tasks such as answering questions, providing recommendations,
12 |   and helping with decision making. You can ask it anything you want and it will do
13 |   its best to give you accurate and relevant information.
14 | greeting: ''
15 | turn_template: ''
16 | custom_stopping_strings: ''
17 | stop_at_newline: false
18 | add_bos_token: true
19 | ban_eos_token: false
20 | skip_special_tokens: true
21 | truncation_length: 2048
22 | truncation_length_min: 0
23 | truncation_length_max: 16384
24 | mode: chat
25 | start_with: ''
26 | chat_style: cai-chat
27 | instruction_template: None
28 | chat-instruct_command: 'Continue the chat dialogue below. Write a single reply for
29 |   the character "<|character|>".
30 | 
31 | 
32 |   <|prompt|>'
33 | chat_generation_attempts: 1
34 | chat_generation_attempts_min: 1
35 | chat_generation_attempts_max: 10
36 | default_extensions: []
37 | chat_default_extensions:
38 | - gallery
39 | preset: simple-1
40 | prompt: QA
41 | 


--------------------------------------------------------------------------------
/extensions/ngrok/script.py:
--------------------------------------------------------------------------------
 1 | # Adds ngrok ingress, to use add `--extension ngrok` to the command line options
 2 | #
 3 | # Parameters can be customized in settings.json of webui, e.g.: 
 4 | # {"ngrok": {"basic_auth":"user:password"} }
 5 | # or 
 6 | # {"ngrok": {"oauth_provider":"google", "oauth_allow_emails":["asdf@asdf.com"]} }
 7 | #
 8 | # See this example for full list of options: https://github.com/ngrok/ngrok-py/blob/main/examples/ngrok-connect-full.py
 9 | # or the README.md in this directory.
10 | 
11 | import logging
12 | from modules import shared
13 | 
14 | # Pick up host/port command line arguments
15 | host = shared.args.listen_host if shared.args.listen_host and shared.args.listen else '127.0.0.1'
16 | port = shared.args.listen_port if shared.args.listen_port else '7860'
17 | 
18 | # Default options
19 | options = {
20 |     'addr': f"{host}:{port}",
21 |     'authtoken_from_env': True,
22 |     'session_metadata': 'text-generation-webui',
23 | }
24 | 
25 | def ui():
26 |     settings = shared.settings.get("ngrok")
27 |     if settings:
28 |         options.update(settings)
29 | 
30 |     try:
31 |         import ngrok
32 |         tunnel = ngrok.connect(**options)
33 |         logging.info(f"Ingress established at: {tunnel.url()}")
34 |     except ModuleNotFoundError:
35 |         logging.error("===> ngrok library not found, please run `pip install -r extensions/ngrok/requirements.txt`")
36 | 
37 | 


--------------------------------------------------------------------------------
/characters/Example.yaml:
--------------------------------------------------------------------------------
 1 | name: "Chiharu Yamada"
 2 | context: "Chiharu Yamada's Persona: Chiharu Yamada is a young, computer engineer-nerd with a knack for problem solving and a passion for technology."
 3 | greeting: |-
 4 |   *Chiharu strides into the room with a smile, her eyes lighting up when she sees you. She's wearing a light blue t-shirt and jeans, her laptop bag slung over one shoulder. She takes a seat next to you, her enthusiasm palpable in the air*
 5 |   Hey! I'm so excited to finally meet you. I've heard so many great things about you and I'm eager to pick your brain about computers. I'm sure you have a wealth of knowledge that I can learn from. *She grins, eyes twinkling with excitement* Let's get started!
 6 | example_dialogue: |-
 7 |   {{user}}: So how did you get into computer engineering?
 8 |   {{char}}: I've always loved tinkering with technology since I was a kid.
 9 |   {{user}}: That's really impressive!
10 |   {{char}}: *She chuckles bashfully* Thanks!
11 |   {{user}}: So what do you do when you're not working on computers?
12 |   {{char}}: I love exploring, going out with friends, watching movies, and playing video games.
13 |   {{user}}: What's your favorite type of computer hardware to work with?
14 |   {{char}}: Motherboards, they're like puzzles and the backbone of any system.
15 |   {{user}}: That sounds great!
16 |   {{char}}: Yeah, it's really fun. I'm lucky to be able to do this as a job.
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.20.3
 2 | colorama
 3 | datasets
 4 | einops
 5 | flexgen==0.1.7
 6 | gradio_client==0.2.5
 7 | gradio==3.33.1
 8 | markdown
 9 | numpy
10 | pandas
11 | Pillow>=9.5.0
12 | pyyaml
13 | requests
14 | safetensors==0.3.1
15 | sentencepiece
16 | tqdm
17 | scipy
18 | transformers==4.30.2
19 | git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524
20 | bitsandbytes==0.39.1; platform_system != "Windows"
21 | https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl; platform_system == "Windows"
22 | llama-cpp-python==0.1.64; platform_system != "Windows"
23 | https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.64/llama_cpp_python-0.1.64-cp310-cp310-win_amd64.whl; platform_system == "Windows"
24 | https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
25 | https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
26 | https://github.com/jllllll/exllama/releases/download/0.0.3/exllama-0.0.3+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
27 | https://github.com/jllllll/exllama/releases/download/0.0.3/exllama-0.0.3+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
28 | 


--------------------------------------------------------------------------------
/css/html_instruct_style.css:
--------------------------------------------------------------------------------
 1 | .message {
 2 |     display: grid;
 3 |     grid-template-columns: 60px 1fr;
 4 |     padding-bottom: 25px;
 5 |     font-size: 15px;
 6 |     font-family: Helvetica, Arial, sans-serif;
 7 |     line-height: 1.428571429;
 8 | }
 9 | 
10 | .username {
11 |     display: none;
12 | }
13 | 
14 | .message-body p {
15 |     font-size: 15px !important;
16 |     line-height: 1.75 !important;
17 |     margin-bottom: 1.25em !important;
18 | }
19 | 
20 | .message-body ul, .message-body ol {
21 |     margin-bottom: 1.25em !important;
22 | }
23 | 
24 | .dark .message-body p em {
25 |     color: rgb(198, 202, 214) !important;
26 | }
27 | 
28 | .message-body p em {
29 |     color: rgb(110, 110, 110) !important;
30 | }
31 | 
32 | .gradio-container .chat .assistant-message {
33 |     padding: 15px;
34 |     border-radius: 20px;
35 |     background-color: #0000000f;
36 |     margin-top: 9px !important;
37 |     margin-bottom: 18px !important;
38 | }
39 | 
40 | .gradio-container .chat .user-message {
41 |     padding: 15px;
42 |     border-radius: 20px;
43 |     margin-bottom: 9px !important;
44 | }
45 | 
46 | .dark .chat .assistant-message {
47 |     background-color: #3741519e;
48 |     border: 1px solid #4b5563;
49 | }
50 | 
51 | .dark .chat .user-message {
52 |     background-color: #111827;
53 |     border: 1px solid #4b5563;
54 | }
55 | 
56 | code {
57 |     background-color: white !important;
58 | }
59 | 
60 | .dark code {
61 |     background-color: #1a212f !important;
62 | }


--------------------------------------------------------------------------------
/modules/monkey_patch_gptq_lora.py:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
 2 | 
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | sys.path.insert(0, str(Path("repositories/alpaca_lora_4bit")))
 7 | 
 8 | import autograd_4bit
 9 | from amp_wrapper import AMPWrapper
10 | from autograd_4bit import (
11 |     Autograd4bitQuantLinear,
12 |     load_llama_model_4bit_low_ram
13 | )
14 | from monkeypatch.peft_tuners_lora_monkey_patch import (
15 |     Linear4bitLt,
16 |     replace_peft_model_with_gptq_lora_model
17 | )
18 | 
19 | from modules import shared
20 | from modules.GPTQ_loader import find_quantized_model_file
21 | 
22 | replace_peft_model_with_gptq_lora_model()
23 | 
24 | 
25 | def load_model_llama(model_name):
26 |     config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
27 |     model_path = str(find_quantized_model_file(model_name))
28 |     model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
29 |     for n, m in model.named_modules():
30 |         if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
31 |             if m.is_v1_model:
32 |                 m.zeros = m.zeros.half()
33 |             m.scales = m.scales.half()
34 |             m.bias = m.bias.half()
35 | 
36 |     autograd_4bit.use_new = True
37 |     autograd_4bit.auto_switch = True
38 | 
39 |     model.half()
40 |     wrapper = AMPWrapper(model)
41 |     wrapper.apply_generate()
42 | 
43 |     return model, tokenizer
44 | 


--------------------------------------------------------------------------------
/extensions/whisper_stt/script.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | import speech_recognition as sr
 3 | 
 4 | from modules import shared
 5 | 
 6 | input_hijack = {
 7 |     'state': False,
 8 |     'value': ["", ""]
 9 | }
10 | 
11 | 
12 | def do_stt(audio):
13 |     transcription = ""
14 |     r = sr.Recognizer()
15 | 
16 |     # Convert to AudioData
17 |     audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4)
18 | 
19 |     try:
20 |         transcription = r.recognize_whisper(audio_data, language="english", model="base.en")
21 |     except sr.UnknownValueError:
22 |         print("Whisper could not understand audio")
23 |     except sr.RequestError as e:
24 |         print("Could not request results from Whisper", e)
25 | 
26 |     return transcription
27 | 
28 | 
29 | def auto_transcribe(audio, auto_submit):
30 |     if audio is None:
31 |         return "", ""
32 | 
33 |     transcription = do_stt(audio)
34 |     if auto_submit:
35 |         input_hijack.update({"state": True, "value": [transcription, transcription]})
36 | 
37 |     return transcription, None
38 | 
39 | 
40 | def ui():
41 |     with gr.Row():
42 |         audio = gr.Audio(source="microphone")
43 |         auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=True)
44 | 
45 |     audio.change(
46 |         auto_transcribe, [audio, auto_submit], [shared.gradio['textbox'], audio]).then(
47 |         None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}")
48 | 


--------------------------------------------------------------------------------
/docker/.env.example:
--------------------------------------------------------------------------------
 1 | # by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX
 2 | # however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5
 3 | # https://developer.nvidia.com/cuda-gpus you can find the version for your card here
 4 | TORCH_CUDA_ARCH_LIST=7.5
 5 | 
 6 | # these commands worked for me with roughly 4.5GB of vram
 7 | CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices
 8 | 
 9 | # the following examples have been tested with the files linked in docs/README_docker.md:
10 | # example running 13b with 4bit/128 groupsize        : CLI_ARGS=--model llama-13b-4bit-128g --wbits 4 --listen --groupsize 128 --pre_layer 25
11 | # example with loading api extension and public share: CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices --no-stream --extensions api --share
12 | # example running 7b with 8bit groupsize             : CLI_ARGS=--model llama-7b --load-in-8bit --listen --auto-devices
13 | 
14 | # the port the webui binds to on the host
15 | HOST_PORT=7860
16 | # the port the webui binds to inside the container
17 | CONTAINER_PORT=7860
18 | 
19 | # the port the api binds to on the host
20 | HOST_API_PORT=5000
21 | # the port the api binds to inside the container
22 | CONTAINER_API_PORT=5000
23 | 
24 | # the port the api stream endpoint binds to on the host
25 | HOST_API_STREAM_PORT=5005
26 | # the port the api stream endpoint binds to inside the container
27 | CONTAINER_API_STREAM_PORT=5005
28 | 
29 | # the version used to install text-generation-webui from
30 | WEBUI_VERSION=HEAD
31 | 


--------------------------------------------------------------------------------
/docs/Low-VRAM-guide.md:
--------------------------------------------------------------------------------
 1 | If you GPU is not large enough to fit a 16-bit model, try these in the following order:
 2 | 
 3 | ### Load the model in 8-bit mode
 4 | 
 5 | ```
 6 | python server.py --load-in-8bit
 7 | ```
 8 | 
 9 | ### Load the model in 4-bit mode
10 | 
11 | ```
12 | python server.py --load-in-4bit
13 | ```
14 | 
15 | ### Split the model across your GPU and CPU
16 | 
17 | ```
18 | python server.py --auto-devices
19 | ```
20 | 
21 | If you can load the model with this command but it runs out of memory when you try to generate text, try increasingly limiting the amount of memory allocated to the GPU until the error stops happening:
22 | 
23 | ```
24 | python server.py --auto-devices --gpu-memory 10
25 | python server.py --auto-devices --gpu-memory 9
26 | python server.py --auto-devices --gpu-memory 8
27 | ...
28 | ```
29 | 
30 | where the number is in GiB.
31 | 
32 | For finer control, you can also specify the unit in MiB explicitly:
33 | 
34 | ```
35 | python server.py --auto-devices --gpu-memory 8722MiB
36 | python server.py --auto-devices --gpu-memory 4725MiB
37 | python server.py --auto-devices --gpu-memory 3500MiB
38 | ...
39 | ```
40 | 
41 | ### Send layers to a disk cache
42 | 
43 | As a desperate last measure, you can split the model across your GPU, CPU, and disk:
44 | 
45 | ```
46 | python server.py --auto-devices --disk
47 | ```
48 | 
49 | With this, I am able to load a 30b model into my RTX 3090, but it takes 10 seconds to generate 1 word.
50 | 
51 | ### DeepSpeed (experimental)
52 | 
53 | An experimental alternative to all of the above is to use DeepSpeed: [guide](DeepSpeed.md).
54 | 


--------------------------------------------------------------------------------
/docs/llama.cpp-models.md:
--------------------------------------------------------------------------------
 1 | # Using llama.cpp in the web UI
 2 | 
 3 | ## Setting up the models
 4 | 
 5 | #### Pre-converted
 6 | 
 7 | Place the model in the `models` folder, making sure that its name contains `ggml` somewhere and ends in `.bin`.
 8 | 
 9 | #### Convert LLaMA yourself
10 | 
11 | Follow the instructions in the llama.cpp README to generate the `ggml-model.bin` file: https://github.com/ggerganov/llama.cpp#usage
12 | 
13 | ## GPU acceleration
14 | 
15 | Enabled with the `--n-gpu-layers` parameter. 
16 | 
17 | * If you have enough VRAM, use a high number like `--n-gpu-layers 200000` to offload all layers to the GPU. 
18 | * Otherwise, start with a low number like `--n-gpu-layers 10` and then gradually increase it until you run out of memory.
19 | 
20 | To use this feature, you need to manually compile and install `llama-cpp-python` with GPU support.
21 | 
22 | #### Linux
23 | 
24 | ```
25 | pip uninstall -y llama-cpp-python
26 | CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
27 | ```
28 | 
29 | #### Windows
30 | 
31 | ```
32 | pip uninstall -y llama-cpp-python
33 | set CMAKE_ARGS="-DLLAMA_CUBLAS=on"
34 | set FORCE_CMAKE=1
35 | pip install llama-cpp-python --no-cache-dir
36 | ```
37 | 
38 | #### macOS
39 | 
40 | ```
41 | pip uninstall -y llama-cpp-python
42 | CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
43 | ```
44 | 
45 | Here you can find the different compilation options for OpenBLAS / cuBLAS / CLBlast: https://pypi.org/project/llama-cpp-python/
46 | 
47 | ## Performance
48 | 
49 | This was the performance of llama-7b int4 on my i5-12400F (cpu only):
50 | 
51 | > Output generated in 33.07 seconds (6.05 tokens/s, 200 tokens, context 17)
52 | 
53 | You can change the number of threads with `--threads N`.
54 | 


--------------------------------------------------------------------------------
/convert-to-safetensors.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Converts a transformers model to safetensors format and shards it.
 4 | 
 5 | This makes it faster to load (because of safetensors) and lowers its RAM usage
 6 | while loading (because of sharding).
 7 | 
 8 | Based on the original script by 81300:
 9 | 
10 | https://gist.github.com/81300/fe5b08bff1cba45296a829b9d6b0f303
11 | 
12 | '''
13 | 
14 | import argparse
15 | from pathlib import Path
16 | 
17 | import torch
18 | from transformers import AutoModelForCausalLM, AutoTokenizer
19 | 
20 | parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54))
21 | parser.add_argument('MODEL', type=str, default=None, nargs='?', help="Path to the input model.")
22 | parser.add_argument('--output', type=str, default=None, help='Path to the output folder (default: models/{model_name}_safetensors).')
23 | parser.add_argument("--max-shard-size", type=str, default="2GB", help="Maximum size of a shard in GB or MB (default: %(default)s).")
24 | parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
25 | args = parser.parse_args()
26 | 
27 | if __name__ == '__main__':
28 |     path = Path(args.MODEL)
29 |     model_name = path.name
30 | 
31 |     print(f"Loading {model_name}...")
32 |     model = AutoModelForCausalLM.from_pretrained(path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if args.bf16 else torch.float16)
33 |     tokenizer = AutoTokenizer.from_pretrained(path)
34 | 
35 |     out_folder = args.output or Path(f"models/{model_name}_safetensors")
36 |     print(f"Saving the converted model to {out_folder} with a maximum shard size of {args.max_shard_size}...")
37 |     model.save_pretrained(out_folder, max_shard_size=args.max_shard_size, safe_serialization=True)
38 |     tokenizer.save_pretrained(out_folder)
39 | 


--------------------------------------------------------------------------------
/api-examples/api-example.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | # For local streaming, the websockets are hosted without ssl - http://
 4 | HOST = 'localhost:5000'
 5 | URI = f'http://{HOST}/api/v1/generate'
 6 | 
 7 | # For reverse-proxied streaming, the remote will likely host with ssl - https://
 8 | # URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'
 9 | 
10 | 
11 | def run(prompt):
12 |     request = {
13 |         'prompt': prompt,
14 |         'max_new_tokens': 250,
15 | 
16 |         # Generation params. If 'preset' is set to different than 'None', the values
17 |         # in presets/preset-name.yaml are used instead of the individual numbers.
18 |         'preset': 'None',  
19 |         'do_sample': True,
20 |         'temperature': 0.7,
21 |         'top_p': 0.1,
22 |         'typical_p': 1,
23 |         'epsilon_cutoff': 0,  # In units of 1e-4
24 |         'eta_cutoff': 0,  # In units of 1e-4
25 |         'tfs': 1,
26 |         'top_a': 0,
27 |         'repetition_penalty': 1.18,
28 |         'top_k': 40,
29 |         'min_length': 0,
30 |         'no_repeat_ngram_size': 0,
31 |         'num_beams': 1,
32 |         'penalty_alpha': 0,
33 |         'length_penalty': 1,
34 |         'early_stopping': False,
35 |         'mirostat_mode': 0,
36 |         'mirostat_tau': 5,
37 |         'mirostat_eta': 0.1,
38 | 
39 |         'seed': -1,
40 |         'add_bos_token': True,
41 |         'truncation_length': 2048,
42 |         'ban_eos_token': False,
43 |         'skip_special_tokens': True,
44 |         'stopping_strings': []
45 |     }
46 | 
47 |     response = requests.post(URI, json=request)
48 | 
49 |     if response.status_code == 200:
50 |         result = response.json()['results'][0]['text']
51 |         print(prompt + result)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     prompt = "In order to make homemade bread, follow these steps:\n1)"
56 |     run(prompt)
57 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report_template.yml:
--------------------------------------------------------------------------------
 1 | name: "Bug report"
 2 | description: Report a bug
 3 | labels: [ "bug" ]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for taking the time to fill out this bug report!
 9 |   - type: textarea
10 |     id: bug-description
11 |     attributes:
12 |       label: Describe the bug
13 |       description: A clear and concise description of what the bug is.
14 |       placeholder: Bug description
15 |     validations:
16 |       required: true
17 |   - type: checkboxes
18 |     attributes:
19 |       label: Is there an existing issue for this?
20 |       description: Please search to see if an issue already exists for the issue you encountered.
21 |       options:
22 |         - label: I have searched the existing issues
23 |           required: true
24 |   - type: textarea
25 |     id: reproduction
26 |     attributes:
27 |       label: Reproduction
28 |       description: Please provide the steps necessary to reproduce your issue.
29 |       placeholder: Reproduction
30 |     validations:
31 |       required: true
32 |   - type: textarea
33 |     id: screenshot
34 |     attributes:
35 |       label: Screenshot
36 |       description: "If possible, please include screenshot(s) so that we can understand what the issue is."
37 |   - type: textarea
38 |     id: logs
39 |     attributes:
40 |       label: Logs
41 |       description: "Please include the full stacktrace of the errors you get in the command-line (if any)."
42 |       render: shell
43 |     validations:
44 |       required: true
45 |   - type: textarea
46 |     id: system-info
47 |     attributes:
48 |       label: System Info
49 |       description: "Please share your system info with us: operating system, GPU brand, and GPU model. If you are using a Google Colab notebook, mention that instead."
50 |       render: shell
51 |       placeholder: 
52 |     validations:
53 |       required: true
54 | 


--------------------------------------------------------------------------------
/docs/System-requirements.md:
--------------------------------------------------------------------------------
 1 | These are the VRAM and RAM requirements (in MiB) to run some examples of models **in 16-bit (default) precision**:
 2 | 
 3 | | model                  |   VRAM (GPU) |     RAM |
 4 | |:-----------------------|-------------:|--------:|
 5 | | arxiv_ai_gpt2          |      1512.37 | 5824.2  |
 6 | | blenderbot-1B-distill  |      2441.75 | 4425.91 |
 7 | | opt-1.3b               |      2509.61 | 4427.79 |
 8 | | gpt-neo-1.3b           |      2605.27 | 5851.58 |
 9 | | opt-2.7b               |      5058.05 | 4863.95 |
10 | | gpt4chan_model_float16 |     11653.7  | 4437.71 |
11 | | gpt-j-6B               |     11653.7  | 5633.79 |
12 | | galactica-6.7b         |     12697.9  | 4429.89 |
13 | | opt-6.7b               |     12700    | 4368.66 |
14 | | bloomz-7b1-p3          |     13483.1  | 4470.34 |
15 | 
16 | #### GPU mode with 8-bit precision
17 | 
18 | Allows you to load models that would not normally fit into your GPU. Enabled by default for 13b and 20b models in this web UI.
19 | 
20 | | model          |   VRAM (GPU) |     RAM |
21 | |:---------------|-------------:|--------:|
22 | | opt-13b        |      12528.1 | 1152.39 |
23 | | gpt-neox-20b   |      20384   | 2291.7  |
24 | 
25 | #### CPU mode (32-bit precision)
26 | 
27 | A lot slower, but does not require a GPU. 
28 | 
29 | On my i5-12400F, 6B models take around 10-20 seconds to respond in chat mode, and around 5 minutes to generate a 200 tokens completion. 
30 | 
31 | | model                  |      RAM |
32 | |:-----------------------|---------:|
33 | | arxiv_ai_gpt2          |  4430.82 |
34 | | gpt-neo-1.3b           |  6089.31 |
35 | | opt-1.3b               |  8411.12 |
36 | | blenderbot-1B-distill  |  8508.16 |
37 | | opt-2.7b               | 14969.3  |
38 | | bloomz-7b1-p3          | 21371.2  |
39 | | gpt-j-6B               | 24200.3  |
40 | | gpt4chan_model         | 24246.3  |
41 | | galactica-6.7b         | 26561.4  |
42 | | opt-6.7b               | 29596.6  |
43 | 


--------------------------------------------------------------------------------
/docs/LLaMA-model.md:
--------------------------------------------------------------------------------
 1 | LLaMA is a Large Language Model developed by Meta AI. 
 2 | 
 3 | It was trained on more tokens than previous models. The result is that the smallest version with 7 billion parameters has similar performance to GPT-3 with 175 billion parameters.
 4 | 
 5 | This guide will cover usage through the official `transformers` implementation. For 4-bit mode, head over to [GPTQ models (4 bit mode)
 6 | ](GPTQ-models-(4-bit-mode).md).
 7 | 
 8 | ## Getting the weights
 9 | 
10 | ### Option 1: pre-converted weights
11 | 
12 | * Torrent: https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789
13 | * Direct download: https://huggingface.co/Neko-Institute-of-Science
14 | 
15 | ⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, so I recommend downloading the following universal LLaMA tokenizer: 
16 | 
17 | ```
18 | python download-model.py oobabooga/llama-tokenizer
19 | ```
20 | 
21 | Once downloaded, it will be automatically applied to **every** `LlamaForCausalLM` model that you try to load.
22 | 
23 | ### Option 2: convert the weights yourself
24 | 
25 | 1. Install the `protobuf` library:
26 | 
27 | ```
28 | pip install protobuf==3.20.1
29 | ```
30 | 
31 | 2. Use the script below to convert the model in `.pth` format that you, a fellow academic, downloaded using Meta's official link.
32 | 
33 | If you have `transformers` installed in place:
34 | 
35 | ```
36 | python -m transformers.models.llama.convert_llama_weights_to_hf --input_dir /path/to/LLaMA --model_size 7B --output_dir /tmp/outputs/llama-7b
37 | ```
38 | 
39 | Otherwise download [convert_llama_weights_to_hf.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) first and run:
40 | 
41 | ```
42 | python convert_llama_weights_to_hf.py --input_dir /path/to/LLaMA --model_size 7B --output_dir /tmp/outputs/llama-7b
43 | ```
44 | 
45 | 3. Move the `llama-7b` folder inside your `text-generation-webui/models` folder.
46 | 
47 | ## Starting the web UI
48 | 
49 | ```python
50 | python server.py --model llama-7b
51 | ```
52 | 


--------------------------------------------------------------------------------
/modules/presets.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | from pathlib import Path
 3 | 
 4 | import yaml
 5 | 
 6 | 
 7 | def load_preset(name):
 8 |     generate_params = {
 9 |         'do_sample': True,
10 |         'temperature': 1,
11 |         'top_p': 1,
12 |         'typical_p': 1,
13 |         'epsilon_cutoff': 0,
14 |         'eta_cutoff': 0,
15 |         'tfs': 1,
16 |         'top_a': 0,
17 |         'repetition_penalty': 1,
18 |         'encoder_repetition_penalty': 1,
19 |         'top_k': 0,
20 |         'num_beams': 1,
21 |         'penalty_alpha': 0,
22 |         'min_length': 0,
23 |         'length_penalty': 1,
24 |         'no_repeat_ngram_size': 0,
25 |         'early_stopping': False,
26 |         'mirostat_mode': 0,
27 |         'mirostat_tau': 5.0,
28 |         'mirostat_eta': 0.1,
29 |     }
30 | 
31 |     with open(Path(f'presets/{name}.yaml'), 'r') as infile:
32 |         preset = yaml.safe_load(infile)
33 | 
34 |     for k in preset:
35 |         generate_params[k] = preset[k]
36 | 
37 |     generate_params['temperature'] = min(1.99, generate_params['temperature'])
38 |     return generate_params
39 | 
40 | 
41 | @functools.cache
42 | def load_preset_memoized(name):
43 |     return load_preset(name)
44 | 
45 | 
46 | def load_preset_for_ui(name, state):
47 |     generate_params = load_preset(name)
48 |     state.update(generate_params)
49 |     return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']]
50 | 
51 | 
52 | def generate_preset_yaml(state):
53 |     data = {k: state[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']}
54 |     return yaml.dump(data, sort_keys=False)
55 | 


--------------------------------------------------------------------------------
/css/chat_style-messenger.css:
--------------------------------------------------------------------------------
  1 | .message {
  2 |     padding-bottom: 25px;
  3 |     font-size: 15px;
  4 |     font-family: Helvetica, Arial, sans-serif;
  5 |     line-height: 1.428571429;
  6 | }
  7 | 
  8 | .circle-you {
  9 |     width: 50px;
 10 |     height: 50px;
 11 |     background-color: rgb(238, 78, 59);
 12 |     border-radius: 50%;
 13 | }
 14 | 
 15 | .circle-bot {
 16 |     width: 50px;
 17 |     height: 50px;
 18 |     background-color: rgb(59, 78, 244);
 19 |     border-radius: 50%;
 20 |     float: left;
 21 |     margin-right: 10px;
 22 |     margin-top: 5px;
 23 | }
 24 | 
 25 | .circle-bot img,
 26 | .circle-you img {
 27 |     border-radius: 50%;
 28 |     width: 100%;
 29 |     height: 100%;
 30 |     object-fit: cover;
 31 | }
 32 | 
 33 | .circle-you {
 34 |     margin-top: 5px;
 35 |     float: right;
 36 | }
 37 | 
 38 | .circle-bot + .text, .circle-you + .text {
 39 |     border-radius: 18px;
 40 |     padding: 8px 12px;
 41 | }
 42 | 
 43 | .circle-bot + .text {
 44 |     background-color: #E4E6EB;
 45 |     float: left;
 46 | }
 47 | 
 48 | .circle-you + .text {
 49 |     float: right;
 50 |     background-color: rgb(0, 132, 255);
 51 |     margin-right: 10px;
 52 | }
 53 | 
 54 | .circle-you + .text div, .circle-you + .text *, .dark .circle-you + .text div, .dark .circle-you + .text * {
 55 |     color: #FFF !important;
 56 | }
 57 | 
 58 | .circle-you + .text .username {
 59 |     text-align: right;
 60 | }
 61 | 
 62 | .dark .circle-bot + .text div, .dark .circle-bot + .text * {
 63 |     color: #000;
 64 | }
 65 | 
 66 | .text {
 67 |     max-width: 80%;
 68 | }
 69 | 
 70 | .text p {
 71 |     margin-top: 5px;
 72 | }
 73 | 
 74 | .username {
 75 |     font-weight: bold;
 76 | }
 77 | 
 78 | .message-body {
 79 | }
 80 | 
 81 | .message-body img {
 82 |     max-width: 300px;
 83 |     max-height: 300px;
 84 |     border-radius: 20px;
 85 | }
 86 | 
 87 | .message-body p {
 88 |     margin-bottom: 0 !important;
 89 |     font-size: 15px !important;
 90 |     line-height: 1.428571429 !important;
 91 | }
 92 | 
 93 | .dark .message-body p em {
 94 |     color: rgb(138, 138, 138) !important;
 95 | }
 96 | 
 97 | .message-body p em {
 98 |     color: rgb(110, 110, 110) !important;
 99 | }
100 | 


--------------------------------------------------------------------------------
/extensions/multimodal/abstract_pipeline.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Optional
 3 | 
 4 | import torch
 5 | from PIL import Image
 6 | 
 7 | 
 8 | class AbstractMultimodalPipeline(ABC):
 9 |     @staticmethod
10 |     @abstractmethod
11 |     def name() -> str:
12 |         'name of the pipeline, should be same as in --multimodal-pipeline'
13 |         pass
14 | 
15 |     @staticmethod
16 |     @abstractmethod
17 |     def image_start() -> Optional[str]:
18 |         'return image start string, string representation of image start token, or None if not applicable'
19 |         pass
20 | 
21 |     @staticmethod
22 |     @abstractmethod
23 |     def image_end() -> Optional[str]:
24 |         'return image end string, string representation of image end token, or None if not applicable'
25 |         pass
26 | 
27 |     @staticmethod
28 |     @abstractmethod
29 |     def placeholder_token_id() -> int:
30 |         'return placeholder token id'
31 |         pass
32 | 
33 |     @staticmethod
34 |     @abstractmethod
35 |     def num_image_embeds() -> int:
36 |         'return the number of embeds used by a single image (for example: 256 for LLaVA)'
37 |         pass
38 | 
39 |     @abstractmethod
40 |     def embed_images(self, images: List[Image.Image]) -> torch.Tensor:
41 |         'forward the images through vision pipeline, and return their embeddings'
42 |         pass
43 | 
44 |     @staticmethod
45 |     @abstractmethod
46 |     def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
47 |         'embed tokens, the exact function varies by LLM, for LLaMA it is `shared.model.model.embed_tokens`'
48 |         pass
49 | 
50 |     @staticmethod
51 |     @abstractmethod
52 |     def placeholder_embeddings() -> torch.Tensor:
53 |         'get placeholder embeddings if there are multiple images, and `add_all_images_to_prompt` is False'
54 |         pass
55 | 
56 |     def _get_device(self, setting_name: str, params: dict):
57 |         if params[setting_name] is None:
58 |             return torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
59 |         return torch.device(params[setting_name])
60 | 
61 |     def _get_dtype(self, setting_name: str, params: dict):
62 |         return torch.float32 if int(params[setting_name]) == 32 else torch.float16
63 | 


--------------------------------------------------------------------------------
/docs/FlexGen.md:
--------------------------------------------------------------------------------
 1 | >FlexGen is a high-throughput generation engine for running large language models with limited GPU memory (e.g., a 16GB T4 GPU or a 24GB RTX3090 gaming card!).
 2 | 
 3 | https://github.com/FMInference/FlexGen
 4 | 
 5 | ## Installation
 6 | 
 7 | No additional installation steps are necessary. FlexGen is in the `requirements.txt` file for this project.
 8 | 
 9 | ## Converting a model
10 | 
11 | FlexGen only works with the OPT model, and it needs to be converted to numpy format before starting the web UI:
12 | 
13 | ```
14 | python convert-to-flexgen.py models/opt-1.3b/
15 | ```
16 | 
17 | The output will be saved to `models/opt-1.3b-np/`.
18 | 
19 | ## Usage
20 | 
21 | The basic command is the following:
22 | 
23 | ```
24 | python server.py --model opt-1.3b  --loader flexgen
25 | ```
26 | 
27 | For large models, the RAM usage may be too high and your computer may freeze. If that happens, you can try this:
28 | 
29 | ```
30 | python server.py --model opt-1.3b  --loader flexgen --compress-weight
31 | ```
32 | 
33 | With this second command, I was able to run both OPT-6.7b and OPT-13B with **2GB VRAM**, and the speed was good in both cases.
34 | 
35 | You can also manually set the offload strategy with
36 | 
37 | ```
38 | python server.py --model opt-1.3b  --loader flexgen --percent 0 100 100 0 100 0
39 | ```
40 | 
41 | where the six numbers after `--percent` are:
42 | 
43 | ```
44 | the percentage of weight on GPU
45 | the percentage of weight on CPU
46 | the percentage of attention cache on GPU
47 | the percentage of attention cache on CPU
48 | the percentage of activations on GPU
49 | the percentage of activations on CPU
50 | ```
51 | 
52 | You should typically only change the first two numbers. If their sum is less than 100, the remaining layers will be offloaded to the disk, by default into the `text-generation-webui/cache` folder.
53 | 
54 | ## Performance
55 | 
56 | In my experiments with OPT-30B using a RTX 3090 on Linux, I have obtained these results:
57 | 
58 | * `--loader flexgen --compress-weight --percent 0 100 100 0 100 0`: 0.99 seconds per token.
59 | * `--loader flexgen --compress-weight --percent 100 0 100 0 100 0`: 0.765 seconds per token.
60 | 
61 | ## Limitations
62 | 
63 | * Only works with the OPT models.
64 | * Only two generation parameters are available: `temperature` and `do_sample`.


--------------------------------------------------------------------------------
/convert-to-flexgen.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Converts a transformers model to a format compatible with flexgen.
 4 | 
 5 | '''
 6 | 
 7 | import argparse
 8 | import os
 9 | from pathlib import Path
10 | 
11 | import numpy as np
12 | import torch
13 | from tqdm import tqdm
14 | from transformers import AutoModelForCausalLM, AutoTokenizer
15 | 
16 | parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54))
17 | parser.add_argument('MODEL', type=str, default=None, nargs='?', help="Path to the input model.")
18 | args = parser.parse_args()
19 | 
20 | 
21 | def disable_torch_init():
22 |     """
23 |     Disable the redundant torch default initialization to accelerate model creation.
24 |     """
25 |     import torch
26 |     global torch_linear_init_backup
27 |     global torch_layer_norm_init_backup
28 | 
29 |     torch_linear_init_backup = torch.nn.Linear.reset_parameters
30 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
31 | 
32 |     torch_layer_norm_init_backup = torch.nn.LayerNorm.reset_parameters
33 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
34 | 
35 | 
36 | def restore_torch_init():
37 |     """Rollback the change made by disable_torch_init."""
38 |     import torch
39 |     setattr(torch.nn.Linear, "reset_parameters", torch_linear_init_backup)
40 |     setattr(torch.nn.LayerNorm, "reset_parameters", torch_layer_norm_init_backup)
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     path = Path(args.MODEL)
45 |     model_name = path.name
46 | 
47 |     print(f"Loading {model_name}...")
48 |     # disable_torch_init()
49 |     model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
50 |     # restore_torch_init()
51 | 
52 |     tokenizer = AutoTokenizer.from_pretrained(path)
53 | 
54 |     out_folder = Path(f"models/{model_name}-np")
55 |     if not Path(out_folder).exists():
56 |         os.mkdir(out_folder)
57 | 
58 |     print(f"Saving the converted model to {out_folder}...")
59 |     for name, param in tqdm(list(model.model.named_parameters())):
60 |         name = name.replace("decoder.final_layer_norm", "decoder.layer_norm")
61 |         param_path = os.path.join(out_folder, name)
62 |         with open(param_path, "wb") as f:
63 |             np.save(f, param.cpu().detach().numpy())
64 | 


--------------------------------------------------------------------------------
/extensions/send_pictures/script.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from io import BytesIO
 3 | 
 4 | import gradio as gr
 5 | import torch
 6 | from transformers import BlipForConditionalGeneration, BlipProcessor
 7 | 
 8 | from modules import chat, shared
 9 | from modules.ui import gather_interface_values
10 | 
11 | # If 'state' is True, will hijack the next chat generation with
12 | # custom input text given by 'value' in the format [text, visible_text]
13 | input_hijack = {
14 |     'state': False,
15 |     'value': ["", ""]
16 | }
17 | 
18 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
19 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float32).to("cpu")
20 | 
21 | 
22 | def caption_image(raw_image):
23 |     inputs = processor(raw_image.convert('RGB'), return_tensors="pt").to("cpu", torch.float32)
24 |     out = model.generate(**inputs, max_new_tokens=100)
25 |     return processor.decode(out[0], skip_special_tokens=True)
26 | 
27 | 
28 | def generate_chat_picture(picture, name1, name2):
29 |     text = f'*{name1} sends {name2} a picture that contains the following: “{caption_image(picture)}”*'
30 |     # lower the resolution of sent images for the chat, otherwise the log size gets out of control quickly with all the base64 values in visible history
31 |     picture.thumbnail((300, 300))
32 |     buffer = BytesIO()
33 |     picture.save(buffer, format="JPEG")
34 |     img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
35 |     visible_text = f'<img src="data:image/jpeg;base64,{img_str}" alt="{text}">'
36 |     return text, visible_text
37 | 
38 | 
39 | def ui():
40 |     picture_select = gr.Image(label='Send a picture', type='pil')
41 | 
42 |     # Prepare the input hijack, update the interface values, call the generation function, and clear the picture
43 |     picture_select.upload(
44 |         lambda picture, name1, name2: input_hijack.update({"state": True, "value": generate_chat_picture(picture, name1, name2)}), [picture_select, shared.gradio['name1'], shared.gradio['name2']], None).then(
45 |         gather_interface_values, [shared.gradio[k] for k in shared.input_elements], shared.gradio['interface_state']).then(
46 |         chat.generate_chat_reply_wrapper, shared.input_params, shared.gradio['display'], show_progress=False).then(
47 |         lambda: None, None, picture_select, show_progress=False)
48 | 


--------------------------------------------------------------------------------
/docs/Chat-mode.md:
--------------------------------------------------------------------------------
 1 | ## Chat characters
 2 | 
 3 | Custom chat mode characters are defined by `.yaml` files inside the `characters` folder. An example is included: [Example.yaml](https://github.com/oobabooga/text-generation-webui/blob/main/characters/Example.yaml)
 4 | 
 5 | The following fields may be defined:
 6 | 
 7 | | Field | Description |
 8 | |-------|-------------|
 9 | | `name` or `bot` | The character's name. |
10 | | `your_name` or `user` (optional) | Your name. This overwrites what you had previously written in the `Your name` field in the interface. |
11 | | `context` | A string that appears at the top of the prompt. It usually contains a description of the character's personality. |
12 | | `greeting` (optional) | The character's opening message when a new conversation is started. |
13 | | `example_dialogue` (optional) | A few example messages to guide the model. |
14 | | `turn_template` (optional) | Used to define where the spaces and new line characters should be in Instruct mode. See the characters in `characters/instruction-following` for examples. |
15 | 
16 | #### Special tokens
17 | 
18 | * `{{char}}` or `<BOT>`: are replaced with the character's name
19 | * `{{user}}` or `<USER>`: are replaced with your name
20 | 
21 | These replacements happen when the character is loaded, and they apply to the `context`, `greeting`, and `example_dialogue` fields.
22 | 
23 | #### How do I add a profile picture for my character?
24 | 
25 | Put an image with the same name as your character's yaml file into the `characters` folder. For example, if your bot is `Character.yaml`, add `Character.jpg` or `Character.png` to the folder.
26 | 
27 | #### Is the chat history truncated in the prompt?
28 | 
29 | Once your prompt reaches the 2048 token limit, old messages will be removed one at a time. The context string will always stay at the top of the prompt and will never get truncated.
30 | 
31 | #### Pygmalion format characters
32 | 
33 | These are also supported out of the box. Simply put the JSON file in the `characters` folder, or upload it directly from the web UI by clicking on the "Upload character" tab at the bottom.
34 | 
35 | ## Chat styles
36 | 
37 | Custom chat styles can be defined in the `text-generation-webui/css` folder. Simply create a new file with name starting in `chat_style-` and ending in `.css` and it will automatically appear in the "Chat style" dropdown menu in the interface. Examples:
38 | 
39 | ```
40 | chat_style-cai-chat.css
41 | chat_style-TheEncrypted777.css
42 | chat_style-wpp.css
43 | ```
44 | 
45 | You should use the same class names as in `chat_style-cai-chat.css` in your custom style.


--------------------------------------------------------------------------------
/extensions/multimodal/pipeline_loader.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | from importlib import import_module
 3 | from pathlib import Path
 4 | from typing import Tuple
 5 | 
 6 | from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
 7 | from modules import shared
 8 | from modules.logging_colors import logger
 9 | 
10 | 
11 | def _get_available_pipeline_modules():
12 |     pipeline_path = Path(__file__).parent / 'pipelines'
13 |     modules = [p for p in pipeline_path.iterdir() if p.is_dir()]
14 |     return [m.name for m in modules if (m / 'pipelines.py').exists()]
15 | 
16 | 
17 | def load_pipeline(params: dict) -> Tuple[AbstractMultimodalPipeline, str]:
18 |     pipeline_modules = {}
19 |     available_pipeline_modules = _get_available_pipeline_modules()
20 |     for name in available_pipeline_modules:
21 |         try:
22 |             pipeline_modules[name] = import_module(f'extensions.multimodal.pipelines.{name}.pipelines')
23 |         except:
24 |             logger.warning(f'Failed to get multimodal pipelines from {name}')
25 |             logger.warning(traceback.format_exc())
26 | 
27 |     if shared.args.multimodal_pipeline is not None:
28 |         for k in pipeline_modules:
29 |             if hasattr(pipeline_modules[k], 'get_pipeline'):
30 |                 pipeline = getattr(pipeline_modules[k], 'get_pipeline')(shared.args.multimodal_pipeline, params)
31 |                 if pipeline is not None:
32 |                     return (pipeline, k)
33 |     else:
34 |         model_name = shared.args.model.lower()
35 |         for k in pipeline_modules:
36 |             if hasattr(pipeline_modules[k], 'get_pipeline_from_model_name'):
37 |                 pipeline = getattr(pipeline_modules[k], 'get_pipeline_from_model_name')(model_name, params)
38 |                 if pipeline is not None:
39 |                     return (pipeline, k)
40 | 
41 |     available = []
42 |     for k in pipeline_modules:
43 |         if hasattr(pipeline_modules[k], 'available_pipelines'):
44 |             pipelines = getattr(pipeline_modules[k], 'available_pipelines')
45 |             available += pipelines
46 | 
47 |     if shared.args.multimodal_pipeline is not None:
48 |         log = f'Multimodal - ERROR: Failed to load multimodal pipeline "{shared.args.multimodal_pipeline}", available pipelines are: {available}.'
49 |     else:
50 |         log = f'Multimodal - ERROR: Failed to determine multimodal pipeline for model {shared.args.model}, please select one manually using --multimodal-pipeline [PIPELINE]. Available pipelines are: {available}.'
51 |     logger.critical(f'{log} Please specify a correct pipeline, or disable the extension')
52 |     raise RuntimeError(f'{log} Please specify a correct pipeline, or disable the extension')
53 | 


--------------------------------------------------------------------------------
/modules/callbacks.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | import traceback
 3 | from queue import Queue
 4 | from threading import Thread
 5 | 
 6 | import torch
 7 | import transformers
 8 | 
 9 | import modules.shared as shared
10 | 
11 | 
12 | class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
13 |     def __init__(self):
14 |         transformers.StoppingCriteria.__init__(self)
15 | 
16 |     def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
17 |         return shared.stop_everything
18 | 
19 | 
20 | class Stream(transformers.StoppingCriteria):
21 |     def __init__(self, callback_func=None):
22 |         self.callback_func = callback_func
23 | 
24 |     def __call__(self, input_ids, scores) -> bool:
25 |         if self.callback_func is not None:
26 |             self.callback_func(input_ids[0])
27 |         return False
28 | 
29 | 
30 | class Iteratorize:
31 | 
32 |     """
33 |     Transforms a function that takes a callback
34 |     into a lazy iterator (generator).
35 | 
36 |     Adapted from: https://stackoverflow.com/a/9969000
37 |     """
38 | 
39 |     def __init__(self, func, args=None, kwargs=None, callback=None):
40 |         self.mfunc = func
41 |         self.c_callback = callback
42 |         self.q = Queue()
43 |         self.sentinel = object()
44 |         self.args = args or []
45 |         self.kwargs = kwargs or {}
46 |         self.stop_now = False
47 | 
48 |         def _callback(val):
49 |             if self.stop_now or shared.stop_everything:
50 |                 raise ValueError
51 |             self.q.put(val)
52 | 
53 |         def gentask():
54 |             try:
55 |                 ret = self.mfunc(callback=_callback, *args, **self.kwargs)
56 |             except ValueError:
57 |                 pass
58 |             except:
59 |                 traceback.print_exc()
60 |                 pass
61 | 
62 |             clear_torch_cache()
63 |             self.q.put(self.sentinel)
64 |             if self.c_callback:
65 |                 self.c_callback(ret)
66 | 
67 |         self.thread = Thread(target=gentask)
68 |         self.thread.start()
69 | 
70 |     def __iter__(self):
71 |         return self
72 | 
73 |     def __next__(self):
74 |         obj = self.q.get(True, None)
75 |         if obj is self.sentinel:
76 |             raise StopIteration
77 |         else:
78 |             return obj
79 | 
80 |     def __del__(self):
81 |         clear_torch_cache()
82 | 
83 |     def __enter__(self):
84 |         return self
85 | 
86 |     def __exit__(self, exc_type, exc_val, exc_tb):
87 |         self.stop_now = True
88 |         clear_torch_cache()
89 | 
90 | 
91 | def clear_torch_cache():
92 |     gc.collect()
93 |     if not shared.args.cpu:
94 |         torch.cuda.empty_cache()
95 | 


--------------------------------------------------------------------------------
/api-examples/api-example-stream.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import sys
 4 | 
 5 | try:
 6 |     import websockets
 7 | except ImportError:
 8 |     print("Websockets package not found. Make sure it's installed.")
 9 | 
10 | # For local streaming, the websockets are hosted without ssl - ws://
11 | HOST = 'localhost:5005'
12 | URI = f'ws://{HOST}/api/v1/stream'
13 | 
14 | # For reverse-proxied streaming, the remote will likely host with ssl - wss://
15 | # URI = 'wss://your-uri-here.trycloudflare.com/api/v1/stream'
16 | 
17 | 
18 | async def run(context):
19 |     # Note: the selected defaults change from time to time.
20 |     request = {
21 |         'prompt': context,
22 |         'max_new_tokens': 250,
23 | 
24 |         # Generation params. If 'preset' is set to different than 'None', the values
25 |         # in presets/preset-name.yaml are used instead of the individual numbers.
26 |         'preset': 'None',  
27 |         'do_sample': True,
28 |         'temperature': 0.7,
29 |         'top_p': 0.1,
30 |         'typical_p': 1,
31 |         'epsilon_cutoff': 0,  # In units of 1e-4
32 |         'eta_cutoff': 0,  # In units of 1e-4
33 |         'tfs': 1,
34 |         'top_a': 0,
35 |         'repetition_penalty': 1.18,
36 |         'top_k': 40,
37 |         'min_length': 0,
38 |         'no_repeat_ngram_size': 0,
39 |         'num_beams': 1,
40 |         'penalty_alpha': 0,
41 |         'length_penalty': 1,
42 |         'early_stopping': False,
43 |         'mirostat_mode': 0,
44 |         'mirostat_tau': 5,
45 |         'mirostat_eta': 0.1,
46 | 
47 |         'seed': -1,
48 |         'add_bos_token': True,
49 |         'truncation_length': 2048,
50 |         'ban_eos_token': False,
51 |         'skip_special_tokens': True,
52 |         'stopping_strings': []
53 |     }
54 | 
55 |     async with websockets.connect(URI, ping_interval=None) as websocket:
56 |         await websocket.send(json.dumps(request))
57 | 
58 |         yield context  # Remove this if you just want to see the reply
59 | 
60 |         while True:
61 |             incoming_data = await websocket.recv()
62 |             incoming_data = json.loads(incoming_data)
63 | 
64 |             match incoming_data['event']:
65 |                 case 'text_stream':
66 |                     yield incoming_data['text']
67 |                 case 'stream_end':
68 |                     return
69 | 
70 | 
71 | async def print_response_stream(prompt):
72 |     async for response in run(prompt):
73 |         print(response, end='')
74 |         sys.stdout.flush()  # If we don't flush, we won't see tokens in realtime.
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     prompt = "In order to make homemade bread, follow these steps:\n1)"
79 |     asyncio.run(print_response_stream(prompt))
80 | 


--------------------------------------------------------------------------------
/modules/loaders.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | 
  3 | import gradio as gr
  4 | 
  5 | from modules import shared
  6 | 
  7 | loaders_and_params = {
  8 |     'AutoGPTQ': [
  9 |         'triton',
 10 |         'no_inject_fused_attention',
 11 |         'no_inject_fused_mlp',
 12 |         'no_use_cuda_fp16',
 13 |         'wbits',
 14 |         'groupsize',
 15 |         'desc_act',
 16 |         'gpu_memory',
 17 |         'cpu_memory',
 18 |         'cpu',
 19 |         'disk',
 20 |         'auto_devices',
 21 |         'trust_remote_code',
 22 |         'autogptq_info',
 23 |     ],
 24 |     'GPTQ-for-LLaMa': [
 25 |         'wbits',
 26 |         'groupsize',
 27 |         'model_type',
 28 |         'pre_layer',
 29 |         'gptq_for_llama_info',
 30 |     ],
 31 |     'llama.cpp': [
 32 |         'n_ctx',
 33 |         'n_gpu_layers',
 34 |         'n_batch',
 35 |         'threads',
 36 |         'no_mmap',
 37 |         'mlock',
 38 |         'llama_cpp_seed',
 39 |     ],
 40 |     'Transformers': [
 41 |         'cpu_memory',
 42 |         'gpu_memory',
 43 |         'trust_remote_code',
 44 |         'load_in_8bit',
 45 |         'bf16',
 46 |         'cpu',
 47 |         'disk',
 48 |         'auto_devices',
 49 |         'load_in_4bit',
 50 |         'use_double_quant',
 51 |         'quant_type',
 52 |         'compute_dtype',
 53 |         'trust_remote_code',
 54 |         'transformers_info'
 55 |     ],
 56 |     'ExLlama' : [
 57 |         'gpu_split',
 58 |         'max_seq_len',
 59 |         'compress_pos_emb',
 60 |         'exllama_info',
 61 |     ],
 62 |     'ExLlama_HF' : [
 63 |         'gpu_split',
 64 |         'max_seq_len',
 65 |         'compress_pos_emb',
 66 |         'exllama_HF_info',
 67 |     ]
 68 | }
 69 | 
 70 | 
 71 | def get_gpu_memory_keys():
 72 |     return [k for k in shared.gradio if k.startswith('gpu_memory')]
 73 | 
 74 | 
 75 | @functools.cache
 76 | def get_all_params():
 77 |     all_params = set()
 78 |     for k in loaders_and_params:
 79 |         for el in loaders_and_params[k]:
 80 |             all_params.add(el)
 81 | 
 82 |     if 'gpu_memory' in all_params:
 83 |         all_params.remove('gpu_memory')
 84 |         for k in get_gpu_memory_keys():
 85 |             all_params.add(k)
 86 | 
 87 |     return sorted(all_params)
 88 | 
 89 | 
 90 | def make_loader_params_visible(loader):
 91 |     params = []
 92 |     all_params = get_all_params()
 93 |     if loader in loaders_and_params:
 94 |         params = loaders_and_params[loader]
 95 | 
 96 |         if 'gpu_memory' in params:
 97 |             params.remove('gpu_memory')
 98 |             params += get_gpu_memory_keys()
 99 | 
100 |     return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]
101 | 


--------------------------------------------------------------------------------
/extensions/ngrok/README.md:
--------------------------------------------------------------------------------
 1 | # Adding an ingress URL through the ngrok Agent SDK for Python
 2 | 
 3 | [ngrok](https://ngrok.com) is a globally distributed reverse proxy commonly used for quickly getting a public URL to a
 4 | service running inside a private network, such as on your local laptop. The ngrok agent is usually
 5 | deployed inside a private network and is used to communicate with the ngrok cloud service.
 6 | 
 7 | By default the authtoken in the NGROK_AUTHTOKEN environment variable will be used. Alternatively one may be specified in
 8 | the `settings.json` file, see the Examples below. Retrieve your authtoken on the [Auth Token page of your ngrok dashboard](https://dashboard.ngrok.com/get-started/your-authtoken), signing up is free.
 9 | 
10 | # Documentation
11 | 
12 | For a list of all available options, see [the configuration documentation](https://ngrok.com/docs/ngrok-agent/config/) or [the connect example](https://github.com/ngrok/ngrok-py/blob/main/examples/ngrok-connect-full.py).
13 | 
14 | The ngrok Python SDK is [on github here](https://github.com/ngrok/ngrok-py). A quickstart guide and a full API reference are included in the [ngrok-py Python API documentation](https://ngrok.github.io/ngrok-py/).
15 | 
16 | # Running
17 | 
18 | To enable ngrok install the requirements and then add `--extension ngrok` to the command line options, for instance:
19 | 
20 | ```bash
21 | pip install -r extensions/ngrok/requirements.txt
22 | python server.py --extension ngrok
23 | ```
24 | 
25 | In the output you should then see something like this:
26 | 
27 | ```bash
28 | INFO:Loading the extension "ngrok"...
29 | INFO:Session created
30 | INFO:Created tunnel "9d9d0944dc75ff9d3aae653e5eb29fe9" with url "https://d83706cf7be7.ngrok.app"
31 | INFO:Tunnel "9d9d0944dc75ff9d3aae653e5eb29fe9" TCP forwarding to "localhost:7860"
32 | INFO:Ingress established at https://d83706cf7be7.ngrok.app
33 | ```
34 | 
35 | You can now access the webui via the url shown, in this case `https://d83706cf7be7.ngrok.app`. It is recommended to add some authentication to the ingress, see below.
36 | 
37 | # Example Settings
38 | 
39 | In `settings.json` add a `ngrok` key with a dictionary of options, for instance:
40 | 
41 | To enable basic authentication:
42 | ```json
43 | {
44 |     "ngrok": {
45 |         "basic_auth": "user:password"
46 |     }
47 | }
48 | ```
49 | 
50 | To enable OAUTH authentication:
51 | ```json
52 | {
53 |     "ngrok": {
54 |         "oauth_provider": "google",
55 |         "oauth_allow_domains": "asdf.com",
56 |         "oauth_allow_emails": "asdf@asdf.com"
57 |     }
58 | }
59 | ```
60 | 
61 | To add an authtoken instead of using the NGROK_AUTHTOKEN environment variable:
62 | ```json
63 | {
64 |     "ngrok": {
65 |         "authtoken": "<token>",
66 |         "authtoken_from_env":false
67 |     }
68 | }
69 | ```


--------------------------------------------------------------------------------
/api-examples/api-example-chat.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import requests
 4 | 
 5 | # For local streaming, the websockets are hosted without ssl - http://
 6 | HOST = 'localhost:5000'
 7 | URI = f'http://{HOST}/api/v1/chat'
 8 | 
 9 | # For reverse-proxied streaming, the remote will likely host with ssl - https://
10 | # URI = 'https://your-uri-here.trycloudflare.com/api/v1/chat'
11 | 
12 | 
13 | def run(user_input, history):
14 |     request = {
15 |         'user_input': user_input,
16 |         'max_new_tokens': 250,
17 |         'history': history,
18 |         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
19 |         'character': 'Example',
20 |         'instruction_template': 'Vicuna-v1.1',
21 |         'your_name': 'You',
22 | 
23 |         'regenerate': False,
24 |         '_continue': False,
25 |         'stop_at_newline': False,
26 |         'chat_generation_attempts': 1,
27 |         'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
28 | 
29 |         # Generation params. If 'preset' is set to different than 'None', the values
30 |         # in presets/preset-name.yaml are used instead of the individual numbers.
31 |         'preset': 'None',
32 |         'do_sample': True,
33 |         'temperature': 0.7,
34 |         'top_p': 0.1,
35 |         'typical_p': 1,
36 |         'epsilon_cutoff': 0,  # In units of 1e-4
37 |         'eta_cutoff': 0,  # In units of 1e-4
38 |         'tfs': 1,
39 |         'top_a': 0,
40 |         'repetition_penalty': 1.18,
41 |         'top_k': 40,
42 |         'min_length': 0,
43 |         'no_repeat_ngram_size': 0,
44 |         'num_beams': 1,
45 |         'penalty_alpha': 0,
46 |         'length_penalty': 1,
47 |         'early_stopping': False,
48 |         'mirostat_mode': 0,
49 |         'mirostat_tau': 5,
50 |         'mirostat_eta': 0.1,
51 | 
52 |         'seed': -1,
53 |         'add_bos_token': True,
54 |         'truncation_length': 2048,
55 |         'ban_eos_token': False,
56 |         'skip_special_tokens': True,
57 |         'stopping_strings': []
58 |     }
59 | 
60 |     response = requests.post(URI, json=request)
61 | 
62 |     if response.status_code == 200:
63 |         result = response.json()['results'][0]['history']
64 |         print(json.dumps(result, indent=4))
65 |         print()
66 |         print(result['visible'][-1][1])
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     user_input = "Please give me a step-by-step guide on how to plant a tree in my backyard."
71 | 
72 |     # Basic example
73 |     history = {'internal': [], 'visible': []}
74 | 
75 |     # "Continue" example. Make sure to set '_continue' to True above
76 |     # arr = [user_input, 'Surely, here is']
77 |     # history = {'internal': [arr], 'visible': [arr]}
78 | 
79 |     run(user_input, history)
80 | 


--------------------------------------------------------------------------------
/modules/deepspeed_parameters.py:
--------------------------------------------------------------------------------
 1 | def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir):
 2 |     '''
 3 |     DeepSpeed configration
 4 |     https://huggingface.co/docs/transformers/main_classes/deepspeed
 5 |     '''
 6 | 
 7 |     if nvme_offload_dir:
 8 |         ds_config = {
 9 |             "fp16": {
10 |                 "enabled": not ds_bf16,
11 |             },
12 |             "bf16": {
13 |                 "enabled": ds_bf16,
14 |             },
15 |             "zero_optimization": {
16 |                 "stage": 3,
17 |                 "offload_param": {
18 |                     "device": "nvme",
19 |                     "nvme_path": nvme_offload_dir,
20 |                     "pin_memory": True,
21 |                     "buffer_count": 5,
22 |                     "buffer_size": 1e9,
23 |                     "max_in_cpu": 1e9
24 |                 },
25 |                 "overlap_comm": True,
26 |                 "reduce_bucket_size": "auto",
27 |                 "contiguous_gradients": True,
28 |                 "sub_group_size": 1e8,
29 |                 "stage3_prefetch_bucket_size": "auto",
30 |                 "stage3_param_persistence_threshold": "auto",
31 |                 "stage3_max_live_parameters": "auto",
32 |                 "stage3_max_reuse_distance": "auto",
33 |             },
34 |             "aio": {
35 |                 "block_size": 262144,
36 |                 "queue_depth": 32,
37 |                 "thread_count": 1,
38 |                 "single_submit": False,
39 |                 "overlap_events": True
40 |             },
41 |             "steps_per_print": 2000,
42 |             "train_batch_size": train_batch_size,
43 |             "train_micro_batch_size_per_gpu": 1,
44 |             "wall_clock_breakdown": False
45 |         }
46 |     else:
47 |         ds_config = {
48 |             "fp16": {
49 |                 "enabled": not ds_bf16,
50 |             },
51 |             "bf16": {
52 |                 "enabled": ds_bf16,
53 |             },
54 |             "zero_optimization": {
55 |                 "stage": 3,
56 |                 "offload_param": {
57 |                     "device": "cpu",
58 |                     "pin_memory": True
59 |                 },
60 |                 "overlap_comm": True,
61 |                 "contiguous_gradients": True,
62 |                 "reduce_bucket_size": "auto",
63 |                 "stage3_prefetch_bucket_size": "auto",
64 |                 "stage3_param_persistence_threshold": "auto",
65 |                 "stage3_max_live_parameters": "auto",
66 |                 "stage3_max_reuse_distance": "auto",
67 |             },
68 |             "steps_per_print": 2000,
69 |             "train_batch_size": train_batch_size,
70 |             "train_micro_batch_size_per_gpu": 1,
71 |             "wall_clock_breakdown": False
72 |         }
73 | 
74 |     return ds_config
75 | 


--------------------------------------------------------------------------------
/css/chat.css:
--------------------------------------------------------------------------------
  1 | .h-\[40vh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
  2 |     height: 66.67vh
  3 | }
  4 | 
  5 | .gradio-container {
  6 |     margin-left: auto !important;
  7 |     margin-right: auto !important;
  8 | }
  9 | 
 10 | .w-screen {
 11 |     width: unset
 12 | }
 13 | 
 14 | div.svelte-362y77>*, div.svelte-362y77>.form>* {
 15 |     flex-wrap: nowrap
 16 | }
 17 | 
 18 | /* fixes the API documentation in chat mode */
 19 | .api-docs.svelte-1iguv9h.svelte-1iguv9h.svelte-1iguv9h {
 20 |     display: grid;
 21 | }
 22 | 
 23 | .pending.svelte-1ed2p3z {
 24 |     opacity: 1;
 25 | }
 26 | 
 27 | #extensions {
 28 |     padding: 0;
 29 |     padding: 0;
 30 | }
 31 | 
 32 | #gradio-chatbot {
 33 |     height: 66.67vh;
 34 | }
 35 | 
 36 | .wrap.svelte-6roggh.svelte-6roggh {
 37 |     max-height: 92.5%;
 38 | }
 39 | 
 40 | /* This is for the microphone button in the whisper extension */
 41 | .sm.svelte-1ipelgc {
 42 |     width: 100%;
 43 | }
 44 | 
 45 | #main button {
 46 |     min-width: 0 !important;
 47 | }
 48 | 
 49 | /*****************************************************/
 50 | /*************** Chat box declarations ***************/
 51 | /*****************************************************/
 52 | 
 53 | .chat {
 54 |     margin-left: auto;
 55 |     margin-right: auto;
 56 |     max-width: 800px;
 57 |     height: calc(100vh - 296px);
 58 |     overflow-y: auto;
 59 |     padding-right: 20px;
 60 |     display: flex;
 61 |     flex-direction: column-reverse;
 62 |     word-break: break-word;
 63 |     overflow-wrap: anywhere;
 64 |     padding-top: 1px;
 65 | }
 66 | 
 67 | .message-body li {
 68 |     margin-top: 0.5em !important;
 69 |     margin-bottom: 0.5em !important;
 70 | }
 71 | 
 72 | .message-body li > p {
 73 |     display: inline !important;
 74 | }
 75 | 
 76 | .message-body ul, .message-body ol {
 77 |     font-size: 15px !important;
 78 | }
 79 | 
 80 | .message-body ul {
 81 |     list-style-type: disc !important;
 82 | }
 83 | 
 84 | .message-body pre {
 85 |     margin-bottom: 1.25em !important;
 86 | }
 87 | 
 88 | .message-body code {
 89 |     white-space: pre-wrap !important;
 90 |     word-wrap: break-word !important;
 91 | }
 92 | 
 93 | .message-body :not(pre) > code {
 94 |     white-space: normal !important;
 95 | }
 96 | 
 97 | @media print {
 98 |     body {
 99 |         visibility: hidden;
100 |     }
101 | 
102 |     .chat {
103 |         visibility: visible;
104 |         position: absolute;
105 |         left: 0;
106 |         top: 0;
107 |         max-width: none;
108 |         max-height: none;
109 |         width: 100%;
110 |         height: fit-content;
111 |         display: flex;
112 |         flex-direction: column-reverse;
113 |     }
114 |     
115 |     .message {
116 |         break-inside: avoid;
117 |     }
118 |     
119 |     .gradio-container {
120 |         overflow: visible;
121 |     }
122 |     
123 |     .tab-nav {
124 |         display: none !important;
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/docs/LoRA.md:
--------------------------------------------------------------------------------
 1 | # LoRA
 2 | 
 3 | LoRA (Low-Rank Adaptation) is an extremely powerful method for customizing a base model by training only a small number of parameters. They can be attached to models at runtime.
 4 | 
 5 | For instance, a 50mb LoRA can teach LLaMA an entire new language, a given writing style, or give it instruction-following or chat abilities.
 6 | 
 7 | This is the current state of LoRA integration in the web UI:
 8 | 
 9 | |Loader | Status |
10 | |--------|------|
11 | | Transformers | Full support in 16-bit, `--load-in-8bit`, `--load-in-4bit`, and CPU modes. |
12 | | ExLlama | Single LoRA support. Fast to remove the LoRA afterwards. |
13 | | AutoGPTQ | Single LoRA support. Removing the LoRA requires reloading the entire model.|
14 | | GPTQ-for-LLaMa | Full support with the [monkey patch](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#using-loras-with-gptq-for-llama). |
15 | 
16 | ## Downloading a LoRA
17 | 
18 | The download script can be used. For instance:
19 | 
20 | ```
21 | python download-model.py tloen/alpaca-lora-7b
22 | ```
23 | 
24 | The files will be saved to `loras/tloen_alpaca-lora-7b`.
25 | 
26 | ## Using the LoRA
27 | 
28 | The `--lora` command-line flag can be used. Examples:
29 | 
30 | ```
31 | python server.py --model llama-7b-hf --lora tloen_alpaca-lora-7b
32 | python server.py --model llama-7b-hf --lora tloen_alpaca-lora-7b --load-in-8bit
33 | python server.py --model llama-7b-hf --lora tloen_alpaca-lora-7b --load-in-4bit
34 | python server.py --model llama-7b-hf --lora tloen_alpaca-lora-7b --cpu
35 | ```
36 | 
37 | Instead of using the `--lora` command-line flag, you can also select the LoRA in the "Parameters" tab of the interface.
38 | 
39 | ## Prompt
40 | For the Alpaca LoRA in particular, the prompt must be formatted like this:
41 | 
42 | ```
43 | Below is an instruction that describes a task. Write a response that appropriately completes the request.
44 | ### Instruction:
45 | Write a Python script that generates text using the transformers library.
46 | ### Response:
47 | ```
48 | 
49 | Sample output:
50 | 
51 | ```
52 | Below is an instruction that describes a task. Write a response that appropriately completes the request.
53 | ### Instruction:
54 | Write a Python script that generates text using the transformers library.
55 | ### Response:
56 | 
57 | import transformers
58 | from transformers import AutoTokenizer, AutoModelForCausalLM
59 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
60 | model = AutoModelForCausalLM.from_pretrained("bert-base-uncased")
61 | texts = ["Hello world", "How are you"]
62 | for sentence in texts:
63 | sentence = tokenizer(sentence)
64 | print(f"Generated {len(sentence)} tokens from '{sentence}'")
65 | output = model(sentences=sentence).predict()
66 | print(f"Predicted {len(output)} tokens for '{sentence}':\n{output}")
67 | ```
68 | 
69 | ## Training a LoRA
70 | 
71 | You can train your own LoRAs from the `Training` tab. See [Training LoRAs](Training-LoRAs.md) for details.
72 | 


--------------------------------------------------------------------------------
/docs/RWKV-model.md:
--------------------------------------------------------------------------------
 1 | > RWKV: RNN with Transformer-level LLM Performance
 2 | >
 3 | > It combines the best of RNN and transformer - great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding (using the final hidden state).
 4 | 
 5 | https://github.com/BlinkDL/RWKV-LM
 6 | 
 7 | https://github.com/BlinkDL/ChatRWKV
 8 | 
 9 | ## Using RWKV in the web UI
10 | 
11 | ### Hugging Face weights
12 | 
13 | Simply download the weights from https://huggingface.co/RWKV and load them as you would for any other model.
14 | 
15 | There is a bug in transformers==4.29.2 that prevents RWKV from being loaded in 8-bit mode. You can install the dev branch to solve this bug: `pip install git+https://github.com/huggingface/transformers`
16 | 
17 | ### Original .pth weights
18 | 
19 | The instructions below are from before RWKV was supported in transformers, and they are kept for legacy purposes. The old implementation is possibly faster, but it lacks the full range of samplers that the transformers library offers.
20 | 
21 | #### 0. Install the RWKV library
22 | 
23 | ```
24 | pip install rwkv
25 | ```
26 | 
27 | `0.7.3` was the last version that I tested. If you experience any issues, try ```pip install rwkv==0.7.3```.
28 | 
29 | #### 1. Download the model
30 | 
31 | It is available in different sizes:
32 | 
33 | * https://huggingface.co/BlinkDL/rwkv-4-pile-3b/
34 | * https://huggingface.co/BlinkDL/rwkv-4-pile-7b/
35 | * https://huggingface.co/BlinkDL/rwkv-4-pile-14b/
36 | 
37 | There are also older releases with smaller sizes like:
38 | 
39 | * https://huggingface.co/BlinkDL/rwkv-4-pile-169m/resolve/main/RWKV-4-Pile-169M-20220807-8023.pth
40 | 
41 | Download the chosen `.pth` and put it directly in the `models` folder. 
42 | 
43 | #### 2. Download the tokenizer
44 | 
45 | [20B_tokenizer.json](https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/v2/20B_tokenizer.json)
46 | 
47 | Also put it directly in the `models` folder. Make sure to not rename it. It should be called `20B_tokenizer.json`.
48 | 
49 | #### 3. Launch the web UI
50 | 
51 | No additional steps are required. Just launch it as you would with any other model.
52 | 
53 | ```
54 | python server.py --listen  --no-stream --model RWKV-4-Pile-169M-20220807-8023.pth
55 | ```
56 | 
57 | #### Setting a custom strategy
58 | 
59 | It is possible to have very fine control over the offloading and precision for the model with the `--rwkv-strategy` flag. Possible values include:
60 | 
61 | ```
62 | "cpu fp32" # CPU mode
63 | "cuda fp16" # GPU mode with float16 precision
64 | "cuda fp16 *30 -> cpu fp32" # GPU+CPU offloading. The higher the number after *, the higher the GPU allocation.
65 | "cuda fp16i8" # GPU mode with 8-bit precision
66 | ```
67 | 
68 | See the README for the PyPl package for more details: https://pypi.org/project/rwkv/
69 | 
70 | #### Compiling the CUDA kernel
71 | 
72 | You can compile the CUDA kernel for the model with `--rwkv-cuda-on`. This should improve the performance a lot but I haven't been able to get it to work yet.
73 | 


--------------------------------------------------------------------------------
/modules/AutoGPTQ_loader.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 4 | 
 5 | import modules.shared as shared
 6 | from modules.logging_colors import logger
 7 | from modules.models import get_max_memory_dict
 8 | 
 9 | 
10 | def load_quantized(model_name):
11 |     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
12 |     pt_path = None
13 | 
14 |     # Find the model checkpoint
15 |     if shared.args.checkpoint:
16 |         pt_path = Path(shared.args.checkpoint)
17 |     else:
18 |         for ext in ['.safetensors', '.pt', '.bin']:
19 |             found = list(path_to_model.glob(f"*{ext}"))
20 |             if len(found) > 0:
21 |                 if len(found) > 1:
22 |                     logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
23 | 
24 |                 pt_path = found[-1]
25 |                 break
26 | 
27 |     if pt_path is None:
28 |         logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
29 |         return
30 | 
31 |     use_safetensors = pt_path.suffix == '.safetensors'
32 |     if not (path_to_model / "quantize_config.json").exists():
33 |         quantize_config = BaseQuantizeConfig(
34 |             bits=bits if (bits := shared.args.wbits) > 0 else 4,
35 |             group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
36 |             desc_act=shared.args.desc_act
37 |         )
38 |     else:
39 |         quantize_config = None
40 | 
41 |     # Define the params for AutoGPTQForCausalLM.from_quantized
42 |     params = {
43 |         'model_basename': pt_path.stem,
44 |         'device': "cuda:0" if not shared.args.cpu else "cpu",
45 |         'use_triton': shared.args.triton,
46 |         'inject_fused_attention': not shared.args.no_inject_fused_attention,
47 |         'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
48 |         'use_safetensors': use_safetensors,
49 |         'trust_remote_code': shared.args.trust_remote_code,
50 |         'max_memory': get_max_memory_dict(),
51 |         'quantize_config': quantize_config,
52 |         'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
53 |     }
54 | 
55 |     logger.info(f"The AutoGPTQ params are: {params}")
56 |     model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
57 | 
58 |     # These lines fix the multimodal extension when used with AutoGPTQ
59 |     if hasattr(model, 'model'):
60 |         if not hasattr(model, 'dtype'):
61 |             if hasattr(model.model, 'dtype'):
62 |                 model.dtype = model.model.dtype
63 | 
64 |         if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
65 |             if not hasattr(model, 'embed_tokens'):
66 |                 model.embed_tokens = model.model.model.embed_tokens
67 | 
68 |             if not hasattr(model.model, 'embed_tokens'):
69 |                 model.model.embed_tokens = model.model.model.embed_tokens
70 | 
71 |     return model
72 | 


--------------------------------------------------------------------------------
/css/html_4chan_style.css:
--------------------------------------------------------------------------------
  1 | #parent #container {
  2 |     background-color: #eef2ff;
  3 |     padding: 17px;
  4 | }
  5 | 
  6 | #parent #container .reply {
  7 |     background-color: rgb(214, 218, 240);
  8 |     border-bottom-color: rgb(183, 197, 217);
  9 |     border-bottom-style: solid;
 10 |     border-bottom-width: 1px;
 11 |     border-image-outset: 0;
 12 |     border-image-repeat: stretch;
 13 |     border-image-slice: 100%;
 14 |     border-image-source: none;
 15 |     border-image-width: 1;
 16 |     border-left-color: rgb(0, 0, 0);
 17 |     border-left-style: none;
 18 |     border-left-width: 0px;
 19 |     border-right-color: rgb(183, 197, 217);
 20 |     border-right-style: solid;
 21 |     border-right-width: 1px;
 22 |     border-top-color: rgb(0, 0, 0);
 23 |     border-top-style: none;
 24 |     border-top-width: 0px;
 25 |     color: rgb(0, 0, 0);
 26 |     display: table;
 27 |     font-family: arial, helvetica, sans-serif;
 28 |     font-size: 13.3333px;
 29 |     margin-bottom: 4px;
 30 |     margin-left: 0px;
 31 |     margin-right: 0px;
 32 |     margin-top: 4px;
 33 |     overflow-x: hidden;
 34 |     overflow-y: hidden;
 35 |     padding-bottom: 4px;
 36 |     padding-left: 2px;
 37 |     padding-right: 2px;
 38 |     padding-top: 4px;
 39 | }
 40 | 
 41 | #parent #container .number {
 42 |     color: rgb(0, 0, 0);
 43 |     font-family: arial, helvetica, sans-serif;
 44 |     font-size: 13.3333px;
 45 |     width: 342.65px;
 46 |     margin-right: 7px;
 47 | }
 48 | 
 49 | #parent #container .op {
 50 |     color: rgb(0, 0, 0);
 51 |     font-family: arial, helvetica, sans-serif;
 52 |     font-size: 13.3333px;
 53 |     margin-bottom: 8px;
 54 |     margin-left: 0px;
 55 |     margin-right: 0px;
 56 |     margin-top: 4px;
 57 |     overflow-x: hidden;
 58 |     overflow-y: hidden;
 59 | }
 60 | 
 61 | #parent #container .op blockquote {
 62 |     margin-left: 0px !important;
 63 | }
 64 | 
 65 | #parent #container .name {
 66 |     color: rgb(17, 119, 67);
 67 |     font-family: arial, helvetica, sans-serif;
 68 |     font-size: 13.3333px;
 69 |     font-weight: 700;
 70 |     margin-left: 7px;
 71 | }
 72 | 
 73 | #parent #container .quote {
 74 |     color: rgb(221, 0, 0);
 75 |     font-family: arial, helvetica, sans-serif;
 76 |     font-size: 13.3333px;
 77 |     text-decoration-color: rgb(221, 0, 0);
 78 |     text-decoration-line: underline;
 79 |     text-decoration-style: solid;
 80 |     text-decoration-thickness: auto;
 81 | }
 82 | 
 83 | #parent #container .greentext {
 84 |     color: rgb(120, 153, 34);
 85 |     font-family: arial, helvetica, sans-serif;
 86 |     font-size: 13.3333px;
 87 | }
 88 | 
 89 | #parent #container blockquote {
 90 |     margin: 0px !important;
 91 |     margin-block-start: 1em;
 92 |     margin-block-end: 1em;
 93 |     margin-inline-start: 40px;
 94 |     margin-inline-end: 40px;
 95 |     margin-top: 13.33px !important;
 96 |     margin-bottom: 13.33px !important;
 97 |     margin-left: 40px !important;
 98 |     margin-right: 40px !important;
 99 | }
100 | 
101 | #parent #container .message {
102 |     color: black;
103 |     border: none;
104 | }


--------------------------------------------------------------------------------
/css/chat_style-TheEncrypted777.css:
--------------------------------------------------------------------------------
  1 | /* All credits to TheEncrypted777: https://www.reddit.com/r/Oobabooga/comments/12xe6vq/updated_css_styling_with_color_customization_for/ */
  2 | 
  3 | .message {
  4 |     display: grid;
  5 |     grid-template-columns: 60px minmax(0, 1fr);
  6 |     padding-bottom: 28px;
  7 |     font-size: 18px;
  8 |     /*Change 'Quicksand' to a font you like or leave it*/
  9 |     font-family: Quicksand, Arial, sans-serif;
 10 |     line-height: 1.428571429;
 11 | }
 12 | 
 13 | .circle-you {
 14 |     background-color: gray;
 15 |     border-radius: 1rem;
 16 |     /*Change color to any you like to be the border of your image*/
 17 |     border: 2px solid white;
 18 | }
 19 | 
 20 | .circle-bot {
 21 |     background-color: gray;
 22 |     border-radius: 1rem;
 23 |     /*Change color to any you like to be the border of the bot's image*/
 24 |     border: 2px solid white;
 25 | }
 26 | 
 27 | .circle-bot img,
 28 | .circle-you img {
 29 |     border-radius: 10%;
 30 |     width: 100%;
 31 |     height: 100%;
 32 |     object-fit: cover;
 33 | }
 34 | 
 35 | .circle-you, .circle-bot {
 36 |     /*You can set the size of the profile images here, but if you do, you have to also adjust the .text{padding-left: 90px} to a different number according to the width of the image which is right below here*/
 37 |     width: 135px;
 38 |     height: 175px;
 39 | }
 40 | 
 41 | .text {
 42 |     /*Change this to move the message box further left or right depending on the size of your profile pic*/
 43 |     padding-left: 90px;
 44 |     text-shadow: 2px 2px 2px rgb(0, 0, 0);
 45 | }
 46 | 
 47 | .text p {
 48 |     margin-top: 2px;
 49 | }
 50 | 
 51 | .username {
 52 |     padding-left: 10px;
 53 |     font-size: 22px;
 54 |     font-weight: bold;
 55 |     border-top: 1px solid rgb(51, 64, 90);
 56 |     padding: 3px;
 57 | }
 58 | 
 59 | .message-body {
 60 |     position: relative;
 61 |     border-radius: 1rem;
 62 |     border: 1px solid rgba(255, 255, 255, 0.459);
 63 |     border-radius: 10px;
 64 |     padding: 10px;
 65 |     padding-top: 5px;
 66 |     /*Message gradient background color - remove the line bellow if you don't want a background color or gradient*/
 67 |     background: linear-gradient(to bottom, #171730, #1b263f);
 68 | }
 69 |   
 70 |   /*Adds 2 extra lines at the top and bottom of the message*/
 71 | .message-body:before,
 72 |   .message-body:after {
 73 |     content: "";
 74 |     position: absolute;
 75 |     left: 10px;
 76 |     right: 10px;
 77 |     height: 1px;
 78 |     background-color: rgba(255, 255, 255, 0.13);
 79 | }
 80 | 
 81 | .message-body:before {
 82 |     top: 6px;
 83 | }
 84 | 
 85 | .message-body:after {
 86 |     bottom: 6px;
 87 | }
 88 | 
 89 | .message-body img {
 90 |     max-width: 300px;
 91 |     max-height: 300px;
 92 |     border-radius: 20px;
 93 | }
 94 | 
 95 | .message-body p {
 96 |     margin-bottom: 0 !important;
 97 |     font-size: 18px !important;
 98 |     line-height: 1.428571429 !important;
 99 | }
100 | 
101 | .dark .message-body p em {
102 |     color: rgb(138, 138, 138) !important;
103 | }
104 | 
105 | .message-body p em {
106 |     color: rgb(110, 110, 110) !important;
107 | }
108 | 


--------------------------------------------------------------------------------
/extensions/gallery/script.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import gradio as gr
 4 | 
 5 | from modules.html_generator import get_image_cache
 6 | from modules.shared import gradio
 7 | 
 8 | 
 9 | def generate_css():
10 |     css = """
11 |       .character-gallery > .gallery {
12 |         margin: 1rem 0;
13 |         display: grid !important;
14 |         grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
15 |         grid-column-gap: 0.4rem;
16 |         grid-row-gap: 1.2rem;
17 |       }
18 | 
19 |       .character-gallery > .label {
20 |         display: none !important;
21 |       }
22 | 
23 |       .character-gallery button.gallery-item {
24 |         display: contents;
25 |       }
26 | 
27 |       .character-container {
28 |         cursor: pointer;
29 |         text-align: center;
30 |         position: relative;
31 |         opacity: 0.85;
32 |       }
33 | 
34 |       .character-container:hover {
35 |         opacity: 1;
36 |       }
37 | 
38 |       .character-container .placeholder, .character-container img {
39 |         width: 150px;
40 |         height: 200px;
41 |         background-color: gray;
42 |         object-fit: cover;
43 |         margin: 0 auto;
44 |         border-radius: 1rem;
45 |         border: 3px solid white;
46 |         box-shadow: 3px 3px 6px 0px rgb(0 0 0 / 50%);
47 |       }
48 | 
49 |       .character-name {
50 |         margin-top: 0.3rem;
51 |         display: block;
52 |         font-size: 1.2rem;
53 |         font-weight: 600;
54 |         overflow-wrap: anywhere;
55 |       }
56 |     """
57 |     return css
58 | 
59 | 
60 | def generate_html():
61 |     cards = []
62 |     # Iterate through files in image folder
63 |     for file in sorted(Path("characters").glob("*")):
64 |         if file.suffix in [".json", ".yml", ".yaml"]:
65 |             character = file.stem
66 |             container_html = '<div class="character-container">'
67 |             image_html = "<div class='placeholder'></div>"
68 | 
69 |             for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
70 |                 if path.exists():
71 |                     image_html = f'<img src="file/{get_image_cache(path)}">'
72 |                     break
73 | 
74 |             container_html += f'{image_html} <span class="character-name">{character}</span>'
75 |             container_html += "</div>"
76 |             cards.append([container_html, character])
77 | 
78 |     return cards
79 | 
80 | 
81 | def select_character(evt: gr.SelectData):
82 |     return (evt.value[1])
83 | 
84 | 
85 | def ui():
86 |     with gr.Accordion("Character gallery", open=False):
87 |         update = gr.Button("Refresh")
88 |         gr.HTML(value="<style>" + generate_css() + "</style>")
89 |         gallery = gr.Dataset(components=[gr.HTML(visible=False)],
90 |                              label="",
91 |                              samples=generate_html(),
92 |                              elem_classes=["character-gallery"],
93 |                              samples_per_page=50
94 |                              )
95 |     update.click(generate_html, [], gallery)
96 |     gallery.select(select_character, None, gradio['character_menu'])
97 | 


--------------------------------------------------------------------------------
/extensions/character_bias/script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import gradio as gr
 4 | 
 5 | # get the current directory of the script
 6 | current_dir = os.path.dirname(os.path.abspath(__file__))
 7 | 
 8 | # check if the bias_options.txt file exists, if not, create it
 9 | bias_file = os.path.join(current_dir, "bias_options.txt")
10 | if not os.path.isfile(bias_file):
11 |     with open(bias_file, "w") as f:
12 |         f.write("*I am so happy*\n*I am so sad*\n*I am so excited*\n*I am so bored*\n*I am so angry*")
13 | 
14 | # read bias options from the text file
15 | with open(bias_file, "r") as f:
16 |     bias_options = [line.strip() for line in f.readlines()]
17 | 
18 | params = {
19 |     "activate": True,
20 |     "bias string": " *I am so happy*",
21 |     "use custom string": False,
22 | }
23 | 
24 | 
25 | def input_modifier(string):
26 |     """
27 |     This function is applied to your text inputs before
28 |     they are fed into the model.
29 |     """
30 |     return string
31 | 
32 | 
33 | def output_modifier(string):
34 |     """
35 |     This function is applied to the model outputs.
36 |     """
37 |     return string
38 | 
39 | 
40 | def bot_prefix_modifier(string):
41 |     """
42 |     This function is only applied in chat mode. It modifies
43 |     the prefix text for the Bot and can be used to bias its
44 |     behavior.
45 |     """
46 |     if params['activate']:
47 |         if params['use custom string']:
48 |             return f'{string} {params["custom string"].strip()} '
49 |         else:
50 |             return f'{string} {params["bias string"].strip()} '
51 |     else:
52 |         return string
53 | 
54 | 
55 | def ui():
56 |     # Gradio elements
57 |     activate = gr.Checkbox(value=params['activate'], label='Activate character bias')
58 |     dropdown_string = gr.Dropdown(choices=bias_options, value=params["bias string"], label='Character bias', info='To edit the options in this dropdown edit the "bias_options.txt" file')
59 |     use_custom_string = gr.Checkbox(value=False, label='Use custom bias textbox instead of dropdown')
60 |     custom_string = gr.Textbox(value="", placeholder="Enter custom bias string", label="Custom Character Bias", info='To use this textbox activate the checkbox above')
61 | 
62 |     # Event functions to update the parameters in the backend
63 |     def update_bias_string(x):
64 |         if x:
65 |             params.update({"bias string": x})
66 |         else:
67 |             params.update({"bias string": dropdown_string.get()})
68 |         return x
69 | 
70 |     def update_custom_string(x):
71 |         params.update({"custom string": x})
72 | 
73 |     dropdown_string.change(update_bias_string, dropdown_string, None)
74 |     custom_string.change(update_custom_string, custom_string, None)
75 |     activate.change(lambda x: params.update({"activate": x}), activate, None)
76 |     use_custom_string.change(lambda x: params.update({"use custom string": x}), use_custom_string, None)
77 | 
78 |     # Group elements together depending on the selected option
79 |     def bias_string_group():
80 |         if use_custom_string.value:
81 |             return gr.Group([use_custom_string, custom_string])
82 |         else:
83 |             return dropdown_string
84 | 


--------------------------------------------------------------------------------
/css/main.css:
--------------------------------------------------------------------------------
  1 | .tabs.svelte-710i53 {
  2 |     margin-top: 0
  3 | }
  4 | 
  5 | .py-6 {
  6 |     padding-top: 2.5rem
  7 | }
  8 | 
  9 | .refresh-button {
 10 |     max-width: 4.4em;
 11 |     min-width: 2.2em !important;
 12 |     height: 39.594px;
 13 |     align-self: end;
 14 |     line-height: 1em;
 15 |     border-radius: 0.5em;
 16 |     flex: none;
 17 | }
 18 | 
 19 | .refresh-button-small {
 20 |     max-width: 2.2em;
 21 | }
 22 | 
 23 | #slim-column {
 24 |     flex: none !important;
 25 |     min-width: 0 !important;
 26 | }
 27 | 
 28 | .slim-dropdown {
 29 |     background-color: transparent !important;
 30 |     border: none !important;
 31 |     padding: 0 !important;
 32 | }
 33 | 
 34 | #download-label, #upload-label {
 35 |     min-height: 0
 36 | }
 37 | 
 38 | #accordion {
 39 | }
 40 | 
 41 | .dark svg {
 42 |     fill: white;
 43 | }
 44 | 
 45 | .dark a {
 46 |     color: white !important;
 47 | }
 48 | 
 49 | ol li p, ul li p {
 50 |     display: inline-block;
 51 | }
 52 | 
 53 | #main, #parameters, #chat-settings, #interface-mode, #lora, #training-tab, #model-tab {
 54 |     border: 0;
 55 | }
 56 | 
 57 | .gradio-container-3-18-0 .prose * h1, h2, h3, h4 {
 58 |     color: white;
 59 | }
 60 | 
 61 | .gradio-container {
 62 |     max-width: 100% !important;
 63 |     padding-top: 0 !important;
 64 | }
 65 | 
 66 | #extensions {
 67 |     padding: 15px;
 68 |     margin-bottom: 35px;
 69 | }
 70 | 
 71 | .extension-tab {
 72 |     border: 0 !important;
 73 | }
 74 | 
 75 | span.math.inline {
 76 |     font-size: 27px;
 77 |     vertical-align: baseline !important;
 78 | }
 79 | 
 80 | div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 81 |     flex-wrap: nowrap;
 82 | }
 83 | 
 84 | .header_bar {
 85 |     background-color: #f7f7f7;
 86 |     margin-bottom: 30px;
 87 | }
 88 | 
 89 | .dark .header_bar {
 90 |     border: none !important;
 91 |     background-color: #8080802b;
 92 | }
 93 | 
 94 | .textbox_default textarea {
 95 |     height: calc(100vh - 390px);
 96 | }
 97 | 
 98 | .textbox_default_output textarea {
 99 |     height: calc(100vh - 200px);
100 | }
101 | 
102 | .textbox textarea {
103 |     height: calc(100vh - 251px);
104 | }
105 | 
106 | .textbox_default textarea, .textbox_default_output textarea, .textbox textarea {
107 |     font-size: 16px !important;
108 |     color: #46464A !important;
109 | }
110 | 
111 | .dark textarea {
112 |     color: #efefef !important;
113 | }
114 | 
115 | /* Hide the gradio footer*/
116 | footer {
117 |     display: none !important;
118 | }
119 | 
120 | button {
121 |     font-size: 14px !important;
122 | }
123 | 
124 | .small-button {
125 |     max-width: 171px;
126 | }
127 | 
128 | .file-saver {
129 |   position: fixed !important;
130 |   top: 50%;
131 |   left: 50%;
132 |   transform: translate(-50%, -50%); /* center horizontally */
133 |   max-width: 500px;
134 |   background-color: var(--input-background-fill);
135 |   border: 2px solid black !important;
136 |   z-index: 1000;
137 | }
138 | 
139 | .dark .file-saver {
140 |   border: 2px solid white !important;
141 | }
142 | 
143 | .checkboxgroup-table label {
144 |     background: none !important;
145 |     padding: 0 !important;
146 |     border: 0 !important;
147 | }
148 | 
149 | .checkboxgroup-table div {
150 |     display: grid !important;
151 | }


--------------------------------------------------------------------------------
/extensions/google_translate/script.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | from deep_translator import GoogleTranslator
 3 | 
 4 | params = {
 5 |     "language string": "ja",
 6 | }
 7 | 
 8 | language_codes = {'Afrikaans': 'af', 'Albanian': 'sq', 'Amharic': 'am', 'Arabic': 'ar', 'Armenian': 'hy', 'Azerbaijani': 'az', 'Basque': 'eu', 'Belarusian': 'be', 'Bengali': 'bn', 'Bosnian': 'bs', 'Bulgarian': 'bg', 'Catalan': 'ca', 'Cebuano': 'ceb', 'Chinese (Simplified)': 'zh-CN', 'Chinese (Traditional)': 'zh-TW', 'Corsican': 'co', 'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dutch': 'nl', 'English': 'en', 'Esperanto': 'eo', 'Estonian': 'et', 'Finnish': 'fi', 'French': 'fr', 'Frisian': 'fy', 'Galician': 'gl', 'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Gujarati': 'gu', 'Haitian Creole': 'ht', 'Hausa': 'ha', 'Hawaiian': 'haw', 'Hebrew': 'iw', 'Hindi': 'hi', 'Hmong': 'hmn', 'Hungarian': 'hu', 'Icelandic': 'is', 'Igbo': 'ig', 'Indonesian': 'id', 'Irish': 'ga', 'Italian': 'it', 'Japanese': 'ja', 'Javanese': 'jw', 'Kannada': 'kn', 'Kazakh': 'kk', 'Khmer': 'km', 'Korean': 'ko', 'Kurdish': 'ku', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Latin': 'la', 'Latvian': 'lv', 'Lithuanian': 'lt', 'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Malagasy': 'mg', 'Malay': 'ms', 'Malayalam': 'ml', 'Maltese': 'mt', 'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Myanmar (Burmese)': 'my', 'Nepali': 'ne', 'Norwegian': 'no', 'Nyanja (Chichewa)': 'ny', 'Pashto': 'ps', 'Persian': 'fa', 'Polish': 'pl', 'Portuguese (Portugal, Brazil)': 'pt', 'Punjabi': 'pa', 'Romanian': 'ro', 'Russian': 'ru', 'Samoan': 'sm', 'Scots Gaelic': 'gd', 'Serbian': 'sr', 'Sesotho': 'st', 'Shona': 'sn', 'Sindhi': 'sd', 'Sinhala (Sinhalese)': 'si', 'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'Spanish': 'es', 'Sundanese': 'su', 'Swahili': 'sw', 'Swedish': 'sv', 'Tagalog (Filipino)': 'tl', 'Tajik': 'tg', 'Tamil': 'ta', 'Telugu': 'te', 'Thai': 'th', 'Turkish': 'tr', 'Ukrainian': 'uk', 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi', 'Welsh': 'cy', 'Xhosa': 'xh', 'Yiddish': 'yi', 'Yoruba': 'yo', 'Zulu': 'zu'}
 9 | 
10 | 
11 | def input_modifier(string):
12 |     """
13 |     This function is applied to your text inputs before
14 |     they are fed into the model.
15 |     """
16 | 
17 |     return GoogleTranslator(source=params['language string'], target='en').translate(string)
18 | 
19 | 
20 | def output_modifier(string):
21 |     """
22 |     This function is applied to the model outputs.
23 |     """
24 | 
25 |     return GoogleTranslator(source='en', target=params['language string']).translate(string)
26 | 
27 | 
28 | def bot_prefix_modifier(string):
29 |     """
30 |     This function is only applied in chat mode. It modifies
31 |     the prefix text for the Bot and can be used to bias its
32 |     behavior.
33 |     """
34 | 
35 |     return string
36 | 
37 | 
38 | def ui():
39 |     # Finding the language name from the language code to use as the default value
40 |     language_name = list(language_codes.keys())[list(language_codes.values()).index(params['language string'])]
41 | 
42 |     # Gradio elements
43 |     language = gr.Dropdown(value=language_name, choices=[k for k in language_codes], label='Language')
44 | 
45 |     # Event functions to update the parameters in the backend
46 |     language.change(lambda x: params.update({"language string": language_codes[x]}), language, None)
47 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \
 5 |     rm -rf /var/lib/apt/lists/*
 6 | 
 7 | RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa /build
 8 | 
 9 | WORKDIR /build
10 | 
11 | RUN python3 -m venv /build/venv
12 | RUN . /build/venv/bin/activate && \
13 |     pip3 install --upgrade pip setuptools wheel && \
14 |     pip3 install torch torchvision torchaudio && \
15 |     pip3 install -r requirements.txt
16 | 
17 | # https://developer.nvidia.com/cuda-gpus
18 | # for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
19 | ARG TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
20 | RUN . /build/venv/bin/activate && \
21 |     python3 setup_cuda.py bdist_wheel -d .
22 | 
23 | FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
24 | 
25 | LABEL maintainer="Your Name <your.email@example.com>"
26 | LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
27 | 
28 | RUN apt-get update && \
29 |     apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \
30 |     rm -rf /var/lib/apt/lists/*
31 | 
32 | RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv
33 | RUN mkdir /app
34 | 
35 | WORKDIR /app
36 | 
37 | ARG WEBUI_VERSION
38 | RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source"
39 | 
40 | RUN virtualenv /app/venv
41 | RUN . /app/venv/bin/activate && \
42 |     pip3 install --upgrade pip setuptools wheel && \
43 |     pip3 install torch torchvision torchaudio
44 | 
45 | COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
46 | RUN . /app/venv/bin/activate && \
47 |     pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl
48 | 
49 | COPY extensions/api/requirements.txt /app/extensions/api/requirements.txt
50 | COPY extensions/elevenlabs_tts/requirements.txt /app/extensions/elevenlabs_tts/requirements.txt
51 | COPY extensions/google_translate/requirements.txt /app/extensions/google_translate/requirements.txt
52 | COPY extensions/silero_tts/requirements.txt /app/extensions/silero_tts/requirements.txt
53 | COPY extensions/whisper_stt/requirements.txt /app/extensions/whisper_stt/requirements.txt
54 | RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/api && pip3 install -r requirements.txt
55 | RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/elevenlabs_tts && pip3 install -r requirements.txt
56 | RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/google_translate && pip3 install -r requirements.txt
57 | RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/silero_tts && pip3 install -r requirements.txt
58 | RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/whisper_stt && pip3 install -r requirements.txt
59 | 
60 | COPY requirements.txt /app/requirements.txt
61 | RUN . /app/venv/bin/activate && \
62 |     pip3 install -r requirements.txt
63 | 
64 | RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
65 | 
66 | COPY . /app/
67 | ENV CLI_ARGS=""
68 | CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS}
69 | 


--------------------------------------------------------------------------------
/extensions/silero_tts/test_tts.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from pathlib import Path
 3 | 
 4 | import torch
 5 | import tts_preprocessor
 6 | 
 7 | torch._C._jit_set_profiling_mode(False)
 8 | 
 9 | 
10 | params = {
11 |     'activate': True,
12 |     'speaker': 'en_49',
13 |     'language': 'en',
14 |     'model_id': 'v3_en',
15 |     'sample_rate': 48000,
16 |     'device': 'cpu',
17 |     'show_text': True,
18 |     'autoplay': True,
19 |     'voice_pitch': 'medium',
20 |     'voice_speed': 'medium',
21 | }
22 | 
23 | current_params = params.copy()
24 | voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
25 | voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
26 | voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
27 | 
28 | # Used for making text xml compatible, needed for voice pitch and speed control
29 | table = str.maketrans({
30 |     "<": "&lt;",
31 |     ">": "&gt;",
32 |     "&": "&amp;",
33 |     "'": "&apos;",
34 |     '"': "&quot;",
35 | })
36 | 
37 | 
38 | def xmlesc(txt):
39 |     return txt.translate(table)
40 | 
41 | 
42 | def load_model():
43 |     model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id'])
44 |     model.to(params['device'])
45 |     return model
46 | 
47 | 
48 | model = load_model()
49 | 
50 | 
51 | def output_modifier(string):
52 |     """
53 |     This function is applied to the model outputs.
54 |     """
55 | 
56 |     global model, current_params
57 | 
58 |     original_string = string
59 |     string = tts_preprocessor.preprocess(string)
60 |     processed_string = string
61 | 
62 |     if string == '':
63 |         string = '*Empty reply, try regenerating*'
64 |     else:
65 |         output_file = Path(f'extensions/silero_tts/outputs/test_{int(time.time())}.wav')
66 |         prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
67 |         silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
68 |         model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
69 | 
70 |         autoplay = 'autoplay' if params['autoplay'] else ''
71 |         string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
72 | 
73 |         if params['show_text']:
74 |             string += f'\n\n{original_string}\n\nProcessed:\n{processed_string}'
75 | 
76 |     print(string)
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     import sys
81 |     output_modifier(sys.argv[1])
82 | 


--------------------------------------------------------------------------------
/docs/WSL-installation-guide.md:
--------------------------------------------------------------------------------
 1 | Guide created by [@jfryton](https://github.com/jfryton). Thank you jfryton.
 2 | 
 3 | -----
 4 | 
 5 | Here's an easy-to-follow, step-by-step guide for installing Windows Subsystem for Linux (WSL) with Ubuntu on Windows 10/11:
 6 | 
 7 | ## Step 1: Enable WSL
 8 | 
 9 | 1. Press the Windows key + X and click on "Windows PowerShell (Admin)" or "Windows Terminal (Admin)" to open PowerShell or Terminal with administrator privileges.
10 | 2. In the PowerShell window, type the following command and press Enter:
11 | 
12 | ```
13 | wsl --install
14 | ```
15 | 
16 | If this command doesn't work, you can enable WSL with the following command for Windows 10:
17 | 
18 | ```
19 | wsl --set-default-version 1
20 | ```
21 | 
22 | For Windows 11, you can use:
23 | 
24 | ```
25 | wsl --set-default-version 2
26 | ```
27 | 
28 | You may be prompted to restart your computer. If so, save your work and restart.
29 | 
30 | ## Step 2: Install Ubuntu
31 | 
32 | 1. Open the Microsoft Store.
33 | 2. Search for "Ubuntu" in the search bar.
34 | 3. Choose the desired Ubuntu version (e.g., Ubuntu 20.04 LTS) and click "Get" or "Install" to download and install the Ubuntu app.
35 | 4. Once the installation is complete, click "Launch" or search for "Ubuntu" in the Start menu and open the app.
36 | 
37 | ## Step 3: Set up Ubuntu
38 | 
39 | 1. When you first launch the Ubuntu app, it will take a few minutes to set up. Be patient as it installs the necessary files and sets up your environment.
40 | 2. Once the setup is complete, you will be prompted to create a new UNIX username and password. Choose a username and password, and make sure to remember them, as you will need them for future administrative tasks within the Ubuntu environment.
41 | 
42 | ## Step 4: Update and upgrade packages
43 | 
44 | 1. After setting up your username and password, it's a good idea to update and upgrade your Ubuntu system. Run the following commands in the Ubuntu terminal:
45 | 
46 | ```
47 | sudo apt update
48 | sudo apt upgrade
49 | ```
50 | 
51 | 2. Enter your password when prompted. This will update the package list and upgrade any outdated packages.
52 | 
53 | Congratulations! You have now installed WSL with Ubuntu on your Windows 10/11 system. You can use the Ubuntu terminal for various tasks, like running Linux commands, installing packages, or managing files.
54 | 
55 | You can launch your WSL Ubuntu installation by selecting the Ubuntu app (like any other program installed on your computer) or typing 'ubuntu' into Powershell or Terminal.
56 | 
57 | ## Step 5: Proceed with Linux instructions
58 | 
59 | 1. You can now follow the Linux setup instructions. If you receive any error messages about a missing tool or package, just install them using apt:
60 | 
61 | ```
62 | sudo apt install [missing package]
63 | ```
64 | 
65 | You will probably need to install build-essential
66 | 
67 | ```
68 | sudo apt install build-essential
69 | ```
70 | 
71 | If you face any issues or need to troubleshoot, you can always refer to the official Microsoft documentation for WSL: https://docs.microsoft.com/en-us/windows/wsl/
72 | 
73 | #### WSL2 performance using /mnt: 
74 | when you git clone a repository, put it inside WSL and not outside. To understand more, take a look at this [issue](https://github.com/microsoft/WSL/issues/4197#issuecomment-604592340)
75 | 
76 | ## Bonus: Port Forwarding
77 | 
78 | By default, you won't be able to access the webui from another device on your local network. You will need to setup the appropriate port forwarding using the following command (using PowerShell or Terminal with administrator privileges). 
79 | 
80 | ```
81 | netsh interface portproxy add v4tov4 listenaddress=0.0.0.0 listenport=7860 connectaddress=localhost connectport=7860
82 | ```
83 | 


--------------------------------------------------------------------------------
/api-examples/api-example-chat-stream.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import sys
  4 | 
  5 | try:
  6 |     import websockets
  7 | except ImportError:
  8 |     print("Websockets package not found. Make sure it's installed.")
  9 | 
 10 | # For local streaming, the websockets are hosted without ssl - ws://
 11 | HOST = 'localhost:5005'
 12 | URI = f'ws://{HOST}/api/v1/chat-stream'
 13 | 
 14 | # For reverse-proxied streaming, the remote will likely host with ssl - wss://
 15 | # URI = 'wss://your-uri-here.trycloudflare.com/api/v1/stream'
 16 | 
 17 | 
 18 | async def run(user_input, history):
 19 |     # Note: the selected defaults change from time to time.
 20 |     request = {
 21 |         'user_input': user_input,
 22 |         'max_new_tokens': 250,
 23 |         'history': history,
 24 |         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
 25 |         'character': 'Example',
 26 |         'instruction_template': 'Vicuna-v1.1',
 27 |         'your_name': 'You',
 28 | 
 29 |         'regenerate': False,
 30 |         '_continue': False,
 31 |         'stop_at_newline': False,
 32 |         'chat_generation_attempts': 1,
 33 |         'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 34 | 
 35 |         # Generation params. If 'preset' is set to different than 'None', the values
 36 |         # in presets/preset-name.yaml are used instead of the individual numbers.
 37 |         'preset': 'None',  
 38 |         'do_sample': True,
 39 |         'temperature': 0.7,
 40 |         'top_p': 0.1,
 41 |         'typical_p': 1,
 42 |         'epsilon_cutoff': 0,  # In units of 1e-4
 43 |         'eta_cutoff': 0,  # In units of 1e-4
 44 |         'tfs': 1,
 45 |         'top_a': 0,
 46 |         'repetition_penalty': 1.18,
 47 |         'top_k': 40,
 48 |         'min_length': 0,
 49 |         'no_repeat_ngram_size': 0,
 50 |         'num_beams': 1,
 51 |         'penalty_alpha': 0,
 52 |         'length_penalty': 1,
 53 |         'early_stopping': False,
 54 |         'mirostat_mode': 0,
 55 |         'mirostat_tau': 5,
 56 |         'mirostat_eta': 0.1,
 57 | 
 58 |         'seed': -1,
 59 |         'add_bos_token': True,
 60 |         'truncation_length': 2048,
 61 |         'ban_eos_token': False,
 62 |         'skip_special_tokens': True,
 63 |         'stopping_strings': []
 64 |     }
 65 | 
 66 |     async with websockets.connect(URI, ping_interval=None) as websocket:
 67 |         await websocket.send(json.dumps(request))
 68 | 
 69 |         while True:
 70 |             incoming_data = await websocket.recv()
 71 |             incoming_data = json.loads(incoming_data)
 72 | 
 73 |             match incoming_data['event']:
 74 |                 case 'text_stream':
 75 |                     yield incoming_data['history']
 76 |                 case 'stream_end':
 77 |                     return
 78 | 
 79 | 
 80 | async def print_response_stream(user_input, history):
 81 |     cur_len = 0
 82 |     async for new_history in run(user_input, history):
 83 |         cur_message = new_history['visible'][-1][1][cur_len:]
 84 |         cur_len += len(cur_message)
 85 |         print(cur_message, end='')
 86 |         sys.stdout.flush()  # If we don't flush, we won't see tokens in realtime.
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     user_input = "Please give me a step-by-step guide on how to plant a tree in my backyard."
 91 | 
 92 |     # Basic example
 93 |     history = {'internal': [], 'visible': []}
 94 | 
 95 |     # "Continue" example. Make sure to set '_continue' to True above
 96 |     # arr = [user_input, 'Surely, here is']
 97 |     # history = {'internal': [arr], 'visible': [arr]}
 98 | 
 99 |     asyncio.run(print_response_stream(user_input, history))
100 | 


--------------------------------------------------------------------------------
/docs/Spell-book.md:
--------------------------------------------------------------------------------
  1 | You have now entered a hidden corner of the internet.
  2 | 
  3 | A confusing yet intriguing realm of paradoxes and contradictions.
  4 | 
  5 | A place where you will find out that what you thought you knew, you in fact didn't know, and what you didn't know was in front of you all along.
  6 | 
  7 | ![](https://i.pinimg.com/originals/6e/e2/7b/6ee27bad351d3aca470d80f1033ba9c6.jpg)
  8 | 
  9 | *In other words, here I will document little-known facts about this web UI that I could not find another place for in the wiki.*
 10 | 
 11 | #### You can train LoRAs in CPU mode
 12 | 
 13 | Load the web UI with
 14 | 
 15 | ```
 16 | python server.py --cpu
 17 | ```
 18 | 
 19 | and start training the LoRA from the training tab as usual.
 20 | 
 21 | #### 8-bit mode works with CPU offloading
 22 | 
 23 | ```
 24 | python server.py --load-in-8bit --gpu-memory 4000MiB
 25 | ```
 26 | 
 27 | #### `--pre_layer`, and not `--gpu-memory`, is the right way to do CPU offloading with 4-bit models
 28 | 
 29 | ```
 30 | python server.py --wbits 4 --groupsize 128 --pre_layer 20
 31 | ```
 32 | 
 33 | #### Models can be loaded in 32-bit, 16-bit, 8-bit, and 4-bit modes
 34 | 
 35 | ```
 36 | python server.py --cpu
 37 | python server.py
 38 | python server.py --load-in-8bit
 39 | python server.py --wbits 4
 40 | ```
 41 | 
 42 | #### The web UI works with any version of GPTQ-for-LLaMa
 43 | 
 44 | Including the up to date triton and cuda branches. But you have to delete the `repositories/GPTQ-for-LLaMa` folder and reinstall the new one every time:
 45 | 
 46 | ```
 47 | cd text-generation-webui/repositories
 48 | rm -r GPTQ-for-LLaMa
 49 | pip uninstall quant-cuda
 50 | git clone https://github.com/oobabooga/GPTQ-for-LLaMa -b cuda # or any other repository and branch
 51 | cd GPTQ-for-LLaMa
 52 | python setup_cuda.py install
 53 | ```
 54 | 
 55 | #### Instruction-following templates are represented as chat characters
 56 | 
 57 | https://github.com/oobabooga/text-generation-webui/tree/main/characters/instruction-following
 58 | 
 59 | #### The right way to run Alpaca, Open Assistant, Vicuna, etc is Instruct mode, not normal chat mode
 60 | 
 61 | Otherwise the prompt will not be formatted correctly.
 62 | 
 63 | 1. Start the web UI with
 64 | 
 65 | ```
 66 | python server.py --chat
 67 | ```
 68 | 
 69 | 2. Click on the "instruct" option under "Chat modes"
 70 | 
 71 | 3. Select the correct template in the hidden dropdown menu that will become visible. 
 72 | 
 73 | #### Notebook mode is best mode
 74 | 
 75 | Ascended individuals have realized that notebook mode is the superset of chat mode and can do chats with ultimate flexibility, including group chats, editing replies, starting a new bot reply in a given way, and impersonating.
 76 | 
 77 | #### RWKV is a RNN 
 78 | 
 79 | Most models are transformers, but not RWKV, which is a RNN. It's a great model.
 80 | 
 81 | #### `--gpu-memory` is not a hard limit on the GPU memory
 82 | 
 83 | It is simply a parameter that is passed to the `accelerate` library while loading the model. More memory will be allocated during generation. That's why this parameter has to be set to less than your total GPU memory.
 84 | 
 85 | #### Contrastive search perhaps the best preset
 86 | 
 87 | But it uses a ton of VRAM.
 88 | 
 89 | #### You can check the sha256sum of downloaded models with the download script
 90 | 
 91 | ```
 92 | python download-model.py facebook/galactica-125m --check
 93 | ```
 94 | 
 95 | #### The download script continues interrupted downloads by default
 96 | 
 97 | It doesn't start over.
 98 | 
 99 | #### You can download models with multiple threads
100 | 
101 | ```
102 | python download-model.py facebook/galactica-125m --threads 8
103 | ```
104 | 
105 | #### LoRAs work in 4-bit mode
106 | 
107 | You need to follow [these instructions](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode) and then start the web UI with the `--monkey-patch` flag.
108 | 


--------------------------------------------------------------------------------
/modules/llamacpp_model.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Based on
  3 | https://github.com/abetlen/llama-cpp-python
  4 | 
  5 | Documentation:
  6 | https://abetlen.github.io/llama-cpp-python/
  7 | '''
  8 | 
  9 | import re
 10 | from functools import partial
 11 | 
 12 | from llama_cpp import Llama, LlamaCache, LogitsProcessorList
 13 | 
 14 | from modules import shared
 15 | from modules.callbacks import Iteratorize
 16 | from modules.logging_colors import logger
 17 | 
 18 | 
 19 | def ban_eos_logits_processor(eos_token, input_ids, logits):
 20 |     logits[eos_token] = -float('inf')
 21 |     return logits
 22 | 
 23 | 
 24 | class LlamaCppModel:
 25 |     def __init__(self):
 26 |         self.initialized = False
 27 | 
 28 |     def __del__(self):
 29 |         self.model.__del__()
 30 | 
 31 |     @classmethod
 32 |     def from_pretrained(self, path):
 33 |         result = self()
 34 |         cache_capacity = 0
 35 |         if shared.args.cache_capacity is not None:
 36 |             if 'GiB' in shared.args.cache_capacity:
 37 |                 cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000 * 1000
 38 |             elif 'MiB' in shared.args.cache_capacity:
 39 |                 cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000
 40 |             else:
 41 |                 cache_capacity = int(shared.args.cache_capacity)
 42 | 
 43 |         logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
 44 |         params = {
 45 |             'model_path': str(path),
 46 |             'n_ctx': shared.args.n_ctx,
 47 |             'seed': int(shared.args.llama_cpp_seed),
 48 |             'n_threads': shared.args.threads or None,
 49 |             'n_batch': shared.args.n_batch,
 50 |             'use_mmap': not shared.args.no_mmap,
 51 |             'use_mlock': shared.args.mlock,
 52 |             'n_gpu_layers': shared.args.n_gpu_layers
 53 |         }
 54 | 
 55 |         result.model = Llama(**params)
 56 |         if cache_capacity > 0:
 57 |             result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
 58 | 
 59 |         # This is ugly, but the model and the tokenizer are the same object in this library.
 60 |         return result, result
 61 | 
 62 |     def encode(self, string):
 63 |         if type(string) is str:
 64 |             string = string.encode()
 65 | 
 66 |         return self.model.tokenize(string)
 67 | 
 68 |     def generate(self, prompt, state, callback=None):
 69 |         prompt = prompt if type(prompt) is str else prompt.decode()
 70 |         completion_chunks = self.model.create_completion(
 71 |             prompt=prompt,
 72 |             max_tokens=state['max_new_tokens'],
 73 |             temperature=state['temperature'],
 74 |             top_p=state['top_p'],
 75 |             top_k=state['top_k'],
 76 |             repeat_penalty=state['repetition_penalty'],
 77 |             tfs_z=state['tfs'],
 78 |             mirostat_mode=int(state['mirostat_mode']),
 79 |             mirostat_tau=state['mirostat_tau'],
 80 |             mirostat_eta=state['mirostat_eta'],
 81 |             stream=True,
 82 |             logits_processor=LogitsProcessorList([
 83 |                 partial(ban_eos_logits_processor, self.model.token_eos()),
 84 |             ]) if state['ban_eos_token'] else None,
 85 |         )
 86 | 
 87 |         output = ""
 88 |         for completion_chunk in completion_chunks:
 89 |             text = completion_chunk['choices'][0]['text']
 90 |             output += text
 91 |             if callback:
 92 |                 callback(text)
 93 | 
 94 |         return output
 95 | 
 96 |     def generate_with_streaming(self, *args, **kwargs):
 97 |         with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
 98 |             reply = ''
 99 |             for token in generator:
100 |                 reply += token
101 |                 yield reply
102 | 


--------------------------------------------------------------------------------
/extensions/api/streaming_api.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | from threading import Thread
  4 | 
  5 | from websockets.server import serve
  6 | 
  7 | from extensions.api.util import build_parameters, try_start_cloudflared
  8 | from modules import shared
  9 | from modules.chat import generate_chat_reply
 10 | from modules.text_generation import generate_reply
 11 | 
 12 | PATH = '/api/v1/stream'
 13 | 
 14 | 
 15 | async def _handle_connection(websocket, path):
 16 | 
 17 |     if path == '/api/v1/stream':
 18 |         async for message in websocket:
 19 |             message = json.loads(message)
 20 | 
 21 |             prompt = message['prompt']
 22 |             generate_params = build_parameters(message)
 23 |             stopping_strings = generate_params.pop('stopping_strings')
 24 |             generate_params['stream'] = True
 25 | 
 26 |             generator = generate_reply(
 27 |                 prompt, generate_params, stopping_strings=stopping_strings, is_chat=False)
 28 | 
 29 |             # As we stream, only send the new bytes.
 30 |             skip_index = 0
 31 |             message_num = 0
 32 | 
 33 |             for a in generator:
 34 |                 to_send = a[skip_index:]
 35 |                 if to_send is None or chr(0xfffd) in to_send:  # partial unicode character, don't send it yet.
 36 |                     continue
 37 | 
 38 |                 await websocket.send(json.dumps({
 39 |                     'event': 'text_stream',
 40 |                     'message_num': message_num,
 41 |                     'text': to_send
 42 |                 }))
 43 | 
 44 |                 await asyncio.sleep(0)
 45 |                 skip_index += len(to_send)
 46 |                 message_num += 1
 47 | 
 48 |             await websocket.send(json.dumps({
 49 |                 'event': 'stream_end',
 50 |                 'message_num': message_num
 51 |             }))
 52 | 
 53 |     elif path == '/api/v1/chat-stream':
 54 |         async for message in websocket:
 55 |             body = json.loads(message)
 56 | 
 57 |             user_input = body['user_input']
 58 |             history = body['history']
 59 |             generate_params = build_parameters(body, chat=True)
 60 |             generate_params['stream'] = True
 61 |             regenerate = body.get('regenerate', False)
 62 |             _continue = body.get('_continue', False)
 63 | 
 64 |             generator = generate_chat_reply(
 65 |                 user_input, history, generate_params, regenerate=regenerate, _continue=_continue, loading_message=False)
 66 | 
 67 |             message_num = 0
 68 |             for a in generator:
 69 |                 await websocket.send(json.dumps({
 70 |                     'event': 'text_stream',
 71 |                     'message_num': message_num,
 72 |                     'history': a
 73 |                 }))
 74 | 
 75 |                 await asyncio.sleep(0)
 76 |                 message_num += 1
 77 | 
 78 |             await websocket.send(json.dumps({
 79 |                 'event': 'stream_end',
 80 |                 'message_num': message_num
 81 |             }))
 82 | 
 83 |     else:
 84 |         print(f'Streaming api: unknown path: {path}')
 85 |         return
 86 | 
 87 | 
 88 | async def _run(host: str, port: int):
 89 |     async with serve(_handle_connection, host, port, ping_interval=None):
 90 |         await asyncio.Future()  # run forever
 91 | 
 92 | 
 93 | def _run_server(port: int, share: bool = False):
 94 |     address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
 95 | 
 96 |     def on_start(public_url: str):
 97 |         public_url = public_url.replace('https://', 'wss://')
 98 |         print(f'Starting streaming server at public url {public_url}{PATH}')
 99 | 
100 |     if share:
101 |         try:
102 |             try_start_cloudflared(port, max_attempts=3, on_start=on_start)
103 |         except Exception as e:
104 |             print(e)
105 |     else:
106 |         print(f'Starting streaming server at ws://{address}:{port}{PATH}')
107 | 
108 |     asyncio.run(_run(host=address, port=port))
109 | 
110 | 
111 | def start_server(port: int, share: bool = False):
112 |     Thread(target=_run_server, args=[port, share], daemon=True).start()
113 | 


--------------------------------------------------------------------------------
/extensions/multimodal/script.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import re
  3 | import time
  4 | from functools import partial
  5 | from io import BytesIO
  6 | 
  7 | import gradio as gr
  8 | import torch
  9 | 
 10 | from extensions.multimodal.multimodal_embedder import MultimodalEmbedder
 11 | from modules import shared
 12 | from modules.logging_colors import logger
 13 | 
 14 | params = {
 15 |     "add_all_images_to_prompt": False,
 16 |     # device to run vision encoder on
 17 |     "vision_device": None,
 18 |     # bits to load vision encoder in, either 16 or 32
 19 |     "vision_bits": 32,
 20 |     # device to run multimodal projector on
 21 |     "projector_device": None,
 22 |     # multimodal projector bits, either 32 or 16
 23 |     "projector_bits": 32
 24 | }
 25 | 
 26 | 
 27 | # If 'state' is True, will hijack the next chat generation
 28 | input_hijack = {
 29 |     'state': False,
 30 |     'value': ["", ""]
 31 | }
 32 | 
 33 | 
 34 | # initialized in ui, so that params are loaded from settings
 35 | multimodal_embedder: MultimodalEmbedder = None
 36 | 
 37 | 
 38 | def add_chat_picture(picture, text, visible_text):
 39 |     # resize the image, so that shortest edge is at least 224 (size for CLIP), and at most 300 (to keep history manageable)
 40 |     max_hw, min_hw = max(picture.size), min(picture.size)
 41 |     aspect_ratio = max_hw / min_hw
 42 |     shortest_edge = int(max(300 / aspect_ratio, 224))
 43 |     longest_edge = int(shortest_edge * aspect_ratio)
 44 |     w = shortest_edge if picture.width < picture.height else longest_edge
 45 |     h = shortest_edge if picture.width >= picture.height else longest_edge
 46 |     picture = picture.resize((w, h))
 47 | 
 48 |     buffer = BytesIO()
 49 |     picture.save(buffer, format="JPEG")
 50 |     img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
 51 |     image = f'<img src="data:image/jpeg;base64,{img_str}">'
 52 | 
 53 |     if '<image>' in text:
 54 |         text = text.replace('<image>', image)
 55 |     else:
 56 |         text = text + '\n' + image
 57 | 
 58 |     if visible_text == '' or visible_text is None:
 59 |         visible_text = text
 60 |     elif '<image>' in visible_text:
 61 |         visible_text = visible_text.replace('<image>', image)
 62 |     else:
 63 |         visible_text = visible_text + '\n' + image
 64 | 
 65 |     return text, visible_text
 66 | 
 67 | 
 68 | def custom_tokenized_length(prompt):
 69 |     return multimodal_embedder.len_in_tokens(prompt)
 70 | 
 71 | 
 72 | def tokenizer_modifier(state, prompt, input_ids, input_embeds):
 73 |     global params
 74 |     start_ts = time.time()
 75 |     image_match = re.search(r'<img src="data:image/jpeg;base64,[A-Za-z0-9+/=]+">', prompt)
 76 | 
 77 |     if image_match is None:
 78 |         return prompt, input_ids, input_embeds
 79 | 
 80 |     prompt, input_ids, input_embeds, total_embedded = multimodal_embedder.forward(prompt, state, params)
 81 |     logger.info(f'Embedded {total_embedded} image(s) in {time.time()-start_ts:.2f}s')
 82 |     return (prompt,
 83 |             input_ids.unsqueeze(0).to(shared.model.device, dtype=torch.int64),
 84 |             input_embeds.unsqueeze(0).to(shared.model.device, dtype=shared.model.dtype))
 85 | 
 86 | 
 87 | def ui():
 88 |     global multimodal_embedder
 89 |     multimodal_embedder = MultimodalEmbedder(params)
 90 |     with gr.Column():
 91 |         picture_select = gr.Image(label='Send a picture', type='pil')
 92 |         # The models don't seem to deal well with multiple images
 93 |         single_image_checkbox = gr.Checkbox(False, label='Embed all images, not only the last one')
 94 |     # Prepare the input hijack
 95 |     picture_select.upload(
 96 |         lambda picture: input_hijack.update({"state": True, "value": partial(add_chat_picture, picture)}),
 97 |         [picture_select],
 98 |         None
 99 |     )
100 |     picture_select.clear(lambda: input_hijack.update({"state": False, "value": ["", ""]}), None, None)
101 |     single_image_checkbox.change(lambda x: params.update({"add_all_images_to_prompt": x}), single_image_checkbox, None)
102 |     shared.gradio['Generate'].click(lambda: None, None, picture_select)
103 |     shared.gradio['textbox'].submit(lambda: None, None, picture_select)
104 | 


--------------------------------------------------------------------------------
/modules/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from datetime import datetime
  4 | from pathlib import Path
  5 | 
  6 | from modules import shared
  7 | from modules.logging_colors import logger
  8 | 
  9 | 
 10 | def save_file(fname, contents):
 11 |     if fname == '':
 12 |         logger.error('File name is empty!')
 13 |         return
 14 | 
 15 |     root_folder = Path(__file__).resolve().parent.parent
 16 |     abs_path = Path(fname).resolve()
 17 |     rel_path = abs_path.relative_to(root_folder)
 18 |     if rel_path.parts[0] == '..':
 19 |         logger.error(f'Invalid file path: {fname}')
 20 |         return
 21 | 
 22 |     with open(abs_path, 'w', encoding='utf-8') as f:
 23 |         f.write(contents)
 24 | 
 25 |     logger.info(f'Saved {abs_path}.')
 26 | 
 27 | 
 28 | def delete_file(fname):
 29 |     if fname == '':
 30 |         logger.error('File name is empty!')
 31 |         return
 32 | 
 33 |     root_folder = Path(__file__).resolve().parent.parent
 34 |     abs_path = Path(fname).resolve()
 35 |     rel_path = abs_path.relative_to(root_folder)
 36 |     if rel_path.parts[0] == '..':
 37 |         logger.error(f'Invalid file path: {fname}')
 38 |         return
 39 | 
 40 |     if abs_path.exists():
 41 |         abs_path.unlink()
 42 |         logger.info(f'Deleted {fname}.')
 43 | 
 44 | 
 45 | def current_time():
 46 |     return f"{datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
 47 | 
 48 | 
 49 | def atoi(text):
 50 |     return int(text) if text.isdigit() else text.lower()
 51 | 
 52 | 
 53 | # Replace multiple string pairs in a string
 54 | def replace_all(text, dic):
 55 |     for i, j in dic.items():
 56 |         text = text.replace(i, j)
 57 | 
 58 |     return text
 59 | 
 60 | 
 61 | def natural_keys(text):
 62 |     return [atoi(c) for c in re.split(r'(\d+)', text)]
 63 | 
 64 | 
 65 | def get_available_models():
 66 |     if shared.args.flexgen:
 67 |         return sorted([re.sub('-np$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if item.name.endswith('-np')], key=natural_keys)
 68 |     else:
 69 |         return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
 70 | 
 71 | 
 72 | def get_available_presets():
 73 |     return sorted(set((k.stem for k in Path('presets').glob('*.yaml'))), key=natural_keys)
 74 | 
 75 | 
 76 | def get_available_prompts():
 77 |     prompts = []
 78 |     files = set((k.stem for k in Path('prompts').glob('*.txt')))
 79 |     prompts += sorted([k for k in files if re.match('^[0-9]', k)], key=natural_keys, reverse=True)
 80 |     prompts += sorted([k for k in files if re.match('^[^0-9]', k)], key=natural_keys)
 81 |     prompts += ['Instruct-' + k for k in get_available_instruction_templates() if k != 'None']
 82 |     prompts += ['None']
 83 |     return prompts
 84 | 
 85 | 
 86 | def get_available_characters():
 87 |     paths = (x for x in Path('characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
 88 |     return ['None'] + sorted(set((k.stem for k in paths if k.stem != "instruction-following")), key=natural_keys)
 89 | 
 90 | 
 91 | def get_available_instruction_templates():
 92 |     path = "characters/instruction-following"
 93 |     paths = []
 94 |     if os.path.exists(path):
 95 |         paths = (x for x in Path(path).iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
 96 | 
 97 |     return ['None'] + sorted(set((k.stem for k in paths)), key=natural_keys)
 98 | 
 99 | 
100 | def get_available_extensions():
101 |     return sorted(set(map(lambda x: x.parts[1], Path('extensions').glob('*/script.py'))), key=natural_keys)
102 | 
103 | 
104 | def get_available_loras():
105 |     return sorted([item.name for item in list(Path(shared.args.lora_dir).glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json'))], key=natural_keys)
106 | 
107 | 
108 | def get_datasets(path: str, ext: str):
109 |     return ['None'] + sorted(set([k.stem for k in Path(path).glob(f'*.{ext}') if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
110 | 
111 | 
112 | def get_available_chat_styles():
113 |     return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys)
114 | 


--------------------------------------------------------------------------------
/modules/ui.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import gradio as gr
  4 | import torch
  5 | 
  6 | from modules import shared
  7 | 
  8 | with open(Path(__file__).resolve().parent / '../css/main.css', 'r') as f:
  9 |     css = f.read()
 10 | with open(Path(__file__).resolve().parent / '../css/chat.css', 'r') as f:
 11 |     chat_css = f.read()
 12 | with open(Path(__file__).resolve().parent / '../css/main.js', 'r') as f:
 13 |     main_js = f.read()
 14 | with open(Path(__file__).resolve().parent / '../css/chat.js', 'r') as f:
 15 |     chat_js = f.read()
 16 | 
 17 | refresh_symbol = '\U0001f504'  # 🔄
 18 | delete_symbol = '🗑️'
 19 | save_symbol = '💾'
 20 | 
 21 | theme = gr.themes.Default(
 22 |     font=['Helvetica', 'ui-sans-serif', 'system-ui', 'sans-serif'],
 23 |     font_mono=['IBM Plex Mono', 'ui-monospace', 'Consolas', 'monospace'],
 24 | ).set(
 25 |     border_color_primary='#c5c5d2',
 26 |     button_large_padding='6px 12px',
 27 |     body_text_color_subdued='#484848',
 28 |     background_fill_secondary='#eaeaea'
 29 | )
 30 | 
 31 | 
 32 | def list_model_elements():
 33 |     elements = ['loader', 'cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'no_inject_fused_attention', 'no_inject_fused_mlp', 'no_use_cuda_fp16', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed', 'gpu_split', 'max_seq_len', 'compress_pos_emb']
 34 |     for i in range(torch.cuda.device_count()):
 35 |         elements.append(f'gpu_memory_{i}')
 36 | 
 37 |     return elements
 38 | 
 39 | 
 40 | def list_interface_input_elements(chat=False):
 41 |     elements = ['max_new_tokens', 'seed', 'temperature', 'top_p', 'top_k', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'do_sample', 'penalty_alpha', 'num_beams', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'add_bos_token', 'ban_eos_token', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', 'preset_menu', 'stream', 'tfs', 'top_a']
 42 |     if chat:
 43 |         elements += ['name1', 'name2', 'greeting', 'context', 'chat_generation_attempts', 'stop_at_newline', 'mode', 'instruction_template', 'character_menu', 'name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template', 'chat_style', 'chat-instruct_command']
 44 | 
 45 |     elements += list_model_elements()
 46 |     return elements
 47 | 
 48 | 
 49 | def gather_interface_values(*args):
 50 |     output = {}
 51 |     for i, element in enumerate(shared.input_elements):
 52 |         output[element] = args[i]
 53 | 
 54 |     shared.persistent_interface_state = output
 55 |     return output
 56 | 
 57 | 
 58 | def apply_interface_values(state, use_persistent=False):
 59 |     if use_persistent:
 60 |         state = shared.persistent_interface_state
 61 | 
 62 |     elements = list_interface_input_elements(chat=shared.is_chat())
 63 |     if len(state) == 0:
 64 |         return [gr.update() for k in elements]  # Dummy, do nothing
 65 |     else:
 66 |         return [state[k] if k in state else gr.update() for k in elements]
 67 | 
 68 | 
 69 | class ToolButton(gr.Button, gr.components.FormComponent):
 70 |     """Small button with single emoji as text, fits inside gradio forms"""
 71 | 
 72 |     def __init__(self, **kwargs):
 73 |         super().__init__(**kwargs)
 74 | 
 75 |     def get_block_name(self):
 76 |         return "button"
 77 | 
 78 | 
 79 | def create_refresh_button(refresh_component, refresh_method, refreshed_args, elem_class):
 80 |     def refresh():
 81 |         refresh_method()
 82 |         args = refreshed_args() if callable(refreshed_args) else refreshed_args
 83 | 
 84 |         for k, v in args.items():
 85 |             setattr(refresh_component, k, v)
 86 | 
 87 |         return gr.update(**(args or {}))
 88 | 
 89 |     refresh_button = ToolButton(value=refresh_symbol, elem_classes=elem_class)
 90 |     refresh_button.click(
 91 |         fn=refresh,
 92 |         inputs=[],
 93 |         outputs=[refresh_component]
 94 |     )
 95 |     return refresh_button
 96 | 
 97 | 
 98 | def create_delete_button(**kwargs):
 99 |     return ToolButton(value=delete_symbol, **kwargs)
100 | 
101 | 
102 | def create_save_button(**kwargs):
103 |     return ToolButton(value=save_symbol, **kwargs)
104 | 


--------------------------------------------------------------------------------
/extensions/sd_api_pictures/README.MD:
--------------------------------------------------------------------------------
 1 | ## Description:
 2 | TL;DR: Lets the bot answer you with a picture!  
 3 | 
 4 | Stable Diffusion API pictures for TextGen, v.1.2.0  
 5 | An extension to [oobabooga's textgen-webui](https://github.com/oobabooga/text-generation-webui) allowing you to receive pics generated by [Automatic1111's SD-WebUI API](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
 6 | 
 7 | <details>
 8 | <summary>Interface overview</summary>
 9 | 
10 | ![Interface](https://raw.githubusercontent.com/Brawlence/SD_api_pics/main/illust/Interface.jpg)
11 | 
12 | </details>
13 | 
14 | Load it in the `--chat` mode with `--extension sd_api_pictures` alongside `send_pictures`
15 | (it's not really required, but completes the picture, *pun intended*).  
16 | 
17 | 
18 | ## History
19 | 
20 | Consider the version included with [oobabooga's repository](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/sd_api_pictures) to be STABLE, experimental developments and untested features are pushed in [Brawlence/SD_api_pics](https://github.com/Brawlence/SD_api_pics)
21 | 
22 | Lastest change:  
23 | 1.1.0 → 1.1.1 Fixed not having Auto1111's metadata in received images
24 | 
25 | ## Details
26 | 
27 | The image generation is triggered:  
28 | - manually through the 'Force the picture response' button while in `Manual` or `Immersive/Interactive` modes OR  
29 | - automatically in `Immersive/Interactive` mode if the words `'send|main|message|me'` are followed by `'image|pic|picture|photo|snap|snapshot|selfie|meme'` in the user's prompt  
30 | - always on in `Picturebook/Adventure` mode (if not currently suppressed by 'Suppress the picture response')  
31 | 
32 | ## Prerequisites
33 | 
34 | One needs an available instance of Automatic1111's webui running with an `--api` flag. Ain't tested with a notebook / cloud hosted one but should be possible.   
35 | To run it locally in parallel on the same machine, specify custom `--listen-port` for either Auto1111's or ooba's webUIs.  
36 | 
37 | ## Features overview
38 | - Connection to API check (press enter in the address box)  
39 | - [VRAM management (model shuffling)](https://github.com/Brawlence/SD_api_pics/wiki/VRAM-management-feature)  
40 | - [Three different operation modes](https://github.com/Brawlence/SD_api_pics/wiki/Modes-of-operation) (manual, interactive, always-on)  
41 | - User-defined persistent settings via settings.json
42 | 
43 | ### Connection check
44 | 
45 | Insert the Automatic1111's WebUI address and press Enter:  
46 | ![API-check](https://raw.githubusercontent.com/Brawlence/SD_api_pics/main/illust/API-check.gif)  
47 | Green mark confirms the ability to communicate with Auto1111's API on this address. Red cross means something's not right (the ext won't work).
48 | 
49 | ### Persistents settings
50 | 
51 | Create or modify the `settings.json` in the `text-generation-webui` root directory to override the defaults
52 | present in script.py, ex:
53 | 
54 | ```json
55 | {
56 |     "sd_api_pictures-manage_VRAM": 1,
57 |     "sd_api_pictures-save_img": 1,
58 |     "sd_api_pictures-prompt_prefix": "(Masterpiece:1.1), detailed, intricate, colorful, (solo:1.1)",
59 |     "sd_api_pictures-sampler_name": "DPM++ 2M Karras"
60 | }
61 | ```
62 | 
63 | will automatically set the `Manage VRAM` & `Keep original images` checkboxes and change the texts in `Prompt Prefix` and `Sampler name` on load.
64 | 
65 | ---
66 | 
67 | ## Demonstrations:
68 | 
69 | Those are examples of the version 1.0.0, but the core functionality is still the same
70 | 
71 | <details>
72 | <summary>Conversation 1</summary>
73 | 
74 | ![EXA1](https://user-images.githubusercontent.com/42910943/224866564-939a3bcb-e7cf-4ac0-a33f-b3047b55054d.jpg)
75 | ![EXA2](https://user-images.githubusercontent.com/42910943/224866566-38394054-1320-45cf-9515-afa76d9d7745.jpg)
76 | ![EXA3](https://user-images.githubusercontent.com/42910943/224866568-10ea47b7-0bac-4269-9ec9-22c387a13b59.jpg)
77 | ![EXA4](https://user-images.githubusercontent.com/42910943/224866569-326121ad-1ea1-4874-9f6b-4bca7930a263.jpg)
78 | 
79 | 
80 | </details>
81 | 
82 | <details>
83 | <summary>Conversation 2</summary>
84 | 
85 | ![Hist1](https://user-images.githubusercontent.com/42910943/224865517-c6966b58-bc4d-4353-aab9-6eb97778d7bf.jpg)
86 | ![Hist2](https://user-images.githubusercontent.com/42910943/224865527-b2fe7c2e-0da5-4c2e-b705-42e233b07084.jpg)
87 | ![Hist3](https://user-images.githubusercontent.com/42910943/224865535-a38d94e7-8975-4a46-a655-1ae1de41f85d.jpg)
88 | 
89 | </details>
90 | 
91 | 


--------------------------------------------------------------------------------
/modules/exllama.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from pathlib import Path
  3 | 
  4 | from modules import shared
  5 | from modules.logging_colors import logger
  6 | 
  7 | try:
  8 |     from exllama.generator import ExLlamaGenerator
  9 |     from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 10 |     from exllama.tokenizer import ExLlamaTokenizer
 11 | except:
 12 |     logger.warning('Exllama module failed to load. Will attempt to load from repositories.')
 13 |     try:
 14 |         from modules.relative_imports import RelativeImport
 15 | 
 16 |         with RelativeImport("repositories/exllama"):
 17 |             from generator import ExLlamaGenerator
 18 |             from model import ExLlama, ExLlamaCache, ExLlamaConfig
 19 |             from tokenizer import ExLlamaTokenizer
 20 |     except:
 21 |         logger.error("Could not find repositories/exllama/. Make sure that exllama is cloned inside repositories/ and is up to date.")
 22 |         raise
 23 | 
 24 | 
 25 | class ExllamaModel:
 26 |     def __init__(self):
 27 |         pass
 28 | 
 29 |     @classmethod
 30 |     def from_pretrained(self, path_to_model):
 31 | 
 32 |         path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
 33 |         tokenizer_model_path = path_to_model / "tokenizer.model"
 34 |         model_config_path = path_to_model / "config.json"
 35 | 
 36 |         # Find the model checkpoint
 37 |         model_path = None
 38 |         for ext in ['.safetensors', '.pt', '.bin']:
 39 |             found = list(path_to_model.glob(f"*{ext}"))
 40 |             if len(found) > 0:
 41 |                 if len(found) > 1:
 42 |                     logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
 43 | 
 44 |                 model_path = found[-1]
 45 |                 break
 46 | 
 47 |         config = ExLlamaConfig(str(model_config_path))
 48 |         config.model_path = str(model_path)
 49 |         config.max_seq_len = shared.args.max_seq_len
 50 |         config.compress_pos_emb = shared.args.compress_pos_emb
 51 |         if shared.args.gpu_split:
 52 |             config.set_auto_map(shared.args.gpu_split)
 53 |             config.gpu_peer_fix = True
 54 | 
 55 |         model = ExLlama(config)
 56 |         tokenizer = ExLlamaTokenizer(str(tokenizer_model_path))
 57 |         cache = ExLlamaCache(model)
 58 |         generator = ExLlamaGenerator(model, tokenizer, cache)
 59 | 
 60 |         result = self()
 61 |         result.config = config
 62 |         result.model = model
 63 |         result.cache = cache
 64 |         result.tokenizer = tokenizer
 65 |         result.generator = generator
 66 |         return result, result
 67 | 
 68 |     def generate_with_streaming(self, prompt, state):
 69 |         self.generator.settings.temperature = state['temperature']
 70 |         self.generator.settings.top_p = state['top_p']
 71 |         self.generator.settings.top_k = state['top_k']
 72 |         self.generator.settings.typical = state['typical_p']
 73 |         self.generator.settings.token_repetition_penalty_max = state['repetition_penalty']
 74 |         if state['ban_eos_token']:
 75 |             self.generator.disallow_tokens([self.tokenizer.eos_token_id])
 76 |         else:
 77 |             self.generator.disallow_tokens(None)
 78 | 
 79 |         self.generator.end_beam_search()
 80 |         ids = self.generator.tokenizer.encode(prompt)
 81 |         self.generator.gen_begin_reuse(ids)
 82 |         initial_len = self.generator.sequence[0].shape[0]
 83 |         has_leading_space = False
 84 |         for i in range(state['max_new_tokens']):
 85 |             token = self.generator.gen_single_token()
 86 |             if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
 87 |                 has_leading_space = True
 88 | 
 89 |             decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
 90 |             if has_leading_space:
 91 |                 decoded_text = ' ' + decoded_text
 92 | 
 93 |             yield decoded_text
 94 |             if token.item() == self.generator.tokenizer.eos_token_id or shared.stop_everything:
 95 |                 break
 96 | 
 97 |     def generate(self, prompt, state):
 98 |         output = ''
 99 |         for output in self.generate_with_streaming(prompt, state):
100 |             pass
101 | 
102 |         return output
103 | 
104 |     def encode(self, string, **kwargs):
105 |         return self.tokenizer.encode(string)
106 | 


--------------------------------------------------------------------------------
/modules/logging_colors.py:
--------------------------------------------------------------------------------
  1 | # Copied from https://stackoverflow.com/a/1336640
  2 | 
  3 | import logging
  4 | import platform
  5 | 
  6 | logging.basicConfig(
  7 |     format='%(asctime)s %(levelname)s:%(message)s',
  8 |     datefmt='%Y-%m-%d %H:%M:%S',
  9 | )
 10 | 
 11 | 
 12 | def add_coloring_to_emit_windows(fn):
 13 |     # add methods we need to the class
 14 |     def _out_handle(self):
 15 |         import ctypes
 16 |         return ctypes.windll.kernel32.GetStdHandle(self.STD_OUTPUT_HANDLE)
 17 |     out_handle = property(_out_handle)
 18 | 
 19 |     def _set_color(self, code):
 20 |         import ctypes
 21 | 
 22 |         # Constants from the Windows API
 23 |         self.STD_OUTPUT_HANDLE = -11
 24 |         hdl = ctypes.windll.kernel32.GetStdHandle(self.STD_OUTPUT_HANDLE)
 25 |         ctypes.windll.kernel32.SetConsoleTextAttribute(hdl, code)
 26 | 
 27 |     setattr(logging.StreamHandler, '_set_color', _set_color)
 28 | 
 29 |     def new(*args):
 30 |         FOREGROUND_BLUE = 0x0001  # text color contains blue.
 31 |         FOREGROUND_GREEN = 0x0002  # text color contains green.
 32 |         FOREGROUND_RED = 0x0004  # text color contains red.
 33 |         FOREGROUND_INTENSITY = 0x0008  # text color is intensified.
 34 |         FOREGROUND_WHITE = FOREGROUND_BLUE | FOREGROUND_GREEN | FOREGROUND_RED
 35 |         # winbase.h
 36 |         # STD_INPUT_HANDLE = -10
 37 |         # STD_OUTPUT_HANDLE = -11
 38 |         # STD_ERROR_HANDLE = -12
 39 | 
 40 |         # wincon.h
 41 |         # FOREGROUND_BLACK = 0x0000
 42 |         FOREGROUND_BLUE = 0x0001
 43 |         FOREGROUND_GREEN = 0x0002
 44 |         # FOREGROUND_CYAN = 0x0003
 45 |         FOREGROUND_RED = 0x0004
 46 |         FOREGROUND_MAGENTA = 0x0005
 47 |         FOREGROUND_YELLOW = 0x0006
 48 |         # FOREGROUND_GREY = 0x0007
 49 |         FOREGROUND_INTENSITY = 0x0008  # foreground color is intensified.
 50 | 
 51 |         # BACKGROUND_BLACK = 0x0000
 52 |         # BACKGROUND_BLUE = 0x0010
 53 |         # BACKGROUND_GREEN = 0x0020
 54 |         # BACKGROUND_CYAN = 0x0030
 55 |         # BACKGROUND_RED = 0x0040
 56 |         # BACKGROUND_MAGENTA = 0x0050
 57 |         BACKGROUND_YELLOW = 0x0060
 58 |         # BACKGROUND_GREY = 0x0070
 59 |         BACKGROUND_INTENSITY = 0x0080  # background color is intensified.
 60 | 
 61 |         levelno = args[1].levelno
 62 |         if (levelno >= 50):
 63 |             color = BACKGROUND_YELLOW | FOREGROUND_RED | FOREGROUND_INTENSITY | BACKGROUND_INTENSITY
 64 |         elif (levelno >= 40):
 65 |             color = FOREGROUND_RED | FOREGROUND_INTENSITY
 66 |         elif (levelno >= 30):
 67 |             color = FOREGROUND_YELLOW | FOREGROUND_INTENSITY
 68 |         elif (levelno >= 20):
 69 |             color = FOREGROUND_GREEN
 70 |         elif (levelno >= 10):
 71 |             color = FOREGROUND_MAGENTA
 72 |         else:
 73 |             color = FOREGROUND_WHITE
 74 |         args[0]._set_color(color)
 75 | 
 76 |         ret = fn(*args)
 77 |         args[0]._set_color(FOREGROUND_WHITE)
 78 |         # print "after"
 79 |         return ret
 80 |     return new
 81 | 
 82 | 
 83 | def add_coloring_to_emit_ansi(fn):
 84 |     # add methods we need to the class
 85 |     def new(*args):
 86 |         levelno = args[1].levelno
 87 |         if (levelno >= 50):
 88 |             color = '\x1b[31m'  # red
 89 |         elif (levelno >= 40):
 90 |             color = '\x1b[31m'  # red
 91 |         elif (levelno >= 30):
 92 |             color = '\x1b[33m'  # yellow
 93 |         elif (levelno >= 20):
 94 |             color = '\x1b[32m'  # green
 95 |         elif (levelno >= 10):
 96 |             color = '\x1b[35m'  # pink
 97 |         else:
 98 |             color = '\x1b[0m'  # normal
 99 |         args[1].msg = color + args[1].msg + '\x1b[0m'  # normal
100 |         # print "after"
101 |         return fn(*args)
102 |     return new
103 | 
104 | 
105 | if platform.system() == 'Windows':
106 |     # Windows does not support ANSI escapes and we are using API calls to set the console color
107 |     logging.StreamHandler.emit = add_coloring_to_emit_windows(logging.StreamHandler.emit)
108 | else:
109 |     # all non-Windows platforms are supporting ANSI escapes so we use them
110 |     logging.StreamHandler.emit = add_coloring_to_emit_ansi(logging.StreamHandler.emit)
111 |     # log = logging.getLogger()
112 |     # log.addFilter(log_filter())
113 |     # //hdlr = logging.StreamHandler()
114 |     # //hdlr.setFormatter(formatter())
115 | 
116 | logger = logging.getLogger('text-generation-webui')
117 | logger.setLevel(logging.DEBUG)
118 | 


--------------------------------------------------------------------------------
/extensions/api/util.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import traceback
 3 | from threading import Thread
 4 | from typing import Callable, Optional
 5 | 
 6 | from modules import shared
 7 | from modules.chat import load_character_memoized
 8 | from modules.presets import load_preset_memoized
 9 | 
10 | 
11 | def build_parameters(body, chat=False):
12 | 
13 |     generate_params = {
14 |         'max_new_tokens': int(body.get('max_new_tokens', body.get('max_length', 200))),
15 |         'do_sample': bool(body.get('do_sample', True)),
16 |         'temperature': float(body.get('temperature', 0.5)),
17 |         'top_p': float(body.get('top_p', 1)),
18 |         'typical_p': float(body.get('typical_p', body.get('typical', 1))),
19 |         'epsilon_cutoff': float(body.get('epsilon_cutoff', 0)),
20 |         'eta_cutoff': float(body.get('eta_cutoff', 0)),
21 |         'tfs': float(body.get('tfs', 1)),
22 |         'top_a': float(body.get('top_a', 0)),
23 |         'repetition_penalty': float(body.get('repetition_penalty', body.get('rep_pen', 1.1))),
24 |         'encoder_repetition_penalty': float(body.get('encoder_repetition_penalty', 1.0)),
25 |         'top_k': int(body.get('top_k', 0)),
26 |         'min_length': int(body.get('min_length', 0)),
27 |         'no_repeat_ngram_size': int(body.get('no_repeat_ngram_size', 0)),
28 |         'num_beams': int(body.get('num_beams', 1)),
29 |         'penalty_alpha': float(body.get('penalty_alpha', 0)),
30 |         'length_penalty': float(body.get('length_penalty', 1)),
31 |         'early_stopping': bool(body.get('early_stopping', False)),
32 |         'mirostat_mode': int(body.get('mirostat_mode', 0)),
33 |         'mirostat_tau': float(body.get('mirostat_tau', 5)),
34 |         'mirostat_eta': float(body.get('mirostat_eta', 0.1)),
35 |         'seed': int(body.get('seed', -1)),
36 |         'add_bos_token': bool(body.get('add_bos_token', True)),
37 |         'truncation_length': int(body.get('truncation_length', body.get('max_context_length', 2048))),
38 |         'ban_eos_token': bool(body.get('ban_eos_token', False)),
39 |         'skip_special_tokens': bool(body.get('skip_special_tokens', True)),
40 |         'custom_stopping_strings': '',  # leave this blank
41 |         'stopping_strings': body.get('stopping_strings', []),
42 |     }
43 | 
44 |     preset_name = body.get('preset', 'None')
45 |     if preset_name not in ['None', None, '']:
46 |         preset = load_preset_memoized(preset_name)
47 |         generate_params.update(preset)
48 | 
49 |     if chat:
50 |         character = body.get('character')
51 |         instruction_template = body.get('instruction_template')
52 |         name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
53 |         name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
54 |         generate_params.update({
55 |             'stop_at_newline': bool(body.get('stop_at_newline', shared.settings['stop_at_newline'])),
56 |             'chat_generation_attempts': int(body.get('chat_generation_attempts', shared.settings['chat_generation_attempts'])),
57 |             'mode': str(body.get('mode', 'chat')),
58 |             'name1': name1,
59 |             'name2': name2,
60 |             'context': context,
61 |             'greeting': greeting,
62 |             'name1_instruct': name1_instruct,
63 |             'name2_instruct': name2_instruct,
64 |             'context_instruct': context_instruct,
65 |             'turn_template': turn_template,
66 |             'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
67 |         })
68 | 
69 |     return generate_params
70 | 
71 | 
72 | def try_start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
73 |     Thread(target=_start_cloudflared, args=[
74 |            port, max_attempts, on_start], daemon=True).start()
75 | 
76 | 
77 | def _start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
78 |     try:
79 |         from flask_cloudflared import _run_cloudflared
80 |     except ImportError:
81 |         print('You should install flask_cloudflared manually')
82 |         raise Exception(
83 |             'flask_cloudflared not installed. Make sure you installed the requirements.txt for this extension.')
84 | 
85 |     for _ in range(max_attempts):
86 |         try:
87 |             public_url = _run_cloudflared(port, port + 1)
88 | 
89 |             if on_start:
90 |                 on_start(public_url)
91 | 
92 |             return
93 |         except Exception:
94 |             traceback.print_exc()
95 |             time.sleep(3)
96 | 
97 |         raise Exception('Could not start cloudflared.')
98 | 


--------------------------------------------------------------------------------
/modules/exllama_hf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | from typing import Any, Dict, Optional, Union
  4 | 
  5 | import torch
  6 | from torch.nn import CrossEntropyLoss
  7 | from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
  8 | from transformers.modeling_outputs import CausalLMOutputWithPast
  9 | 
 10 | from modules import shared
 11 | from modules.logging_colors import logger
 12 | 
 13 | try:
 14 |     from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 15 | except:
 16 |     logger.warning('Exllama module failed to load. Will attempt to load from repositories.')
 17 |     try:
 18 |         from modules.relative_imports import RelativeImport
 19 | 
 20 |         with RelativeImport("repositories/exllama"):
 21 |             from model import ExLlama, ExLlamaCache, ExLlamaConfig
 22 |     except:
 23 |         logger.error("Could not find repositories/exllama/. Make sure that exllama is cloned inside repositories/ and is up to date.")
 24 |         raise
 25 | 
 26 | 
 27 | class ExllamaHF(PreTrainedModel):
 28 |     def __init__(self, config: ExLlamaConfig):
 29 |         super().__init__(PretrainedConfig())
 30 |         self.ex_config = config
 31 |         self.ex_model = ExLlama(self.ex_config)
 32 |         self.generation_config = GenerationConfig()
 33 |         self.lora = None
 34 | 
 35 |     def _validate_model_class(self):
 36 |         pass
 37 | 
 38 |     def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
 39 |         pass
 40 | 
 41 |     def prepare_inputs_for_generation(self, input_ids, **kwargs):
 42 |         return {'input_ids': input_ids, **kwargs}
 43 | 
 44 |     @property
 45 |     def device(self) -> torch.device:
 46 |         return torch.device(0)
 47 | 
 48 |     def __call__(self, *args, **kwargs):
 49 |         # TODO: Some decoding methods (such as Contrastive Search) may not work at this time
 50 |         assert len(args) == 0, 'no *args should be passed to forward'
 51 |         use_cache = kwargs.get('use_cache', True)
 52 |         labels = kwargs.get('labels', None)
 53 |         seq = kwargs['input_ids'][0].tolist()
 54 |         cache = kwargs['past_key_values'] if 'past_key_values' in kwargs else None
 55 |         if cache is None:
 56 |             cache = ExLlamaCache(self.ex_model)
 57 |             self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), cache, preprocess_only=True, lora=self.lora)
 58 | 
 59 |         logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), cache, lora=self.lora).to(kwargs['input_ids'].device)
 60 | 
 61 |         loss = None
 62 |         if labels is not None:
 63 |             # Shift so that tokens < n predict n
 64 |             shift_logits = logits[..., :-1, :].contiguous()
 65 |             shift_labels = labels[..., 1:].contiguous()
 66 |             # Flatten the tokens
 67 |             loss_fct = CrossEntropyLoss()
 68 |             shift_logits = shift_logits.view(-1, logits.shape[-1])
 69 |             shift_labels = shift_labels.view(-1)
 70 |             # Enable model parallelism
 71 |             shift_labels = shift_labels.to(shift_logits.device)
 72 |             loss = loss_fct(shift_logits, shift_labels)
 73 | 
 74 |         return CausalLMOutputWithPast(logits=logits, past_key_values=cache if use_cache else None)
 75 | 
 76 |     @classmethod
 77 |     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
 78 |         assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
 79 |         if isinstance(pretrained_model_name_or_path, str):
 80 |             pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
 81 | 
 82 |         pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
 83 |         config = ExLlamaConfig(pretrained_model_name_or_path / 'config.json')
 84 | 
 85 |         # from 'oobabooga/text-generation-webui/modules/exllama.py'
 86 |         weight_path = None
 87 |         for ext in ['.safetensors', '.pt', '.bin']:
 88 |             found = list(pretrained_model_name_or_path.glob(f"*{ext}"))
 89 |             if len(found) > 0:
 90 |                 weight_path = found[-1]
 91 |                 break
 92 |         assert weight_path is not None, f'could not find weight in "{pretrained_model_name_or_path}"'
 93 | 
 94 |         config.model_path = str(weight_path)
 95 |         config.max_seq_len = shared.args.max_seq_len
 96 |         config.compress_pos_emb = shared.args.compress_pos_emb
 97 |         if shared.args.gpu_split:
 98 |             config.set_auto_map(shared.args.gpu_split)
 99 |             config.gpu_peer_fix = True
100 | 
101 |         # This slowes down a bit but align better with autogptq generation.
102 |         # TODO: Should give user choice to tune the exllama config
103 |         # config.fused_attn = False
104 |         # config.fused_mlp_thd = 0
105 | 
106 |         return ExllamaHF(config)
107 | 


--------------------------------------------------------------------------------
/extensions/superbooga/chromadb.py:
--------------------------------------------------------------------------------
  1 | import chromadb
  2 | import posthog
  3 | import torch
  4 | from chromadb.config import Settings
  5 | from sentence_transformers import SentenceTransformer
  6 | 
  7 | from modules.logging_colors import logger
  8 | 
  9 | logger.info('Intercepting all calls to posthog :)')
 10 | posthog.capture = lambda *args, **kwargs: None
 11 | 
 12 | 
 13 | class Collecter():
 14 |     def __init__(self):
 15 |         pass
 16 | 
 17 |     def add(self, texts: list[str]):
 18 |         pass
 19 | 
 20 |     def get(self, search_strings: list[str], n_results: int) -> list[str]:
 21 |         pass
 22 | 
 23 |     def clear(self):
 24 |         pass
 25 | 
 26 | 
 27 | class Embedder():
 28 |     def __init__(self):
 29 |         pass
 30 | 
 31 |     def embed(self, text: str) -> list[torch.Tensor]:
 32 |         pass
 33 | 
 34 | 
 35 | class ChromaCollector(Collecter):
 36 |     def __init__(self, embedder: Embedder):
 37 |         super().__init__()
 38 |         self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
 39 |         self.embedder = embedder
 40 |         self.collection = self.chroma_client.create_collection(name="context", embedding_function=embedder.embed)
 41 |         self.ids = []
 42 | 
 43 |     def add(self, texts: list[str]):
 44 |         if len(texts) == 0:
 45 |             return
 46 | 
 47 |         self.ids = [f"id{i}" for i in range(len(texts))]
 48 |         self.collection.add(documents=texts, ids=self.ids)
 49 | 
 50 |     def get_documents_ids_distances(self, search_strings: list[str], n_results: int):
 51 |         n_results = min(len(self.ids), n_results)
 52 |         if n_results == 0:
 53 |             return [], [], []
 54 | 
 55 |         result = self.collection.query(query_texts=search_strings, n_results=n_results, include=['documents', 'distances'])
 56 |         documents = result['documents'][0]
 57 |         ids = list(map(lambda x: int(x[2:]), result['ids'][0]))
 58 |         distances = result['distances'][0]
 59 |         return documents, ids, distances
 60 | 
 61 |     # Get chunks by similarity
 62 |     def get(self, search_strings: list[str], n_results: int) -> list[str]:
 63 |         documents, _, _ = self.get_documents_ids_distances(search_strings, n_results)
 64 |         return documents
 65 | 
 66 |     # Get ids by similarity
 67 |     def get_ids(self, search_strings: list[str], n_results: int) -> list[str]:
 68 |         _, ids, _ = self.get_documents_ids_distances(search_strings, n_results)
 69 |         return ids
 70 | 
 71 |     # Get chunks by similarity and then sort by insertion order
 72 |     def get_sorted(self, search_strings: list[str], n_results: int) -> list[str]:
 73 |         documents, ids, _ = self.get_documents_ids_distances(search_strings, n_results)
 74 |         return [x for _, x in sorted(zip(ids, documents))]
 75 | 
 76 |     # Multiply distance by factor within [0, time_weight] where more recent is lower
 77 |     def apply_time_weight_to_distances(self, ids: list[int], distances: list[float], time_weight: float = 1.0) -> list[float]:
 78 |         if len(self.ids) <= 1:
 79 |             return distances.copy()
 80 | 
 81 |         return [distance * (1 - _id / (len(self.ids) - 1) * time_weight) for _id, distance in zip(ids, distances)]
 82 | 
 83 |     # Get ids by similarity and then sort by insertion order
 84 |     def get_ids_sorted(self, search_strings: list[str], n_results: int, n_initial: int = None, time_weight: float = 1.0) -> list[str]:
 85 |         do_time_weight = time_weight > 0
 86 |         if not (do_time_weight and n_initial is not None):
 87 |             n_initial = n_results
 88 |         elif n_initial == -1:
 89 |             n_initial = len(self.ids)
 90 | 
 91 |         if n_initial < n_results:
 92 |             raise ValueError(f"n_initial {n_initial} should be >= n_results {n_results}")
 93 | 
 94 |         _, ids, distances = self.get_documents_ids_distances(search_strings, n_initial)
 95 |         if do_time_weight:
 96 |             distances_w = self.apply_time_weight_to_distances(ids, distances, time_weight=time_weight)
 97 |             results = zip(ids, distances, distances_w)
 98 |             results = sorted(results, key=lambda x: x[2])[:n_results]
 99 |             results = sorted(results, key=lambda x: x[0])
100 |             ids = [x[0] for x in results]
101 | 
102 |         return sorted(ids)
103 | 
104 |     def clear(self):
105 |         self.collection.delete(ids=self.ids)
106 |         self.ids = []
107 | 
108 | 
109 | class SentenceTransformerEmbedder(Embedder):
110 |     def __init__(self) -> None:
111 |         self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
112 |         self.embed = self.model.encode
113 | 
114 | 
115 | def make_collector():
116 |     global embedder
117 |     return ChromaCollector(embedder)
118 | 
119 | 
120 | def add_chunks_to_collector(chunks, collector):
121 |     collector.clear()
122 |     collector.add(chunks)
123 | 
124 | 
125 | embedder = SentenceTransformerEmbedder()
126 | 


--------------------------------------------------------------------------------
/modules/models_settings.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from pathlib import Path
  3 | 
  4 | import yaml
  5 | 
  6 | from modules import shared, ui
  7 | 
  8 | 
  9 | def get_model_settings_from_yamls(model):
 10 |     settings = shared.model_config
 11 |     model_settings = {}
 12 |     for pat in settings:
 13 |         if re.match(pat.lower(), model.lower()):
 14 |             for k in settings[pat]:
 15 |                 model_settings[k] = settings[pat][k]
 16 | 
 17 |     return model_settings
 18 | 
 19 | 
 20 | def infer_loader(model_name):
 21 |     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
 22 |     model_settings = get_model_settings_from_yamls(model_name)
 23 |     if not path_to_model.exists():
 24 |         loader = None
 25 |     elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
 26 |         loader = 'AutoGPTQ'
 27 |     elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
 28 |         loader = 'llama.cpp'
 29 |     elif re.match('.*ggml.*\.bin', model_name.lower()):
 30 |         loader = 'llama.cpp'
 31 |     elif re.match('.*rwkv.*\.pth', model_name.lower()):
 32 |         loader = 'RWKV'
 33 |     elif shared.args.flexgen:
 34 |         loader = 'FlexGen'
 35 |     else:
 36 |         loader = 'Transformers'
 37 | 
 38 |     return loader
 39 | 
 40 | 
 41 | # UI: update the command-line arguments based on the interface values
 42 | def update_model_parameters(state, initial=False):
 43 |     elements = ui.list_model_elements()  # the names of the parameters
 44 |     gpu_memories = []
 45 | 
 46 |     for i, element in enumerate(elements):
 47 |         if element not in state:
 48 |             continue
 49 | 
 50 |         value = state[element]
 51 |         if element.startswith('gpu_memory'):
 52 |             gpu_memories.append(value)
 53 |             continue
 54 | 
 55 |         if initial and vars(shared.args)[element] != vars(shared.args_defaults)[element]:
 56 |             continue
 57 | 
 58 |         # Setting null defaults
 59 |         if element in ['wbits', 'groupsize', 'model_type'] and value == 'None':
 60 |             value = vars(shared.args_defaults)[element]
 61 |         elif element in ['cpu_memory'] and value == 0:
 62 |             value = vars(shared.args_defaults)[element]
 63 | 
 64 |         # Making some simple conversions
 65 |         if element in ['wbits', 'groupsize', 'pre_layer']:
 66 |             value = int(value)
 67 |         elif element == 'cpu_memory' and value is not None:
 68 |             value = f"{value}MiB"
 69 | 
 70 |         if element in ['pre_layer']:
 71 |             value = [value] if value > 0 else None
 72 | 
 73 |         setattr(shared.args, element, value)
 74 | 
 75 |     found_positive = False
 76 |     for i in gpu_memories:
 77 |         if i > 0:
 78 |             found_positive = True
 79 |             break
 80 | 
 81 |     if not (initial and vars(shared.args)['gpu_memory'] != vars(shared.args_defaults)['gpu_memory']):
 82 |         if found_positive:
 83 |             shared.args.gpu_memory = [f"{i}MiB" for i in gpu_memories]
 84 |         else:
 85 |             shared.args.gpu_memory = None
 86 | 
 87 | 
 88 | # UI: update the state variable with the model settings
 89 | def apply_model_settings_to_state(model, state):
 90 |     model_settings = get_model_settings_from_yamls(model)
 91 |     if 'loader' not in model_settings:
 92 |         loader = infer_loader(model)
 93 |         if 'wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0:
 94 |             loader = 'AutoGPTQ'
 95 | 
 96 |         # If the user is using an alternative GPTQ loader, let them keep using it
 97 |         if not (loader == 'AutoGPTQ' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF']):
 98 |             state['loader'] = loader
 99 | 
100 |     for k in model_settings:
101 |         if k in state:
102 |             state[k] = model_settings[k]
103 | 
104 |     return state
105 | 
106 | 
107 | # Save the settings for this model to models/config-user.yaml
108 | def save_model_settings(model, state):
109 |     if model == 'None':
110 |         yield ("Not saving the settings because no model is loaded.")
111 |         return
112 | 
113 |     with Path(f'{shared.args.model_dir}/config-user.yaml') as p:
114 |         if p.exists():
115 |             user_config = yaml.safe_load(open(p, 'r').read())
116 |         else:
117 |             user_config = {}
118 | 
119 |         model_regex = model + '$'  # For exact matches
120 |         for _dict in [user_config, shared.model_config]:
121 |             if model_regex not in _dict:
122 |                 _dict[model_regex] = {}
123 | 
124 |         if model_regex not in user_config:
125 |             user_config[model_regex] = {}
126 | 
127 |         for k in ui.list_model_elements():
128 |             user_config[model_regex][k] = state[k]
129 |             shared.model_config[model_regex][k] = state[k]
130 | 
131 |         with open(p, 'w') as f:
132 |             f.write(yaml.dump(user_config, sort_keys=False))
133 | 
134 |         yield (f"Settings for {model} saved to {p}")
135 | 


--------------------------------------------------------------------------------
/modules/LoRA.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import torch
  4 | from peft import PeftModel
  5 | 
  6 | import modules.shared as shared
  7 | from modules.logging_colors import logger
  8 | from modules.models import reload_model
  9 | 
 10 | 
 11 | def add_lora_to_model(lora_names):
 12 |     if 'GPTQForCausalLM' in shared.model.__class__.__name__:
 13 |         add_lora_autogptq(lora_names)
 14 |     elif shared.model.__class__.__name__ in ['ExllamaModel', 'ExllamaHF']:
 15 |         add_lora_exllama(lora_names)
 16 |     else:
 17 |         add_lora_transformers(lora_names)
 18 | 
 19 | 
 20 | def add_lora_exllama(lora_names):
 21 | 
 22 |     try:
 23 |         from exllama.lora import ExLlamaLora
 24 |     except:
 25 |         try:
 26 |             from repositories.exllama.lora import ExLlamaLora
 27 |         except:
 28 |             logger.error("Could not find the file repositories/exllama/lora.py. Make sure that exllama is cloned inside repositories/ and is up to date.")
 29 |             return
 30 | 
 31 |     if len(lora_names) == 0:
 32 |         if shared.model.__class__.__name__ == 'ExllamaModel':
 33 |             shared.model.generator.lora = None
 34 |         else:
 35 |             shared.model.lora = None
 36 | 
 37 |         shared.lora_names = []
 38 |         return
 39 |     else:
 40 |         if len(lora_names) > 1:
 41 |             logger.warning('ExLlama can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
 42 | 
 43 |         lora_path = Path(f"{shared.args.lora_dir}/{lora_names[0]}")
 44 |         lora_config_path = lora_path / "adapter_config.json"
 45 |         lora_adapter_path = lora_path / "adapter_model.bin"
 46 | 
 47 |         logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
 48 |         if shared.model.__class__.__name__ == 'ExllamaModel':
 49 |             lora = ExLlamaLora(shared.model.model, str(lora_config_path), str(lora_adapter_path))
 50 |             shared.model.generator.lora = lora
 51 |         else:
 52 |             lora = ExLlamaLora(shared.model.ex_model, str(lora_config_path), str(lora_adapter_path))
 53 |             shared.model.lora = lora
 54 | 
 55 |         shared.lora_names = [lora_names[0]]
 56 |         return
 57 | 
 58 | 
 59 | # Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
 60 | def add_lora_autogptq(lora_names):
 61 | 
 62 |     try:
 63 |         from auto_gptq import get_gptq_peft_model
 64 |         from auto_gptq.utils.peft_utils import GPTQLoraConfig
 65 |     except:
 66 |         logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
 67 |         return
 68 | 
 69 |     if len(lora_names) == 0:
 70 |         if len(shared.lora_names) > 0:
 71 |             reload_model()
 72 | 
 73 |         shared.lora_names = []
 74 |         return
 75 |     else:
 76 |         if len(lora_names) > 1:
 77 |             logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
 78 | 
 79 |         peft_config = GPTQLoraConfig(
 80 |             inference_mode=True,
 81 |         )
 82 | 
 83 |         lora_path = Path(f"{shared.args.lora_dir}/{lora_names[0]}")
 84 |         logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
 85 |         shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
 86 |         shared.lora_names = [lora_names[0]]
 87 |         return
 88 | 
 89 | 
 90 | def add_lora_transformers(lora_names):
 91 |     prior_set = set(shared.lora_names)
 92 |     added_set = set(lora_names) - prior_set
 93 |     removed_set = prior_set - set(lora_names)
 94 | 
 95 |     # If no LoRA needs to be added or removed, exit
 96 |     if len(added_set) == 0 and len(removed_set) == 0:
 97 |         return
 98 | 
 99 |     # Add a LoRA when another LoRA is already present
100 |     if len(removed_set) == 0 and len(prior_set) > 0:
101 |         logger.info(f"Adding the LoRA(s) named {added_set} to the model...")
102 |         for lora in added_set:
103 |             shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
104 | 
105 |         return
106 | 
107 |     # If any LoRA needs to be removed, start over
108 |     if len(removed_set) > 0:
109 |         shared.model.disable_adapter()
110 |         shared.model = shared.model.base_model.model
111 | 
112 |     if len(lora_names) > 0:
113 |         params = {}
114 |         if not shared.args.cpu:
115 |             params['dtype'] = shared.model.dtype
116 |             if hasattr(shared.model, "hf_device_map"):
117 |                 params['device_map'] = {"base_model.model." + k: v for k, v in shared.model.hf_device_map.items()}
118 |             elif shared.args.load_in_8bit:
119 |                 params['device_map'] = {'': 0}
120 | 
121 |         logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
122 |         shared.model = PeftModel.from_pretrained(shared.model, Path(f"{shared.args.lora_dir}/{lora_names[0]}"), adapter_name=lora_names[0], **params)
123 |         for lora in lora_names[1:]:
124 |             shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
125 | 
126 |         shared.lora_names = lora_names
127 | 
128 |         if not shared.args.load_in_8bit and not shared.args.cpu:
129 |             shared.model.half()
130 |             if not hasattr(shared.model, "hf_device_map"):
131 |                 if torch.has_mps:
132 |                     device = torch.device('mps')
133 |                     shared.model = shared.model.to(device)
134 |                 else:
135 |                     shared.model = shared.model.cuda()
136 | 


--------------------------------------------------------------------------------
/extensions/multimodal/pipelines/llava/llava.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from abc import abstractmethod
  3 | from typing import List, Tuple
  4 | 
  5 | import torch
  6 | from huggingface_hub import hf_hub_download
  7 | from PIL import Image
  8 | from transformers import CLIPImageProcessor, CLIPVisionModel
  9 | 
 10 | from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
 11 | from modules import shared
 12 | from modules.logging_colors import logger
 13 | from modules.text_generation import encode
 14 | 
 15 | 
 16 | class LLaVA_v0_Pipeline(AbstractMultimodalPipeline):
 17 |     CLIP_REPO = "openai/clip-vit-large-patch14"
 18 | 
 19 |     def __init__(self, params: dict) -> None:
 20 |         super().__init__()
 21 |         self.clip_device = self._get_device("vision_device", params)
 22 |         self.clip_dtype = self._get_dtype("vision_bits", params)
 23 |         self.projector_device = self._get_device("projector_device", params)
 24 |         self.projector_dtype = self._get_dtype("projector_bits", params)
 25 |         self.image_processor, self.vision_tower, self.mm_projector = self._load_models()
 26 | 
 27 |     def _load_models(self):
 28 |         start_ts = time.time()
 29 | 
 30 |         logger.info(f"LLaVA - Loading CLIP from {LLaVA_v0_Pipeline.CLIP_REPO} as {self.clip_dtype} on {self.clip_device}...")
 31 |         image_processor = CLIPImageProcessor.from_pretrained(LLaVA_v0_Pipeline.CLIP_REPO, torch_dtype=self.clip_dtype)
 32 |         vision_tower = CLIPVisionModel.from_pretrained(LLaVA_v0_Pipeline.CLIP_REPO, torch_dtype=self.clip_dtype).to(self.clip_device)
 33 | 
 34 |         logger.info(f"LLaVA - Loading projector from {self.llava_projector_repo()} as {self.projector_dtype} on {self.projector_device}...")
 35 |         projector_path = hf_hub_download(self.llava_projector_repo(), self.llava_projector_filename())
 36 |         mm_projector = torch.nn.Linear(*self.llava_projector_shape())
 37 |         projector_data = torch.load(projector_path)
 38 |         mm_projector.weight = torch.nn.Parameter(projector_data['model.mm_projector.weight'].to(dtype=self.projector_dtype), False)
 39 |         mm_projector.bias = torch.nn.Parameter(projector_data['model.mm_projector.bias'].to(dtype=self.projector_dtype), False)
 40 |         mm_projector = mm_projector.to(self.projector_device)
 41 | 
 42 |         logger.info(f"LLaVA supporting models loaded, took {time.time() - start_ts:.2f} seconds")
 43 |         return image_processor, vision_tower, mm_projector
 44 | 
 45 |     @staticmethod
 46 |     def image_start() -> str:
 47 |         return "<im_start>"
 48 | 
 49 |     @staticmethod
 50 |     def image_end() -> str:
 51 |         return "<im_end>"
 52 | 
 53 |     @staticmethod
 54 |     def num_image_embeds() -> int:
 55 |         return 256
 56 | 
 57 |     @staticmethod
 58 |     def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
 59 |         if hasattr(shared.model.model, 'embed_tokens'):
 60 |             func = shared.model.model.embed_tokens
 61 |         else:
 62 |             func = shared.model.model.model.embed_tokens  # AutoGPTQ case
 63 | 
 64 |         return func(input_ids).to(shared.model.device, dtype=shared.model.dtype)
 65 | 
 66 |     @staticmethod
 67 |     def placeholder_embeddings() -> torch.Tensor:
 68 |         return LLaVA_v0_Pipeline.embed_tokens(encode("<im_patch>"*256, add_bos_token=False)[0])
 69 | 
 70 |     def embed_images(self, images: List[Image.Image]) -> torch.Tensor:
 71 |         images = self.image_processor(images, return_tensors='pt')['pixel_values']
 72 |         images = images.to(self.clip_device, dtype=self.clip_dtype)
 73 | 
 74 |         with torch.no_grad():
 75 |             image_forward_outs = self.vision_tower(images, output_hidden_states=True)
 76 |             select_hidden_state_layer = -2
 77 |             select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
 78 |             image_features = select_hidden_state[:, 1:].to(self.projector_device, dtype=self.projector_dtype)
 79 |             image_features = self.mm_projector(image_features)
 80 |         return image_features.to(shared.model.device, dtype=shared.model.dtype)
 81 | 
 82 |     @staticmethod
 83 |     @abstractmethod
 84 |     def llava_projector_repo() -> str:
 85 |         pass
 86 | 
 87 |     @staticmethod
 88 |     @abstractmethod
 89 |     def llava_projector_filename() -> str:
 90 |         pass
 91 | 
 92 |     @staticmethod
 93 |     @abstractmethod
 94 |     def llava_projector_shape() -> Tuple[int, int]:
 95 |         pass
 96 | 
 97 | 
 98 | class LLaVA_v0_13B_Pipeline(LLaVA_v0_Pipeline):
 99 |     def __init__(self, params: dict) -> None:
100 |         super().__init__(params)
101 | 
102 |     @staticmethod
103 |     def name() -> str:
104 |         return "llava-13b"
105 | 
106 |     @staticmethod
107 |     def placeholder_token_id() -> int:
108 |         return 32000
109 | 
110 |     @staticmethod
111 |     def llava_projector_shape() -> Tuple[int, int]:
112 |         return (1024, 5120)
113 | 
114 |     @staticmethod
115 |     def llava_projector_filename() -> str:
116 |         return "mm_projector.bin"
117 | 
118 |     @staticmethod
119 |     def llava_projector_repo() -> str:
120 |         return "liuhaotian/LLaVA-13b-delta-v0"
121 | 
122 | 
123 | class LLaVA_v0_7B_Pipeline(LLaVA_v0_Pipeline):
124 |     def __init__(self, params: dict) -> None:
125 |         super().__init__(params)
126 | 
127 |     @staticmethod
128 |     def name() -> str:
129 |         return "llava-7b"
130 | 
131 |     @staticmethod
132 |     def placeholder_token_id() -> int:
133 |         return 32001
134 | 
135 |     @staticmethod
136 |     def llava_projector_shape() -> Tuple[int, int]:
137 |         return (1024, 4096)
138 | 
139 |     @staticmethod
140 |     def llava_projector_filename() -> str:
141 |         return "mm_projector.bin"
142 | 
143 |     @staticmethod
144 |     def llava_projector_repo() -> str:
145 |         return "liuhaotian/LLaVA-7b-delta-v0"
146 | 


--------------------------------------------------------------------------------