├── .github └── workflows │ └── build-docker.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── backend ├── aria.py ├── aya_vision.py ├── bunny.py ├── chatglm.py ├── cogvlm.py ├── cogvlm2.py ├── deepseek-vl2.py ├── dragonfly.py ├── dv-qwen.py ├── emu.py ├── emu3.py ├── florence2.py ├── fuyu.py ├── generic.py ├── got.py ├── idefics2.py ├── idefics3.py ├── internlm2-wqx-vl.py ├── internlm2.py ├── internlmxcomposer2.py ├── internvl_chat.py ├── joy-caption-latest.py ├── joy-caption-pre-alpha.py ├── llamavision.py ├── llava-qwen2.py ├── llava.py ├── llava_next.py ├── llavanextgit.py ├── mantis.py ├── minicpm-v-2_6.py ├── minicpmv.py ├── minigemini.py ├── minimonkey.py ├── mistral.py ├── mllama.py ├── molmo.py ├── monkey.py ├── moondream1.py ├── moondream2.py ├── nvlm.py ├── omchat.py ├── omnilmm12b.py ├── ovis.py ├── ovis16.py ├── ovis2.py ├── paligemma.py ├── phi3_v.py ├── qh_360vl.py ├── qwen.py ├── qwen2_5_vl.py ├── qwen2_vl.py ├── xcomposer2-vl.py ├── xcomposer2.py ├── xgenmm.py └── yi-vl.py ├── chat_with_image.py ├── debug_info.py ├── docker-compose.alt.yml ├── docker-compose.yml ├── hf_home └── hf_home.txt ├── model_conf_tests.alt.json ├── model_conf_tests.json ├── openedai.py ├── requirements.txt ├── run_tests.sh ├── test_api_model.py ├── test_models.py ├── vision-alt.sample.env ├── vision.py ├── vision.sample.env └── vision_qna.py /.github/workflows/build-docker.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish Docker Image 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - 'dev' 8 | release: 9 | types: [published] 10 | 11 | jobs: 12 | build-and-push-image: 13 | runs-on: ubuntu-latest 14 | 15 | permissions: 16 | contents: read 17 | packages: write 18 | 19 | env: 20 | # Set up environment variables for the job 21 | DOCKER_REGISTRY: ghcr.io 22 | IMAGE_NAME: ${{ github.repository }} 23 | TAG: ${{ github.sha }} 24 | 25 | steps: 26 | - name: Check out code 27 | uses: actions/checkout@v4 28 | 29 | - name: Free Disk Space Before Build 30 | run: | 31 | sudo rm -rf /usr/local/.ghcup 32 | sudo rm -rf /opt/hostedtoolcache/CodeQL 33 | sudo rm -rf /usr/local/lib/android 34 | sudo rm -rf /usr/share/dotnet 35 | sudo rm -rf /opt/ghc 36 | sudo rm -rf /usr/local/share/boost 37 | sudo rm -rf /root/.cache/ 38 | 39 | - name: Set up Docker Buildx 40 | uses: docker/setup-buildx-action@v2 41 | with: 42 | install: true 43 | 44 | # Log in to the GitHub Container Registry only when not running on a pull request event 45 | - name: Login to Docker Registry 46 | uses: docker/login-action@v2 47 | with: 48 | registry: ${{ env.DOCKER_REGISTRY }} 49 | username: ${{ github.actor }} 50 | password: ${{ secrets.GITHUB_TOKEN }} 51 | 52 | - name: Extract metadata (tags, labels) for Docker 53 | id: meta 54 | uses: docker/metadata-action@v4 55 | with: 56 | images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }} 57 | 58 | # Build and push the Docker image to GHCR for the main branch or specific tags 59 | - name: Build and Push Docker Image (dev) 60 | if: github.ref == 'refs/heads/dev' 61 | uses: docker/build-push-action@v4 62 | with: 63 | context: . 64 | build-args: | 65 | GROUP_ID=1000 66 | USER_ID=1000 67 | file: Dockerfile 68 | push: true 69 | tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:dev 70 | labels: version=${{ github.run_id }} 71 | 72 | # For tagged releases, build and push the Docker image with the corresponding tag 73 | - name: Build and Push Docker Image (Tagged) 74 | if: startsWith(github.ref, 'refs/tags/') 75 | uses: docker/build-push-action@v4 76 | with: 77 | context: . 78 | build-args: | 79 | GROUP_ID=1000 80 | USER_ID=1000 81 | file: Dockerfile 82 | push: true 83 | tags: | 84 | ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }} 85 | ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest 86 | labels: version=${{ github.run_id }} 87 | 88 | build-and-push-alt-image: 89 | runs-on: ubuntu-latest 90 | 91 | permissions: 92 | contents: read 93 | packages: write 94 | 95 | env: 96 | # Set up environment variables for the job 97 | DOCKER_REGISTRY: ghcr.io 98 | IMAGE_NAME: matatonic/openedai-vision-alt 99 | TAG: ${{ github.sha }} 100 | 101 | steps: 102 | - name: Check out code 103 | uses: actions/checkout@v4 104 | 105 | - name: Free Disk Space Before Build 106 | run: | 107 | sudo rm -rf /usr/local/.ghcup 108 | sudo rm -rf /opt/hostedtoolcache/CodeQL 109 | sudo rm -rf /usr/local/lib/android 110 | sudo rm -rf /usr/share/dotnet 111 | sudo rm -rf /opt/ghc 112 | sudo rm -rf /usr/local/share/boost 113 | sudo rm -rf /root/.cache/ 114 | 115 | - name: Set up Docker Buildx 116 | uses: docker/setup-buildx-action@v2 117 | with: 118 | install: true 119 | 120 | # Log in to the GitHub Container Registry only when not running on a pull request event 121 | - name: Login to Docker Registry 122 | uses: docker/login-action@v2 123 | with: 124 | registry: ${{ env.DOCKER_REGISTRY }} 125 | username: ${{ github.actor }} 126 | password: ${{ secrets.GITHUB_TOKEN }} 127 | 128 | - name: Extract metadata (tags, labels) for Docker 129 | id: meta 130 | uses: docker/metadata-action@v4 131 | with: 132 | images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }} 133 | 134 | # Build and push the Docker image to GHCR for the main branch or specific tags 135 | - name: Build and Push Docker Image (dev) 136 | if: github.ref == 'refs/heads/dev' 137 | uses: docker/build-push-action@v4 138 | with: 139 | context: . 140 | build-args: | 141 | VERSION=alt 142 | GROUP_ID=1000 143 | USER_ID=1000 144 | file: Dockerfile 145 | push: true 146 | tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:dev 147 | labels: version=${{ github.run_id }} 148 | 149 | # For tagged releases, build and push the Docker image with the corresponding tag 150 | - name: Build and Push Docker Image (Tagged) 151 | if: startsWith(github.ref, 'refs/tags/') 152 | uses: docker/build-push-action@v4 153 | with: 154 | context: . 155 | build-args: | 156 | VERSION=alt 157 | GROUP_ID=1000 158 | USER_ID=1000 159 | file: Dockerfile 160 | push: true 161 | tags: | 162 | ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }} 163 | ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest 164 | labels: version=${{ github.run_id }} 165 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # 2 | hf_home/ 3 | vision.env 4 | vision-alt.env 5 | sample-*.env 6 | test_output-*.csv 7 | *.log 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | cover/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | .pybuilder/ 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | # For a library or package, you might want to ignore these files since the code is 95 | # intended to run in multiple environments; otherwise, check them in: 96 | # .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # poetry 106 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 107 | # This is especially recommended for binary packages to ensure reproducibility, and is more 108 | # commonly ignored for libraries. 109 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 110 | #poetry.lock 111 | 112 | # pdm 113 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 114 | #pdm.lock 115 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 116 | # in version control. 117 | # https://pdm.fming.dev/#use-with-ide 118 | .pdm.toml 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | RUN apt-get update && apt-get install -y git gcc \ 4 | && apt-get clean && rm -rf /var/lib/apt/lists/* 5 | RUN --mount=type=cache,target=/root/.cache/pip pip install --upgrade pip 6 | 7 | WORKDIR /app 8 | RUN git clone https://github.com/deepseek-ai/DeepSeek-VL2 --single-branch /app/DeepSeek-VL2 && \ 9 | git clone https://github.com/LLaVA-VL/LLaVA-NeXT.git --single-branch /app/LLaVA-NeXT 10 | 11 | COPY requirements.txt . 12 | ARG VERSION=latest 13 | RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "git+https://github.com/huggingface/transformers.git@v4.49.0-AyaVision" >> requirements.txt ; fi 14 | RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt 15 | 16 | RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps "git+https://github.com/casper-hansen/AutoAWQ.git" 17 | RUN --mount=type=cache,target=/root/.cache/pip pip install gptqmodel --no-build-isolation 18 | 19 | WORKDIR /app/DeepSeek-VL2 20 | RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps -e . 21 | 22 | WORKDIR /app/LLaVA-NeXT 23 | RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps -e . 24 | 25 | WORKDIR /app 26 | 27 | COPY *.py model_conf_tests.json README.md LICENSE /app/ 28 | COPY backend /app/backend 29 | 30 | ARG USER_ID=1000 31 | ENV USER_ID=${USER_ID} 32 | ARG GROUP_ID=1000 33 | ENV GROUP_ID=${GROUP_ID} 34 | RUN groupadd -g ${GROUP_ID} openedai && \ 35 | useradd -r -u ${USER_ID} -g ${GROUP_ID} -M -d /app openedai 36 | RUN chown openedai:openedai /app # for .triton, .config/matplotlib 37 | 38 | USER openedai 39 | ENV CLI_COMMAND="python vision.py" 40 | CMD $CLI_COMMAND 41 | -------------------------------------------------------------------------------- /backend/aria.py: -------------------------------------------------------------------------------- 1 | from transformers import AriaProcessor, AriaForConditionalGeneration 2 | 3 | from vision_qna import * 4 | 5 | # rhymes-ai/Aria 6 | 7 | class VisionQnA(VisionQnABase): 8 | model_name: str = "aria" # idefics3_vision 9 | format: str = "chatml" 10 | visual_layers: List[str] = ["vision_tower", "multi_modal_projector"] 11 | 12 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 13 | super().__init__(model_id, device, device_map, extra_params, format) 14 | 15 | self.processor = AriaProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 16 | self.model = AriaForConditionalGeneration.from_pretrained(**self.params).eval() 17 | 18 | self.eos_token = '<|im_end|>' 19 | 20 | self.loaded_banner() 21 | 22 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 23 | images, prompt = await chatml_prompt_from_messages(request.messages, img_tok = "<|img|>") 24 | 25 | prompt = prompt.replace("", "\n")#.replace('<|im_end|>', '<|im_end|>\n') 26 | 27 | if len(images) < 1: 28 | prompt = "<|img|>" + prompt 29 | images = [await url_to_image(transparent_pixel_url)] 30 | 31 | inputs = self.processor(images=images, text=prompt, return_tensors="pt") 32 | inputs["pixel_values"] = inputs["pixel_values"].to(self.model.dtype) 33 | inputs = inputs.to(self.model.device) 34 | 35 | default_params = { 36 | 'max_new_tokens': 500, 37 | 'do_sample': False, 38 | # 'temperature': 0.9, # random test failures, ex. OCR 39 | 'stop_strings': [self.eos_token], 40 | } 41 | 42 | params = self.get_generation_params(request, default_params=default_params) 43 | 44 | generation_kwargs = dict( 45 | tokenizer=self.processor.tokenizer, 46 | **inputs, 47 | **params, 48 | ) 49 | 50 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 51 | end = new_text.find(self.eos_token) 52 | if end == -1: 53 | yield new_text 54 | else: 55 | yield new_text[:end] 56 | break 57 | -------------------------------------------------------------------------------- /backend/aya_vision.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, AutoModelForImageTextToText 2 | 3 | from vision_qna import * 4 | 5 | class VisionQnA(VisionQnABase): 6 | model_name: str = "aya_vision" 7 | format: str = "internal" 8 | visual_layers: List[str] = ["multi_modal_projector", "vision_tower"] 9 | 10 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 11 | super().__init__(model_id, device, device_map, extra_params, format) 12 | 13 | if self.params['torch_dtype'] == torch.bfloat16: 14 | self.dtype = self.params['torch_dtype'] = torch.float16 15 | 16 | self.processor = AutoProcessor.from_pretrained(model_id) 17 | self.model = AutoModelForImageTextToText.from_pretrained(**self.params).eval() # model_id, device_map="auto", 18 | 19 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 20 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 21 | self.model = self.model.to(self.device) 22 | 23 | self.loaded_banner() 24 | 25 | 26 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 27 | messages = messages_from_messages(request.messages) 28 | 29 | inputs = self.processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(self.model.device) 30 | 31 | default_params = { 32 | 'do_sample': True, 33 | 'temperature': 0.3, 34 | # 'eos_token_id': self.processor.tokenizer.eos_token_id, 35 | # 'pad_token_id': self.processor.tokenizer.eos_token_id, 36 | } 37 | 38 | params = self.get_generation_params(request, default_params=default_params) 39 | 40 | generation_kwargs = dict( 41 | **inputs, 42 | **params, 43 | ) 44 | 45 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 46 | end = new_text.find(self.processor.tokenizer.eos_token) 47 | if end == -1: 48 | yield new_text 49 | else: 50 | yield new_text[:end] 51 | break 52 | -------------------------------------------------------------------------------- /backend/bunny.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, logging 2 | 3 | from vision_qna import * 4 | 5 | import warnings 6 | logging.set_verbosity_error() 7 | warnings.filterwarnings('ignore') 8 | 9 | # BAAI/Bunny-Llama-3-8B-V - vicuna (llama3?) 10 | # BAAI/Bunny-v1_0-2B-zh 11 | # BAAI/Bunny-v1_0-3B-zh (wont 4bit) 12 | # BAAI/Bunny-v1_0-3B 13 | # BAAI/Bunny-v1_0-4B - vicuna (phi3??) 14 | # BAAI/Bunny-v1_1-4B - vicuna (phi2??) 15 | # BAAI/Bunny-v1_1-Llama-3-8B-V - vicuna (llama3??) 16 | 17 | class VisionQnA(VisionQnABase): 18 | model_name: str = "bunny" 19 | format: str = "vicuna" 20 | vision_layers: List[str] = ["vision_tower", "mm_projector"] 21 | 22 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 23 | super().__init__(model_id, device, device_map, extra_params, format) 24 | 25 | torch.set_default_device(self.device) 26 | 27 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 28 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 29 | 30 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 31 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 32 | self.model = self.model.to(self.device) 33 | 34 | self.loaded_banner() 35 | 36 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 37 | images, prompt = await prompt_from_messages(request.messages, self.format) 38 | 39 | if not images: 40 | input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.model.device) 41 | image_tensor = None 42 | else: 43 | text_chunks = [self.tokenizer(chunk).input_ids for chunk in prompt.split('')] 44 | input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(self.model.device) 45 | 46 | image_tensor = self.model.process_images(images, self.model.config).to(dtype=self.model.dtype, device=self.model.device) 47 | 48 | default_params = dict( 49 | repetition_penalty=1.0, 50 | ) 51 | 52 | params = self.get_generation_params(request, default_params=default_params) 53 | 54 | generation_kwargs = dict( 55 | input_ids=input_ids, 56 | images=image_tensor, 57 | **params, 58 | ) 59 | 60 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 61 | end = new_text.find(self.tokenizer.eos_token) 62 | if end == -1: 63 | yield new_text 64 | else: 65 | yield new_text[:end] 66 | break 67 | -------------------------------------------------------------------------------- /backend/chatglm.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | from torchvision import transforms 3 | import torch 4 | from vision_qna import * 5 | 6 | # THUDM/glm-4v-9b 7 | 8 | class VisionQnA(VisionQnABase): 9 | model_name: str = "chatglm" 10 | format: str = 'glm-4v' 11 | vision_layers: List[str] = ['vision'] 12 | 13 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 14 | super().__init__(model_id, device, device_map, extra_params, format) 15 | 16 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 17 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 18 | 19 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 20 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 21 | self.model = self.model.to(self.device) 22 | 23 | self.transform = transforms.Compose( 24 | [ 25 | transforms.Resize( 26 | (self.model.config.vision_config['image_size'], self.model.config.vision_config['image_size']), interpolation=transforms.InterpolationMode.BICUBIC 27 | ), 28 | transforms.ToTensor(), 29 | transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 30 | ] 31 | ) 32 | 33 | self.loaded_banner() 34 | 35 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 36 | images, prompt = await glm4v_prompt_from_messages(request.messages) 37 | 38 | input_ids = self.tokenizer.encode(prompt) 39 | inputs = self.tokenizer.batch_encode_plus( 40 | [input_ids], 41 | padding=False, 42 | truncation=False, 43 | max_length=None, 44 | return_tensors="pt", 45 | is_split_into_words=True, 46 | add_special_tokens=False 47 | ) 48 | 49 | if images: 50 | inputs["images"] = torch.stack([ self.transform(img) for img in images ]) 51 | 52 | inputs = inputs.to(device=self.device) 53 | 54 | default_params = { 55 | 'max_new_tokens': 2500, 56 | 'do_sample': False, 57 | } 58 | 59 | params = self.get_generation_params(request, default_params) 60 | 61 | # streamer = TextIteratorStreamer(self.tokenizer, skip_special_tokens=False, skip_prompt=True) 62 | 63 | generation_kwargs = dict( 64 | **inputs, 65 | **params, 66 | ) 67 | 68 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 69 | end = new_text.find(self.tokenizer.eos_token) 70 | if end == -1: 71 | yield new_text 72 | else: 73 | yield new_text[:end] 74 | break 75 | -------------------------------------------------------------------------------- /backend/cogvlm.py: -------------------------------------------------------------------------------- 1 | from transformers import LlamaTokenizer, AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | # THUDM/cogvlm-chat-hf 6 | # THUDM/cogagent-chat-hf 7 | import transformers 8 | transformers.logging.set_verbosity_error() 9 | 10 | class VisionQnA(VisionQnABase): 11 | model_name: str = "cogvlm" 12 | format: str = 'llama2' 13 | 14 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 15 | super().__init__(model_id, device, device_map, extra_params, format) 16 | 17 | self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5") 18 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 19 | 20 | self.loaded_banner() 21 | 22 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 23 | 24 | query, history, images, system_message = await prompt_history_images_system_from_messages( 25 | request.messages, img_tok='', url_handler=url_to_image) 26 | 27 | if len(images) < 1: 28 | images = [ await url_to_image(transparent_pixel_url) ] 29 | 30 | input_by_model = self.model.build_conversation_input_ids(self.tokenizer, query=query, history=history, images=images) 31 | 32 | inputs = { 33 | 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(self.model.device), 34 | 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(self.model.device), 35 | 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(self.model.device), 36 | 'images': [[input_by_model['images'][0].to(self.model.device).to(self.model.dtype)]], 37 | } 38 | if 'cross_images' in input_by_model and input_by_model['cross_images']: 39 | inputs['cross_images'] = [[input_by_model['cross_images'][0].to(self.model.device).to(self.model.dtype)]] 40 | 41 | params = self.get_generation_params(request) 42 | 43 | del params['top_k'] 44 | 45 | generation_kwargs = dict( 46 | **inputs, 47 | **params, 48 | ) 49 | 50 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 51 | end = new_text.find(self.tokenizer.eos_token) 52 | if end == -1: 53 | yield new_text 54 | else: 55 | yield new_text[:end] 56 | break 57 | -------------------------------------------------------------------------------- /backend/cogvlm2.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | # THUDM/cogvlm2-llama3-chat-19B 6 | # THUDM/cogvlm2-llama3-chinese-chat-19B 7 | import transformers 8 | transformers.logging.set_verbosity_error() 9 | 10 | class VisionQnA(VisionQnABase): 11 | model_name: str = "cogvlm2" 12 | format: str = 'llama3' 13 | vision_layers: List[str] = ['vision'] 14 | 15 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 16 | super().__init__(model_id, device, device_map, extra_params, format) 17 | 18 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 19 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 20 | 21 | self.loaded_banner() 22 | 23 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 24 | 25 | query, history, images, system_message = await prompt_history_images_system_from_messages( 26 | request.messages, img_tok='', url_handler=url_to_image) 27 | 28 | input_by_model = self.model.build_conversation_input_ids(self.tokenizer, query=query, history=history, images=images, template_version='chat') 29 | 30 | inputs = { 31 | 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(self.model.device), 32 | 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(self.model.device), 33 | 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(self.model.device), 34 | 'images': [[input_by_model['images'][0].to(self.model.device).to(self.model.dtype)]] if images else None, 35 | } 36 | 37 | default_params = { 38 | 'max_new_tokens': 2048, 39 | 'pad_token_id': 128002, 40 | 'top_p': None, # 0.9 41 | 'temperature': None, # 0.6 42 | } 43 | 44 | params = self.get_generation_params(request, default_params) 45 | 46 | # streamer = TextIteratorStreamer(self.tokenizer, skip_special_tokens=False, skip_prompt=True) 47 | 48 | generation_kwargs = dict( 49 | **inputs, 50 | **params, 51 | ) 52 | 53 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 54 | end = new_text.find(self.tokenizer.eos_token) 55 | if end == -1: 56 | yield new_text 57 | else: 58 | yield new_text[:end] 59 | break 60 | -------------------------------------------------------------------------------- /backend/deepseek-vl2.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM 2 | 3 | from deepseek_vl.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM 4 | from deepseek_vl.utils.io import load_pil_images 5 | 6 | from vision_qna import * 7 | 8 | WIP 9 | 10 | 11 | class VisionQnA(VisionQnABase): 12 | model_name: str = "deepseek_vl2" 13 | format: str = "custom" 14 | visual_layers: List[str] = [] 15 | 16 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 17 | super().__init__(model_id, device, device_map, extra_params, format) 18 | 19 | if not format: 20 | self.format = guess_model_format(model_id) 21 | 22 | # specify the path to the model 23 | self.processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_id) 24 | self.model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_id, **self.params).eval() 25 | 26 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 27 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 28 | self.model = self.model.to(device=self.device) 29 | 30 | self.loaded_banner() 31 | 32 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 33 | images, prompt = await prompt_from_messages(request.messages, self.format) 34 | 35 | ## single image conversation example 36 | conversation = [ 37 | { 38 | "role": "<|User|>", 39 | "content": "\n<|ref|>The giraffe at the back.<|/ref|>.", 40 | "images": ["./images/visual_grounding.jpeg"], 41 | }, 42 | {"role": "<|Assistant|>", "content": ""}, 43 | ] 44 | 45 | ## multiple images (or in-context learning) conversation example 46 | # conversation = [ 47 | # { 48 | # "role": "User", 49 | # "content": "A dog wearing nothing in the foreground, " 50 | # "a dog wearing a santa hat, " 51 | # "a dog wearing a wizard outfit, and " 52 | # "what's the dog wearing?", 53 | # "images": [ 54 | # "images/dog_a.png", 55 | # "images/dog_b.png", 56 | # "images/dog_c.png", 57 | # "images/dog_d.png", 58 | # ], 59 | # }, 60 | # {"role": "Assistant", "content": ""} 61 | # ] 62 | 63 | conversation = ... prompt 64 | 65 | prepare_inputs = self.processor( 66 | conversations=conversation, 67 | images=images, 68 | force_batchify=True, 69 | system_prompt="" 70 | ).to(self.model.device) 71 | 72 | inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs) 73 | 74 | default_params = { 75 | 'pad_token_id': self.processor.tokenizer.eos_token_id, 76 | 'bos_token_id': self.processor.tokenizer.bos_token_id, 77 | 'eos_token_id': self.processor.tokenizer.eos_token_id, 78 | 'do_sample': False, 79 | } 80 | 81 | params = self.get_generation_params(request, default_params=default_params) 82 | 83 | generation_kwargs = dict( 84 | inputs_embeds=inputs_embeds, 85 | attention_mask=prepare_inputs.attention_mask, 86 | **params, 87 | ) 88 | 89 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 90 | end = new_text.find(self.processor.tokenizer.eos_token) 91 | if end == -1: 92 | yield new_text 93 | else: 94 | yield new_text[:end] 95 | break 96 | -------------------------------------------------------------------------------- /backend/dragonfly.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | from transformers import AutoTokenizer, AutoProcessor, logging 3 | from dragonfly.models.modeling_dragonfly import DragonflyForCausalLM 4 | from dragonfly.models.processing_dragonfly import DragonflyProcessor 5 | 6 | import warnings 7 | # disable some warnings 8 | logging.set_verbosity_error() 9 | warnings.filterwarnings('ignore') 10 | 11 | from vision_qna import * 12 | 13 | # togethercomputer/Llama-3-8B-Dragonfly-v1 14 | # togethercomputer/Llama-3-8B-Dragonfly-Med-v1 15 | 16 | class VisionQnA(VisionQnABase): 17 | model_name: str = "dragonfly" 18 | format: str = 'llama3' 19 | vision_layers: List[str] = ['image_encoder', 'vision_model', 'encoder', 'mpl', 'vision_embed_tokens'] 20 | 21 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 22 | super().__init__(model_id, device, device_map, extra_params, format) 23 | 24 | del self.params['trust_remote_code'] 25 | 26 | self.tokenizer = AutoTokenizer.from_pretrained(model_id) 27 | clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") 28 | self.processor = DragonflyProcessor(image_processor=clip_processor.image_processor, tokenizer=self.tokenizer, image_encoding_style="llava-hd") 29 | 30 | self.model = DragonflyForCausalLM.from_pretrained(**self.params) 31 | 32 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 33 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 34 | self.model = self.model.to(dtype=self.dtype, device=self.device) 35 | 36 | self.eos_token = "<|eot_id|>" 37 | self.eos_token_id = self.tokenizer.encode(self.eos_token, add_special_tokens=False) 38 | 39 | self.loaded_banner() 40 | 41 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 42 | images, prompt = await llama3_prompt_from_messages(request.messages, img_tok='') 43 | 44 | if not images: 45 | images = [ await url_to_image(transparent_pixel_url) ] 46 | 47 | inputs = self.processor(text=[prompt], images=images, max_length=2048, return_tensors="pt", is_generate=True).to(device=self.model.device) 48 | 49 | default_params = { 50 | 'max_new_tokens': 1024, 51 | 'eos_token_id': self.eos_token_id, 52 | 'pad_token_id': self.eos_token_id[0], 53 | } 54 | 55 | params = self.get_generation_params(request, default_params=default_params) 56 | 57 | generation_kwargs = dict( 58 | **inputs, 59 | **params, 60 | ) 61 | 62 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 63 | end = new_text.find(self.eos_token) 64 | if end == -1: 65 | yield new_text 66 | else: 67 | yield new_text[:end] 68 | break 69 | -------------------------------------------------------------------------------- /backend/dv-qwen.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, logging 2 | 3 | import warnings 4 | 5 | from vision_qna import * 6 | 7 | # disable some warnings 8 | logging.set_verbosity_error() 9 | warnings.filterwarnings('ignore') 10 | 11 | # cognitivecomputations/dolphin-vision-72b 12 | # cognitivecomputations/dolphin-vision-7b 13 | 14 | class VisionQnA(VisionQnABase): 15 | model_name: str = "dolphin-vision" 16 | format: str = "chatml" 17 | visual_layers: List[str] = ["vision_tower", "mm_projector"] 18 | 19 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 20 | super().__init__(model_id, device, device_map, extra_params, format) 21 | 22 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 23 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 24 | 25 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 26 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 27 | self.model = self.model.to(self.device) 28 | 29 | self.loaded_banner() 30 | 31 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 32 | images, prompt = await prompt_from_messages(request.messages, self.format) 33 | 34 | if not images: 35 | input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.model.device) 36 | image_tensor = None 37 | else: 38 | text_chunks = [self.tokenizer(chunk).input_ids for chunk in prompt.split('')] 39 | input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(self.model.device) 40 | 41 | image_tensor = self.model.process_images(images, self.model.config).to(dtype=self.model.dtype, device=self.model.device) 42 | 43 | params = self.get_generation_params(request) 44 | 45 | generation_kwargs = dict( 46 | input_ids=input_ids, 47 | images=image_tensor, 48 | **params, 49 | ) 50 | 51 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 52 | end = new_text.find(self.tokenizer.eos_token) 53 | if end == -1: 54 | yield new_text 55 | else: 56 | yield new_text[:end] 57 | break 58 | -------------------------------------------------------------------------------- /backend/emu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForCausalLM 3 | from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch 4 | from huggingface_hub import snapshot_download 5 | from loguru import logger 6 | from vision_qna import * 7 | 8 | # BAAI/Emu2-Chat 9 | 10 | class VisionQnA(VisionQnABase): 11 | model_name: str = 'emu' 12 | format: str = 'emu' 13 | vision_layers: List[str] = ["visual", "project_up", "project_down"] 14 | 15 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 16 | super().__init__(model_id, device, device_map, extra_params, format) 17 | 18 | self.tokenizer = AutoTokenizer.from_pretrained(model_id) 19 | 20 | if extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False): 21 | if self.params['torch_dtype'] == torch.bfloat16: 22 | self.dtype = self.params['torch_dtype'] = torch.float16 23 | 24 | self.model = AutoModelForCausalLM.from_pretrained(**self.params) 25 | else: 26 | checkpoint = snapshot_download(model_id) 27 | with init_empty_weights(): 28 | self.model = AutoModelForCausalLM.from_pretrained(**self.params) 29 | 30 | max_memory=extra_params.get('max_memory', None) 31 | 32 | device_map = infer_auto_device_map(self.model, max_memory=max_memory, no_split_module_classes=['Block','LlamaDecoderLayer']) 33 | # input and output logits should be on same device 34 | device_map["model.decoder.lm.lm_head"] = 0 35 | 36 | self.model = load_checkpoint_and_dispatch(self.model, checkpoint=checkpoint, device_map=device_map).eval() 37 | 38 | # self.model.device/dtype are overloaded with some other object 39 | logger.info(f"Loaded {model_id} on device: {self.device} with dtype: {self.params['torch_dtype']}") 40 | 41 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 42 | images, prompt, system = await emu_images_prompt_system_from_messages(request.messages) 43 | 44 | if not system: 45 | system = "You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses." 46 | 47 | prompt = system + prompt 48 | 49 | inputs = self.model.build_input_ids( 50 | text=[prompt], 51 | tokenizer=self.tokenizer, 52 | image=images if images else None 53 | ) 54 | 55 | default_params = { 56 | 'length_penalty': 1.0, 57 | 'num_beams': 1, # for streaming 58 | 'do_sample': True, 59 | } 60 | 61 | params = self.get_generation_params(request, default_params) 62 | 63 | generation_kwargs = dict( 64 | input_ids=inputs["input_ids"], 65 | attention_mask=inputs["attention_mask"], 66 | image=inputs["image"].to(self.params['torch_dtype']) if images else None, 67 | **params, 68 | ) 69 | 70 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 71 | end = new_text.find(self.tokenizer.eos_token) 72 | if end == -1: 73 | yield new_text 74 | else: 75 | yield new_text[:end] 76 | break 77 | -------------------------------------------------------------------------------- /backend/emu3.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel, AutoImageProcessor, AutoModelForCausalLM 2 | from transformers.generation.configuration_utils import GenerationConfig 3 | from Emu3.emu3.mllm.processing_emu3 import Emu3Processor 4 | 5 | from vision_qna import * 6 | 7 | # BAAI/Emu3-Chat 8 | 9 | VQ_HUB = "BAAI/Emu3-VisionTokenizer" 10 | 11 | class VisionQnA(VisionQnABase): 12 | model_name: str = "emu3" 13 | format: str = "vicuna" 14 | visual_layers: List[str] = [] 15 | 16 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 17 | super().__init__(model_id, device, device_map, extra_params, format) 18 | 19 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 20 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 21 | image_processor = AutoImageProcessor.from_pretrained(VQ_HUB, trust_remote_code=True) 22 | image_tokenizer = AutoModel.from_pretrained(VQ_HUB, device_map=self.params['device_map'], trust_remote_code=self.params.get('trust_remote_code', False)).eval() 23 | self.processor = Emu3Processor(image_processor, image_tokenizer, tokenizer) 24 | 25 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 26 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 27 | self.model = self.model.to(self.device) 28 | 29 | self.loaded_banner() 30 | 31 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 32 | image = None 33 | text = '' 34 | 35 | for m in request.messages: 36 | if m.role == 'user': 37 | for c in m.content: 38 | if c.type == 'image_url': 39 | image = await url_to_image(c.image_url.url) 40 | break 41 | 42 | if image is None: 43 | image = await url_to_image(black_pixel_url) 44 | text = "".join([t.text for t in request.messages[-1].content if t.text]) 45 | 46 | inputs = self.processor(text=text, image=image, mode='U', padding_side="left", padding="longest", return_tensors="pt") 47 | 48 | default_params = dict( 49 | max_new_tokens=320, 50 | pad_token_id=self.processor.tokenizer.pad_token_id, 51 | bos_token_id=self.processor.tokenizer.bos_token_id, 52 | eos_token_id=self.processor.tokenizer.eos_token_id, 53 | ) 54 | 55 | params = self.get_generation_params(request, default_params=default_params) 56 | 57 | generation_kwargs = dict( 58 | input_ids=inputs.input_ids.to(self.device), 59 | generation_config=GenerationConfig(**params), 60 | ) 61 | 62 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 63 | end = new_text.find(self.processor.tokenizer.eos_token) 64 | if end == -1: 65 | yield new_text 66 | else: 67 | yield new_text[:end] 68 | break 69 | -------------------------------------------------------------------------------- /backend/florence2.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | # microsoft/Florence-2-large-ft 6 | # microsoft/Florence-2-base-ft 7 | 8 | def select_task(prompt): 9 | tasks = ["", "", "", "", # simple tasks 10 | "", "", "", "", 11 | "", "", "", 12 | "", "", "" 13 | ] 14 | for task in tasks: 15 | if task in prompt: 16 | return task 17 | 18 | return None 19 | 20 | class VisionQnA(VisionQnABase): 21 | model_name: str = "florence2" 22 | format: str = "florence" 23 | visual_layers: List[str] = ['vision_tower', 'image_proj_norm', 'image_pos_embed', 'visual_temporal_embed'] 24 | 25 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 26 | super().__init__(model_id, device, device_map, extra_params, format) 27 | 28 | if not format: 29 | self.format = guess_model_format(model_id) 30 | 31 | self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 32 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 33 | 34 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 35 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 36 | self.model = self.model.to(self.device) 37 | 38 | self.loaded_banner() 39 | 40 | async def chat_with_images(self, request: ImageChatRequest) -> str: 41 | images, prompt = await prompt_from_messages(request.messages, self.format) 42 | 43 | if len(images) < 1: 44 | images = [ await url_to_image(black_pixel_url) ] 45 | 46 | inputs = self.processor(text=prompt, images=images[0], return_tensors="pt").to(device=self.model.device, dtype=self.model.dtype) 47 | 48 | default_params = { 49 | 'do_sample': False, 50 | 'num_beams': 3, 51 | } 52 | 53 | params = self.get_generation_params(request, default_params=default_params) 54 | 55 | generation_kwargs = dict( 56 | **inputs, 57 | **params, 58 | ) 59 | 60 | tps_start = time.time() 61 | generated_ids = self.model.generate(**generation_kwargs) 62 | logger.info(f"Generated {len(generated_ids[0])} tokens at {len(generated_ids[0]) / (time.time() - tps_start):0.2f} T/s") 63 | 64 | generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0] 65 | parsed_answer = self.processor.post_process_generation(generated_text, task=select_task(prompt), image_size=(images[0].width, images[0].height)) 66 | 67 | for k, v in parsed_answer.items(): 68 | return str(v) 69 | 70 | -------------------------------------------------------------------------------- /backend/fuyu.py: -------------------------------------------------------------------------------- 1 | from transformers import FuyuProcessor, FuyuForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | # "adept/fuyu-8b" 6 | 7 | class VisionQnA(VisionQnABase): 8 | model_name: str = "fuyu" 9 | format: str = "fuyu" 10 | vision_layers: List[str] = ["vision_embed_tokens"] 11 | 12 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 13 | super().__init__(model_id, device, device_map, extra_params, format) 14 | 15 | if not format: 16 | self.format = guess_model_format(model_id) 17 | 18 | del self.params['trust_remote_code'] # not needed. 19 | 20 | self.processor = FuyuProcessor.from_pretrained(model_id) 21 | self.model = FuyuForCausalLM.from_pretrained(**self.params) 22 | 23 | self.loaded_banner() 24 | 25 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 26 | images, prompt = await prompt_from_messages(request.messages, self.format) 27 | 28 | inputs = self.processor(text=prompt, images=images[0] if images else None, return_tensors="pt").to(self.model.device) 29 | 30 | params = self.get_generation_params(request) 31 | 32 | generation_kwargs = dict( 33 | **inputs, 34 | **params, 35 | ) 36 | 37 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 38 | end = new_text.find(self.processor.tokenizer.eos_token) 39 | if end == -1: 40 | yield new_text 41 | else: 42 | yield new_text[:end] 43 | break 44 | -------------------------------------------------------------------------------- /backend/generic.py: -------------------------------------------------------------------------------- 1 | #from transformers import AutoProcessor, AutoModel 2 | from transformers import AutoProcessor, AutoModelForVision2Seq 3 | 4 | from vision_qna import * 5 | 6 | class VisionQnA(VisionQnABase): 7 | model_name: str = "generic" 8 | format: str = "generic" 9 | visual_layers: List[str] = [] 10 | 11 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 12 | super().__init__(model_id, device, device_map, extra_params, format) 13 | 14 | if not format: 15 | self.format = guess_model_format(model_id) 16 | 17 | self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 18 | self.model = AutoModelForVision2Seq.from_pretrained(**self.params).eval() 19 | 20 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 21 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 22 | self.model = self.model.to(self.device) 23 | 24 | self.loaded_banner() 25 | 26 | # newer style 27 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 28 | messages = chat_from_messages(request.messages) 29 | 30 | inputs = self.processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(self.model.device) 31 | 32 | default_params = { 33 | 'do_sample': True, 34 | 'temperature': 0.3, 35 | # 'eos_token_id': self.processor.tokenizer.eos_token_id, 36 | # 'pad_token_id': self.processor.tokenizer.eos_token_id, 37 | } 38 | 39 | params = self.get_generation_params(request, default_params=default_params) 40 | 41 | generation_kwargs = dict( 42 | **inputs, 43 | **params, 44 | ) 45 | 46 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 47 | end = new_text.find(self.processor.tokenizer.eos_token) 48 | if end == -1: 49 | yield new_text 50 | else: 51 | yield new_text[:end] 52 | break 53 | 54 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 55 | images, prompt = await prompt_from_messages(request.messages, self.format) 56 | 57 | inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(self.model.device) 58 | 59 | default_params = { 60 | 'do_sample': False, 61 | # 'eos_token_id': self.processor.tokenizer.eos_token_id, 62 | # 'pad_token_id': self.processor.tokenizer.eos_token_id, 63 | } 64 | 65 | params = self.get_generation_params(request, default_params=default_params) 66 | 67 | generation_kwargs = dict( 68 | **inputs, 69 | **params, 70 | ) 71 | 72 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 73 | end = new_text.find(self.processor.tokenizer.eos_token) 74 | if end == -1: 75 | yield new_text 76 | else: 77 | yield new_text[:end] 78 | break 79 | 80 | async def chat_with_images(self, request: ImageChatRequest) -> str: 81 | images, prompt = await prompt_from_messages(request.messages, self.format) 82 | 83 | inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(self.model.device) 84 | 85 | default_params = { 86 | 'do_sample': False, 87 | # 'eos_token_id': self.processor.tokenizer.eos_token_id, 88 | # 'pad_token_id': self.processor.tokenizer.eos_token_id, 89 | } 90 | 91 | params = self.get_generation_params(request, default_params=default_params) 92 | 93 | 94 | tps_start = time.time() 95 | output = self.model.generate(**inputs, **params) 96 | out_tokens = output[0][inputs.input_ids.size(1):].cpu() 97 | logger.info(f"Generated {len(out_tokens)} tokens at {len(out_tokens) / (time.time() - tps_start):0.2f} T/s") 98 | response = self.processor.tokenizer.decode(out_tokens, skip_special_tokens=True) 99 | 100 | return response 101 | 102 | -------------------------------------------------------------------------------- /backend/got.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel 2 | 3 | from vision_qna import * 4 | 5 | # ucaslcl/GOT-OCR2_0 XXX 6 | # stepfun-ai/GOT-OCR2_0 7 | 8 | DEFAULT_IMAGE_TOKEN = "" 9 | DEFAULT_IMAGE_PATCH_TOKEN = '' 10 | DEFAULT_IM_START_TOKEN = '' 11 | DEFAULT_IM_END_TOKEN = '' 12 | 13 | class VisionQnA(VisionQnABase): 14 | model_name: str = "got" 15 | format: str = "custom" 16 | visual_layers: List[str] = ['vision_tower_high', 'mm_projector_vary'] 17 | 18 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 19 | super().__init__(model_id, device, device_map, extra_params, format) 20 | 21 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 22 | self.model = AutoModel.from_pretrained(**self.params).eval() 23 | 24 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 25 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 26 | self.model = self.model.to(self.device) 27 | 28 | self.loaded_banner() 29 | 30 | async def chat_with_images(self, request: ImageChatRequest) -> str: 31 | try: 32 | image = None 33 | for m in reversed(request.messages): 34 | for c in m.content: 35 | if c.type == 'image_url': 36 | image = await url_to_file(c.image_url.url) 37 | break 38 | 39 | response = self.model.chat(self.tokenizer, image, ocr_type='ocr') # TODO: support format and maybe convert to markdown? 40 | 41 | return response 42 | finally: 43 | if image: 44 | os.remove(image) 45 | -------------------------------------------------------------------------------- /backend/idefics2.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, AutoModelForVision2Seq 2 | from transformers import AwqConfig 3 | 4 | from vision_qna import * 5 | 6 | # HuggingFaceM4/idefics2-8b 7 | # HuggingFaceM4/idefics2-8b-AWQ 8 | # HuggingFaceM4/idefics2-8b-chatty 9 | # HuggingFaceM4/idefics2-8b-chatty-AWQ 10 | 11 | class VisionQnA(VisionQnABase): 12 | format: str = 'internal' 13 | model_name: str = "idefics2" 14 | vision_layers: List[str] = ['vision_model', 'connector'] 15 | 16 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 17 | super().__init__(model_id, device, device_map, extra_params, format) 18 | 19 | #do_image_splitting=False 20 | #size= {"longest_edge": 448, "shortest_edge": 378} 21 | self.processor = AutoProcessor.from_pretrained(model_id) 22 | 23 | if '-awq' in model_id.lower(): 24 | """ 25 | # This is from https://huggingface.co/HuggingFaceM4/idefics2-8b 26 | # It doesn't work 27 | quantization_config = AwqConfig( 28 | bits=4, 29 | fuse_max_seq_len=4096, 30 | modules_to_fuse={ 31 | "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], 32 | "mlp": ["gate_proj", "up_proj", "down_proj"], 33 | "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], 34 | "use_alibi": False, 35 | "num_attention_heads": 32, 36 | "num_key_value_heads": 8, 37 | "hidden_size": 4096, 38 | } 39 | ) 40 | self.params['quantization_config'] = quantization_config 41 | """ 42 | 43 | if self.params['torch_dtype'] == torch.bfloat16: 44 | self.dtype = self.params['torch_dtype'] = torch.float16 45 | 46 | self.model = AutoModelForVision2Seq.from_pretrained(**self.params) 47 | 48 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 49 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 50 | self.model = self.model.to(self.device) 51 | 52 | self.loaded_banner() 53 | 54 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 55 | images, hfmessages = await images_hfmessages_from_messages(request.messages) 56 | 57 | prompt = self.processor.apply_chat_template(hfmessages, add_generation_prompt=True) 58 | inputs = self.processor(text=prompt, images=images if images else None, return_tensors="pt").to(device=self.model.device) 59 | 60 | # Generate 61 | params = self.get_generation_params(request) 62 | 63 | generation_kwargs = dict( 64 | **inputs, 65 | **params, 66 | ) 67 | 68 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 69 | end = new_text.find(self.processor.tokenizer.eos_token) 70 | if end == -1: 71 | yield new_text 72 | else: 73 | yield new_text[:end] 74 | break 75 | -------------------------------------------------------------------------------- /backend/idefics3.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, AutoModelForVision2Seq 2 | 3 | from vision_qna import * 4 | 5 | # HuggingFaceTB/SmolVLM-Instruct 6 | # HuggingFaceM4/Idefics3-8B-Llama3 7 | 8 | class VisionQnA(VisionQnABase): 9 | model_name: str = "idefics3" 10 | format: str = "internal" 11 | visual_layers: List[str] = ["vision_model"] 12 | 13 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 14 | super().__init__(model_id, device, device_map, extra_params, format) 15 | 16 | self.processor = AutoProcessor.from_pretrained(model_id) 17 | self.model = AutoModelForVision2Seq.from_pretrained(**self.params).eval() 18 | 19 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 20 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 21 | self.model = self.model.to(self.device) 22 | 23 | self.loaded_banner() 24 | 25 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 26 | images, messages = await images_hfmessages_from_messages(request.messages) 27 | prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True) 28 | 29 | if len(images) < 1: 30 | images = [ await url_to_image(black_pixel_url) ] 31 | prompt = "\n" + prompt 32 | 33 | inputs = self.processor(text=prompt, images=images, return_tensors="pt").to(self.device) 34 | 35 | params = self.get_generation_params(request) 36 | 37 | generation_kwargs = dict( 38 | **inputs, 39 | **params, 40 | ) 41 | 42 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 43 | end = new_text.find(self.processor.tokenizer.eos_token) 44 | if end == -1: 45 | yield new_text 46 | else: 47 | yield new_text[:end] 48 | break 49 | -------------------------------------------------------------------------------- /backend/internlm2-wqx-vl.py: -------------------------------------------------------------------------------- 1 | import os 2 | from math import ceil 3 | import warnings 4 | import torch 5 | from transformers import AutoTokenizer, AutoModel, logging 6 | from torchvision import transforms 7 | from huggingface_hub import snapshot_download 8 | 9 | 10 | from vision_qna import * 11 | 12 | #logging.set_verbosity_error() 13 | #warnings.filterwarnings('ignore') 14 | 15 | # internlm/internlm2-wqx-vl-20b 16 | 17 | # --4bit: 18 | # Linear4bit.forward() takes 2 positional arguments but 3 were given 19 | 20 | class VisionQnA(VisionQnABase): 21 | model_name: str = "internlm2-wqx-vl" 22 | format: str = "chatml" 23 | vision_layers: List[str] = ['vit', 'vision_proj', 'vision_tower'] 24 | 25 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 26 | super().__init__(model_id, device, device_map, extra_params, format) 27 | 28 | #torch.set_default_dtype(self.dtype) 29 | 30 | self.params['pretrained_model_name_or_path'] = model_id = snapshot_download(model_id) 31 | 32 | #self.max_tiles = extra_params.get('max_tiles', MAX_TILES) 33 | 34 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 35 | self.model = AutoModel.from_pretrained(**self.params).eval() 36 | 37 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 38 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 39 | self.model = self.model.to(self.device) 40 | 41 | self.eos_token = '<|im_end|>' # [UNUSED_TOKEN_145] 42 | self.eos_token_id = self.tokenizer.convert_tokens_to_ids([self.eos_token])[0] 43 | 44 | self.loaded_banner() 45 | 46 | 47 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 48 | images, prompt = await chatml_prompt_from_messages(request.messages, img_tok = "") 49 | 50 | system_default = ("You are an AI assistant whose name is InternLM (书生·浦语).\n" 51 | "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n" 52 | "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文."), 53 | 54 | inputs = self.tokenizer([prompt], return_tensors="pt") 55 | inputs = { 56 | k: v.to(self.model.device) 57 | for k, v in inputs.items() if torch.is_tensor(v) 58 | } 59 | 60 | if images: 61 | images = self.model.vis_processor(images[-1]).unsqueeze(0).to(self.model.device) 62 | 63 | # XXX server-1 | sub_img = img.reshape(1,3,H//560,560,W//560,560).permute(0,2,4,1,3,5).reshape(-1,3,560,560).contiguous() 64 | # Ex. RuntimeError: shape '[1, 3, 1, 560, 1, 560]' is invalid for input of size 1224216 65 | img_embeds, img_split = self.model.vit([images], self.model.plora_glb_GN, self.model.plora_sub_GN) 66 | 67 | img_embeds = self.model.vision_proj(img_embeds) 68 | inputs['img_embeds'] = img_embeds 69 | 70 | default_params = { 71 | #'num_beams': 3, 72 | #'do_sample': False, 73 | "temperature": 0.8, 74 | "top_p": 0.8, 75 | 'do_sample': True, 76 | 'repetition_penalty': 1.005, 77 | 'eos_token_id': [ self.tokenizer.eos_token_id, self.eos_token_id ], # also add end-of-assistant token in eos token id to avoid unnecessary generation 78 | } 79 | params = self.get_generation_params(request, default_params) 80 | 81 | generation_kwargs = dict( 82 | **inputs, 83 | **params, 84 | ) 85 | 86 | def wrapper(**kwargs): 87 | with torch.cuda.amp.autocast(): 88 | _ = self.model.generate(**kwargs) 89 | 90 | for new_text in threaded_streaming_generator(generate=wrapper, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 91 | end = new_text.find(self.eos_token) 92 | if end == -1: 93 | yield new_text 94 | else: 95 | yield new_text[:end] 96 | break 97 | 98 | -------------------------------------------------------------------------------- /backend/internlm2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | import torch 4 | from transformers import AutoTokenizer, AutoModel, logging 5 | 6 | from vision_qna import * 7 | 8 | logging.set_verbosity_error() 9 | warnings.filterwarnings('ignore') 10 | 11 | # internlm/internlm-xcomposer2d5 12 | MAX_TILES = 24 13 | 14 | class VisionQnA(VisionQnABase): 15 | model_name: str = "internlm2" 16 | format: str = "internal" 17 | vision_layers: List[str] = ['vit', 'vision_proj'] 18 | 19 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 20 | super().__init__(model_id, device, device_map, extra_params, format) 21 | 22 | torch.set_grad_enabled(False) 23 | 24 | self.max_tiles = extra_params.get('max_tiles', MAX_TILES) 25 | 26 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 27 | self.model = AutoModel.from_pretrained(**self.params).eval() 28 | self.model.tokenizer = self.tokenizer 29 | 30 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 31 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 32 | self.model = self.model.to(self.device) 33 | 34 | self.eos_token = '[UNUSED_TOKEN_145]' 35 | self.eos_token_id = self.tokenizer.convert_tokens_to_ids([self.eos_token])[0] 36 | 37 | self.loaded_banner() 38 | 39 | 40 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 41 | prompt, history, files, meta_instruction = await prompt_history_images_system_from_messages(request.messages, img_tok='', url_handler=url_to_file) 42 | 43 | with torch.autocast(device_type='cuda', dtype=torch.float16): 44 | inputs, im_mask, _ = self.model.interleav_wrap_chat(prompt, files, history=history, meta_instruction=meta_instruction, hd_num=self.max_tiles) 45 | 46 | inputs = { 47 | k: v.to(self.device) 48 | for k, v in inputs.items() if torch.is_tensor(v) 49 | } 50 | inputs['im_mask'] = im_mask 51 | 52 | default_params = { 53 | #'num_beams': 3, 54 | #'do_sample': False, 55 | "temperature": 1.0, 56 | "top_p": 0.8, 57 | 'do_sample': True, 58 | 'repetition_penalty': 1.005, 59 | 'eos_token_id': [ self.tokenizer.eos_token_id, self.eos_token_id ], # also add end-of-assistant token in eos token id to avoid unnecessary generation 60 | } 61 | params = self.get_generation_params(request, default_params) 62 | 63 | generation_kwargs = dict( 64 | **inputs, 65 | **params, 66 | ) 67 | 68 | try: 69 | def wrapper(**kwargs): 70 | with torch.autocast(device_type='cuda', dtype=torch.float16): 71 | _ = self.model.generate(**kwargs) 72 | 73 | for new_text in threaded_streaming_generator(generate=wrapper, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 74 | end = new_text.find(self.eos_token) 75 | if end == -1: 76 | yield new_text 77 | else: 78 | yield new_text[:end] 79 | break 80 | 81 | except Exception as e: 82 | logger.error(e) 83 | # raise 84 | 85 | finally: 86 | for f in files: 87 | os.remove(f) 88 | 89 | -------------------------------------------------------------------------------- /backend/internlmxcomposer2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from math import ceil 3 | import warnings 4 | import torch 5 | from transformers import AutoTokenizer, AutoModel, logging 6 | 7 | from vision_qna import * 8 | 9 | logging.set_verbosity_error() 10 | warnings.filterwarnings('ignore') 11 | 12 | # internlm/internlm-xcomposer2-4khd-7b 13 | MAX_TILES = 55 14 | 15 | # --4bit: 16 | # Linear4bit.forward() takes 2 positional arguments but 3 were given 17 | 18 | def calc_hd(image, max_num=MAX_TILES): 19 | # Not sure if this is correct, but there are no instructions for how to set it 20 | img = Image.open(image) 21 | width, height = img.size 22 | del img 23 | 24 | return min(ceil(width // 336) * ceil(height // 336), max_num) 25 | 26 | class VisionQnA(VisionQnABase): 27 | model_name: str = "internlmxcomposer2" 28 | format: str = "chatml" 29 | vision_layers: List[str] = ['vit', 'vision_proj'] 30 | 31 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 32 | super().__init__(model_id, device, device_map, extra_params, format) 33 | 34 | torch.set_default_dtype(self.dtype) 35 | 36 | self.max_tiles = extra_params.get('max_tiles', MAX_TILES) 37 | 38 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 39 | self.model = AutoModel.from_pretrained(**self.params).eval() 40 | 41 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 42 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 43 | self.model = self.model.to(self.device) 44 | 45 | self.eos_token = '[UNUSED_TOKEN_145]' 46 | self.eos_token_id = self.tokenizer.convert_tokens_to_ids([self.eos_token])[0] 47 | 48 | self.loaded_banner() 49 | 50 | 51 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 52 | prompt, history, files, meta_instruction = await prompt_history_images_system_from_messages(request.messages, img_tok='', url_handler=url_to_file) 53 | 54 | meta_instruction = meta_instruction if meta_instruction else ('You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n' 55 | '- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n' 56 | '- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.\n' 57 | '- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively based on the provided image.'), 58 | 59 | if files: 60 | image = self.model.encode_img(files[-1], hd_num=calc_hd(files[-1], max_num=self.max_tiles)) 61 | inputs, im_mask = self.model.interleav_wrap_chat(self.tokenizer, prompt, image, history, meta_instruction) 62 | else: 63 | inputs = self.model.build_inputs(self.tokenizer, prompt, history, meta_instruction) 64 | im_mask = torch.zeros(inputs['input_ids'].shape[:2]).cuda().bool() 65 | inputs = { 66 | k: v.to(self.device) 67 | for k, v in inputs.items() if torch.is_tensor(v) 68 | } 69 | inputs['im_mask'] = im_mask 70 | 71 | default_params = { 72 | #'num_beams': 3, 73 | #'do_sample': False, 74 | "temperature": 1.0, 75 | "top_p": 0.8, 76 | 'do_sample': True, 77 | 'repetition_penalty': 1.005, 78 | 'eos_token_id': [ self.tokenizer.eos_token_id, self.eos_token_id ], # also add end-of-assistant token in eos token id to avoid unnecessary generation 79 | } 80 | params = self.get_generation_params(request, default_params) 81 | 82 | generation_kwargs = dict( 83 | **inputs, 84 | **params, 85 | ) 86 | 87 | try: 88 | def wrapper(**kwargs): 89 | with torch.cuda.amp.autocast(): 90 | _ = self.model.generate(**kwargs) 91 | 92 | for new_text in threaded_streaming_generator(generate=wrapper, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 93 | end = new_text.find(self.eos_token) 94 | if end == -1: 95 | yield new_text 96 | else: 97 | yield new_text[:end] 98 | break 99 | 100 | except Exception as e: 101 | logger.error(e) 102 | # raise 103 | 104 | finally: 105 | for f in files: 106 | os.remove(f) 107 | 108 | -------------------------------------------------------------------------------- /backend/joy-caption-pre-alpha.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoModelForCausalLM 2 | import torch 3 | from torch import nn 4 | import torch.amp.autocast_mode 5 | from huggingface_hub import hf_hub_download 6 | 7 | from vision_qna import * 8 | 9 | # fancyfeast/joy-caption-pre-alpha 10 | 11 | class ImageAdapter(nn.Module): 12 | def __init__(self, input_features: int, output_features: int): 13 | super().__init__() 14 | self.linear1 = nn.Linear(input_features, output_features) 15 | self.activation = nn.GELU() 16 | self.linear2 = nn.Linear(output_features, output_features) 17 | 18 | def forward(self, vision_outputs: torch.Tensor): 19 | x = self.linear1(vision_outputs) 20 | x = self.activation(x) 21 | x = self.linear2(x) 22 | return x 23 | 24 | 25 | class VisionQnA(VisionQnABase): 26 | model_name: str = "joy-caption-pre-alpha" 27 | format: str = "llama3" 28 | visual_layers: List[str] = [] 29 | 30 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 31 | 32 | logger.warning("Loading fancyfeast/joy-caption-pre-alpha with wpkklhc6/image_adapter.pt, ") 33 | # XXX Ignore the actual model_id 34 | if extra_params.get("load_in_4bit", False): 35 | model_id = "unsloth/Meta-Llama-3.1-8B-bnb-4bit" # no authorization required 36 | else: 37 | model_id = "meta-llama/Meta-Llama-3.1-8B" # requires authorized access 38 | 39 | super().__init__(model_id, device, device_map, extra_params, format) 40 | 41 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) 42 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 43 | 44 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 45 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 46 | self.model = self.model.to(self.device) 47 | 48 | CLIP_PATH = "google/siglip-so400m-patch14-384" 49 | 50 | self.clip_processor = AutoProcessor.from_pretrained(CLIP_PATH) 51 | self.clip_model = AutoModel.from_pretrained(CLIP_PATH) 52 | self.clip_model = self.clip_model.vision_model 53 | self.clip_model.eval() 54 | self.clip_model.requires_grad_(False) 55 | self.clip_model.to(self.device) 56 | 57 | self.image_adapter = ImageAdapter(self.clip_model.config.hidden_size, self.model.config.hidden_size) 58 | CHECKPOINT_PATH = hf_hub_download(repo_id="fancyfeast/joy-caption-pre-alpha", repo_type="space", subfolder="wpkklhc6", filename="image_adapter.pt") 59 | checkpoint = torch.load(CHECKPOINT_PATH, map_location="cpu") 60 | self.image_adapter.load_state_dict(checkpoint) 61 | self.image_adapter.eval() 62 | self.image_adapter.to(self.device) 63 | 64 | self.loaded_banner() 65 | 66 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 67 | images, prompt = await prompt_from_messages(request.messages, self.format) 68 | 69 | #prompt = "A descriptive caption for this image:\n" 70 | 71 | # Tokenize the prompt 72 | prompt_tok = self.tokenizer.encode(prompt, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False) 73 | 74 | if len(images) < 1: 75 | inputs = dict( 76 | input_ids=prompt_tok.to(device=self.device), 77 | ) 78 | else: 79 | # Preprocess image 80 | image = self.clip_processor(images=images[0], return_tensors='pt').pixel_values.to(device=self.device) 81 | 82 | # Embed image 83 | with torch.amp.autocast_mode.autocast('cuda', enabled=True): 84 | vision_outputs = self.clip_model(pixel_values=image, output_hidden_states=True) 85 | image_features = vision_outputs.hidden_states[-2] 86 | embedded_images = self.image_adapter(image_features) 87 | embedded_images = embedded_images.to(self.device) 88 | 89 | # Embed prompt 90 | prompt_embeds = self.model.model.embed_tokens(prompt_tok.to(device=self.device)) 91 | embedded_bos = self.model.model.embed_tokens(torch.tensor([[self.tokenizer.bos_token_id]], device=self.device, dtype=torch.int64)) 92 | 93 | # Construct prompts 94 | inputs_embeds = torch.cat([ 95 | embedded_bos.expand(embedded_images.shape[0], -1, -1), 96 | embedded_images.to(dtype=embedded_bos.dtype), 97 | prompt_embeds.expand(embedded_images.shape[0], -1, -1), 98 | ], dim=1) 99 | 100 | input_ids = torch.cat([ 101 | torch.tensor([[self.tokenizer.bos_token_id]], dtype=torch.long), 102 | torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), 103 | prompt_tok, 104 | ], dim=1).to(device=self.device) 105 | attention_mask = torch.ones_like(input_ids) 106 | 107 | inputs = dict( 108 | input_ids=input_ids, 109 | inputs_embeds=inputs_embeds, 110 | attention_mask=attention_mask, 111 | ) 112 | 113 | default_params = dict( 114 | max_new_tokens=300, 115 | do_sample=True, 116 | top_k=10, 117 | temperature=0.5, 118 | suppress_tokens=None, 119 | ) 120 | 121 | params = self.get_generation_params(request, default_params=default_params) 122 | 123 | generation_kwargs = dict( 124 | **inputs, 125 | **params, 126 | ) 127 | 128 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 129 | end = new_text.find(self.tokenizer.eos_token) 130 | if end == -1: 131 | yield new_text 132 | else: 133 | yield new_text[:end] 134 | break 135 | -------------------------------------------------------------------------------- /backend/llamavision.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | # qresearch/llama-3-vision-alpha-hf 6 | 7 | # Doesn't support generation without images 8 | 9 | class VisionQnA(VisionQnABase): 10 | model_name: str = "llamavision" 11 | format: str = "llama3" 12 | vision_layers: List[str] = ["mm_projector", "vision_model"] 13 | 14 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 15 | super().__init__(model_id, device, None, extra_params, format) 16 | 17 | if not format: 18 | self.format = guess_model_format(model_id) 19 | 20 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) 21 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 22 | 23 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 24 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 25 | self.model = self.model.to(self.device) 26 | 27 | self.loaded_banner() 28 | 29 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 30 | images, prompt = await prompt_from_messages(request.messages, self.format) 31 | 32 | if len(images) < 1: 33 | images = [ await url_to_image(black_pixel_url) ] 34 | prompt = '\n' + prompt 35 | 36 | input_ids = self.model.tokenizer_image_token(prompt, self.tokenizer, -200, return_tensors="pt").unsqueeze(0).to(self.device) 37 | image_inputs = self.model.processor( 38 | images=images, 39 | return_tensors="pt", 40 | do_resize=True, 41 | size={"height": 384, "width": 384}, 42 | ) 43 | 44 | image_inputs = image_inputs["pixel_values"].to(device=self.device, dtype=self.dtype) 45 | image_forward_outs = self.model.vision_model(image_inputs, output_hidden_states=True) 46 | image_features = image_forward_outs.hidden_states[-2] 47 | projected_embeddings = self.model.mm_projector(image_features).to(self.device) 48 | embedding_layer = self.model.text_model.get_input_embeddings() # .to(self.device) 49 | new_embeds, attn_mask = self.model.process_tensors(input_ids, projected_embeddings, embedding_layer) 50 | 51 | default_params = dict( 52 | temperature=0.2, 53 | do_sample=True, 54 | ) 55 | 56 | params = self.get_generation_params(request, default_params=default_params) 57 | 58 | generation_kwargs = dict( 59 | inputs_embeds=new_embeds.to(self.device), 60 | attention_mask=attn_mask.to(self.device), 61 | eos_token_id=[ 62 | self.tokenizer.eos_token_id, 63 | self.tokenizer.convert_tokens_to_ids("<|eot_id|>"), 64 | ], 65 | pad_token_id=self.tokenizer.eos_token_id, 66 | **params, 67 | ) 68 | 69 | for new_text in threaded_streaming_generator(generate=self.model.text_model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 70 | end = new_text.find(self.tokenizer.eos_token) 71 | if end == -1: 72 | yield new_text 73 | else: 74 | yield new_text[:end] 75 | break -------------------------------------------------------------------------------- /backend/llava-qwen2.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | import transformers 6 | import warnings 7 | # disable some warnings 8 | transformers.logging.set_verbosity_error() 9 | warnings.filterwarnings('ignore') 10 | 11 | # qnguyen3/nanoLLaVA 12 | # qnguyen3/nanoLLaVA-1.5 13 | 14 | class VisionQnA(VisionQnABase): 15 | model_name: str = "llava-qwen2" 16 | format: str = "chatml" 17 | 18 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 19 | super().__init__(model_id, device, device_map, extra_params, format) 20 | 21 | torch.set_default_device(self.device) 22 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 23 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 24 | 25 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 26 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 27 | self.model = self.model.to(self.device) 28 | 29 | self.loaded_banner() 30 | 31 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 32 | images, prompt = await prompt_from_messages(request.messages, self.format) 33 | 34 | text_chunks = [self.tokenizer(chunk).input_ids for chunk in prompt.split('')] 35 | if images: 36 | text_with_img_tok = join_int_lists(text_chunks, -200) # -200 == 37 | encoded_images = self.model.process_images(images, self.model.config).to(dtype=self.model.dtype) 38 | else: 39 | text_with_img_tok = text_chunks[0] 40 | encoded_images = None 41 | 42 | input_ids = torch.tensor(text_with_img_tok, dtype=torch.long).unsqueeze(0) 43 | 44 | default_params = { 45 | 'top_p': 0.8, 46 | 'temperature': 0.7, 47 | 'do_sample': True, 48 | 'pad_token_id': self.tokenizer.eos_token_id, 49 | } 50 | params = self.get_generation_params(request, default_params=default_params) 51 | 52 | generation_kwargs = dict( 53 | input_ids=input_ids, 54 | images=encoded_images, 55 | **params 56 | ) 57 | 58 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 59 | end = new_text.find(self.tokenizer.eos_token) 60 | if end == -1: 61 | yield new_text 62 | else: 63 | yield new_text[:end] 64 | break 65 | -------------------------------------------------------------------------------- /backend/llava.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, LlavaForConditionalGeneration # was LlavaProcessor 2 | from vision_qna import * 3 | 4 | # 5 | # llava-hf/bakLlava-v1-hf # llama2 6 | # llava-hf/llava-1.5-7b-hf # vicuna 7 | # llava-hf/llava-1.5-13b-hf # vicuna 8 | # Doesn't support execution without images 9 | # "hf-internal-testing/pixtral-12b" soon? 10 | 11 | class VisionQnA(VisionQnABase): 12 | model_name: str = "llava" 13 | format: str = 'vicuna' 14 | vision_layers: List[str] = ["vision_model", "vision_tower", "multi_modal_projector", "vision_encoder", "vision_language_adapter"] 15 | 16 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 17 | super().__init__(model_id, device, device_map, extra_params, format) 18 | 19 | if not format: 20 | self.format = guess_model_format(model_id) 21 | 22 | del self.params['trust_remote_code'] 23 | 24 | self.processor = AutoProcessor.from_pretrained(model_id) 25 | self.model = LlavaForConditionalGeneration.from_pretrained(**self.params) 26 | 27 | self.loaded_banner() 28 | 29 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 30 | 31 | images, prompt = await prompt_from_messages(request.messages, self.format) 32 | 33 | if len(images) < 1: 34 | images = [ await url_to_image(black_pixel_url) ] 35 | if self.format == 'pixtral': 36 | prompt = "[IMG]\n" + prompt 37 | else: 38 | prompt = "\n" + prompt 39 | 40 | inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(self.device) 41 | 42 | params = self.get_generation_params(request) 43 | 44 | generation_kwargs = dict( 45 | **inputs, 46 | **params, 47 | ) 48 | 49 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 50 | end = new_text.find(self.processor.tokenizer.eos_token) 51 | if end == -1: 52 | yield new_text 53 | else: 54 | yield new_text[:end] 55 | break 56 | -------------------------------------------------------------------------------- /backend/llava_next.py: -------------------------------------------------------------------------------- 1 | from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration 2 | from vision_qna import * 3 | 4 | # llava-hf/llava-v1.6-34b-hf # chatml 5 | # llava-hf/llava-v1.6-vicuna-13b-hf # vicuna 6 | # llava-hf/llava-v1.6-vicuna-7b-hf # vicuna 7 | # llava-hf/llava-v1.6-mistral-7b-hf # llama2 8 | # tiiuae/falcon-11B-vlm # falcon 9 | 10 | # llavanext doesn't support generation without images 11 | 12 | class VisionQnA(VisionQnABase): 13 | model_name: str = "llava_next" 14 | format: str = 'llama2' 15 | vision_layers: List[str] = ["vision_model", "vision_tower", "multi_modal_projector"] 16 | 17 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 18 | super().__init__(model_id, device, device_map, extra_params, format) 19 | 20 | if not format: 21 | self.format = guess_model_format(model_id) 22 | 23 | del self.params['trust_remote_code'] 24 | 25 | use_fast = 'mistral' in model_id or 'falcon' in model_id 26 | self.processor = LlavaNextProcessor.from_pretrained(model_id, use_fast=use_fast) 27 | self.model = LlavaNextForConditionalGeneration.from_pretrained(**self.params) 28 | 29 | self.loaded_banner() 30 | 31 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 32 | images, prompt = await prompt_from_messages(request.messages, self.format) 33 | 34 | if len(images) < 1: 35 | images = [ await url_to_image(black_pixel_url) ] 36 | prompt = "\n" + prompt 37 | 38 | inputs = self.processor(prompt, images, return_tensors="pt").to(self.model.device) 39 | 40 | default_params = dict( 41 | pad_token_id=self.processor.tokenizer.eos_token_id, 42 | ) 43 | 44 | params = self.get_generation_params(request, default_params) 45 | 46 | generation_kwargs = dict( 47 | **inputs, 48 | **params, 49 | ) 50 | 51 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 52 | end = new_text.find(self.processor.tokenizer.eos_token) 53 | if end == -1: 54 | yield new_text 55 | else: 56 | yield new_text[:end] 57 | break 58 | -------------------------------------------------------------------------------- /backend/llavanextgit.py: -------------------------------------------------------------------------------- 1 | from llava.model.builder import load_pretrained_model 2 | from llava.mm_utils import process_images, tokenizer_image_token 3 | from vision_qna import * 4 | 5 | # lmms-lab/llava-onevision-qwen2-0.5b-ov 6 | # lmms-lab/llava-onevision-qwen2-0.5b-si 7 | # lmms-lab/llava-onevision-qwen2-7b-ov 8 | # lmms-lab/llava-onevision-qwen2-7b-si 9 | # lmms-lab/llava-onevision-qwen2-72b-ov 10 | # lmms-lab/llava-onevision-qwen2-72b-si 11 | 12 | # BAAI/Aquila-VL-2B-llava-qwen 13 | 14 | import warnings 15 | warnings.filterwarnings("ignore") 16 | 17 | class VisionQnA(VisionQnABase): 18 | model_name: str = "llavanextgit" # llava_qwen 19 | format: str = 'chatml' # qwen_1_5 20 | vision_layers: List[str] = ["vision_model", "vision_tower", "multi_modal_projector"] 21 | 22 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 23 | 24 | load_in_4bit = extra_params.get('load_in_4bit', False) 25 | load_in_8bit = extra_params.get('load_in_8bit', False) 26 | if load_in_4bit: del extra_params['load_in_4bit'] 27 | if load_in_8bit: del extra_params['load_in_8bit'] 28 | 29 | super().__init__(model_id, device, device_map, extra_params, format) 30 | 31 | if not format: 32 | self.format = guess_model_format(model_id) 33 | 34 | for i in ['pretrained_model_name_or_path', 'trust_remote_code', 'low_cpu_mem_usage', 'torch_dtype']: 35 | del self.params[i] 36 | 37 | self.tokenizer, self.model, self.image_processor, max_length = load_pretrained_model(model_id, None, "llava_qwen", load_in_4bit, load_in_8bit, **self.params) 38 | 39 | self.loaded_banner() 40 | 41 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 42 | images, prompt = await prompt_from_messages(request.messages, self.format) 43 | 44 | if len(images) < 1: 45 | images = [ await url_to_image(black_pixel_url) ] 46 | prompt = "\n" + prompt 47 | 48 | image_tensor = process_images(images, self.image_processor, self.model.config) 49 | image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor] 50 | 51 | input_ids = tokenizer_image_token(prompt, self.tokenizer, -200, return_tensors="pt").unsqueeze(0).to(self.device) 52 | image_sizes = [image.size for image in images] 53 | 54 | default_params = dict( 55 | #pad_token_id=self.processor.tokenizer.eos_token_id, 56 | temperature=0.0, 57 | do_sample=False, 58 | max_new_tokens=4096, 59 | ) 60 | 61 | params = self.get_generation_params(request, default_params) 62 | 63 | generation_kwargs = dict( 64 | inputs=input_ids, 65 | images=image_tensor, 66 | image_sizes=image_sizes, 67 | **params, 68 | ) 69 | 70 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 71 | end = new_text.find(self.tokenizer.eos_token) 72 | if end == -1: 73 | yield new_text 74 | else: 75 | yield new_text[:end] 76 | break 77 | 78 | -------------------------------------------------------------------------------- /backend/mantis.py: -------------------------------------------------------------------------------- 1 | from mantis.models.mllava import chat_mllava, MLlavaProcessor, LlavaForConditionalGeneration 2 | from mantis.models.mfuyu import MFuyuForCausalLM, MFuyuProcessor 3 | 4 | from vision_qna import * 5 | 6 | class VisionQnA(VisionQnABase): 7 | format: str = 'internal' 8 | model_name: str = "mantis" 9 | vision_layers: List[str] = ["vision_tower", "multi_modal_projector"] 10 | 11 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 12 | super().__init__(model_id, device, device_map, extra_params, format) 13 | 14 | del self.params['trust_remote_code'] 15 | 16 | if '-fuyu' in model_id.lower(): 17 | self.processor = MFuyuProcessor.from_pretrained(model_id) 18 | self.model = MFuyuForCausalLM.from_pretrained(**self.params) 19 | else: 20 | self.processor = MLlavaProcessor.from_pretrained(model_id) 21 | self.model = LlavaForConditionalGeneration.from_pretrained(**self.params) 22 | 23 | print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}") 24 | 25 | async def chat_with_images(self, request: ImageChatRequest) -> str: 26 | prompt, history, images, system = await prompt_history_images_system_from_messages(request.messages, img_tok = "", url_handler = url_to_image) 27 | 28 | default_params = { 29 | 'num_beams': 1, 30 | 'do_sample': False, 31 | } 32 | 33 | params = self.get_generation_params(request, default_params) 34 | 35 | response, history = chat_mllava(prompt, images if images else None, self.model, self.processor, history=history if history else None, **params) 36 | 37 | return response 38 | 39 | -------------------------------------------------------------------------------- /backend/minicpm-v-2_6.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel 2 | 3 | from vision_qna import * 4 | from PIL import Image 5 | from decord import VideoReader, cpu 6 | 7 | # openbmb/MiniCPM-V-2_6 8 | # openbmb/MiniCPM-V-2_6-int4 9 | 10 | MAX_NUM_FRAMES=64 # if cuda OOM set a smaller number 11 | 12 | async def encode_video(video_path): 13 | def uniform_sample(l, n): 14 | gap = len(l) / n 15 | idxs = [int(i * gap + gap / 2) for i in range(n)] 16 | return [l[i] for i in idxs] 17 | 18 | vr = VideoReader(video_path, ctx=cpu(0)) 19 | sample_fps = round(vr.get_avg_fps() / 1) # FPS 20 | frame_idx = [i for i in range(0, len(vr), sample_fps)] 21 | if len(frame_idx) > MAX_NUM_FRAMES: 22 | frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES) 23 | frames = vr.get_batch(frame_idx).asnumpy() 24 | frames = [Image.fromarray(v.astype('uint8')) for v in frames] 25 | return frames 26 | 27 | 28 | class VisionQnA(VisionQnABase): 29 | format: str = 'internal' 30 | model_name: str = "minicpm-v-2_6" 31 | vision_layers: List[str] = ["resampler", "vpm"] 32 | 33 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 34 | super().__init__(model_id, device, device_map, extra_params, format) 35 | 36 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 37 | self.model = AutoModel.from_pretrained(**self.params).eval() 38 | 39 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 40 | if '-int4' not in model_id: 41 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 42 | self.model = self.model.to(dtype=self.params['torch_dtype'], device=self.device) 43 | 44 | self.loaded_banner() 45 | 46 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 47 | msgs = [] 48 | system_prompt = None 49 | 50 | params = self.get_generation_params(request) 51 | 52 | for m in request.messages: 53 | image = None 54 | content = [] 55 | for c in m.content: 56 | if m.role == 'user': 57 | if c.type == 'image_url': 58 | # # Video not working yet 59 | # if '.mp4' in c.image_url.url: 60 | # params["use_image_id"] = False 61 | # params["max_slice_nums"] = 2 # use 1 if cuda OOM and video resolution > 448*448 62 | # video_path = await url_to_file(c.image_url.url) 63 | # image = await encode_video(video_path) 64 | content.extend([await url_to_image(c.image_url.url)]) 65 | 66 | for c in m.content: 67 | if c.type == 'text': 68 | if m.role == 'system': 69 | system_prompt = c.text 70 | else: 71 | content.extend([c.text]) 72 | msgs.extend([{ 'role': m.role, 'content': content }]) 73 | 74 | default_params = dict( 75 | do_sample=True, 76 | top_p=0.8, 77 | temperature=0.7, 78 | ) 79 | 80 | params = self.get_generation_params(request, default_params=default_params) 81 | 82 | answer = self.model.chat( 83 | image=None, 84 | msgs=msgs, 85 | tokenizer=self.tokenizer, 86 | sampling=True, 87 | system_prompt=system_prompt, 88 | stream=True, 89 | **params, 90 | ) 91 | 92 | for new_text in answer: 93 | yield new_text 94 | 95 | -------------------------------------------------------------------------------- /backend/minicpmv.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel 2 | 3 | from vision_qna import * 4 | 5 | # openbmb/MiniCPM-Llama3-V-2_5 6 | # openbmb/MiniCPM-V-2 - 4bit broken 7 | # openbmb/MiniCPM-V aka OmniLMM-3B 8 | 9 | class VisionQnA(VisionQnABase): 10 | format: str = 'internal' 11 | model_name: str = "minicpmv" 12 | vision_layers: List[str] = ["resampler", "vpm"] 13 | 14 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 15 | super().__init__(model_id, device, device_map, extra_params, format) 16 | 17 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 18 | self.model = AutoModel.from_pretrained(**self.params).eval() 19 | 20 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 21 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 22 | self.model = self.model.to(dtype=self.params['torch_dtype'], device=self.device) 23 | 24 | self.loaded_banner() 25 | 26 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 27 | image = None 28 | msgs = [] 29 | system_prompt = None 30 | 31 | for m in request.messages: 32 | if m.role == 'user': 33 | for c in m.content: 34 | if c.type == 'image_url': 35 | image = await url_to_image(c.image_url.url) 36 | for c in m.content: 37 | if c.type == 'text': 38 | if m.role == 'system': 39 | system_prompt = c.text 40 | else: 41 | msgs.extend([{ 'role': m.role, 'content': c.text }]) 42 | 43 | if image is None: 44 | image = await url_to_image(black_pixel_url) 45 | 46 | # default uses num_beams: 3, but if streaming/sampling is requested, switch the defaults. 47 | default_sampling_params = { 48 | 'do_sample': True, 49 | 'top_p': 0.8, 50 | 'top_k': 100, 51 | 'temperature': 0.6, 52 | } 53 | params = self.get_generation_params(request, default_sampling_params) 54 | 55 | with torch.cuda.amp.autocast(): 56 | answer = self.model.chat( 57 | image=image, 58 | msgs=msgs, 59 | context=None, 60 | tokenizer=self.tokenizer, 61 | sampling=True, 62 | system_prompt=system_prompt, 63 | stream=True, 64 | **params, 65 | ) 66 | 67 | if isinstance(answer, str): 68 | answer = [answer] 69 | 70 | for new_text in answer: 71 | if isinstance(new_text, str): 72 | yield new_text 73 | 74 | -------------------------------------------------------------------------------- /backend/minigemini.py: -------------------------------------------------------------------------------- 1 | import re 2 | from accelerate import infer_auto_device_map 3 | 4 | import transformers 5 | import warnings 6 | # disable some warnings 7 | transformers.logging.set_verbosity_error() 8 | warnings.filterwarnings('ignore') 9 | 10 | from mgm.constants import IMAGE_TOKEN_INDEX 11 | from mgm.model.builder import load_pretrained_model 12 | from mgm.mm_utils import process_images, tokenizer_image_token 13 | 14 | from vision_qna import * 15 | 16 | # YanweiLi/MGM-2B (no 4bit) 17 | # out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state) 18 | # AttributeError: 'Parameter' object has no attribute 'quant_state' 19 | 20 | # YanweiLi/MGM-7B 21 | # YanweiLi/MGM-7B-HD 22 | # YanweiLi/MGM-13B 23 | # YanweiLi/MGM-34B 24 | # YanweiLi/MGM-34B-HD 25 | # YanweiLi/MGM-13B-HDs 26 | # YanweiLi/MGM-8x7B-HD 27 | # YanweiLi/MGM-8x7B 28 | 29 | # TODO: 30 | # YanweiLi/MGM-8B-HD 31 | # YanweiLi/MGM-8B 32 | 33 | class VisionQnA(VisionQnABase): 34 | model_name: str = "minigemini" 35 | format: str = "llama2" 36 | vision_layers: List[str] = ["vision_tower", "vision_tower_aux", "vlm_uni_aux_projector", "vlm_uni_val_projector"] 37 | 38 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 39 | super().__init__(model_id, device, device_map, extra_params, format) 40 | 41 | if not format: 42 | self.format = guess_model_format(model_id) 43 | 44 | model_base, model_name = model_id.split('/', 1) 45 | del self.params['low_cpu_mem_usage'] 46 | del self.params['pretrained_model_name_or_path'] 47 | del self.params['trust_remote_code'] 48 | 49 | self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model( 50 | model_id, None, model_name, **self.params) 51 | 52 | self.loaded_banner() 53 | 54 | async def chat_with_images(self, request: ImageChatRequest) -> str: 55 | images, prompt = await prompt_from_messages(request.messages, self.format) 56 | 57 | if len(images) < 1: 58 | images = [ await url_to_image(black_pixel_url) ] 59 | prompt = '\n' + prompt 60 | 61 | if hasattr(self.model.config, 'image_size_aux'): 62 | if not hasattr(self.image_processor, 'image_size_raw'): 63 | self.image_processor.image_size_raw = self.image_processor.crop_size.copy() 64 | self.image_processor.crop_size['height'] = self.model.config.image_size_aux 65 | self.image_processor.crop_size['width'] = self.model.config.image_size_aux 66 | self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux 67 | 68 | image_tensor = process_images(images, self.image_processor, self.model.config) 69 | 70 | image_grid = getattr(self.model.config, 'image_grid', 1) 71 | if hasattr(self.model.config, 'image_size_aux'): 72 | raw_shape = [self.image_processor.image_size_raw['height'] * image_grid, 73 | self.image_processor.image_size_raw['width'] * image_grid] 74 | image_tensor_aux = image_tensor 75 | image_tensor = torch.nn.functional.interpolate(image_tensor, 76 | size=raw_shape, 77 | mode='bilinear', 78 | align_corners=False) 79 | else: 80 | image_tensor_aux = [] 81 | 82 | if image_grid >= 2: 83 | raw_image = image_tensor.reshape(3, 84 | image_grid, 85 | self.image_processor.image_size_raw['height'], 86 | image_grid, 87 | self.image_processor.image_size_raw['width']) 88 | raw_image = raw_image.permute(1, 3, 0, 2, 4) 89 | raw_image = raw_image.reshape(-1, 3, 90 | self.image_processor.image_size_raw['height'], 91 | self.image_processor.image_size_raw['width']) 92 | 93 | if getattr(self.model.config, 'image_global', False): 94 | global_image = image_tensor 95 | if len(global_image.shape) == 3: 96 | global_image = global_image[None] 97 | global_image = torch.nn.functional.interpolate(global_image, 98 | size=[self.image_processor.image_size_raw['height'], 99 | self.image_processor.image_size_raw['width']], 100 | mode='bilinear', 101 | align_corners=False) 102 | # [image_crops, image_global] 103 | raw_image = torch.cat([raw_image, global_image], dim=0) 104 | image_tensor = raw_image.contiguous() 105 | image_tensor = image_tensor.unsqueeze(0) 106 | 107 | if type(image_tensor) is list: 108 | image_tensor = [image.to(self.model.device, dtype=torch.float16) for image in image_tensor] 109 | image_tensor_aux = [image.to(self.model.device, dtype=torch.float16) for image in image_tensor_aux] 110 | else: 111 | image_tensor = image_tensor.to(self.model.device, dtype=torch.float16) 112 | image_tensor_aux = image_tensor_aux.to(self.model.device, dtype=torch.float16) 113 | 114 | 115 | input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.model.device) 116 | 117 | params = self.get_generation_params(request) 118 | 119 | if 'top_k' in params: del params['top_k'] # avoid warnings 120 | 121 | with torch.inference_mode(): 122 | output_ids = self.model.generate( 123 | input_ids, 124 | images=image_tensor, 125 | images_aux=image_tensor_aux if len(image_tensor_aux)>0 else None, 126 | bos_token_id=self.tokenizer.bos_token_id, # Begin of sequence token 127 | eos_token_id=self.tokenizer.eos_token_id, # End of sequence token 128 | pad_token_id=self.tokenizer.pad_token_id, # Pad token 129 | **params, 130 | ) 131 | answer = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 132 | return answer 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /backend/mistral.py: -------------------------------------------------------------------------------- 1 | 2 | from huggingface_hub import snapshot_download 3 | from safetensors import safe_open 4 | import numpy as np 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from mistral_inference.transformer import Transformer 8 | from mistral_inference.generate import generate 9 | from mistral_common.tokens.tokenizers.mistral import MistralTokenizer 10 | 11 | from vision_qna import * 12 | 13 | # mistralai/Pixtral-12B-2409 14 | 15 | class VisionQnA(VisionQnABase): 16 | model_name: str = "mistral" 17 | format: str = "pixtral" 18 | visual_layers: List[str] = ["vision_encoder", 'vision_language_adapter'] 19 | 20 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 21 | super().__init__(model_id, device, device_map, extra_params, format) 22 | 23 | mistral_models_path = snapshot_download(repo_id=model_id, allow_patterns=["params.json", "consolidated.safetensors", "tekken.json"]) 24 | 25 | self.tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tekken.json") 26 | self.model = Transformer.from_folder(mistral_models_path, device=self.device, dtype=self.dtype) 27 | 28 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 29 | #if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 30 | # self.model = self.model.to(self.device) 31 | 32 | self.loaded_banner() 33 | 34 | async def chat_with_images(self, request: ImageChatRequest) -> str: 35 | prompt = await pixtral_messages(request.messages) 36 | 37 | # tokenize image urls and text 38 | tokenized = self.tokenizer.encode_chat_completion(prompt) 39 | 40 | generation_kwargs = dict( 41 | eos_id = self.tokenizer.instruct_tokenizer.tokenizer.eos_id, 42 | max_tokens = request.max_tokens, 43 | temperature= 0.35 if request.temperature is None else request.temperature, 44 | ) 45 | 46 | tps_start = time.time() 47 | out_tokens, _ = generate([tokenized.tokens], self.model, images=[tokenized.images], **generation_kwargs) 48 | logger.info(f"Generated {len(out_tokens[0])} tokens at {len(out_tokens[0]) / (time.time() - tps_start):0.2f} T/s") 49 | 50 | return self.tokenizer.decode(out_tokens[0]) 51 | -------------------------------------------------------------------------------- /backend/mllama.py: -------------------------------------------------------------------------------- 1 | from transformers import MllamaForConditionalGeneration, AutoProcessor 2 | 3 | from vision_qna import * 4 | 5 | # meta-llama/Llama-3.2-11B-Vision-Instruct 6 | # meta-llama/Llama-3.2-90B-Vision-Instruct 7 | 8 | class VisionQnA(VisionQnABase): 9 | model_name: str = "mllama" 10 | format: str = "llama3" 11 | visual_layers: List[str] = ['vision_model', 'multi_modal_projector'] 12 | 13 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 14 | super().__init__(model_id, device, device_map, extra_params, format) 15 | 16 | del self.params['trust_remote_code'] 17 | 18 | self.processor = AutoProcessor.from_pretrained(model_id) 19 | self.model = MllamaForConditionalGeneration.from_pretrained(**self.params).eval() 20 | 21 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 22 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 23 | self.model = self.model.to(self.device) 24 | 25 | self.loaded_banner() 26 | 27 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 28 | images, prompt = await llama3_prompt_from_messages(request.messages, img_tok = "<|image|>") 29 | 30 | if len(images) < 1: 31 | images = [ await url_to_image(black_pixel_url) ] 32 | prompt = "<|image|>" + prompt 33 | 34 | inputs = self.processor(images, prompt, return_tensors="pt").to(self.model.device) 35 | 36 | default_params = dict(do_sample=True) 37 | 38 | params = self.get_generation_params(request, default_params=default_params) 39 | 40 | generation_kwargs = dict( 41 | **inputs, 42 | **params, 43 | ) 44 | 45 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor, generation_kwargs=generation_kwargs): 46 | yield new_text 47 | -------------------------------------------------------------------------------- /backend/molmo.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig 2 | 3 | from vision_qna import * 4 | 5 | # allenai/MolmoE-1B-0924 XXX problems with performance and RAM usage 6 | # allenai/Molmo-7B-D-0924 # faster 7 | # allenai/Molmo-7B-O-0924 8 | # allenai/Molmo-72B-0924 9 | # SeanScripts/Molmo-72B-0924-nf4 10 | # cyan2k/molmo-7B-D-bnb-4bit XXX needs tensorflow-cpu 11 | # cyan2k/molmo-7B-O-bnb-4bit XXX needs tensorflow-cpu 12 | 13 | class VisionQnA(VisionQnABase): 14 | model_name: str = "molmo" 15 | format: str = "chatml" 16 | visual_layers: List[str] = ['vision_backbone'] 17 | 18 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 19 | super().__init__(model_id, device, device_map, extra_params, format) 20 | 21 | #self.dtype = self.params['torch_dtype'] = 'auto' # torch.float32 22 | 23 | self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False), torch_dtype=self.params['torch_dtype'], device_map=self.params['device_map']) 24 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 25 | 26 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 27 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 28 | self.model = self.model.to(self.device) 29 | 30 | self.eos_token_id = self.processor.tokenizer.encode(self.processor.tokenizer.eos_token)[0] 31 | 32 | self.loaded_banner() 33 | 34 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 35 | images, prompt = await chatml_prompt_from_messages(request.messages, img_tok = "<|image|>") 36 | 37 | if len(images) < 1: 38 | images = [ await url_to_image(black_pixel_url) ] 39 | prompt = "<|image|>" + prompt 40 | 41 | # process the image and text 42 | inputs = self.processor.process( 43 | images=images, 44 | text=prompt, 45 | ) 46 | 47 | inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()} 48 | 49 | default_params = dict( 50 | eos_token_id=self.eos_token_id, 51 | pad_token_id=self.eos_token_id 52 | ) 53 | 54 | params = self.get_generation_params(request, default_params) 55 | 56 | generation_kwargs = dict( 57 | batch=inputs, 58 | generation_config=GenerationConfig(**params) 59 | ) 60 | 61 | def wrapper(**kwargs): 62 | with torch.amp.autocast('cuda', dtype=self.dtype): 63 | _ = self.model.generate_from_batch(**kwargs) 64 | 65 | for new_text in threaded_streaming_generator(generate=wrapper, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 66 | end = new_text.find(self.processor.tokenizer.eos_token) 67 | if end == -1: 68 | yield new_text 69 | else: 70 | yield new_text[:end] 71 | break 72 | -------------------------------------------------------------------------------- /backend/monkey.py: -------------------------------------------------------------------------------- 1 | import os 2 | from transformers import AutoTokenizer, AutoModelForCausalLM 3 | 4 | from vision_qna import * 5 | 6 | # echo840/Monkey 7 | # echo840/Monkey-Chat 8 | 9 | class VisionQnA(VisionQnABase): 10 | model_name: str = "monkey" 11 | format: str = 'phi15' # phi15-ish 12 | vision_layers: List[str] = ["vision", "vision_tower", "resampler", "visual", "in_proj","out_proj","c_fc","c_proj"] 13 | 14 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 15 | super().__init__(model_id, device, device_map, extra_params, format) 16 | 17 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 18 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 19 | 20 | self.tokenizer.padding_side = 'left' 21 | self.tokenizer.pad_token_id = self.tokenizer.eod_id 22 | self.eos_token = self.tokenizer.decode(self.tokenizer.eod_id) # <|endoftext|> 23 | 24 | self.loaded_banner() 25 | 26 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 27 | try: # 28 | files, prompt = await phi15_prompt_from_messages(request.messages, img_tok = "{} ", img_end = '', url_handler = url_to_file) 29 | 30 | input_ids = self.tokenizer(prompt, return_tensors='pt', padding='longest') 31 | 32 | attention_mask = input_ids.attention_mask.to(self.model.device) 33 | input_ids = input_ids.input_ids.to(self.model.device) 34 | 35 | default_params = { 36 | 'top_p': None, 37 | 'do_sample': False, 38 | } 39 | 40 | params = self.get_generation_params(request, default_params=default_params) 41 | 42 | generation_kwargs = dict( 43 | input_ids=input_ids, 44 | attention_mask=attention_mask, 45 | num_beams=1, 46 | min_new_tokens=1, 47 | length_penalty=1, 48 | num_return_sequences=1, 49 | output_hidden_states=True, 50 | pad_token_id=self.tokenizer.eod_id, 51 | eos_token_id=self.tokenizer.eod_id, 52 | **params, 53 | ) 54 | 55 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 56 | end = new_text.find(self.eos_token) 57 | if end == -1: 58 | yield new_text 59 | else: 60 | yield new_text[:end] 61 | break 62 | 63 | 64 | finally: 65 | for f in files: 66 | os.remove(f) 67 | 68 | -------------------------------------------------------------------------------- /backend/moondream1.py: -------------------------------------------------------------------------------- 1 | import re 2 | from transformers import CodeGenTokenizerFast, AutoModelForCausalLM 3 | 4 | from vision_qna import * 5 | 6 | class VisionQnA(VisionQnABase): 7 | model_name: str = "moondream1" 8 | format: str = 'phi15' 9 | 10 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 11 | super().__init__(model_id, device, device_map, extra_params, format) 12 | 13 | # not supported yet 14 | del self.params['device_map'] 15 | 16 | self.tokenizer = CodeGenTokenizerFast.from_pretrained(model_id) 17 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 18 | 19 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 20 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 21 | self.model = self.model.to(self.device) 22 | 23 | self.loaded_banner() 24 | 25 | async def chat_with_images(self, request: ImageChatRequest) -> str: 26 | images, prompt = await prompt_from_messages(request.messages, self.format) 27 | 28 | encoded_images = self.model.encode_image(images[0]).to(self.model.device) if images else None 29 | 30 | params = self.get_generation_params(request) 31 | 32 | # XXX currently broken here... 33 | """ 34 | File "hf_home/modules/transformers_modules/vikhyatk/moondream1/f6e9da68e8f1b78b8f3ee10905d56826db7a5802/modeling_phi.py", line 318, in forward 35 | padding_mask.masked_fill_(key_padding_mask, 0.0) 36 | RuntimeError: The expanded size of the tensor (747) must match the existing size (748) at non-singleton dimension 1. Target sizes: [1, 747]. Tensor sizes: [1, 748] 37 | """ 38 | answer = self.model.generate( 39 | encoded_images, 40 | prompt, 41 | eos_text="", 42 | tokenizer=self.tokenizer, 43 | **params, 44 | )[0] 45 | answer = re.sub("<$| AsyncGenerator[str, None]: 30 | images, prompt = await prompt_from_messages(request.messages, self.format) 31 | 32 | encoded_images = self.model.encode_image(images) if images else None 33 | inputs_embeds = self.model.input_embeds(prompt, encoded_images, self.tokenizer) 34 | 35 | params = self.get_generation_params(request) 36 | 37 | #streamer = TextIteratorStreamer(self.tokenizer, skip_special_tokens=False, skip_prompt=True) 38 | 39 | generation_kwargs = dict( 40 | eos_token_id=self.tokenizer.eos_token_id, 41 | bos_token_id=self.tokenizer.bos_token_id, 42 | pad_token_id=self.tokenizer.bos_token_id, 43 | inputs_embeds=inputs_embeds, 44 | **params, 45 | ) 46 | for new_text in threaded_streaming_generator(generate=self.model.text_model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 47 | end = new_text.find(self.tokenizer.eos_token) 48 | if end == -1: 49 | yield new_text 50 | else: 51 | yield new_text[:end] 52 | break 53 | -------------------------------------------------------------------------------- /backend/nvlm.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel 2 | import torchvision.transforms as T 3 | from torchvision.transforms.functional import InterpolationMode 4 | 5 | from vision_qna import * 6 | 7 | # nvidia/NVLM-D-72B 8 | 9 | MAX_TILES = 6 10 | 11 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 12 | IMAGENET_STD = (0.229, 0.224, 0.225) 13 | 14 | def build_transform(input_size): 15 | MEAN, STD = IMAGENET_MEAN, IMAGENET_STD 16 | transform = T.Compose([ 17 | T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), 18 | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), 19 | T.ToTensor(), 20 | T.Normalize(mean=MEAN, std=STD) 21 | ]) 22 | return transform 23 | 24 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): 25 | best_ratio_diff = float('inf') 26 | best_ratio = (1, 1) 27 | area = width * height 28 | for ratio in target_ratios: 29 | target_aspect_ratio = ratio[0] / ratio[1] 30 | ratio_diff = abs(aspect_ratio - target_aspect_ratio) 31 | if ratio_diff < best_ratio_diff: 32 | best_ratio_diff = ratio_diff 33 | best_ratio = ratio 34 | elif ratio_diff == best_ratio_diff: 35 | if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: 36 | best_ratio = ratio 37 | return best_ratio 38 | 39 | def dynamic_preprocess(image, min_num=1, max_num=MAX_TILES, image_size=448, use_thumbnail=False): 40 | orig_width, orig_height = image.size 41 | aspect_ratio = orig_width / orig_height 42 | 43 | # calculate the existing image aspect ratio 44 | target_ratios = set( 45 | (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if 46 | i * j <= max_num and i * j >= min_num) 47 | target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) 48 | 49 | # find the closest aspect ratio to the target 50 | target_aspect_ratio = find_closest_aspect_ratio( 51 | aspect_ratio, target_ratios, orig_width, orig_height, image_size) 52 | 53 | # calculate the target width and height 54 | target_width = image_size * target_aspect_ratio[0] 55 | target_height = image_size * target_aspect_ratio[1] 56 | blocks = target_aspect_ratio[0] * target_aspect_ratio[1] 57 | 58 | # resize the image 59 | resized_img = image.resize((target_width, target_height)) 60 | processed_images = [] 61 | for i in range(blocks): 62 | box = ( 63 | (i % (target_width // image_size)) * image_size, 64 | (i // (target_width // image_size)) * image_size, 65 | ((i % (target_width // image_size)) + 1) * image_size, 66 | ((i // (target_width // image_size)) + 1) * image_size 67 | ) 68 | # split the image 69 | split_img = resized_img.crop(box) 70 | processed_images.append(split_img) 71 | assert len(processed_images) == blocks 72 | if use_thumbnail and len(processed_images) != 1: 73 | thumbnail_img = image.resize((image_size, image_size)) 74 | processed_images.append(thumbnail_img) 75 | return processed_images 76 | 77 | 78 | def load_image(image, input_size=448, max_num=MAX_TILES): 79 | #image = Image.open(image_file).convert('RGB') 80 | transform = build_transform(input_size=input_size) 81 | images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) 82 | pixel_values = [transform(image) for image in images] 83 | pixel_values = torch.stack(pixel_values) 84 | return pixel_values 85 | 86 | 87 | class VisionQnA(VisionQnABase): 88 | model_name: str = "nvlm" 89 | format: str = "chatml" 90 | vision_layers: List[str] = ["vision_model"] 91 | 92 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 93 | super().__init__(model_id, device, device_map, extra_params, format) 94 | 95 | self.max_tiles = extra_params.get('max_tiles', MAX_TILES) 96 | 97 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=self.params.get('trust_remote_code', False)) 98 | self.model = AutoModel.from_pretrained(**self.params).eval() 99 | 100 | self.eos_token = '<|im_end|>' 101 | self.IMG_CONTEXT_TOKEN='<|vision_pad|>' 102 | self.IMG_START_TOKEN = '' # <|vision_start|> ? 103 | self.IMG_END_TOKEN = '' # <|vision_end|> ? 104 | self.model.img_context_token_id = self.tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN) 105 | 106 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 107 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 108 | self.model = self.model.to(self.device) 109 | 110 | self.loaded_banner() 111 | 112 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 113 | images, prompt = await prompt_from_messages(request.messages, self.format) 114 | 115 | if len(images) < 1: 116 | pixel_values = None 117 | else: 118 | pixel_values = load_image(images[-1], max_num=self.max_tiles).to(self.model.dtype).cuda() 119 | 120 | for num_patches in [pixel_values.shape[0]]: 121 | tile_pos_identifiers = [f"" for i in range(1, num_patches)] + [""] 122 | image_tokens = '' 123 | for tile_pos_identifier in tile_pos_identifiers: 124 | image_tokens += tile_pos_identifier + self.IMG_CONTEXT_TOKEN * self.model.num_image_token 125 | image_tokens = self.IMG_START_TOKEN + image_tokens + self.IMG_END_TOKEN 126 | prompt = prompt.replace('', image_tokens, 1) 127 | 128 | model_inputs = self.tokenizer(prompt, return_tensors='pt') 129 | input_ids = model_inputs['input_ids'].cuda() 130 | attention_mask = model_inputs['attention_mask'].cuda() 131 | 132 | default_params = dict( 133 | max_new_tokens=1024, 134 | do_sample=False, 135 | pad_token_id=self.tokenizer.eos_token_id, 136 | ) 137 | 138 | params = self.get_generation_params(request, default_params) 139 | 140 | del params['use_cache'] 141 | 142 | generation_kwargs = dict( 143 | pixel_values=pixel_values, 144 | input_ids=input_ids, 145 | attention_mask=attention_mask, 146 | **params, 147 | ) 148 | 149 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 150 | end = new_text.find(self.eos_token) 151 | if end == -1: 152 | yield new_text 153 | else: 154 | yield new_text[:end] 155 | break 156 | -------------------------------------------------------------------------------- /backend/omchat.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModel, AutoProcessor, AutoTokenizer 2 | 3 | from vision_qna import * 4 | 5 | # omlab/omchat-v2.0-13B-single-beta_hf 6 | 7 | class VisionQnA(VisionQnABase): 8 | model_name: str = "omchat" 9 | format: str = "chatml" 10 | visual_layers: List[str] = ['vision_tower', 'multi_modal_projector'] 11 | 12 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 13 | super().__init__(model_id, device, device_map, extra_params, format) 14 | 15 | if self.params['torch_dtype'] == torch.bfloat16: 16 | self.dtype = self.params['torch_dtype'] = torch.float16 17 | 18 | self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 19 | self.model = AutoModel.from_pretrained(**self.params).eval() 20 | 21 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 22 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 23 | self.model = self.model.to(self.device) 24 | 25 | # XXX bug fix, model seems to alter the config after first generation 26 | self.eos_token_id = self.model.generation_config.eos_token_id 27 | 28 | self.loaded_banner() 29 | 30 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 31 | images, prompt = await chatml_prompt_from_messages(request.messages, img_tok='') 32 | 33 | if len(images) < 1: 34 | images = None 35 | 36 | inputs = self.processor(prompt, images=images, return_tensors="pt").to(self.model.device) 37 | 38 | default_params = dict( 39 | do_sample=False, 40 | eos_token_id=self.eos_token_id, 41 | pad_token_id=self.processor.tokenizer.pad_token_id, 42 | ) 43 | 44 | params = self.get_generation_params(request, default_params=default_params) 45 | 46 | generation_kwargs = dict( 47 | **inputs, 48 | **params, 49 | ) 50 | 51 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 52 | end = new_text.find(self.processor.tokenizer.eos_token) 53 | if end == -1: 54 | yield new_text 55 | else: 56 | yield new_text[:end] 57 | break 58 | -------------------------------------------------------------------------------- /backend/omnilmm12b.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel 2 | 3 | from vision_qna import * 4 | 5 | # openbmb/OmniLMM-12B 6 | 7 | class VisionQnA(VisionQnABase): 8 | format: str = 'internal' 9 | model_name: str = "omnilmm12b" 10 | 11 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 12 | super().__init__(model_id, device, device_map, extra_params, format) 13 | 14 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, model_max_length=2048, trust_remote_code=self.params.get('trust_remote_code', False)) 15 | self.model = AutoModel.from_pretrained(**self.params).to(dtype=self.params['torch_dtype']).eval() 16 | 17 | print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}") 18 | 19 | async def chat_with_images(self, request: ImageChatRequest) -> str: 20 | # 3B 21 | image = None 22 | msgs = [] 23 | 24 | for m in request.messages: 25 | if m.role == 'user': 26 | for c in m.content: 27 | if c.type == 'image_url': 28 | image = await url_to_image(c.image_url.url) 29 | if c.type == 'text': 30 | msgs.extend([{ 'role': 'user', 'content': c.text }]) 31 | elif m.role == 'assistant': 32 | for c in m.content: 33 | if c.type == 'text': 34 | msgs.extend([{ 'role': 'assistant', 'content': c.text }]) 35 | 36 | params = self.get_generation_params(request) 37 | 38 | answer, context, _ = self.model.chat( 39 | image=image, 40 | msgs=msgs, 41 | context=None, 42 | tokenizer=self.tokenizer, 43 | **params, 44 | ) 45 | 46 | return answer 47 | -------------------------------------------------------------------------------- /backend/ovis.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | # AIDC-AI/Ovis1.5-Gemma2-9B 6 | # AIDC-AI/Ovis1.5-Llama3-8B 7 | 8 | class VisionQnA(VisionQnABase): 9 | model_name: str = "generic" 10 | format: str = "gemma" # or llama3 11 | visual_layers: List[str] = ['visual_tokenizer', 'vte'] 12 | 13 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 14 | super().__init__(model_id, device, device_map, extra_params, format) 15 | 16 | if not format: 17 | self.format = guess_model_format(model_id) 18 | 19 | self.params['multimodal_max_length'] = 8192 20 | 21 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 22 | 23 | self.text_tokenizer = self.model.get_text_tokenizer() 24 | self.visual_tokenizer = self.model.get_visual_tokenizer() 25 | self.conversation_formatter = self.model.get_conversation_formatter() 26 | 27 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 28 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 29 | self.model = self.model.to(self.device) 30 | 31 | self.loaded_banner() 32 | 33 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 34 | images, prompt = await prompt_from_messages(request.messages, self.format) 35 | 36 | if len(images) < 1: 37 | images = [ await url_to_image(black_pixel_url) ] 38 | prompt = "\n" + prompt 39 | 40 | tok_chunks = [self.text_tokenizer(chunk, add_special_tokens=False).input_ids for chunk in prompt.split('')] 41 | 42 | 43 | input_ids = join_int_lists(tok_chunks, -200) # -200 == 44 | input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device=self.model.device) 45 | 46 | text_attention_masks = torch.ne(input_ids, self.text_tokenizer.pad_token_id).to(device=self.model.device) 47 | pixel_values = [ self.visual_tokenizer.preprocess_image(image).to( 48 | dtype=self.visual_tokenizer.dtype, device=self.visual_tokenizer.device) for image in images ] 49 | 50 | 51 | # hack section, skip model.generate because of cache bug with gemma2 52 | _, inputs_embeds, _, attention_mask = self.model.merge_multimodal( 53 | text_input_ids=input_ids, 54 | text_attention_masks=text_attention_masks, 55 | text_labels=None, 56 | pixel_values=pixel_values, 57 | ) 58 | 59 | """ 60 | # Hybrid cache implementation for Gemma2 - this is disabled for now, due to an error with this version of transformers 61 | # AttributeError: 'HybridCache' object has no attribute 'max_batch_size' 62 | if getattr(self.generation_config, 'cache_implementation') == 'hybrid': # mainly for Gemma2 63 | kwargs['past_key_values'] = self._get_hybrid_cache_for_llm( 64 | getattr(kwargs, "num_beams", 1), kwargs['max_new_tokens'] + inputs_embeds.shape[-2]) 65 | self.get_llm()._supports_cache_class = True 66 | kwargs['cache_implementation'] = None 67 | """ 68 | 69 | default_params = dict( 70 | max_new_tokens=1024, 71 | do_sample=False, 72 | top_p=None, 73 | top_k=None, 74 | temperature=None, 75 | repetition_penalty=None, 76 | eos_token_id=self.model.generation_config.eos_token_id, 77 | pad_token_id=self.text_tokenizer.pad_token_id, 78 | use_cache=True, 79 | num_beams=1, 80 | ) 81 | 82 | params = self.get_generation_params(request, default_params=default_params) 83 | 84 | generation_kwargs = dict( 85 | # inputs=input_ids 86 | inputs_embeds=inputs_embeds, 87 | attention_mask=attention_mask, 88 | **params, 89 | ) 90 | 91 | for new_text in threaded_streaming_generator(generate=self.model.llm.generate, tokenizer=self.text_tokenizer, generation_kwargs=generation_kwargs): 92 | end = new_text.find(self.text_tokenizer.eos_token) 93 | if end == -1: 94 | yield new_text 95 | else: 96 | yield new_text[:end] 97 | break 98 | -------------------------------------------------------------------------------- /backend/ovis16.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | # AIDC-AI/Ovis1.6-Llama3.2-3B 6 | # AIDC-AI/Ovis1.6-Gemma2-9B 7 | # AIDC-AI/Ovis1.6-Gemma2-27B 8 | 9 | IMAGE_TOKEN = "" 10 | 11 | class VisionQnA(VisionQnABase): 12 | model_name: str = "generic" 13 | format: str = "custom" 14 | visual_layers: List[str] = ['visual_tokenizer', 'vte'] 15 | 16 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 17 | super().__init__(model_id, device, device_map, extra_params, format) 18 | 19 | self.params['multimodal_max_length'] = 8192 20 | 21 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 22 | 23 | self.text_tokenizer = self.model.get_text_tokenizer() 24 | self.visual_tokenizer = self.model.get_visual_tokenizer() 25 | 26 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 27 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 28 | self.model = self.model.to(self.device) 29 | 30 | self.loaded_banner() 31 | 32 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 33 | conversation = [] 34 | images = [] 35 | 36 | for m in request.messages: 37 | content = '' 38 | for c in m.content: 39 | if c.type == 'image_url': 40 | image = await url_to_image(c.image_url.url) 41 | images.extend([image]) 42 | content += IMAGE_TOKEN + '\n' 43 | elif c.type == 'text': 44 | content += c.text 45 | 46 | if content: 47 | if m.role == 'user': 48 | conversation.extend([{'from': 'human', 'value': content }]) 49 | elif m.role == 'assistant': 50 | conversation.extend([{'from': 'gpt', 'value': content }]) 51 | # system is ignored 52 | 53 | if len(images) < 1: 54 | images = [ await url_to_image(black_pixel_url) ] 55 | conversation[0]['value'] = IMAGE_TOKEN + '\n' + conversation[0]['value'] 56 | 57 | _prompt, input_ids, pixel_values = self.model.preprocess_inputs(conversation, images) 58 | attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id) 59 | input_ids = input_ids.unsqueeze(0).to(device=self.model.device) 60 | attention_mask = attention_mask.unsqueeze(0).to(device=self.model.device) 61 | pixel_values = [pixel_values.to(dtype=self.visual_tokenizer.dtype, device=self.visual_tokenizer.device)] 62 | 63 | _, inputs_embeds, labels, attention_mask = self.model.merge_multimodal( 64 | text_input_ids=input_ids, 65 | text_attention_masks=attention_mask, 66 | text_labels=None, 67 | pixel_values=pixel_values, 68 | left_padding=True 69 | ) 70 | 71 | default_params = dict( 72 | max_new_tokens=1024, 73 | do_sample=False, 74 | top_p=None, 75 | top_k=None, 76 | temperature=None, 77 | repetition_penalty=None, 78 | eos_token_id=self.model.generation_config.eos_token_id, 79 | pad_token_id=self.text_tokenizer.pad_token_id, 80 | use_cache=True, 81 | num_beams=1, 82 | ) 83 | 84 | params = self.get_generation_params(request, default_params=default_params) 85 | 86 | generation_kwargs = dict( 87 | inputs_embeds=inputs_embeds, 88 | attention_mask=attention_mask, 89 | **params, 90 | ) 91 | 92 | for new_text in threaded_streaming_generator(generate=self.model.llm.generate, tokenizer=self.text_tokenizer, generation_kwargs=generation_kwargs): 93 | end = new_text.find(self.text_tokenizer.eos_token) 94 | if end == -1: 95 | yield new_text 96 | else: 97 | yield new_text[:end] 98 | break 99 | -------------------------------------------------------------------------------- /backend/ovis2.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | #AIDC-AI/Ovis2-1B 6 | #AIDC-AI/Ovis2-2B 7 | #AIDC-AI/Ovis2-4B 8 | #AIDC-AI/Ovis2-8B 9 | #AIDC-AI/Ovis2-16B 10 | #AIDC-AI/Ovis2-34B 11 | 12 | IMAGE_TOKEN = "" 13 | 14 | class VisionQnA(VisionQnABase): 15 | model_name: str = "ovis2" 16 | format: str = "custom" 17 | visual_layers: List[str] = ['visual_tokenizer', 'vte'] 18 | 19 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 20 | super().__init__(model_id, device, device_map, extra_params, format) 21 | 22 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 23 | 24 | self.text_tokenizer = self.model.get_text_tokenizer() 25 | self.visual_tokenizer = self.model.get_visual_tokenizer() 26 | 27 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 28 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 29 | self.model = self.model.to(self.device) 30 | 31 | self.loaded_banner() 32 | 33 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 34 | conversation = [] 35 | images = [] 36 | 37 | for m in request.messages: 38 | content = '' 39 | for c in m.content: 40 | if c.type == 'image_url': 41 | image = await url_to_image(c.image_url.url) 42 | images.extend([image]) 43 | content += IMAGE_TOKEN + '\n' 44 | elif c.type == 'text': 45 | content += c.text 46 | 47 | if content: 48 | if m.role == 'user': 49 | conversation.extend([{'from': 'human', 'value': content }]) 50 | elif m.role == 'assistant': 51 | conversation.extend([{'from': 'gpt', 'value': content }]) 52 | # system is ignored 53 | 54 | if len(images) < 1: 55 | images = [ await url_to_image(black_pixel_url) ] 56 | conversation[0]['value'] = IMAGE_TOKEN + '\n' + conversation[0]['value'] 57 | 58 | _prompt, input_ids, pixel_values = self.model.preprocess_inputs(conversation, images) 59 | attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id) 60 | input_ids = input_ids.unsqueeze(0).to(device=self.model.device) 61 | attention_mask = attention_mask.unsqueeze(0).to(device=self.model.device) 62 | pixel_values = [pixel_values.to(dtype=self.visual_tokenizer.dtype, device=self.visual_tokenizer.device)] 63 | 64 | default_params = dict( 65 | max_new_tokens=1024, 66 | do_sample=False, 67 | top_p=None, 68 | top_k=None, 69 | temperature=None, 70 | repetition_penalty=None, 71 | eos_token_id=self.model.generation_config.eos_token_id, 72 | pad_token_id=self.text_tokenizer.pad_token_id, 73 | use_cache=True, 74 | num_beams=1, 75 | ) 76 | 77 | params = self.get_generation_params(request, default_params=default_params) 78 | 79 | generation_kwargs = dict( 80 | inputs=input_ids, 81 | pixel_values=pixel_values, 82 | attention_mask=attention_mask, 83 | **params, 84 | ) 85 | 86 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.text_tokenizer, generation_kwargs=generation_kwargs): 87 | end = new_text.find(self.text_tokenizer.eos_token) 88 | if end == -1: 89 | yield new_text 90 | else: 91 | yield new_text[:end] 92 | break 93 | -------------------------------------------------------------------------------- /backend/paligemma.py: -------------------------------------------------------------------------------- 1 | # "google/paligemma2-3b-ft-docci-448" 2 | # "google/paligemma2-10b-ft-docci-448" 3 | # "google/paligemma2-28b-pt-896" - pretrain 4 | 5 | from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration 6 | from vision_qna import * 7 | 8 | class VisionQnA(VisionQnABase): 9 | model_name: str = "paligemma2" 10 | format: str = "gemma" # doesn't seem to actually be instruction trained 11 | visual_layers: List[str] = ["vision_tower", "multi_modal_projector"] 12 | 13 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 14 | super().__init__(model_id, device, device_map, extra_params, format) 15 | 16 | if not format: 17 | self.format = guess_model_format(model_id) 18 | 19 | for i in ['trust_remote_code']: 20 | del self.params[i] 21 | 22 | self.model = PaliGemmaForConditionalGeneration.from_pretrained(**self.params).eval() 23 | self.processor = PaliGemmaProcessor.from_pretrained(model_id) 24 | 25 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 26 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 27 | self.model = self.model.to(self.device) 28 | 29 | self.loaded_banner() 30 | 31 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 32 | images, prompt = await prompt_from_messages(request.messages, self.format) 33 | 34 | if len(images) < 1: 35 | images = [ await url_to_image(black_pixel_url) ] 36 | prompt = "\n" + prompt 37 | 38 | # Instruct the model to create a caption in English 39 | #prompt = "caption en" 40 | inputs = self.processor(text=prompt, images=images, return_tensors="pt").to(dtype=self.dtype, device=self.device) 41 | 42 | default_params = { 43 | 'do_sample': False, 44 | # 'eos_token_id': self.processor.tokenizer.eos_token_id, 45 | # 'pad_token_id': self.processor.tokenizer.eos_token_id, 46 | } 47 | 48 | params = self.get_generation_params(request, default_params=default_params) 49 | 50 | generation_kwargs = dict( 51 | **inputs, 52 | **params, 53 | ) 54 | 55 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 56 | end = new_text.find(self.processor.tokenizer.eos_token) 57 | if end == -1: 58 | yield new_text 59 | else: 60 | yield new_text[:end] 61 | break 62 | -------------------------------------------------------------------------------- /backend/phi3_v.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, AutoModelForCausalLM 2 | 3 | from vision_qna import * 4 | 5 | # microsoft/Phi-3-vision-128k-instruct 6 | # microsoft/Phi-3.5-vision-instruct 7 | # failspy/Phi-3-vision-128k-instruct-abliterated-alpha 8 | 9 | class VisionQnA(VisionQnABase): 10 | format: str = 'phi3_v' 11 | model_name: str = "phi3" 12 | vision_layers: List[str] = ["vision_embed_tokens"] 13 | 14 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 15 | super().__init__(model_id, device, device_map, extra_params, format) 16 | 17 | self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 18 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 19 | 20 | self.loaded_banner() 21 | 22 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 23 | images, prompt = await phi3_prompt_from_messages(request.messages, img_tok = "<|image_{}|>\n") # numbered image token 24 | 25 | inputs = self.processor(prompt, images=images if images else None, return_tensors="pt").to(self.model.device) 26 | 27 | default_params = { 28 | "do_sample": False, 29 | "eos_token_id": self.processor.tokenizer.eos_token_id, 30 | } 31 | 32 | params = self.get_generation_params(request, default_params) 33 | 34 | generation_kwargs = dict( 35 | **inputs, 36 | **params, 37 | ) 38 | 39 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 40 | end = new_text.find(self.processor.tokenizer.eos_token) 41 | if end == -1: 42 | yield new_text 43 | else: 44 | yield new_text[:end] 45 | break 46 | -------------------------------------------------------------------------------- /backend/qh_360vl.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | 3 | import transformers 4 | import warnings 5 | # disable some warnings 6 | transformers.logging.set_verbosity_error() 7 | warnings.filterwarnings('ignore') 8 | 9 | from vision_qna import * 10 | # qihoo360/360VL-8B 11 | # qihoo360/360VL-70B 12 | 13 | # 4bit 70B: 14 | # RuntimeError: mat1 and mat2 shapes cannot be multiplied (1170x8192 and 1x3584) 15 | 16 | class VisionQnA(VisionQnABase): 17 | model_name: str = "360vl" 18 | format: str = "llama3" 19 | vision_layers: List[str] = ["vision_tower"] 20 | 21 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 22 | super().__init__(model_id, device, device_map, extra_params, format) 23 | 24 | if not format: 25 | self.format = guess_model_format(model_id) 26 | 27 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 28 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 29 | 30 | self.vision_tower = self.model.get_vision_tower() 31 | self.vision_tower.load_model() 32 | self.vision_tower.to(device=self.device, dtype=self.dtype) 33 | self.image_processor = self.vision_tower.image_processor 34 | self.tokenizer.pad_token = self.tokenizer.eos_token 35 | 36 | self.terminators = [ 37 | self.tokenizer.convert_tokens_to_ids("<|eot_id|>",) 38 | ] 39 | 40 | self.loaded_banner() 41 | 42 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 43 | images, prompt = await llama3_prompt_from_messages(request.messages, img_tok = "<|reserved_special_token_44|>\n") 44 | 45 | default_system = "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 46 | 47 | input_ids = self.tokenizer.encode(prompt, return_tensors="pt") 48 | 49 | input_id_list = input_ids[0].tolist() 50 | try: 51 | input_id_list[input_id_list.index(128049)] = -200 52 | image_tensor = self.model.process_images_slid_window(images[0], self.image_processor).unsqueeze(0) 53 | except ValueError as e: 54 | pass 55 | 56 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype, device=input_ids.device).unsqueeze(0) 57 | 58 | default_params = dict( 59 | num_beams=1, 60 | ) 61 | 62 | params = self.get_generation_params(request, default_params) 63 | 64 | generation_kwargs = dict( 65 | input_ids=input_ids.to(device=self.device), 66 | images=image_tensor.to(dtype=self.dtype, device=self.device) if images else None, 67 | eos_token_id=self.terminators, 68 | **params, 69 | ) 70 | 71 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 72 | end = new_text.find(self.tokenizer.eos_token) 73 | if end == -1: 74 | yield new_text 75 | else: 76 | yield new_text[:end] 77 | break 78 | -------------------------------------------------------------------------------- /backend/qwen.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | import os 4 | from vision_qna import * 5 | 6 | # "Qwen/Qwen-VL-Chat" # 13GB 7 | # "Qwen/Qwen-VL-Chat-int4" # 11GB (bad, bugs) 8 | 9 | class VisionQnA(VisionQnABase): 10 | model_name: str = "qwen" 11 | format: 'chatml' 12 | vision_layers: List[str] = ['visual'] 13 | 14 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 15 | super().__init__(model_id, device, device_map, extra_params, format) 16 | 17 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 18 | self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() 19 | 20 | self.loaded_banner() 21 | 22 | async def chat_with_images(self, request: ImageChatRequest) -> str: 23 | prompt, history, files, system_prompt = await prompt_history_images_system_from_messages( 24 | request.messages, img_tok='', url_handler=url_to_file) 25 | 26 | if system_prompt is None: 27 | system_prompt = "You are an helpful assistant." 28 | 29 | # 1st dialogue turn 30 | if files: 31 | query_list = [{'image': files[-1]}, {'text': prompt}] 32 | else: 33 | query_list = [{'text': prompt}] 34 | 35 | query = self.tokenizer.from_list_format(query_list) 36 | 37 | default_params = { 38 | 'top_p': 0.3, 39 | } 40 | 41 | params = self.get_generation_params(request) 42 | 43 | answer, history = self.model.chat(self.tokenizer, query=query, history=history, system=system_prompt, **params) 44 | 45 | for f in files: 46 | os.remove(f) 47 | 48 | return answer 49 | 50 | """ 51 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 52 | try: 53 | prompt, history, files, system_prompt = await prompt_history_images_system_from_messages( 54 | request.messages, img_tok='', url_handler=url_to_file) 55 | 56 | # 1st dialogue turn 57 | query = self.tokenizer.from_list_format([ 58 | {'image': files[-1] if files else []}, 59 | {'text': prompt}, 60 | ]) 61 | 62 | if system_prompt is None: 63 | system_prompt = "You are an helpful assistant." 64 | 65 | max_window_size = 16384 # generation_config.max_window_size 66 | 67 | # XXX make_context isn't available. 68 | raw_text, context_tokens = self.model.make_context( 69 | self.tokenizer, 70 | query, 71 | history=history, 72 | system=system_prompt, 73 | max_window_size=max_window_size, 74 | chat_format=self.format, 75 | ) 76 | 77 | input_ids = torch.tensor([context_tokens]).to(self.model.device) 78 | 79 | inputs = dict( 80 | input_ids=input_ids, 81 | stop_words_ids=[[self.tokenizer.im_end_id], [self.tokenizer.im_start_id]], 82 | return_dict_in_generate=False, 83 | ) 84 | 85 | default_params = { 86 | 'top_p': 0.3, 87 | } 88 | 89 | params = self.get_generation_params(request, default_params=default_params) 90 | 91 | generation_kwargs = dict( 92 | **inputs, 93 | **params, 94 | ) 95 | 96 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 97 | end = new_text.find(self.tokenizer.eos_token) 98 | if end == -1: 99 | yield new_text 100 | else: 101 | yield new_text[:end] 102 | break 103 | 104 | except Exception as e: 105 | logger.error(e) 106 | # raise 107 | 108 | finally: 109 | for f in files: 110 | os.remove(f) 111 | """ 112 | # XXX native streaming doesn't work 113 | """ 114 | File "/app/backend/qwen-vl.py", line 72, in stream_chat_with_images 115 | for new_text in streamer: 116 | File "/app/hf_home/modules/transformers_modules/Qwen/Qwen-VL-Chat/f57cfbd358cb56b710d963669ad1bcfb44cdcdd8/modeling_qwen.py", line 1021, in stream_generator 117 | for token in self.generate_stream( 118 | ^^^^^^^^^^^^^^^^^^^^^ 119 | File "/usr/local/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context 120 | return func(*args, **kwargs) 121 | ^^^^^^^^^^^^^^^^^^^^^ 122 | File "/usr/local/lib/python3.11/site-packages/transformers_stream_generator/main.py", line 208, in generate 123 | ] = self._prepare_attention_mask_for_generation( 124 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 125 | File "/usr/local/lib/python3.11/site-packages/transformers/generation/utils.py", line 473, in _prepare_attention_mask_for_generation 126 | torch.isin(elements=inputs, test_elements=pad_token_id).any() 127 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 128 | TypeError: isin() received an invalid combination of arguments - got (test_elements=int, elements=Tensor, ), but expected one of: 129 | * (Tensor elements, Tensor test_elements, *, bool assume_unique, bool invert, Tensor out) 130 | * (Number element, Tensor test_elements, *, bool assume_unique, bool invert, Tensor out) 131 | * (Tensor elements, Number test_element, *, bool assume_unique, bool invert, Tensor out) 132 | """ -------------------------------------------------------------------------------- /backend/qwen2_5_vl.py: -------------------------------------------------------------------------------- 1 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor 2 | from qwen_vl_utils import process_vision_info 3 | 4 | import os 5 | from vision_qna import * 6 | 7 | #Qwen/Qwen2.5-VL-72B-Instruct 8 | #Qwen/Qwen2.5-VL-72B-Instruct-AWQ 9 | #Qwen/Qwen2.5-VL-7B-Instruct 10 | #Qwen/Qwen2.5-VL-7B-Instruct-AWQ 11 | #Qwen/Qwen2.5-VL-3B-Instruct 12 | #Qwen/Qwen2.5-VL-3B-Instruct-AWQ 13 | 14 | 15 | class VisionQnA(VisionQnABase): 16 | model_name: str = "qwen2_5_vl" 17 | format: 'chatml' 18 | vision_layers: List[str] = ['visual'] 19 | 20 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 21 | super().__init__(model_id, device, device_map, extra_params, format) 22 | 23 | if ('awq' in model_id.lower() or 'gptq' in model_id.lower()) and self.dtype == torch.bfloat16: 24 | self.dtype = self.params['torch_dtype'] = torch.float16 # recommended 25 | 26 | self.processor = AutoProcessor.from_pretrained(model_id) 27 | 28 | del self.params['trust_remote_code'] 29 | 30 | self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(**self.params).eval() 31 | 32 | self.loaded_banner() 33 | 34 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 35 | # image_tag = '<|vision_start|><|image_pad|><|vision_end|>' 36 | 37 | messages = [] 38 | 39 | for m in request.messages: 40 | if m.role == 'user': 41 | msg = { 'role': m.role, 'content': [] } 42 | for c in m.content: 43 | if c.type == 'image_url': 44 | msg['content'].extend([{'type': c.type, 'image': c.image_url.url}]) 45 | elif c.type == 'text': 46 | msg['content'].extend([{'type': c.type, 'text': c.text}]) 47 | elif c.type == 'video': # not likely to work. 48 | msg['content'].extend([{'type': c.type, 'video': c.image_url.url}]) 49 | else: 50 | ctext = "".join([c.text for c in m.content]) # fix for multiple system prompt contents #19 51 | msg = { 'role': m.role, 'content': [{ 'type': 'text', 'text': ctext }] } 52 | 53 | messages.extend([msg]) 54 | 55 | text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 56 | 57 | image_inputs, video_inputs = process_vision_info(messages) 58 | inputs = self.processor( 59 | text=[text], 60 | images=image_inputs, 61 | videos=video_inputs, 62 | padding=True, 63 | return_tensors="pt", 64 | ) 65 | inputs = inputs.to(self.device) 66 | 67 | params = self.get_generation_params(request, default_params={}) 68 | 69 | generation_kwargs = dict( 70 | **inputs, 71 | **params, 72 | ) 73 | 74 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 75 | end = new_text.find(self.processor.tokenizer.eos_token) 76 | if end == -1: 77 | yield new_text 78 | else: 79 | yield new_text[:end] 80 | break 81 | -------------------------------------------------------------------------------- /backend/qwen2_vl.py: -------------------------------------------------------------------------------- 1 | from transformers import Qwen2VLForConditionalGeneration, AutoProcessor 2 | from qwen_vl_utils import process_vision_info 3 | 4 | import os 5 | from vision_qna import * 6 | 7 | # Qwen/Qwen2-VL-2B-Instruct-AWQ 8 | # Qwen/Qwen2-VL-2B-Instruct 9 | # Qwen/Qwen2-VL-7B-Instruct-AWQ 10 | # Qwen/Qwen2-VL-7B-Instruct 11 | # Qwen/Qwen2-VL-72B-Instruct-AWQ 12 | # Qwen/Qwen2-VL-72B-Instruct 13 | # Qwen/QVQ-72B-Preview 14 | # Not recommended: 15 | # X Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4 16 | # X Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8 17 | # X Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4 18 | # X Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8 19 | # X Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4 20 | # X Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8 21 | 22 | # https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4 23 | # Performance: for A100 80GB Qwen claim 30-40 T/s, I can't reproduce with this setup, I see more like 5-10 T/s. 24 | 25 | class VisionQnA(VisionQnABase): 26 | model_name: str = "qwen2_vl" 27 | format: 'chatml' 28 | vision_layers: List[str] = ['visual'] 29 | 30 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 31 | super().__init__(model_id, device, device_map, extra_params, format) 32 | 33 | if ('awq' in model_id.lower() or 'gptq' in model_id.lower()) and self.dtype == torch.bfloat16: 34 | self.dtype = self.params['torch_dtype'] = torch.float16 # recommended 35 | 36 | self.processor = AutoProcessor.from_pretrained(model_id) 37 | 38 | del self.params['trust_remote_code'] 39 | 40 | self.model = Qwen2VLForConditionalGeneration.from_pretrained(**self.params).eval() 41 | 42 | self.loaded_banner() 43 | 44 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 45 | # image_tag = '<|vision_start|><|image_pad|><|vision_end|>' 46 | 47 | messages = [] 48 | 49 | for m in request.messages: 50 | if m.role == 'user': 51 | msg = { 'role': m.role, 'content': [] } 52 | for c in m.content: 53 | if c.type == 'image_url': 54 | msg['content'].extend([{'type': c.type, 'image': c.image_url.url}]) 55 | elif c.type == 'text': 56 | msg['content'].extend([{'type': c.type, 'text': c.text}]) 57 | elif c.type == 'video': # not likely to work. 58 | msg['content'].extend([{'type': c.type, 'video': c.image_url.url}]) 59 | else: 60 | ctext = "".join([c.text for c in m.content]) # fix for multiple system prompt contents #19 61 | msg = { 'role': m.role, 'content': [{ 'type': 'text', 'text': ctext }] } 62 | 63 | messages.extend([msg]) 64 | 65 | text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 66 | 67 | image_inputs, video_inputs = process_vision_info(messages) 68 | inputs = self.processor( 69 | text=[text], 70 | images=image_inputs, 71 | videos=video_inputs, 72 | padding=True, 73 | return_tensors="pt", 74 | ) 75 | inputs = inputs.to(self.device) 76 | 77 | params = self.get_generation_params(request, default_params={}) 78 | 79 | generation_kwargs = dict( 80 | **inputs, 81 | **params, 82 | ) 83 | 84 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs): 85 | end = new_text.find(self.processor.tokenizer.eos_token) 86 | if end == -1: 87 | yield new_text 88 | else: 89 | yield new_text[:end] 90 | break 91 | -------------------------------------------------------------------------------- /backend/xcomposer2-vl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch.amp 3 | from transformers import AutoTokenizer, AutoModel 4 | from vision_qna import * 5 | import auto_gptq 6 | import torch 7 | 8 | import transformers 9 | import warnings 10 | # disable some warnings 11 | transformers.logging.set_verbosity_error() 12 | warnings.filterwarnings('ignore') 13 | 14 | # internlm/internlm-xcomposer2-vl-7b # ~21GB 15 | # internlm/internlm-xcomposer2-vl-7b-4bit # ~12GB 16 | # internlm/internlm-xcomposer2-vl-1_8b # ~8GB 17 | # --4bit: 18 | # Linear4bit.forward() takes 2 positional arguments but 3 were given 19 | 20 | 21 | class InternLMXComposer2QForCausalLM(auto_gptq.modeling.BaseGPTQForCausalLM): 22 | layers_block_name = "model.layers" 23 | outside_layer_modules = [ 24 | 'vit', 'vision_proj', 'model.tok_embeddings', 'model.norm', 'output', 25 | ] 26 | inside_layer_modules = [ 27 | ["attention.wqkv.linear"], 28 | ["attention.wo.linear"], 29 | ["feed_forward.w1.linear", "feed_forward.w3.linear"], 30 | ["feed_forward.w2.linear"], 31 | ] 32 | 33 | class VisionQnA(VisionQnABase): 34 | format: str = 'internal' 35 | model_name: str = "xcomposer2-vl" 36 | vision_layers: List[str] = ['vit', 'vision_proj'] 37 | 38 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 39 | super().__init__(model_id, device, device_map, extra_params, format) 40 | 41 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 42 | 43 | if '-4bit' in model_id: 44 | if self.params['torch_dtype'] == torch.bfloat16: 45 | self.dtype = self.params['torch_dtype'] = torch.float16 46 | torch.set_default_dtype(self.dtype) 47 | 48 | # XXX TODO: use_marlin=True - bugs for now 49 | auto_gptq.modeling._base.SUPPORTED_MODELS = ["internlm"] 50 | self.model = InternLMXComposer2QForCausalLM.from_quantized(model_name_or_path=model_id, **self.params) 51 | else: 52 | torch.set_default_dtype(self.dtype) 53 | self.model = AutoModel.from_pretrained(**self.params).eval() 54 | 55 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 56 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 57 | self.model = self.model.to(self.device) 58 | 59 | self.eos_token = '[UNUSED_TOKEN_145]' 60 | self.eos_token_id = self.tokenizer.convert_tokens_to_ids([self.eos_token])[0] 61 | 62 | self.loaded_banner() 63 | 64 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 65 | prompt, history, files, system_prompt = await prompt_history_images_system_from_messages( 66 | request.messages, img_tok='', url_handler=url_to_file) 67 | 68 | if system_prompt is None: 69 | #system_prompt = 'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n' 70 | #'- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n' 71 | #'- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.\n' 72 | #'- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively based on the provided image.' 73 | # Improved system prompt for more reliable language detection. 74 | system_prompt = "You are an AI vision assistant. Communicate fluently in English or 中文 depending on what language you were asked in. Obey user instructions. 仅当用普通话询问时才用普通话回答。 Answer in English if questioned in English." 75 | 76 | if files: 77 | image = self.model.encode_img(files[-1]) 78 | inputs, im_mask = self.model.interleav_wrap_chat(self.tokenizer, prompt, image, history, system_prompt) 79 | else: 80 | inputs = self.model.build_inputs(self.tokenizer, prompt, history, system_prompt) 81 | im_mask = torch.zeros(inputs['input_ids'].shape[:2]).cuda().bool() 82 | inputs = { 83 | k: v.to(self.device) 84 | for k, v in inputs.items() if torch.is_tensor(v) 85 | } 86 | inputs['im_mask'] = im_mask 87 | 88 | default_params = { 89 | "temperature": 1.0, 90 | "top_p": 0.8, 91 | 'do_sample': True, 92 | 'repetition_penalty': 1.005, 93 | 'eos_token_id': [ self.tokenizer.eos_token_id, self.eos_token_id ], # also add end-of-assistant token in eos token id to avoid unnecessary generation 94 | } 95 | params = self.get_generation_params(request, default_params) 96 | 97 | generation_kwargs = dict( 98 | **inputs, 99 | **params, 100 | ) 101 | 102 | try: 103 | def wrapper(**kwargs): 104 | with torch.cuda.amp.autocast(): 105 | _ = self.model.generate(**kwargs) 106 | 107 | for new_text in threaded_streaming_generator(generate=wrapper, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 108 | end = new_text.find(self.eos_token) 109 | if end == -1: 110 | yield new_text 111 | else: 112 | yield new_text[:end] 113 | break 114 | 115 | except Exception as e: 116 | logger.error(e) 117 | # raise 118 | 119 | finally: 120 | for f in files: 121 | os.remove(f) 122 | 123 | -------------------------------------------------------------------------------- /backend/xcomposer2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from transformers import AutoTokenizer, AutoModel, logging 3 | from vision_qna import * 4 | import auto_gptq 5 | import torch 6 | 7 | import warnings 8 | # disable some warnings 9 | logging.set_verbosity_error() 10 | warnings.filterwarnings('ignore') 11 | 12 | # internlm/internlm-xcomposer2-7b 13 | # --4bit | TypeError: Linear4bit.forward() takes 2 positional arguments but 3 were given 14 | # internlm/internlm-xcomposer2-7b-4bit # pretty bad. 15 | 16 | class InternLMXComposer2QForCausalLM(auto_gptq.modeling.BaseGPTQForCausalLM): 17 | layers_block_name = "model.layers" 18 | outside_layer_modules = [ 19 | 'vit', 'vision_proj', 'model.tok_embeddings', 'model.norm', 'output', 20 | ] 21 | inside_layer_modules = [ 22 | ["attention.wqkv.linear"], 23 | ["attention.wo.linear"], 24 | ["feed_forward.w1.linear", "feed_forward.w3.linear"], 25 | ["feed_forward.w2.linear"], 26 | ] 27 | 28 | class VisionQnA(VisionQnABase): 29 | format: str = 'internal' 30 | model_name: str = "xcomposer2" 31 | vision_layers: List[str] = ['vit', 'vision_proj'] 32 | 33 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 34 | super().__init__(model_id, device, device_map, extra_params, format) 35 | 36 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 37 | if '-4bit' in model_id: 38 | if self.params['torch_dtype'] == torch.bfloat16: 39 | self.dtype = self.params['torch_dtype'] = torch.float16 # fix bugs. 40 | torch.set_default_dtype(self.dtype) 41 | 42 | # XXX TODO: use_marlin=True 43 | auto_gptq.modeling._base.SUPPORTED_MODELS = ["internlm"] 44 | self.model = InternLMXComposer2QForCausalLM.from_quantized(model_name_or_path=model_id, **self.params) 45 | else: 46 | torch.set_default_dtype(self.dtype) 47 | self.model = AutoModel.from_pretrained(**self.params).eval() 48 | 49 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 50 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 51 | self.model = self.model.to(self.device) 52 | 53 | self.eos_token = '[UNUSED_TOKEN_145]' 54 | self.eos_token_id = self.tokenizer.convert_tokens_to_ids([self.eos_token])[0] 55 | 56 | self.loaded_banner() 57 | 58 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 59 | query, history, images, meta_instruction = await prompt_history_images_system_from_messages( 60 | request.messages, img_tok='', url_handler=url_to_image) 61 | 62 | if meta_instruction is None: 63 | #meta_instruction = 'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n' 64 | #'- InternLM-XComposer (浦语·灵笔) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n' 65 | #'- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.' 66 | # This only works if the input is in English, Chinese input still receives Chinese output. 67 | meta_instruction = "You are an AI visual assistant. Communicate in English. Do what the user instructs." 68 | 69 | # this may be the only difference with, -vl, images in PIL instead of files. 70 | image = torch.stack([self.model.vis_processor(image) for image in images]) if images else None 71 | 72 | if image is None: 73 | inputs = self.model.build_inputs(self.tokenizer, query, history, meta_instruction) 74 | im_mask = torch.zeros(inputs['input_ids'].shape[:2]).cuda().bool() 75 | else: 76 | image = self.model.encode_img(image) 77 | inputs, im_mask = self.model.interleav_wrap_chat(self.tokenizer, query, image, history, meta_instruction) 78 | inputs = { 79 | k: v.to(self.device) 80 | for k, v in inputs.items() if torch.is_tensor(v) 81 | } 82 | inputs['im_mask'] = im_mask 83 | 84 | default_params = { 85 | "temperature": 1.0, 86 | "top_p": 0.8, 87 | 'do_sample': True, 88 | } 89 | 90 | params = self.get_generation_params(request, default_params) 91 | 92 | generation_kwargs = dict( 93 | **inputs, 94 | **params, 95 | ) 96 | 97 | try: 98 | def wrapper(**kwargs): 99 | with torch.cuda.amp.autocast(): 100 | _ = self.model.generate(**kwargs) 101 | 102 | for new_text in threaded_streaming_generator(generate=wrapper, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 103 | end = new_text.find(self.eos_token) 104 | if end == -1: 105 | yield new_text 106 | else: 107 | yield new_text[:end] 108 | break 109 | 110 | except Exception as e: 111 | logger.error(e) 112 | # raise 113 | -------------------------------------------------------------------------------- /backend/xgenmm.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria 2 | 3 | from vision_qna import * 4 | 5 | # Salesforce/xgen-mm-phi3-mini-instruct-r-v1 6 | # xgen-mm-phi3-mini-instruct-interleave-r-v1.5 7 | # xgen-mm-phi3-mini-instruct-singleimg-r-v1.5 8 | # xgen-mm-phi3-mini-instruct-dpo-r-v1.5 9 | 10 | class VisionQnA(VisionQnABase): 11 | model_name: str = "xgenmm" 12 | format: str = 'phi3' 13 | vision_layers: List[str] = ['vision_encoder', 'vision_tokenizer'] 14 | 15 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 16 | super().__init__(model_id, device, device_map, extra_params, format) 17 | 18 | # Doesn't work with accelerate 19 | # Errors: 20 | # NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device. 21 | # also doesn't work with --load-in-4bit for the same reason 22 | self.params['low_cpu_mem_usage'] = False 23 | del self.params['device_map'] 24 | 25 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False), use_fast=False, legacy=False) 26 | self.model = AutoModelForVision2Seq.from_pretrained(**self.params).eval() 27 | 28 | # bitsandbytes already moves the model to the device, so we don't need to do it again. 29 | if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): 30 | self.model = self.model.to(self.device) 31 | 32 | self.tokenizer = self.model.update_special_tokens(self.tokenizer) 33 | 34 | self.image_processor = AutoImageProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) 35 | self.eos_token = "<|end|>" 36 | 37 | self.loaded_banner() 38 | 39 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 40 | images, prompt = await phi3_prompt_from_messages(request.messages, img_tok = "\n") 41 | default_system = ("A chat between a curious user and an artificial intelligence assistant. " 42 | "The assistant gives helpful, detailed, and polite answers to the user's questions.") 43 | 44 | inputs = self.tokenizer([prompt], return_tensors="pt").to(device=self.model.device) 45 | image_sizes = [] 46 | 47 | if images: 48 | image_sizes = [img.size for img in images] 49 | 50 | if 'r-v1.5' in self._model_id: 51 | image_sizes = [image_sizes] 52 | 53 | image_list = [] 54 | for img in images: 55 | image_list.append(self.image_processor([img], return_tensors="pt", image_aspect_ratio='anyres')['pixel_values'].to(dtype=self.model.dtype)) 56 | 57 | inputs['pixel_values'] = [image_list] 58 | else: 59 | inputs['pixel_values'] = self.image_processor(images, return_tensors="pt", image_aspect_ratio='anyres')['pixel_values'].to(dtype=self.model.dtype) 60 | 61 | else: 62 | inputs['pixel_values'] = None 63 | 64 | default_params = { 65 | 'pad_token_id': self.tokenizer.pad_token_id, 66 | 'eos_token_id': 32007, # <|end|> 67 | 'do_sample': False, 68 | 'max_new_tokens': 768, 69 | 'top_p': None, 70 | 'num_beams': 1, 71 | 'image_size': image_sizes, 72 | } 73 | 74 | params = self.get_generation_params(request, default_params=default_params) 75 | 76 | # errors 77 | del params['use_cache'] 78 | 79 | generation_kwargs = dict( 80 | **inputs, 81 | **params, 82 | ) 83 | 84 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 85 | end = new_text.find(self.eos_token) 86 | if end == -1: 87 | yield new_text 88 | else: 89 | yield new_text[:end] 90 | break 91 | -------------------------------------------------------------------------------- /backend/yi-vl.py: -------------------------------------------------------------------------------- 1 | from llava.conversation import conv_templates 2 | from llava.mm_utils import ( 3 | expand2square, 4 | get_model_name_from_path, 5 | load_pretrained_model, 6 | tokenizer_image_token, 7 | ) 8 | from llava.model import LlavaLlamaForCausalLM 9 | from llava.model.constants import IMAGE_TOKEN_INDEX 10 | 11 | from transformers import AutoTokenizer 12 | 13 | 14 | from vision_qna import * 15 | 16 | # 01-ai/Yi-VL-34B 17 | # 01-ai/Yi-VL-6B 18 | 19 | class VisionQnA(VisionQnABase): 20 | model_name: str = "qwen-vl" 21 | format: str = 'chatml' 22 | 23 | def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): 24 | super().__init__(model_id, device, device_map, extra_params, format) 25 | 26 | if not format: 27 | self.format = guess_model_format(model_id) 28 | 29 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) 30 | self.model = LlavaLlamaForCausalLM.from_pretrained(**self.params) 31 | 32 | image_processor = None 33 | self.model.resize_token_embeddings(len(self.tokenizer)) 34 | self.vision_tower = self.model.get_vision_tower() 35 | 36 | if not self.vision_tower.is_loaded: 37 | self.vision_tower.load_model() 38 | self.vision_tower.to(device=self.model.device, dtype=self.model.dtype) 39 | self.image_processor = self.vision_tower.image_processor 40 | 41 | if hasattr(self.model.config, "max_sequence_length"): 42 | context_len = self.model.config.max_sequence_length 43 | else: 44 | context_len = 2048 45 | 46 | print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}") 47 | 48 | async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: 49 | # XXX 50 | images, prompt = await prompt_from_messages(request.messages, self.format) 51 | 52 | encoded_images = self.model.encode_image(images) 53 | inputs = self.tokenizer(prompt, encoded_images, return_tensors="pt") 54 | 55 | params = self.get_generation_params(request) 56 | 57 | generation_kwargs = dict( 58 | **inputs, 59 | **params, 60 | ) 61 | 62 | for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs): 63 | end = new_text.find(self.tokenizer.eos_token) 64 | if end == -1: 65 | yield new_text 66 | else: 67 | yield new_text[:end] 68 | break 69 | -------------------------------------------------------------------------------- /chat_with_image.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | try: 3 | import dotenv 4 | dotenv.load_dotenv(override=True) 5 | except: 6 | pass 7 | 8 | import os 9 | import requests 10 | import argparse 11 | from datauri import DataURI 12 | from openai import OpenAI 13 | 14 | 15 | def url_for_api(img_url: str = None, filename: str = None, always_data=False) -> str: 16 | if img_url.startswith('http'): 17 | response = requests.get(img_url) 18 | 19 | img_data = response.content 20 | content_type = response.headers['content-type'] 21 | return str(DataURI.make(mimetype=content_type, charset='utf-8', base64=True, data=img_data)) 22 | elif img_url.startswith('file:'): 23 | img_url = img_url.replace('file://', '').replace('file:', '') 24 | return str(DataURI.from_file(img_url)) 25 | 26 | return img_url 27 | 28 | if __name__ == '__main__': 29 | # Initialize argparse 30 | parser = argparse.ArgumentParser(description='Test vision using OpenAI', 31 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 32 | parser.add_argument('-s', '--system-prompt', type=str, default=None, help="Set a system prompt.") 33 | parser.add_argument('--openai-model', type=str, default="gpt-4-vision-preview", help="OpenAI model to use.") 34 | parser.add_argument('-S', '--start-with', type=str, default=None, help="Start reply with, ex. 'Sure, ' (doesn't work with all models)") 35 | parser.add_argument('-m', '--max-tokens', type=int, default=None, help="Max tokens to generate.") 36 | parser.add_argument('-t', '--temperature', type=float, default=None, help="Temperature.") 37 | parser.add_argument('-p', '--top_p', type=float, default=None, help="top_p") 38 | parser.add_argument('-u', '--keep-remote-urls', action='store_true', help="Normally, http urls are converted to data: urls for better latency.") 39 | parser.add_argument('-1', '--single', action='store_true', help='Single turn Q&A, output is only the model response.') 40 | parser.add_argument('--no-stream', action='store_true', help='Disable streaming response.') 41 | parser.add_argument('image_url', type=str, help='URL or image file to be tested') 42 | parser.add_argument('questions', type=str, nargs='*', help='The question to ask the image') 43 | args = parser.parse_args() 44 | 45 | client = OpenAI(base_url=os.environ.get('OPENAI_BASE_URL', 'http://localhost:5006/v1'), 46 | api_key=os.environ.get('OPENAI_API_KEY', 'sk-ip')) 47 | 48 | params = {} 49 | if args.max_tokens is not None: 50 | params['max_tokens'] = args.max_tokens 51 | if args.temperature is not None: 52 | params['temperature'] = args.temperature 53 | if args.top_p is not None: 54 | params['top_p'] = args.top_p 55 | params['stream'] = not args.no_stream 56 | 57 | image_url = args.image_url 58 | 59 | if not image_url.startswith('http'): 60 | image_url = str(DataURI.from_file(image_url)) 61 | elif not args.keep_remote_urls: 62 | image_url = url_for_api(image_url) 63 | 64 | messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else [] 65 | content = [{ "type": "image_url", "image_url": { "url": image_url } }, 66 | { "type": "text", "text": ' '.join(args.questions) }] 67 | messages.extend([{ "role": "user", "content": content }]) 68 | 69 | while True: 70 | if args.start_with: 71 | messages.extend([{ "role": "assistant", "content": [{ "type": "text", "text": args.start_with }] }]) 72 | 73 | response = client.chat.completions.create(model=args.openai_model, messages=messages, **params) 74 | 75 | if not args.single: 76 | print(f"Answer: ", end='', flush=True) 77 | 78 | assistant_text = '' 79 | 80 | if args.no_stream: 81 | assistant_text = response.choices[0].message.content 82 | print(assistant_text) 83 | else: 84 | for chunk in response: 85 | assistant_text += chunk.choices[0].delta.content 86 | print(chunk.choices[0].delta.content, end='', flush=True) 87 | 88 | print('') 89 | 90 | if args.single: 91 | break 92 | 93 | image_url = None 94 | try: 95 | q = input("\nQuestion: ") 96 | 97 | if q.startswith('http') or q.startswith('data:') or q.startswith('file:'): 98 | image_url = q 99 | if image_url.startswith('http') and args.keep_remote_urls: 100 | pass 101 | else: 102 | image_url = url_for_api(image_url) 103 | 104 | q = input("Question: ") 105 | 106 | except EOFError as e: 107 | print('') 108 | break 109 | 110 | content = [{"type": "image_url", "image_url": { "url": image_url } }] if image_url else [] 111 | content.extend([{ 'type': 'text', 'text': assistant_text }]) 112 | messages.extend([{ "role": "assistant", "content": content }, 113 | { "role": "user", "content": [{ 'type': 'text', 'text': q }] }]) 114 | 115 | 116 | -------------------------------------------------------------------------------- /debug_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import torch 4 | import re 5 | import sys 6 | import pkg_resources 7 | 8 | def get_cuda_info(): 9 | print("---- cuda") 10 | print(f"CUDA version: {torch.version.cuda}") 11 | print(f"torch.cuda.is_available(): {torch.cuda.is_available()}") 12 | for i in range(0, torch.cuda.device_count()): 13 | print(f"CUDA device[{i}]{torch.cuda.get_device_capability(i)}: {torch.cuda.get_device_name(i)}") 14 | 15 | def get_python_version(): 16 | print("---- python") 17 | print(sys.version) 18 | 19 | def get_pip_packages(): 20 | print("---- pip") 21 | try: 22 | packages = set(["transformers"]) 23 | with open("requirements.txt", "r") as f: 24 | for line in f.readlines(): 25 | line = line.strip() 26 | if not line or line.startswith("#") or line.startswith("http:") or line.startswith("https:") or line.startswith("git+"): 27 | continue 28 | package = re.split(r"[=<>;#\[ ]", line)[0] 29 | packages.add(package) 30 | 31 | for package in sorted(list(packages)): 32 | try: 33 | version = pkg_resources.get_distribution(package).version 34 | print(f"{package}=={version}") 35 | except pkg_resources.DistributionNotFound: 36 | print(f"{package}: Not found") 37 | 38 | except FileNotFoundError: 39 | print("requirements.txt not found") 40 | 41 | get_cuda_info() 42 | get_python_version() 43 | get_pip_packages() 44 | -------------------------------------------------------------------------------- /docker-compose.alt.yml: -------------------------------------------------------------------------------- 1 | services: 2 | openedai-vision-alt: 3 | build: 4 | args: 5 | - VERSION=alt 6 | - USER_ID=${UID:-1000} 7 | - GROUP_ID=${GID:-1000} 8 | dockerfile: Dockerfile 9 | user: ${UID:-1000}:${GID:-1000} 10 | container_name: openedai-vision-alt 11 | image: ghcr.io/matatonic/openedai-vision-alt 12 | env_file: vision-alt.env # your settings go here 13 | volumes: 14 | - ./hf_home:/app/hf_home # for Hugginface model cache 15 | - ./model_conf_tests.alt.json:/app/model_conf_tests.json 16 | ports: 17 | - 5006:5006 18 | #runtime: nvidia 19 | deploy: 20 | resources: 21 | reservations: 22 | devices: 23 | - driver: nvidia 24 | #device_ids: ['0', '1'] # Select a gpu, or 25 | count: all 26 | capabilities: [gpu] 27 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | openedai-vision: 3 | build: 4 | args: 5 | - VERSION=latest 6 | - USER_ID=${UID:-1000} 7 | - GROUP_ID=${GID:-1000} 8 | dockerfile: Dockerfile 9 | user: ${UID:-1000}:${GID:-1000} 10 | container_name: openedai-vision 11 | image: ghcr.io/matatonic/openedai-vision 12 | env_file: vision.env # your settings go here 13 | volumes: 14 | - ./hf_home:/app/hf_home # for Hugginface model cache 15 | - ./model_conf_tests.json:/app/model_conf_tests.json 16 | ports: 17 | - 5006:5006 18 | #runtime: nvidia 19 | deploy: 20 | resources: 21 | reservations: 22 | devices: 23 | - driver: nvidia 24 | #device_ids: ['0', '1'] # Select a gpu, or 25 | count: all 26 | capabilities: [gpu] 27 | -------------------------------------------------------------------------------- /hf_home/hf_home.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matatonic/openedai-vision/13a9c8d9e8c82f157665f8e382f09a008d547fec/hf_home/hf_home.txt -------------------------------------------------------------------------------- /model_conf_tests.alt.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["AIDC-AI/Ovis1.6-Gemma2-9B", "-A", "flash_attention_2"], 3 | ["AIDC-AI/Ovis1.6-Gemma2-27B", "-A", "flash_attention_2"], 4 | ["AIDC-AI/Ovis1.5-Gemma2-9B", "-A", "flash_attention_2"], 5 | ["omlab/omchat-v2.0-13B-single-beta_hf", "-A", "flash_attention_2"], 6 | ["BAAI/Bunny-Llama-3-8B-V", "--load-in-4bit"], 7 | ["BAAI/Bunny-Llama-3-8B-V"], 8 | ["BAAI/Bunny-v1_0-2B-zh", "--load-in-4bit"], 9 | ["BAAI/Bunny-v1_0-2B-zh"], 10 | ["BAAI/Bunny-v1_0-3B", "--load-in-4bit"], 11 | ["BAAI/Bunny-v1_0-3B"], 12 | ["BAAI/Bunny-v1_0-3B-zh"], 13 | ["BAAI/Bunny-v1_0-4B", "--load-in-4bit"], 14 | ["BAAI/Bunny-v1_0-4B"], 15 | ["BAAI/Bunny-v1_1-4B", "--load-in-4bit"], 16 | ["BAAI/Bunny-v1_1-4B"], 17 | ["BAAI/Bunny-v1_1-Llama-3-8B-V", "--load-in-4bit"], 18 | ["BAAI/Bunny-v1_1-Llama-3-8B-V"], 19 | ["BAAI/Emu2-Chat", "--max-memory=0:78GiB,1:20GiB"], 20 | ["BAAI/Emu3-Chat", "--load-in-4bit", "-A", "flash_attention_2"], 21 | ["BAAI/Emu3-Chat", "-A", "flash_attention_2"], 22 | ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 23 | ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0"], 24 | ["HuggingFaceM4/idefics2-8b-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"], 25 | ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 26 | ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0"], 27 | ["HuggingFaceM4/idefics2-8b-chatty-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"], 28 | ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"], 29 | ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"], 30 | ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5", "--max-tiles", "40", "--load-in-4bit"], 31 | ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5", "--load-in-4bit"], 32 | ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5"], 33 | ["THUDM/cogagent-chat-hf", "--load-in-4bit"], 34 | ["THUDM/cogagent-chat-hf"], 35 | ["THUDM/cogvlm-chat-hf", "--load-in-4bit"], 36 | ["THUDM/cogvlm-chat-hf"], 37 | ["THUDM/cogvlm2-llama3-chat-19B", "--load-in-4bit"], 38 | ["THUDM/cogvlm2-llama3-chat-19B"], 39 | ["THUDM/cogvlm2-llama3-chinese-chat-19B", "--load-in-4bit"], 40 | ["THUDM/cogvlm2-llama3-chinese-chat-19B"], 41 | ["THUDM/glm-4v-9b", "--device-map", "cuda:0", "--load-in-4bit"], 42 | ["THUDM/glm-4v-9b", "--device-map", "cuda:0"], 43 | ["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0", "--load-in-4bit"], 44 | ["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0"], 45 | ["TIGER-Lab/Mantis-8B-clip-llama3", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 46 | ["TIGER-Lab/Mantis-8B-clip-llama3", "-A", "flash_attention_2", "--device-map", "cuda:0"], 47 | ["TIGER-Lab/Mantis-8B-siglip-llama3", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 48 | ["TIGER-Lab/Mantis-8B-siglip-llama3", "-A", "flash_attention_2", "--device-map", "cuda:0"], 49 | ["allenai/MolmoE-1B-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"], 50 | ["allenai/MolmoE-1B-0924", "-A", "flash_attention_2", "--load-in-4bit"], 51 | ["allenai/MolmoE-1B-0924", "-A", "flash_attention_2"], 52 | ["allenai/Molmo-7B-D-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"], 53 | ["allenai/Molmo-7B-D-0924", "-A", "flash_attention_2", "--load-in-4bit"], 54 | ["allenai/Molmo-7B-D-0924", "-A", "flash_attention_2"], 55 | ["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"], 56 | ["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit"], 57 | ["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2"], 58 | ["allenai/Molmo-72B-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"], 59 | ["allenai/Molmo-72B-0924", "-A", "flash_attention_2", "--load-in-4bit"], 60 | ["cognitivecomputations/dolphin-vision-72b", "-A", "flash_attention_2", "--load-in-4bit", "--device-map", "cuda:0"], 61 | ["cognitivecomputations/dolphin-vision-7b", "-A", "flash_attention_2", "--load-in-4bit", "--device-map", "cuda:0"], 62 | ["cognitivecomputations/dolphin-vision-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"], 63 | ["echo840/Monkey-Chat", "--load-in-4bit"], 64 | ["echo840/Monkey-Chat"], 65 | ["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "-A", "flash_attention_2", "--load-in-4bit"], 66 | ["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "-A", "flash_attention_2"], 67 | ["fancyfeast/joy-caption-pre-alpha", "--load-in-4bit", "-A", "flash_attention_2"], 68 | ["google/paligemma2-3b-ft-docci-448", "-A", "flash_attention_2"], 69 | ["google/paligemma2-10b-ft-docci-448", "-A", "flash_attention_2"], 70 | ["llava-hf/llava-v1.6-mistral-7b-hf", "-A", "flash_attention_2", "--load-in-4bit"], 71 | ["llava-hf/llava-v1.6-mistral-7b-hf", "-A", "flash_attention_2"], 72 | ["microsoft/Florence-2-large-ft", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 73 | ["microsoft/Florence-2-large-ft", "-A", "flash_attention_2", "--device-map", "cuda:0"], 74 | ["microsoft/Phi-3-vision-128k-instruct", "-A", "flash_attention_2", "--load-in-4bit"], 75 | ["microsoft/Phi-3-vision-128k-instruct", "-A", "flash_attention_2"], 76 | ["microsoft/Phi-3.5-vision-instruct", "-A", "flash_attention_2", "--load-in-4bit"], 77 | ["microsoft/Phi-3.5-vision-instruct", "-A", "flash_attention_2"], 78 | ["omlab/omchat-v2.0-13B-single-beta_hf", "-A", "flash_attention_2"], 79 | ["openbmb/MiniCPM-V-2_6-int4", "-A", "flash_attention_2", "--device-map", "cuda:0"], 80 | ["openbmb/MiniCPM-V", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 81 | ["openbmb/MiniCPM-V", "-A", "flash_attention_2", "--device-map", "cuda:0"], 82 | ["openbmb/MiniCPM-V-2", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 83 | ["openbmb/MiniCPM-V-2", "-A", "flash_attention_2", "--device-map", "cuda:0"], 84 | ["qnguyen3/nanoLLaVA", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 85 | ["qnguyen3/nanoLLaVA", "-A", "flash_attention_2", "--device-map", "cuda:0"], 86 | ["qnguyen3/nanoLLaVA-1.5", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 87 | ["qnguyen3/nanoLLaVA-1.5", "-A", "flash_attention_2", "--device-map", "cuda:0"], 88 | ["qihoo360/360VL-8B", "-A", "flash_attention_2", "--load-in-4bit"], 89 | ["qihoo360/360VL-8B", "-A", "flash_attention_2"], 90 | ["rhymes-ai/Aria", "-A", "flash_attention_2"], 91 | ["tiiuae/falcon-11B-vlm", "-A", "flash_attention_2", "--load-in-4bit"], 92 | ["tiiuae/falcon-11B-vlm", "-A", "flash_attention_2"] 93 | ] 94 | -------------------------------------------------------------------------------- /model_conf_tests.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["AIDC-AI/Ovis2-1B", "-A", "flash_attention_2"], 3 | ["AIDC-AI/Ovis2-2B", "-A", "flash_attention_2"], 4 | ["AIDC-AI/Ovis2-4B", "-A", "flash_attention_2"], 5 | ["AIDC-AI/Ovis2-8B", "-A", "flash_attention_2"], 6 | ["AIDC-AI/Ovis2-16B", "-A", "flash_attention_2"], 7 | ["AIDC-AI/Ovis2-34B", "-A", "flash_attention_2"], 8 | ["AIDC-AI/Ovis1.6-Llama3.2-3B", "-A", "flash_attention_2"], 9 | ["AIDC-AI/Ovis1.5-Llama3-8B", "-A", "flash_attention_2"], 10 | ["BAAI/Aquila-VL-2B-llava-qwen", "-A", "flash_attention_2", "--load-in-4bit"], 11 | ["BAAI/Aquila-VL-2B-llava-qwen", "-A", "flash_attention_2"], 12 | ["BAAI/Emu2-Chat", "--load-in-4bit"], 13 | ["CohereForAI/aya-vision-8b", "-A", "flash_attention_2"], 14 | ["CohereForAI/aya-vision-32b", "-A", "flash_attention_2"], 15 | ["HuggingFaceTB/SmolVLM-Instruct", "-A", "flash_attention_2", "--load-in-4bit"], 16 | ["HuggingFaceTB/SmolVLM-Instruct", "-A", "flash_attention_2"], 17 | ["HuggingFaceM4/Idefics3-8B-Llama3", "-A", "flash_attention_2", "--load-in-4bit"], 18 | ["HuggingFaceM4/Idefics3-8B-Llama3", "-A", "flash_attention_2"], 19 | ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--load-in-4bit"], 20 | ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40", "--load-in-4bit"], 21 | ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"], 22 | ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"], 23 | ["OpenGVLab/InternVL2_5-1B", "--device-map", "cuda:0", "--max-tiles", "12", "--load-in-4bit"], 24 | ["OpenGVLab/InternVL2_5-1B", "--device-map", "cuda:0", "--max-tiles", "12"], 25 | ["OpenGVLab/InternVL2_5-2B", "--device-map", "cuda:0", "--max-tiles", "12", "--load-in-4bit"], 26 | ["OpenGVLab/InternVL2_5-2B", "--device-map", "cuda:0", "--max-tiles", "12"], 27 | ["OpenGVLab/InternVL2_5-4B", "--device-map", "cuda:0", "--max-tiles", "12", "--load-in-4bit"], 28 | ["OpenGVLab/InternVL2_5-4B", "--device-map", "cuda:0", "--max-tiles", "12"], 29 | ["OpenGVLab/InternVL2_5-8B", "--device-map", "cuda:0", "--max-tiles", "12", "--load-in-4bit"], 30 | ["OpenGVLab/InternVL2_5-8B", "--device-map", "cuda:0", "--max-tiles", "12"], 31 | ["OpenGVLab/InternVL2_5-26B", "--device-map", "cuda:0", "--max-tiles", "12", "--load-in-4bit"], 32 | ["OpenGVLab/InternVL2_5-26B", "--device-map", "cuda:0", "--max-tiles", "12"], 33 | ["OpenGVLab/InternVL2_5-38B", "--device-map", "cuda:0", "--max-tiles", "12", "--load-in-4bit"], 34 | ["OpenGVLab/InternVL2_5-38B", "--device-map", "cuda:0", "--max-tiles", "12"], 35 | ["OpenGVLab/InternVL2_5-78B", "--device-map", "cuda:0", "--max-tiles", "12", "--load-in-4bit"], 36 | ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0", "--load-in-4bit"], 37 | ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0"], 38 | ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0", "--load-in-4bit"], 39 | ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0"], 40 | ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0", "--load-in-4bit"], 41 | ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0"], 42 | ["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0", "--load-in-4bit"], 43 | ["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0"], 44 | ["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0", "--load-in-4bit"], 45 | ["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0"], 46 | ["OpenGVLab/InternVL2-Llama3-76B", "--device-map", "cuda:0", "--load-in-4bit"], 47 | ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--load-in-4bit"], 48 | ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40", "--load-in-4bit"], 49 | ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40"], 50 | ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5"], 51 | ["Qwen/Qwen-VL-Chat", "--load-in-4bit"], 52 | ["Qwen/Qwen-VL-Chat"], 53 | ["Qwen/Qwen2-VL-2B-Instruct-AWQ", "-A", "flash_attention_2"], 54 | ["Qwen/Qwen2-VL-2B-Instruct", "-A", "flash_attention_2"], 55 | ["Qwen/Qwen2-VL-7B-Instruct-AWQ", "-A", "flash_attention_2"], 56 | ["Qwen/Qwen2-VL-7B-Instruct", "-A", "flash_attention_2"], 57 | ["Qwen/Qwen2-VL-72B-Instruct-AWQ", "-A", "flash_attention_2"], 58 | ["Qwen/Qwen2.5-VL-3B-Instruct-AWQ", "-F"], 59 | ["Qwen/Qwen2.5-VL-3B-Instruct", "-F"], 60 | ["Qwen/Qwen2.5-VL-7B-Instruct-AWQ", "-F"], 61 | ["Qwen/Qwen2.5-VL-7B-Instruct", "-F"], 62 | ["Qwen/Qwen2.5-VL-72B-Instruct", "-4F"], 63 | ["Qwen/Qwen2.5-VL-72B-Instruct-AWQ", "-F"], 64 | ["Qwen/QVQ-72B-Preview", "-A", "flash_attention_2", "--load-in-4bit"], 65 | ["kosbu/QVQ-72B-Preview-AWQ", "-A", "flash_attention_2"], 66 | ["Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5"], 67 | ["Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"], 68 | ["Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"], 69 | ["Salesforce/xgen-mm-phi3-mini-instruct-r-v1"], 70 | ["adept/fuyu-8b", "--device-map", "cuda:0", "--load-in-4bit"], 71 | ["adept/fuyu-8b", "--device-map", "cuda:0"], 72 | ["fancyfeast/joy-caption-alpha-two", "--load-in-4bit", "-A", "flash_attention_2"], 73 | ["fancyfeast/joy-caption-alpha-two", "-A", "flash_attention_2"], 74 | ["fancyfeast/joy-caption-pre-alpha", "-A", "flash_attention_2"], 75 | ["internlm/internlm-xcomposer2d5-7b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 76 | ["internlm/internlm-xcomposer2d5-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"], 77 | ["internlm/internlm-xcomposer2-4khd-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"], 78 | ["llava-hf/llava-1.5-13b-hf", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 79 | ["llava-hf/llava-1.5-13b-hf", "-A", "flash_attention_2", "--device-map", "cuda:0"], 80 | ["llava-hf/llava-1.5-7b-hf", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 81 | ["llava-hf/llava-1.5-7b-hf", "-A", "flash_attention_2", "--device-map", "cuda:0"], 82 | ["llava-hf/llava-v1.6-34b-hf", "-A", "flash_attention_2", "--load-in-4bit"], 83 | ["llava-hf/llava-v1.6-34b-hf", "-A", "flash_attention_2"], 84 | ["llava-hf/llava-v1.6-vicuna-13b-hf", "-A", "flash_attention_2", "--load-in-4bit"], 85 | ["llava-hf/llava-v1.6-vicuna-13b-hf", "-A", "flash_attention_2"], 86 | ["llava-hf/llava-v1.6-vicuna-7b-hf", "-A", "flash_attention_2", "--load-in-4bit"], 87 | ["llava-hf/llava-v1.6-vicuna-7b-hf", "-A", "flash_attention_2"], 88 | ["lmms-lab/llava-onevision-qwen2-0.5b-ov", "-A", "flash_attention_2"], 89 | ["lmms-lab/llava-onevision-qwen2-7b-ov", "-A", "flash_attention_2"], 90 | ["meta-llama/Llama-3.2-11B-Vision-Instruct", "-A", "flash_attention_2", "--load-in-4bit"], 91 | ["meta-llama/Llama-3.2-11B-Vision-Instruct", "-A", "flash_attention_2"], 92 | ["meta-llama/Llama-3.2-90B-Vision-Instruct", "-A", "flash_attention_2", "--load-in-4bit"], 93 | ["microsoft/Florence-2-base-ft", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 94 | ["microsoft/Florence-2-base-ft", "-A", "flash_attention_2", "--device-map", "cuda:0"], 95 | ["mistralai/Pixtral-12B-2409"], 96 | ["mx262/MiniMonkey", "-A", "flash_attention_2", "--load-in-4bit"], 97 | ["mx262/MiniMonkey", "-A", "flash_attention_2"], 98 | ["openbmb/MiniCPM-V-2_6", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 99 | ["openbmb/MiniCPM-V-2_6", "-A", "flash_attention_2", "--device-map", "cuda:0"], 100 | ["openbmb/MiniCPM-Llama3-V-2_5", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], 101 | ["openbmb/MiniCPM-Llama3-V-2_5", "-A", "flash_attention_2", "--device-map", "cuda:0"], 102 | ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0", "--load-in-4bit"], 103 | ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0"], 104 | ["vikhyatk/moondream2", "-A", "flash_attention_2", "--load-in-4bit"], 105 | ["vikhyatk/moondream2", "-A", "flash_attention_2"] 106 | ] 107 | -------------------------------------------------------------------------------- /openedai.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import asyncio 3 | from fastapi import FastAPI, Request, HTTPException 4 | from fastapi.middleware.cors import CORSMiddleware 5 | from fastapi.responses import PlainTextResponse, JSONResponse 6 | from loguru import logger 7 | 8 | 9 | def single_request(timeout_seconds=300): 10 | lock = asyncio.Lock() 11 | 12 | def decorator(func): 13 | @functools.wraps(func) 14 | async def wrapper(*args, **kwargs): 15 | try: 16 | async with asyncio.timeout(timeout_seconds): 17 | if not await lock.acquire(): 18 | raise HTTPException( 19 | status_code=429, 20 | detail="Server is currently processing another request" 21 | ) 22 | try: 23 | return await func(*args, **kwargs) 24 | finally: 25 | lock.release() 26 | except asyncio.TimeoutError: 27 | raise HTTPException( 28 | status_code=504, 29 | detail="Request processing timed out" 30 | ) 31 | return wrapper 32 | return decorator 33 | 34 | class OpenAIError(Exception): 35 | pass 36 | 37 | class APIError(OpenAIError): 38 | message: str 39 | code: str = None 40 | param: str = None 41 | type: str = None 42 | 43 | def __init__(self, message: str, code: int = 500, param: str = None, internal_message: str = ''): 44 | super().__init__(message) 45 | self.message = message 46 | self.code = code 47 | self.param = param 48 | self.type = self.__class__.__name__, 49 | self.internal_message = internal_message 50 | 51 | def __repr__(self): 52 | return "%s(message=%r, code=%d, param=%s)" % ( 53 | self.__class__.__name__, 54 | self.message, 55 | self.code, 56 | self.param, 57 | ) 58 | 59 | class InternalServerError(APIError): 60 | pass 61 | 62 | class ServiceUnavailableError(APIError): 63 | def __init__(self, message="Service unavailable, please try again later.", code=503, internal_message=''): 64 | super().__init__(message, code, internal_message) 65 | 66 | class APIStatusError(APIError): 67 | status_code: int = 400 68 | 69 | def __init__(self, message: str, param: str = None, internal_message: str = ''): 70 | super().__init__(message, self.status_code, param, internal_message) 71 | 72 | class BadRequestError(APIStatusError): 73 | status_code: int = 400 74 | 75 | class AuthenticationError(APIStatusError): 76 | status_code: int = 401 77 | 78 | class PermissionDeniedError(APIStatusError): 79 | status_code: int = 403 80 | 81 | class NotFoundError(APIStatusError): 82 | status_code: int = 404 83 | 84 | class ConflictError(APIStatusError): 85 | status_code: int = 409 86 | 87 | class UnprocessableEntityError(APIStatusError): 88 | status_code: int = 422 89 | 90 | class RateLimitError(APIStatusError): 91 | status_code: int = 429 92 | 93 | class OpenAIStub(FastAPI): 94 | def __init__(self, **kwargs) -> None: 95 | super().__init__(**kwargs) 96 | self.models = {} 97 | 98 | self.add_middleware( 99 | CORSMiddleware, 100 | allow_origins=["*"], 101 | allow_credentials=True, 102 | allow_methods=["*"], 103 | allow_headers=["*"] 104 | ) 105 | 106 | @self.exception_handler(Exception) 107 | def openai_exception_handler(request: Request, exc: Exception) -> JSONResponse: 108 | # Generic server errors 109 | #logger.opt(exception=exc).error("Logging exception traceback") 110 | 111 | return JSONResponse(status_code=500, content={ 112 | 'message': 'InternalServerError', 113 | 'code': 500, 114 | }) 115 | 116 | @self.exception_handler(APIError) 117 | def openai_apierror_handler(request: Request, exc: APIError) -> JSONResponse: 118 | # Server error 119 | logger.opt(exception=exc).error("Logging exception traceback") 120 | 121 | if exc.internal_message: 122 | logger.info(exc.internal_message) 123 | 124 | return JSONResponse(status_code = exc.code, content={ 125 | 'message': exc.message, 126 | 'code': exc.code, 127 | 'type': exc.__class__.__name__, 128 | 'param': exc.param, 129 | }) 130 | 131 | @self.exception_handler(APIStatusError) 132 | def openai_statuserror_handler(request: Request, exc: APIStatusError) -> JSONResponse: 133 | # Client side error 134 | logger.info(repr(exc)) 135 | 136 | if exc.internal_message: 137 | logger.info(exc.internal_message) 138 | 139 | return JSONResponse(status_code = exc.code, content={ 140 | 'message': exc.message, 141 | 'code': exc.code, 142 | 'type': exc.__class__.__name__, 143 | 'param': exc.param, 144 | }) 145 | 146 | @self.middleware("http") 147 | async def log_requests(request: Request, call_next): 148 | logger.debug(f"Request path: {request.url.path}") 149 | logger.debug(f"Request method: {request.method}") 150 | logger.debug(f"Request headers: {request.headers}") 151 | logger.debug(f"Request query params: {request.query_params}") 152 | logger.debug(f"Request body: {await request.body()}") # can be huge... 153 | 154 | response = await call_next(request) 155 | 156 | logger.debug(f"Response status code: {response.status_code}") 157 | logger.debug(f"Response headers: {response.headers}") 158 | 159 | return response 160 | 161 | @self.get('/v1/billing/usage') 162 | @self.get('/v1/dashboard/billing/usage') 163 | async def handle_billing_usage(): 164 | return { 'total_usage': 0 } 165 | 166 | @self.get("/", response_class=PlainTextResponse) 167 | @self.head("/", response_class=PlainTextResponse) 168 | @self.options("/", response_class=PlainTextResponse) 169 | async def root(): 170 | return PlainTextResponse(content="", status_code=200 if self.models else 503) 171 | 172 | @self.get("/health") 173 | async def health(): 174 | return {"status": "ok" if self.models else "unk" } 175 | 176 | @self.get("/v1/models") 177 | async def get_model_list(): 178 | return self.model_list() 179 | 180 | @self.get("/v1/models/{model}") 181 | async def get_model_info(model_id: str): 182 | return self.model_info(model_id) 183 | 184 | def register_model(self, name: str, model: str = None) -> None: 185 | self.models[name] = model if model else name 186 | 187 | def deregister_model(self, name: str) -> None: 188 | if name in self.models: 189 | del self.models[name] 190 | 191 | def model_info(self, model: str) -> dict: 192 | result = { 193 | "id": model, 194 | "object": "model", 195 | "created": 0, 196 | "owned_by": "user" 197 | } 198 | return result 199 | 200 | def model_list(self) -> dict: 201 | if not self.models: 202 | return {} 203 | 204 | result = { 205 | "object": "list", 206 | "data": [ self.model_info(model) for model in list(set(self.models.keys() | self.models.values())) if model ] 207 | } 208 | 209 | return result 210 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | # AWQ and GPTQ support need to be installed separately 3 | bitsandbytes==0.44.1 4 | datasets 5 | fastapi 6 | # See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases 7 | # And: https://github.com/Dao-AILab/flash-attention/releases for linux. 8 | https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" 9 | https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" 10 | https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.12" 11 | https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13" 12 | https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.5.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" 13 | https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.5.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" 14 | https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.5.0cxx11abiFALSE-cp312-cp312-win_amd64.whl; platform_system == "Windows" and python_version == "3.12" 15 | https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.5.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" 16 | flash_attn; python_version != "3.10" and python_version != "3.11" and python_version != "3.12" and python_version != "3.13" 17 | hf_transfer 18 | loguru 19 | numpy 20 | openai 21 | peft 22 | protobuf 23 | pydantic 24 | python-datauri 25 | quanto 26 | requests 27 | sentencepiece 28 | sse_starlette 29 | torch==2.5.* 30 | uvicorn 31 | wandb 32 | xformers 33 | 34 | # moondream 35 | deepspeed 36 | einops 37 | einops-exts 38 | httpx 39 | markdown2[all] 40 | open_clip_torch 41 | shortuuid 42 | timm 43 | tokenizers 44 | torchvision 45 | 46 | # qwen 47 | matplotlib 48 | optimum 49 | tiktoken 50 | transformers_stream_generator 51 | qwen-vl-utils[decord] 52 | 53 | # 360vl 54 | logger 55 | 56 | # mistral 57 | mistral_inference 58 | mistral_common[opencv] 59 | 60 | # got-ocr2 61 | verovio 62 | 63 | # Aria. needs to build a bunch and doesn't work without many extra packages 64 | # BYOB, use it if you need it 65 | #grouped_gemm 66 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DATE=$(date +%F-%H-%M) 3 | FN="sample-${DATE}.env" 4 | CSV="test_output-${DATE}.csv" 5 | CSV_LOG="test_output-${DATE}.log" 6 | touch ${FN} ${CSV} 7 | docker run --runtime nvidia --gpus all \ 8 | -v ./hf_home:/app/hf_home -v ./model_conf_tests.json:/app/model_conf_tests.json -v ./${FN}:/app/vision.sample.env -v ./${CSV}:/app/test_output.csv \ 9 | -e HF_HOME=hf_home -e HF_HUB_ENABLE_HF_TRANSFER=1 -e HF_TOKEN=${HF_TOKEN} \ 10 | -e CUDA_VISIBLE_DEVICES=1,0 -e OPENEDAI_DEVICE_MAP="sequential" \ 11 | -e CLI_COMMAND="/usr/bin/env python test_models.py -v --log-level INFO" \ 12 | -u $(id -u):$(id -g) --expose=5006 --name openedai-vision-test-${DATE} \ 13 | ghcr.io/matatonic/openedai-vision 2> >(tee ${CSV_LOG} >&2) 14 | -------------------------------------------------------------------------------- /vision-alt.sample.env: -------------------------------------------------------------------------------- 1 | # This sample env file can be used to set environment variables for the docker-compose.yml 2 | # Copy this file to vision.env and uncomment the model of your choice. 3 | HF_HOME=hf_home 4 | HF_HUB_ENABLE_HF_TRANSFER=1 5 | #HF_TOKEN=hf-... 6 | #CUDA_VISIBLE_DEVICES=1,0 7 | #CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 11.6s, mem: 8.8GB, 13/13 tests passed. 8 | #CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 8.3s, mem: 19.7GB, 13/13 tests passed. 9 | #CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 12.0s, mem: 9.4GB, 13/13 tests passed. 10 | #CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 10.7s, mem: 19.7GB, 13/13 tests passed. 11 | #CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 26.2s, mem: 29.9GB, 13/13 tests passed. 12 | #CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 22.9s, mem: 39.5GB, 13/13 tests passed. 13 | #CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 25.0s, mem: 29.7GB, 13/13 tests passed. 14 | #CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 31.1s, mem: 29.9GB, 13/13 tests passed. 15 | #CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 24.8s, mem: 39.4GB, 13/13 tests passed. 16 | #CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 26.9s, mem: 29.7GB, 13/13 tests passed. 17 | #CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.6s, mem: 5.9GB, 13/13 tests passed. 18 | #CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0" # test pass✅, time: 8.0s, mem: 10.9GB, 13/13 tests passed. 19 | #CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 10.8s, mem: 9.9GB, 13/13 tests passed. 20 | #CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 9.5s, mem: 6.3GB, 13/13 tests passed. 21 | #CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 7.6s, mem: 11.3GB, 13/13 tests passed. 22 | #CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 32.0s, mem: 13.4GB, 13/13 tests passed. 23 | #CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 22.8s, mem: 37.6GB, 13/13 tests passed. 24 | #CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 27.8s, mem: 12.9GB, 13/13 tests passed. 25 | #CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 21.3s, mem: 36.5GB, 13/13 tests passed. 26 | #CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 35.4s, mem: 22.4GB, 13/13 tests passed. 27 | #CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 31.3s, mem: 40.7GB, 13/13 tests passed. 28 | #CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 123.3s, mem: 22.4GB, 13/13 tests passed. 29 | #CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 125.6s, mem: 40.7GB, 13/13 tests passed. 30 | #CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test fail❌, time: 4.3s, mem: 17.8GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} 31 | #CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test fail❌, time: 4.1s, mem: 29.1GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} 32 | #CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 13.1s, mem: 7.6GB, 13/13 tests passed. 33 | #CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 9.8s, mem: 17.5GB, 13/13 tests passed. 34 | #CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.2s, mem: 8.4GB, 13/13 tests passed. 35 | #CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 7.6s, mem: 18.1GB, 13/13 tests passed. 36 | #CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b -A flash_attention_2 --load-in-4bit --device-map cuda:0" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (timeout). 37 | #CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b -A flash_attention_2 --load-in-4bit --device-map cuda:0" # test pass✅, time: 45.3s, mem: 8.9GB, 13/13 tests passed. 38 | #CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b -A flash_attention_2 --device-map cuda:0" # test fail❌, time: 7.8s, mem: 16.6GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} 39 | #CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 21.9s, mem: 8.1GB, 13/13 tests passed. 40 | #CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf -A flash_attention_2" # test pass✅, time: 17.9s, mem: 17.4GB, 13/13 tests passed. 41 | #CLI_COMMAND="python vision.py -m omlab/omchat-v2.0-13B-single-beta_hf -A flash_attention_2" # test pass✅, time: 28.5s, mem: 42.9GB, 13/13 tests passed. 42 | #CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.0s, mem: 5.5GB, 13/13 tests passed. 43 | #CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 6.1s, mem: 8.7GB, 13/13 tests passed. 44 | #CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.8s, mem: 3.6GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} 45 | #CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 9.5s, mem: 9.1GB, 13/13 tests passed. 46 | #CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2 --load-in-4bit" # test pass✅, time: 15.0s, mem: 8.4GB, 13/13 tests passed. 47 | #CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2" # test pass✅, time: 8.7s, mem: 17.3GB, 13/13 tests passed. 48 | #CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm -A flash_attention_2 --load-in-4bit" # test pass✅, time: 17.5s, mem: 17.1GB, 13/13 tests passed. 49 | #CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm -A flash_attention_2" # test pass✅, time: 15.9s, mem: 32.4GB, 13/13 tests passed. 50 | --------------------------------------------------------------------------------