├── .gitignore ├── README.md ├── assets ├── arch.png ├── assistant.png ├── camparison.bmp ├── human.png ├── moti.png └── resources │ ├── image_1.png │ ├── image_2.png │ ├── image_3.png │ ├── image_4.png │ ├── image_5.png │ ├── image_6.png │ ├── image_7.png │ ├── image_8.png │ ├── image_9.png │ └── video_1.mp4 ├── builtin_plan.json ├── cllm ├── __init__.py ├── agents │ ├── __init__.py │ ├── base.py │ ├── builtin │ │ ├── __init__.py │ │ ├── plans.py │ │ ├── prompts.py │ │ └── tools.py │ ├── container.py │ └── tog │ │ ├── __init__.py │ │ ├── compiler.py │ │ ├── controller.py │ │ ├── interpretor.py │ │ ├── planner.py │ │ └── responser.py ├── app │ ├── __init__.py │ └── gradio.py ├── services │ ├── __init__.py │ ├── anything2image │ │ ├── __init__.py │ │ ├── api.py │ │ ├── imagebind │ │ │ ├── __init__.py │ │ │ ├── bpe │ │ │ │ └── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── data.py │ │ │ └── models │ │ │ │ ├── __init__.py │ │ │ │ ├── helpers.py │ │ │ │ ├── imagebind_model.py │ │ │ │ ├── multimodal_preprocessors.py │ │ │ │ └── transformer.py │ │ ├── launch.py │ │ └── tools.py │ ├── audio │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ └── tools.py │ ├── general │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ └── tools.py │ ├── hf_pipeline.py │ ├── image_editing │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ ├── ldm_inpainting │ │ │ ├── __init__.py │ │ │ ├── config.yaml │ │ │ ├── ldm │ │ │ │ ├── __init__.py │ │ │ │ ├── lr_scheduler.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── autoencoder.py │ │ │ │ │ ├── diffusion │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── classifier.py │ │ │ │ │ │ ├── ddim.py │ │ │ │ │ │ ├── ddpm.py │ │ │ │ │ │ └── plms.py │ │ │ │ │ └── quantize.py │ │ │ │ ├── modules │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── attention.py │ │ │ │ │ ├── diffusionmodules │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── model.py │ │ │ │ │ │ ├── openaimodel.py │ │ │ │ │ │ └── util.py │ │ │ │ │ ├── distributions │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── distributions.py │ │ │ │ │ ├── ema.py │ │ │ │ │ ├── encoders │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── modules.py │ │ │ │ │ ├── image_degradation │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── bsrgan.py │ │ │ │ │ │ ├── bsrgan_light.py │ │ │ │ │ │ ├── utils │ │ │ │ │ │ │ └── test.png │ │ │ │ │ │ └── utils_image.py │ │ │ │ │ ├── losses │ │ │ │ │ │ └── __init__.py │ │ │ │ │ └── x_transformer.py │ │ │ │ └── util.py │ │ │ └── wrapper.py │ │ └── tools.py │ ├── image_generation │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ └── tools.py │ ├── image_inpainting │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ ├── ldm_inpainting │ │ │ ├── __init__.py │ │ │ ├── config.yaml │ │ │ ├── ldm │ │ │ │ ├── __init__.py │ │ │ │ ├── lr_scheduler.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── autoencoder.py │ │ │ │ │ ├── diffusion │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── classifier.py │ │ │ │ │ │ ├── ddim.py │ │ │ │ │ │ ├── ddpm.py │ │ │ │ │ │ └── plms.py │ │ │ │ │ └── quantize.py │ │ │ │ ├── modules │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── attention.py │ │ │ │ │ ├── diffusionmodules │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── model.py │ │ │ │ │ │ ├── openaimodel.py │ │ │ │ │ │ └── util.py │ │ │ │ │ ├── distributions │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── distributions.py │ │ │ │ │ ├── ema.py │ │ │ │ │ ├── encoders │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── modules.py │ │ │ │ │ ├── image_degradation │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── bsrgan.py │ │ │ │ │ │ ├── bsrgan_light.py │ │ │ │ │ │ ├── utils │ │ │ │ │ │ │ └── test.png │ │ │ │ │ │ └── utils_image.py │ │ │ │ │ ├── losses │ │ │ │ │ │ └── __init__.py │ │ │ │ │ └── x_transformer.py │ │ │ │ └── util.py │ │ │ └── wrapper.py │ │ └── tools.py │ ├── image_perception │ │ ├── __init__.py │ │ ├── api.py │ │ ├── configs │ │ │ └── GroundingDINO_SwinT_OGC.py │ │ ├── launch.py │ │ ├── sam_preditor.py │ │ └── tools.py │ ├── image_processing │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ └── tools.py │ ├── launch.py │ ├── llama2 │ │ ├── api.py │ │ ├── launch.py │ │ └── llama2.py │ ├── nlp │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ ├── llms │ │ │ ├── __init__.py │ │ │ ├── chat_models.py │ │ │ └── memory │ │ │ │ ├── __init__.py │ │ │ │ ├── message_memory.py │ │ │ │ └── utils.py │ │ └── tools.py │ ├── pool.py │ ├── tog │ │ ├── __init__.py │ │ ├── api.py │ │ ├── configs │ │ │ ├── __init__.py │ │ │ ├── resource_expert_prompts.py │ │ │ ├── solution_expert_prompts.py │ │ │ ├── task_decomposition_prompts.py │ │ │ ├── task_solver_prompts.py │ │ │ ├── tog_config.py │ │ │ └── tools.json │ │ ├── launch.py │ │ ├── tool.py │ │ └── utils.py │ ├── utils.py │ ├── video │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ └── tools.py │ └── vqa │ │ ├── __init__.py │ │ ├── api.py │ │ ├── launch.py │ │ └── tools.py └── utils.py ├── docker ├── Dockerfile ├── docker-compose-gradio.yml ├── docker-compose-tog.yml └── docker-compose-tool.yml ├── eval_data ├── tool100.json └── tool2k.json ├── requirements.txt ├── run.sh ├── setup.py └── tests ├── test_controller.py └── test_tool.py /.gitignore: -------------------------------------------------------------------------------- 1 | tmp 2 | !tests/services/test.png 3 | model_zoo 4 | .vscode 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | .DS_Store 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # poetry 103 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 104 | # This is especially recommended for binary packages to ensure reproducibility, and is more 105 | # commonly ignored for libraries. 106 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 107 | #poetry.lock 108 | 109 | # pdm 110 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 111 | #pdm.lock 112 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 113 | # in version control. 114 | # https://pdm.fming.dev/#use-with-ide 115 | .pdm.toml 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ 166 | client_resources/ 167 | server_resources/ 168 | certificate/ 169 | logs/ 170 | bash.sh 171 | -------------------------------------------------------------------------------- /assets/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/arch.png -------------------------------------------------------------------------------- /assets/assistant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/assistant.png -------------------------------------------------------------------------------- /assets/camparison.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/camparison.bmp -------------------------------------------------------------------------------- /assets/human.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/human.png -------------------------------------------------------------------------------- /assets/moti.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/moti.png -------------------------------------------------------------------------------- /assets/resources/image_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_1.png -------------------------------------------------------------------------------- /assets/resources/image_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_2.png -------------------------------------------------------------------------------- /assets/resources/image_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_3.png -------------------------------------------------------------------------------- /assets/resources/image_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_4.png -------------------------------------------------------------------------------- /assets/resources/image_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_5.png -------------------------------------------------------------------------------- /assets/resources/image_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_6.png -------------------------------------------------------------------------------- /assets/resources/image_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_7.png -------------------------------------------------------------------------------- /assets/resources/image_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_8.png -------------------------------------------------------------------------------- /assets/resources/image_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_9.png -------------------------------------------------------------------------------- /assets/resources/video_1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/video_1.mp4 -------------------------------------------------------------------------------- /builtin_plan.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /cllm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/__init__.py -------------------------------------------------------------------------------- /cllm/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Tool, Action 2 | from .container import * 3 | -------------------------------------------------------------------------------- /cllm/agents/base.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from enum import Enum 3 | from typing import Callable, List 4 | import json 5 | from pathlib import Path 6 | from collections import OrderedDict 7 | 8 | 9 | @dataclass 10 | class Action: 11 | """The action represent an assignment. 12 | `output = tool_name(**inputs)` 13 | 14 | Examples: 15 | >>> mask = segmentation_by_mask(image=image, prompt_mask=prompt_mask) 16 | >>> image = image_inpainting(image=image, mask=mask) 17 | """ 18 | 19 | tool_name: str = (None,) 20 | inputs: dict = (None,) 21 | outputs: List[str] = (None,) 22 | 23 | def __str__(self) -> str: 24 | args = ", ".join([f"{k}={v}" for k, v in self.inputs.items()]) 25 | return "{} = {}(".format(", ".join(self.outputs), self.tool_name) + args + ")" 26 | 27 | def dict(self): 28 | args = {str(k): str(v) for k, v in self.inputs.items()} 29 | # args = {str(item["name"]): str(item["value"]) for item in self.inputs} 30 | rets = [o if isinstance(o, str) else str(o) for o in self.outputs] 31 | return { 32 | "tool": self.tool_name, 33 | "inputs": args, 34 | "outputs": rets, 35 | } 36 | 37 | 38 | class DataType(Enum): 39 | TEXT = "text" 40 | TAGS = "tags" 41 | TITLE = "title" 42 | # HTML = "text.html" 43 | HTML = "html" 44 | LOCATION = "location" 45 | WEATHER = "weather" 46 | TIME = "time" 47 | 48 | IMAGE = "image" 49 | VIDEO = "video" 50 | AUDIO = "audio" 51 | ANY = "any" 52 | NONE = "none" 53 | 54 | SEGMENTATION = "image.segmentation" 55 | EDGE = "image.edge" 56 | LINE = "image.line" 57 | HED = "image.hed" 58 | CANNY = "image.canny" 59 | SCRIBBLE = "image.scribble" 60 | POSE = "image.pose" 61 | DEPTH = "image.depth" 62 | NORMAL = "image.normal" 63 | 64 | MASK = "image.mask" # SAM mask 65 | POINT = "point" 66 | BBOX = "bbox" # {'label': 'dog', 'box': [1,2,3,4], 'score': 0.9} 67 | CATEGORY = "category" 68 | 69 | LIST = "list" 70 | 71 | def __str__(self): 72 | return self.value 73 | 74 | def __eq__(self, other): 75 | if isinstance(other, str): 76 | return self.value == other 77 | elif isinstance(other, self.__class__): 78 | return self.value == other.value 79 | else: 80 | return False 81 | 82 | 83 | @dataclass 84 | class Resource: 85 | name: str 86 | type: DataType 87 | value: None 88 | # description: str = None 89 | 90 | def dict(self): 91 | return { 92 | "name": self.name, 93 | "type": str(self.type), 94 | "value": str(self.value), 95 | # "description": self.description, 96 | } 97 | 98 | 99 | @dataclass 100 | class Tool: 101 | class Domain(Enum): 102 | IMAGE_PERCEPTION = "image-perception" 103 | IMAGE_GENERATION = "image-generation" 104 | IMAGE_EDITING = "image-editing" 105 | IMAGE_PROCESSING = "image-processing" 106 | AUDIO_PERCEPTION = "audio-perception" 107 | AUDIO_GENERATION = "audio-generation" 108 | VIDEO_PERCEPTION = "video-perception" 109 | VIDEO_GENERATION = "video-generation" 110 | VIDEO_PROCESSING = "video-processing" 111 | VIDEO_EDITING = "video-editing" 112 | VIDEO_CUTTING = "video-cutting" 113 | NATURAL_LANGUAGE_PROCESSING = "natural-language-processing" 114 | CODE_GENERATION = "code-generation" 115 | VISUAL_QUESTION_ANSWERING = "visual-question-answering" 116 | QUESTION_ANSWERING = "question-answering" 117 | GENERAL = "general" 118 | 119 | def __str__(self): 120 | return self.value 121 | 122 | @dataclass 123 | class Argument: 124 | name: str 125 | type: DataType 126 | description: str 127 | 128 | def dict(self): 129 | return { 130 | "name": self.name, 131 | "type": str(self.type), 132 | "description": self.description, 133 | } 134 | 135 | name: str 136 | description: str 137 | domain: Domain 138 | model: Callable 139 | 140 | usages: List[str] = field(default_factory=lambda: []) 141 | args: List[Argument] = field(default_factory=lambda: []) 142 | returns: List[Argument] = field(default_factory=lambda: []) 143 | 144 | def dict(self): 145 | return { 146 | "name": self.name, 147 | "description": self.description, 148 | "domain": str(self.domain), 149 | "args": [a.dict() for a in self.args], 150 | "returns": [r.dict() for r in self.returns], 151 | } 152 | 153 | 154 | NON_FILE_TYPES = [ 155 | DataType.TAGS, 156 | DataType.TEXT, 157 | DataType.TITLE, 158 | DataType.BBOX, 159 | DataType.CATEGORY, 160 | DataType.LIST, 161 | DataType.LOCATION, 162 | DataType.POINT, 163 | DataType.WEATHER, 164 | DataType.TIME, 165 | ] 166 | 167 | 168 | if __name__ == "__main__": 169 | s = [ 170 | [Action("a", {"aa": [Path("/a/d/e/t.txt")]}, [Path("/a/aa.txt")])], 171 | Action("b", {"bb": "bbb"}, ["bbb"]), 172 | ] 173 | print(json.dumps(s, indent=4, default=lambda o: o.dict())) 174 | -------------------------------------------------------------------------------- /cllm/agents/builtin/__init__.py: -------------------------------------------------------------------------------- 1 | from . import plans 2 | from . import prompts 3 | from . import tools 4 | from .plans import BUILTIN_PLANS, load_builtin_plans 5 | from .prompts import RUN_PROMPT_TEMPLATE, CODE_PROMPT_TEMPLATE 6 | from .tools import TOOLS 7 | -------------------------------------------------------------------------------- /cllm/agents/builtin/prompts.py: -------------------------------------------------------------------------------- 1 | RUN_PROMPT_TEMPLATE = """ 2 | I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task. 3 | To help you, I will give you access to a set of tools that you can use. Each tool is a Python function and has a docstring explaining the task it performs, the inputs it expects and the outputs it returns. 4 | You should first explain which tool you will use to perform the task and for what reason, then write the code in Python. 5 | Each instruction in Python should be a simple assignment. 6 | 7 | Tools: 8 | <> 9 | 10 | Task: "How are you." 11 | 12 | I will use `question_answering` to answer the given question. 13 | 14 | Answer: 15 | ``` 16 | output = question_answering(text=_task_) 17 | ``` 18 | 19 | Task: "describe `image_1`." 20 | 21 | I will use `image_captioning` to answer the question on `image_1`. 22 | 23 | Answer: 24 | ``` 25 | output = image_captioning(image=image_1) 26 | ``` 27 | 28 | Task: "generate an image with a dog." 29 | 30 | I will use `text_to_image` to generate an image. 31 | 32 | Answer: 33 | ``` 34 | output = text_to_image(text="an image with a dog") 35 | ``` 36 | 37 | Task: "<>" 38 | 39 | """ 40 | 41 | CODE_PROMPT_TEMPLATE = ''' 42 | I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task. 43 | - To help you, I will give you access to a set of tools that you can use. 44 | - Each tool is a Python function and has a docstring explaining the task it performs, the inputs it expects and the outputs it returns. 45 | - You should first explain which tool you will use to perform the task and for what reason, then write the code in Python. 46 | - Each instruction in Python should be a simple assignment. 47 | 48 | Tools: 49 | <> 50 | 51 | ======= 52 | 53 | History: 54 | ``` 55 | # Task: "How are you." 56 | ``` 57 | 58 | I will use `question_answering` to answer the given question. 59 | 60 | Answer: 61 | ``` 62 | # Task: "How are you." 63 | output = question_answering(text=_task_) 64 | ``` 65 | 66 | ======= 67 | 68 | History: 69 | ``` 70 | # Task: "describe the given image." 71 | ``` 72 | 73 | I will use `image_question_answering` to answer the question on the input image. 74 | 75 | Answer: 76 | ``` 77 | # Task: "describe the given image." 78 | output = image_question_answering(text=_task_, image=image) 79 | ``` 80 | 81 | ======= 82 | 83 | History: 84 | ``` 85 | output = text_to_image(text="an image with a dog") 86 | # Task: "describe the given image." 87 | ``` 88 | 89 | I will use `image_question_answering` to answer the question on the input image. 90 | 91 | Answer: 92 | ``` 93 | # Task: "describe the given image." 94 | output = image_question_answering(text=_task_, image=output) 95 | ``` 96 | 97 | ======= 98 | 99 | History: 100 | ``` 101 | # Task: "generate an image with a dog." 102 | ``` 103 | 104 | I will use `text_to_image` to generate an image. 105 | 106 | Answer: 107 | ``` 108 | # Task: "generate an image with a dog." 109 | output = text_to_image(text="an image with a dog") 110 | ``` 111 | 112 | ======= 113 | 114 | History: 115 | ``` 116 | <> 117 | # Task: "<>" 118 | ``` 119 | 120 | 121 | ''' 122 | -------------------------------------------------------------------------------- /cllm/agents/container.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | from pathlib import Path 4 | import json 5 | from .base import DataType 6 | from cllm.utils import get_real_path 7 | 8 | FILE_EXT = { 9 | "image": ["png", "jpeg", "jpg", "gif", "bmp", "tiff", "webp"], 10 | "video": ["mp4", "mov", "avi", "mkv"], 11 | "audio": ["wav", "mp3"], 12 | } 13 | 14 | 15 | class Container: 16 | def __init__(self, name, rtype, value) -> None: 17 | self.name = name 18 | self.rtype = rtype 19 | self.value = value 20 | 21 | def to_chatbot(self): 22 | pass 23 | 24 | def __str__(self): 25 | pass 26 | 27 | def __repr__(self) -> str: 28 | return str(self) 29 | 30 | 31 | class File(Container): 32 | def to_chatbot(self): 33 | return str(self.value) 34 | 35 | @property 36 | def filename(self): 37 | return os.path.basename(self.value) 38 | 39 | def __str__(self): 40 | return f"`{self.filename}`" 41 | 42 | 43 | class HTML(File): 44 | def to_chatbot(self): 45 | return str(self.value) 46 | 47 | def __str__(self): 48 | return f"`{self.filename}`" 49 | 50 | 51 | class Image(File): 52 | def __str__(self): 53 | return f"`{self.filename}`" 54 | 55 | 56 | class Video(File): 57 | def __str__(self): 58 | return f"`{self.filename}`" 59 | 60 | 61 | class Audio(File): 62 | def __str__(self): 63 | return f"`{self.filename}`" 64 | 65 | 66 | class Text(Container): 67 | def to_chatbot(self): 68 | if isinstance(self.value, str): 69 | return self.value 70 | elif isinstance(self.value, (list, tuple, dict)): 71 | return json.dumps(self.value, indent=2) 72 | return self.value 73 | 74 | def __str__(self): 75 | if isinstance(self.value, (list, dict)): 76 | return json.dumps(self.value) 77 | elif isinstance(self.value, str): 78 | return self.value 79 | return str(self.value) 80 | 81 | 82 | def auto_type(name, rtype, value): 83 | if value is None: 84 | return None 85 | if "image" in str(rtype): 86 | return Image(name, rtype, get_real_path(value)) 87 | if DataType.VIDEO == rtype: 88 | return Video(name, rtype, get_real_path(value)) 89 | if DataType.AUDIO == rtype: 90 | return Audio(name, rtype, get_real_path(value)) 91 | if DataType.HTML == rtype: 92 | return HTML(name, rtype, get_real_path(value)) 93 | return Text(name, rtype, value) 94 | -------------------------------------------------------------------------------- /cllm/agents/tog/__init__.py: -------------------------------------------------------------------------------- 1 | from .planner import Planner 2 | from .controller import Controller -------------------------------------------------------------------------------- /cllm/agents/tog/compiler.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | import ast 3 | 4 | from cllm.agents.base import Action 5 | 6 | 7 | class Parser: 8 | def parse(self, plan) -> List[Action]: 9 | # ignore indent 10 | input = '\n'.join([line.strip() for line in plan.split('\n')]) 11 | actions = [] 12 | for stmt in ast.parse(input).body: 13 | if isinstance(stmt, ast.Assign): 14 | assign: ast.Assign = stmt 15 | output: ast.Name = assign.targets[0] 16 | func_call: ast.Call = assign.value 17 | func_name: ast.Name = func_call.func 18 | kwargs: List[ast.keyword] = func_call.keywords 19 | args = {} 20 | for kwarg in kwargs: 21 | k = kwarg.arg 22 | if isinstance(kwarg.value, ast.Name): 23 | v = kwarg.value.id 24 | else: 25 | v = ast.literal_eval(kwarg.value) 26 | args[k] = v 27 | action = Action(tool_name=func_name.id, outputs=[output.id], inputs=args) 28 | actions.append(action) 29 | return actions 30 | 31 | 32 | class Compiler: 33 | def __init__(self): 34 | self.parser = Parser() 35 | 36 | def compile(self, plan: Union[str, List[Union[Action, str]]]) -> List[Action]: 37 | """ The input could be a plain string, a list of structured `Action` 38 | or combination of structured `Action` or unstructured action string. 39 | """ 40 | actions = self.parse(plan) 41 | actions = self.correct(actions) 42 | return actions 43 | 44 | def parse(self, plan) -> List[Action]: 45 | if isinstance(plan, str): 46 | return self.parser.parse(plan) 47 | 48 | actions = [] 49 | for action in plan: 50 | if isinstance(action, str): 51 | action = self.parser.parse(action)[0] 52 | actions.append(action) 53 | 54 | return actions 55 | 56 | def correct(self, actions): 57 | return actions 58 | -------------------------------------------------------------------------------- /cllm/agents/tog/controller.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | import logging 3 | from typing import Tuple, List 4 | import copy 5 | from pathlib import Path 6 | from cllm.agents import container 7 | import json 8 | from collections import OrderedDict 9 | 10 | from cllm.agents.builtin import BUILTIN_PLANS, load_builtin_plans 11 | from cllm.agents.container import auto_type 12 | from cllm.agents.base import DataType, NON_FILE_TYPES 13 | 14 | from cllm.agents.tog.interpretor import Interpretor 15 | from cllm.agents.tog.planner import Planner 16 | from cllm.agents.tog.responser2 import generate_response 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class Controller: 22 | def __init__(self, stream=True, interpretor_kwargs={}): 23 | self.stream = stream 24 | self.planner = Planner(self.stream) 25 | self.interpretor = Interpretor(**interpretor_kwargs) 26 | self.SHORTCUT = "**Using builtin shortcut solution.**" 27 | BUILTIN_PLANS.update(load_builtin_plans("builtin_plan.json")) 28 | logger.info(BUILTIN_PLANS) 29 | 30 | def plan(self, request: str, state: dict): 31 | logger.info(request) 32 | 33 | resource_memory = state.get("resources", {}) 34 | raw_solution = None 35 | # shortcut for builtin plan 36 | for trigger_prompt, _ in BUILTIN_PLANS.items(): 37 | if request == trigger_prompt: 38 | return self.SHORTCUT 39 | 40 | # dynamic execution 41 | if raw_solution is None: 42 | raw_solution = self.planner.plan(request, resource_memory) 43 | return raw_solution 44 | 45 | def parse_solution_from_stream(self, raw_solution): 46 | return self.planner.parse(raw_solution) 47 | 48 | def execute(self, raw_solution: str, state: dict): 49 | resource_memory = state.get("resources") 50 | request = state["request"] 51 | solution = None 52 | if raw_solution == self.SHORTCUT: 53 | for trigger_prompt, builtin_plan in BUILTIN_PLANS.items(): 54 | if request == trigger_prompt: 55 | solution = builtin_plan 56 | solution = self._fill_args(solution, resource_memory) 57 | else: 58 | solution = self.planner.parse(raw_solution) 59 | 60 | if not solution: 61 | return None 62 | try: 63 | history_msgs = state.get("history_msgs") 64 | return self.interpretor.interpret(solution, history_msgs) 65 | except Exception as e: 66 | traceback.print_exc() 67 | return None 68 | 69 | def reply(self, executed_plan: dict, outputs: list, state: dict): 70 | error_response = [ 71 | auto_type( 72 | "response", 73 | DataType.TEXT, 74 | "Sorry, I cannot understand your request due to an internal error.", 75 | ) 76 | ] 77 | state = copy.deepcopy(state) 78 | if ( 79 | executed_plan is None 80 | or len(executed_plan) == 0 81 | or outputs is None 82 | or len(outputs) == 0 83 | ): 84 | return error_response, state 85 | resources = state.get("resources", OrderedDict()) 86 | for o in outputs: 87 | if isinstance(o, container.File): 88 | resources[str(o.filename)] = str(o.rtype) 89 | state["resources"] = resources 90 | response = generate_response(state["request"], executed_plan, outputs) 91 | if len(response) == 0: 92 | return error_response, state 93 | logger.info(response) 94 | return response, state 95 | 96 | def run(self, task: str, state: dict) -> Tuple[List, str]: 97 | try: 98 | return self._run(task, state) 99 | except: 100 | traceback.print_exc() 101 | logger.info(traceback.format_exc()) 102 | return [ 103 | auto_type( 104 | "response", 105 | DataType.TEXT, 106 | "Sorry, I cannot understand your request due to an internal error.", 107 | ) 108 | ], "Internal Error" 109 | 110 | def _run(self, task: str, state: dict) -> Tuple[List, str]: 111 | state["request"] = task 112 | _, plan = self.plan(task, state) 113 | logger.info(plan) 114 | executed_plan = self.execute(plan, state) 115 | 116 | state["outputs"] = [] 117 | executed_plan = list(executed_plan) 118 | for result_per_step, executed_solutions, wrapped_outputs in executed_plan: 119 | state["executed_solutions"] = executed_solutions 120 | for _, output in enumerate(wrapped_outputs): 121 | if output is None or output.value is None: 122 | continue 123 | state["outputs"].extend(wrapped_outputs) 124 | 125 | outputs = self.reply(state["executed_solutions"], state["outputs"], state) 126 | 127 | logger.info(outputs) 128 | return outputs, executed_plan 129 | 130 | def _fill_args(self, plan, memory): 131 | plan = copy.deepcopy(plan) 132 | latest_resource = OrderedDict() 133 | for key, val in memory.items(): 134 | latest_resource[val] = key 135 | 136 | for actions in plan: 137 | for action in actions: 138 | for key, val in action.inputs.items(): 139 | if "" not in val: 140 | action.inputs[key] = latest_resource.get(val, val) 141 | return plan 142 | 143 | 144 | if __name__ == "__main__": 145 | controller = Controller(False) 146 | task = "describe the image in details." 147 | state = { 148 | "resources": { 149 | "image_3.png": "image", 150 | }, 151 | "history_msgs": [], 152 | } 153 | outputs, executed_plan = controller.run(task, state) 154 | print(outputs[0]) 155 | print("*" * 40) 156 | print(executed_plan) 157 | print("Done!") 158 | -------------------------------------------------------------------------------- /cllm/agents/tog/planner.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | import logging 4 | 5 | from ..base import Action, NON_FILE_TYPES 6 | from cllm.services.tog import TaskSolver, TaskDecomposer, config 7 | from cllm.services.nlp.llms import ChatOpenAI, MessageMemory 8 | from cllm.services.tog.api import tog, task_decomposer 9 | from collections import OrderedDict 10 | import copy 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class Planner: 17 | def __init__( 18 | self, streaming=False, backend="remote", device="cuda:0", **llm_kwargs 19 | ): 20 | self.streaming = streaming 21 | if backend == "local": 22 | self.cfg = config 23 | self.device = device 24 | self.mem = MessageMemory(**self.cfg.memory) 25 | self.llm = ChatOpenAI(temperature=0.2, **llm_kwargs) 26 | self.tog = TaskSolver(self.llm, self.cfg.task_solver_config, device).solve 27 | self.decomposer = TaskDecomposer(device, self.cfg.task_decomposer_cfg).solve 28 | elif backend == "remote": 29 | self.decomposer = task_decomposer 30 | self.tog = tog 31 | else: 32 | raise ValueError("Backend should be chosen from [remote, local]") 33 | 34 | def _find_latest_resource(self, resources, type): 35 | for key, val in list(resources.items())[::-1]: 36 | if val == type: 37 | return key 38 | return None 39 | 40 | def _check_task_decomposition( 41 | self, task_decomposition: str | list, available_resources: dict 42 | ): 43 | copy_task_decomposition = copy.deepcopy(task_decomposition) 44 | available_resources = copy.deepcopy(available_resources) 45 | if isinstance(copy_task_decomposition, str): 46 | copy_task_decomposition = json.loads(copy_task_decomposition) 47 | 48 | for subtask in copy_task_decomposition: 49 | for arg in subtask["args"]: 50 | if arg["type"] in NON_FILE_TYPES: 51 | continue 52 | 53 | r_type = available_resources.get(arg["value"], "None").split(".")[-1] 54 | if arg["value"] not in available_resources or arg["type"] != r_type: 55 | new_value = self._find_latest_resource( 56 | available_resources, arg["type"] 57 | ) 58 | if new_value is None: 59 | logger.error( 60 | f"No available resource for {arg['value']} with type {arg['type']}" 61 | ) 62 | return None 63 | 64 | arg["value"] = new_value 65 | 66 | available_resources[subtask["returns"][0]["value"]] = subtask["returns"][0][ 67 | "type" 68 | ] 69 | return json.dumps(copy_task_decomposition, indent=2, ensure_ascii=False) 70 | 71 | def wrap_request(self, request, memory): 72 | logger.info(memory) 73 | resource_list = {k: v.split(".")[-1] for k, v in memory.items()} 74 | request = f"Resource list: {resource_list}\n{request}" 75 | logger.info(f"Input: {request}") 76 | # print(f"Input: {request}") 77 | return request 78 | 79 | def solve_streaming(self, request: str, memory: dict = OrderedDict()): 80 | request = self.wrap_request(request, memory) 81 | sub_tasks = self.decomposer(request, streaming=self.streaming) 82 | logger.info(f"Task decomposition: \n{sub_tasks}") 83 | sub_tasks = self._check_task_decomposition(sub_tasks, memory) 84 | yield sub_tasks 85 | if sub_tasks in [None, "", []]: 86 | yield None 87 | else: 88 | solutions = self.tog(request, sub_tasks, streaming=self.streaming) 89 | yield solutions 90 | 91 | def solve(self, request: str, memory: dict = OrderedDict()) -> List: 92 | request = self.wrap_request(request, memory) 93 | sub_tasks = self.decomposer(request) 94 | solutions = self.tog(request, sub_tasks) 95 | return sub_tasks, solutions 96 | 97 | def plan(self, task, memory: dict = OrderedDict()) -> List: 98 | if self.streaming: 99 | return self.solve_streaming(task, memory) 100 | else: 101 | return self.solve(task, memory) 102 | 103 | def _check_solutions(self, solution: List | str) -> bool: 104 | if isinstance(solution, str): 105 | solution = json.loads(solution) 106 | if len(solution) == 0: 107 | return False 108 | 109 | valid = True 110 | for i, stage_candiate in enumerate(solution): 111 | if len(stage_candiate) == 0: 112 | logger.error(f"No solution is found in {i}-th subtask.") 113 | valid = False 114 | elif ( 115 | "solution" in stage_candiate[0] 116 | and len(stage_candiate[0]["solution"]) == 0 117 | ): 118 | logger.error(f"No solution is found in {i+1}-th subtask.") 119 | valid = False 120 | else: 121 | logger.info(f"Solutions for {i+1}-th subtask:\n{stage_candiate}") 122 | return valid 123 | 124 | def parse(self, solution: List | str) -> List[List[Action]]: 125 | if isinstance(solution, str): 126 | solution = json.loads(solution) 127 | 128 | if not self._check_solutions(solution): 129 | return None 130 | 131 | if isinstance(solution[0], Action): 132 | return solution 133 | 134 | stages = [] 135 | for i, stage_candiate in enumerate(solution): 136 | stage = stage_candiate[0]["solution"] 137 | actions = [] 138 | for action in stage: 139 | inputs = {arg["name"]: arg["value"] for arg in action["args"]} 140 | outputs = [r["value"] for r in action["returns"]] 141 | actions.append( 142 | Action(action["tool_name"], inputs=inputs, outputs=outputs) 143 | ) 144 | stages.append(actions) 145 | return stages 146 | 147 | def __call__( 148 | self, request: str, memory: dict = OrderedDict() 149 | ) -> List[List[Action]]: 150 | solution = self.solve(request, memory) 151 | return self.parse(solution) 152 | -------------------------------------------------------------------------------- /cllm/agents/tog/responser.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import logging 3 | 4 | from cllm.services.nlp.llms.chat_models import ChatOpenAI 5 | from cllm.services.nlp.llms.memory import MessageMemory 6 | from langchain.schema import SystemMessage 7 | 8 | from cllm.agents.base import DataType 9 | from cllm.agents import container 10 | 11 | 12 | RESPONSE_GENERATION_PROMPT = """Your name is ControlLLM, an AI-powered assistant developed by OpenGV-lab from Shanghai Artificial Intelligence Laboratory. For user's request, the system executes the solution and collects the results based on the following workflow. You need to respond to user requests based on the following information. 13 | Here are the information for you reference. 14 | 15 | ## User Request 16 | {} 17 | 18 | ## Workflow and Execution Results 19 | {} 20 | 21 | Now you should pay attention to Collected Results. You first must answer the user’s request in a straightforward manner. Then you need to summarize the workflow and intermediate results friendly. Some of the results may not be accurate and need you to use your judgement in making decisions. If the results contain file names, you have to output the file name directly. Only if there is nothing returned by tools, you should tell user you can not finish the task. Now, please friendly summarize the results and answer the question for the user requests `{}`. 22 | """.strip() 23 | 24 | 25 | SIMPLE_RESPONSE_GENERATION_PROMPT = """Your name is ControlLLM, an AI-powered assistant developed by OpenGVLab from Shanghai Artificial Intelligence Laboratory. You need to respond to user requests based on the following information. 26 | Here are the information for you reference. 27 | 28 | ## User Request 29 | {} 30 | 31 | ## Workflow and Execution Results 32 | {} 33 | 34 | Now, please friendly summarize the results and answer the question for the user requests `{}`. 35 | """.strip() 36 | 37 | logger = logging.getLogger(__name__) 38 | 39 | 40 | def generate_response(user_input, solution, output_files): 41 | if ( 42 | len(solution) <= 1 43 | and len(solution[0]) <= 1 44 | and solution[0][0].tool_name == "question_answering" 45 | ): 46 | content = SIMPLE_RESPONSE_GENERATION_PROMPT.format( 47 | user_input, solution, user_input 48 | ) 49 | else: 50 | content = RESPONSE_GENERATION_PROMPT.format(user_input, solution, user_input) 51 | 52 | logger.info("##### Response Generation #####") 53 | logger.info(content) 54 | 55 | chat = ChatOpenAI(model_name="gpt-3.5-turbo-16k") 56 | messages = [SystemMessage(content=content)] 57 | output = chat(messages) 58 | logger.info(output) 59 | 60 | # files = [output for output in output_files if isinstance(output, container.File)] 61 | # return [container.Text('Response', DataType.TEXT, output)] + files 62 | return [container.Text("Response", DataType.TEXT, output)] 63 | -------------------------------------------------------------------------------- /cllm/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/app/__init__.py -------------------------------------------------------------------------------- /cllm/services/__init__.py: -------------------------------------------------------------------------------- 1 | from cllm.services.image_editing.api import ( 2 | inpainting_ldm, 3 | inpainting_ldm_general, 4 | partial_image_editing, 5 | instruct_pix2pix, 6 | image_cropping, 7 | image_matting, 8 | draw_bbox_on_image, 9 | ) 10 | from cllm.services.image_generation.api import ( 11 | text2image, 12 | image2image, 13 | cannytext2image, 14 | linetext2image, 15 | hedtext2image, 16 | scribbletext2image, 17 | posetext2image, 18 | segtext2image, 19 | depthtext2image, 20 | normaltext2image, 21 | ) 22 | 23 | from cllm.services.image_processing.api import ( 24 | image2canny, 25 | image2line, 26 | image2hed, 27 | image2scribble, 28 | image2pose, 29 | image2depth, 30 | image2normal, 31 | ) 32 | from cllm.services.image_perception.api import ( 33 | object_detection, 34 | image_classification, 35 | ocr, 36 | segment_objects, 37 | visual_grounding, 38 | image_captioning, 39 | segment_by_mask, 40 | segment_by_points, 41 | set_image, 42 | segment_all, 43 | seg_by_mask, 44 | seg_by_points, 45 | ) 46 | from cllm.services.video.api import ( 47 | video_classification, 48 | video_captioning, 49 | image_audio_to_video, 50 | video_to_webpage, 51 | dub_video, 52 | image_to_video, 53 | ) 54 | from cllm.services.audio.api import ( 55 | text_to_music, 56 | text_to_speech, 57 | audio_classification, 58 | ) 59 | from cllm.services.general.api import ( 60 | select, 61 | count, 62 | remote_logging, 63 | ) 64 | from cllm.services.nlp.api import ( 65 | text_to_text_generation, 66 | title_generation, 67 | text_to_tags, 68 | question_answering_with_context, 69 | openai_chat_model, 70 | summarization, 71 | extract_location, 72 | sentiment_analysis, 73 | get_weather, 74 | summarize_weather_condition, 75 | get_time, 76 | ) 77 | from cllm.services.vqa.api import image_qa 78 | 79 | from fastapi import FastAPI 80 | from .pool import ModelPool 81 | 82 | app = FastAPI() 83 | pool = ModelPool() 84 | -------------------------------------------------------------------------------- /cllm/services/anything2image/__init__.py: -------------------------------------------------------------------------------- 1 | from .tools import Anything2Image -------------------------------------------------------------------------------- /cllm/services/anything2image/api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | 4 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056) 5 | 6 | 7 | def audio2image(audio): 8 | url = "http://localhost:10049/chat" 9 | # files = {"image": open("assets/ADE_val_00000529.jpg", "rb")} 10 | data = {"audio": audio} 11 | response = requests.post(url, data=data) 12 | return response.json() 13 | -------------------------------------------------------------------------------- /cllm/services/anything2image/imagebind/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import load_and_transform_text, load_and_transform_audio_data, load_and_transform_video_data, load_and_transform_vision_data, load_and_transform_thermal_data 2 | from .models.imagebind_model import imagebind_huge, ModalityType -------------------------------------------------------------------------------- /cllm/services/anything2image/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/anything2image/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /cllm/services/anything2image/imagebind/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/anything2image/imagebind/models/__init__.py -------------------------------------------------------------------------------- /cllm/services/anything2image/imagebind/models/helpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Portions Copyright (c) Meta Platforms, Inc. and affiliates. 3 | # All rights reserved. 4 | 5 | # This source code is licensed under the license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | import math 9 | 10 | import einops 11 | import numpy as np 12 | import torch 13 | 14 | import torch.nn as nn 15 | 16 | 17 | class Normalize(nn.Module): 18 | def __init__(self, dim: int) -> None: 19 | super().__init__() 20 | self.dim = dim 21 | 22 | def forward(self, x): 23 | return torch.nn.functional.normalize(x, dim=self.dim, p=2) 24 | 25 | 26 | class LearnableLogitScaling(nn.Module): 27 | def __init__( 28 | self, 29 | logit_scale_init: float = 1 / 0.07, 30 | learnable: bool = True, 31 | max_logit_scale: float = 100, 32 | ) -> None: 33 | super().__init__() 34 | self.max_logit_scale = max_logit_scale 35 | self.logit_scale_init = logit_scale_init 36 | self.learnable = learnable 37 | log_logit_scale = torch.ones([]) * np.log(self.logit_scale_init) 38 | if learnable: 39 | self.log_logit_scale = nn.Parameter(log_logit_scale) 40 | else: 41 | self.register_buffer("log_logit_scale", log_logit_scale) 42 | 43 | def forward(self, x): 44 | return torch.clip(self.log_logit_scale.exp(), max=self.max_logit_scale) * x 45 | 46 | def extra_repr(self): 47 | st = f"logit_scale_init={self.logit_scale_init},learnable={self.learnable}, max_logit_scale={self.max_logit_scale}" 48 | return st 49 | 50 | 51 | class EinOpsRearrange(nn.Module): 52 | def __init__(self, rearrange_expr: str, **kwargs) -> None: 53 | super().__init__() 54 | self.rearrange_expr = rearrange_expr 55 | self.kwargs = kwargs 56 | 57 | def forward(self, x): 58 | assert isinstance(x, torch.Tensor) 59 | return einops.rearrange(x, self.rearrange_expr, **self.kwargs) 60 | 61 | 62 | class VerboseNNModule(nn.Module): 63 | """ 64 | Wrapper around nn.Module that prints registered buffers and parameter names. 65 | """ 66 | 67 | @staticmethod 68 | def get_readable_tensor_repr(name: str, tensor: torch.Tensor) -> str: 69 | st = ( 70 | "(" 71 | + name 72 | + "): " 73 | + "tensor(" 74 | + str(tuple(tensor[1].shape)) 75 | + ", requires_grad=" 76 | + str(tensor[1].requires_grad) 77 | + ")\n" 78 | ) 79 | return st 80 | 81 | def extra_repr(self) -> str: 82 | named_modules = set() 83 | for p in self.named_modules(): 84 | named_modules.update([p[0]]) 85 | named_modules = list(named_modules) 86 | 87 | string_repr = "" 88 | for p in self.named_parameters(): 89 | name = p[0].split(".")[0] 90 | if name not in named_modules: 91 | string_repr += self.get_readable_tensor_repr(name, p) 92 | 93 | for p in self.named_buffers(): 94 | name = p[0].split(".")[0] 95 | string_repr += self.get_readable_tensor_repr(name, p) 96 | 97 | return string_repr 98 | 99 | 100 | def cast_if_src_dtype( 101 | tensor: torch.Tensor, src_dtype: torch.dtype, tgt_dtype: torch.dtype 102 | ): 103 | updated = False 104 | if tensor.dtype == src_dtype: 105 | tensor = tensor.to(dtype=tgt_dtype) 106 | updated = True 107 | return tensor, updated 108 | 109 | 110 | class QuickGELU(nn.Module): 111 | # From https://github.com/openai/CLIP/blob/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1/clip/model.py#L166 112 | def forward(self, x: torch.Tensor): 113 | return x * torch.sigmoid(1.702 * x) 114 | 115 | 116 | class SelectElement(nn.Module): 117 | def __init__(self, index) -> None: 118 | super().__init__() 119 | self.index = index 120 | 121 | def forward(self, x): 122 | assert x.ndim >= 3 123 | return x[:, self.index, ...] 124 | 125 | 126 | class SelectEOSAndProject(nn.Module): 127 | """ 128 | Text Pooling used in OpenCLIP 129 | """ 130 | 131 | def __init__(self, proj: nn.Module) -> None: 132 | super().__init__() 133 | self.proj = proj 134 | 135 | def forward(self, x, seq_len): 136 | assert x.ndim == 3 137 | # x is of shape B x L x D 138 | # take features from the eot embedding (eot_token is the highest number in each sequence) 139 | x = x[torch.arange(x.shape[0]), seq_len] 140 | x = self.proj(x) 141 | return x 142 | -------------------------------------------------------------------------------- /cllm/services/anything2image/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import uvicorn 5 | from fastapi import FastAPI, UploadFile, File, Form, Response 6 | from PIL import Image 7 | import io 8 | import uuid 9 | 10 | from .tools import Anything2Image 11 | 12 | parser = argparse.ArgumentParser(description="Anything2Image API") 13 | parser.add_argument("--port", type=int, default=10049, help="Port") 14 | args = parser.parse_args() 15 | 16 | app = FastAPI() 17 | model = Anything2Image('cuda:0') 18 | 19 | TMP_DIR = 'anything2image_tmp' 20 | os.makedirs(TMP_DIR, exist_ok=True) 21 | 22 | 23 | def get_bytes_value(image): 24 | img_byte_arr = io.BytesIO() 25 | image.save(img_byte_arr, format='png') 26 | return img_byte_arr.getvalue() 27 | 28 | 29 | @app.post("/audio2image") 30 | async def audio2image(audio: UploadFile = File(None)): 31 | image_bytes = image.file.read() 32 | image: Image = Image.open(io.BytesIO(image_bytes)) 33 | image_path = os.path.join(TMP_DIR, str(uuid.uuid3)) 34 | image.save(image_path) 35 | output = model.audio2image(image_path) 36 | buffer = get_bytes_value(output) 37 | return Response(content=buffer, media_type="image/jpg") 38 | 39 | 40 | if __name__ == '__main__': 41 | uvicorn.run(app, host="0.0.0.0", port=args.port) 42 | -------------------------------------------------------------------------------- /cllm/services/anything2image/tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from diffusers import StableUnCLIPImg2ImgPipeline 3 | from . import imagebind as ib 4 | 5 | 6 | class Anything2Image: 7 | def __init__(self, device): 8 | pipe = StableUnCLIPImg2ImgPipeline.from_pretrained( 9 | "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16" 10 | ) 11 | self.device = device 12 | self.pipe = pipe.to(device) 13 | self.pipe.enable_model_cpu_offload() 14 | self.pipe.enable_vae_slicing() 15 | 16 | self.model = ib.imagebind_huge(pretrained=True) 17 | self.model.eval() 18 | self.model.to(device) 19 | 20 | def audio2image(self, audio_path): 21 | embeddings = self.model.forward({ 22 | ib.ModalityType.AUDIO: ib.load_and_transform_audio_data([audio_path], self.device), 23 | }) 24 | embeddings = embeddings[ib.ModalityType.AUDIO] 25 | images = self.pipe(image_embeds=embeddings.half(), width=512, height=512).images 26 | return images[0] 27 | 28 | def thermal2image(self, thermal_path): 29 | embeddings = self.model.forward({ 30 | ib.ModalityType.THERMAL: ib.load_and_transform_thermal_data([thermal_path], self.device), 31 | }) 32 | embeddings = embeddings[ib.ModalityType.THERMAL] 33 | images = self.pipe(image_embeds=embeddings.half(), width=512, height=512).images 34 | return images[0] 35 | 36 | def audioimage2image(self, image_path, audio_path): 37 | embeddings = self.model.forward({ 38 | ib.ModalityType.VISION: ib.load_and_transform_vision_data([image_path], self.device), 39 | }, normalize=False) 40 | img_embeddings = embeddings[ib.ModalityType.VISION] 41 | embeddings = self.model.forward({ 42 | ib.ModalityType.AUDIO: ib.load_and_transform_audio_data([audio_path], self.device), 43 | }) 44 | audio_embeddings = embeddings[ib.ModalityType.AUDIO] 45 | embeddings = (img_embeddings + audio_embeddings) / 2 46 | images = self.pipe(image_embeds=embeddings.half(), width=512, height=512).images 47 | return images[0] 48 | 49 | def audiotext2image(self, audio_path, text): 50 | embeddings = self.model.forward({ 51 | ib.ModalityType.TEXT: ib.load_and_transform_text([text], self.device), 52 | }, normalize=False) 53 | text_embeddings = embeddings[ib.ModalityType.TEXT] 54 | 55 | embeddings = self.model.forward({ 56 | ib.ModalityType.AUDIO: ib.load_and_transform_audio_data([audio_path], self.device), 57 | }) 58 | audio_embeddings = embeddings[ib.ModalityType.AUDIO] 59 | embeddings = text_embeddings * 0.5 + audio_embeddings * 0.5 60 | images = self.pipe(image_embeds=embeddings.half(), width=512, height=512).images 61 | return images[0] 62 | -------------------------------------------------------------------------------- /cllm/services/audio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/audio/__init__.py -------------------------------------------------------------------------------- /cllm/services/audio/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import uuid 4 | import numpy as np 5 | import os.path as osp 6 | import whisper 7 | import uvicorn 8 | from fastapi import UploadFile, File, Form 9 | from fastapi.responses import JSONResponse, StreamingResponse, FileResponse 10 | 11 | from .tools import * 12 | 13 | from cllm.services import app, pool 14 | from cllm.services.utils import AudioResponse 15 | from ..hf_pipeline import HuggingfacePipeline 16 | 17 | 18 | parser = argparse.ArgumentParser(description="Audio API") 19 | parser.add_argument("--host", type=str, default="localhost", help="Host") 20 | parser.add_argument("--port", type=int, default=10049, help="Port") 21 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 22 | args = parser.parse_args() 23 | 24 | RESOURCE_ROOT = os.environ.get("SERVER_ROOT", "./server_resources") 25 | os.makedirs(RESOURCE_ROOT, exist_ok=True) 26 | 27 | 28 | @app.post("/audio_classification") 29 | @pool.register(lambda: HuggingfacePipeline("audio-classification", args.device)) 30 | async def audio_classification(audio: UploadFile = File(None)): 31 | bytes = audio.file.read() 32 | model = audio_classification.__wrapped__.model 33 | output = model(bytes) 34 | return JSONResponse(output) 35 | 36 | 37 | @app.post("/automatic_speech_recognition") 38 | @pool.register(lambda: HuggingfacePipeline("automatic-speech-recognition", args.device)) 39 | async def automatic_speech_recognition(audio: UploadFile = File(None)): 40 | bytes = audio.file.read() 41 | model = automatic_speech_recognition.__wrapped__.model 42 | output = model(bytes) 43 | return JSONResponse(output) 44 | 45 | 46 | @app.post("/text_to_music") 47 | @pool.register(lambda: Text2Music(args.device)) 48 | async def text_to_music(text: str = Form(...)): 49 | model = text_to_music.__wrapped__.model 50 | output = model(text) 51 | return AudioResponse(output) 52 | 53 | 54 | @app.post("/text_to_speech") 55 | @pool.register( 56 | lambda: HuggingfacePipeline("text-to-speech", args.device, model="suno/bark") 57 | ) 58 | async def text_to_speech(text: str = Form(...)): 59 | model = text_to_speech.__wrapped__.model 60 | speech = model(text) 61 | save_path = osp.join(RESOURCE_ROOT, f"{str(uuid.uuid4())[:6]}_audio.wav") 62 | scipy.io.wavfile.write( 63 | save_path, 64 | rate=speech["sampling_rate"], 65 | data=speech["audio"][0].astype(np.float32), 66 | ) 67 | return AudioResponse(save_path) 68 | 69 | 70 | @app.post("/speech_to_text") 71 | @pool.register(lambda: whisper.load_model("base", args.device)) 72 | async def speech_to_text(audio: UploadFile = File(None)): 73 | model = speech_to_text.__wrapped__.model 74 | save_path = osp.join(RESOURCE_ROOT, f"{str(uuid.uuid4())[:6]}_audio.wav") 75 | with open(save_path, "wb") as fout: 76 | fout.write(audio.file.read()) 77 | result = model.transcribe(save_path) 78 | text = result["text"] 79 | return JSONResponse(text) 80 | 81 | 82 | if __name__ == "__main__": 83 | uvicorn.run(app, host=args.host, port=args.port) 84 | -------------------------------------------------------------------------------- /cllm/services/audio/tools.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | pipeline, 3 | AutoModel, 4 | AutoProcessor, 5 | MusicgenForConditionalGeneration, 6 | ) 7 | from PIL import Image 8 | import torch 9 | import scipy 10 | import io 11 | import numpy as np 12 | 13 | 14 | ''' 15 | class Text2Speech: 16 | def __init__(self, device): 17 | self.device = device 18 | self.processor = AutoProcessor.from_pretrained("suno/bark-small") 19 | self.model = AutoModel.from_pretrained("suno/bark-small") 20 | # self.model.to(self.device) 21 | 22 | def __call__(self, text): 23 | inputs = self.processor( 24 | text = [text], 25 | padding=True, 26 | return_tensors="pt", 27 | ).to(self.device) 28 | audio_values = self.model.generate(**inputs, do_sample=True) 29 | 30 | # TODO 31 | save_path = 'resources/test.wav' 32 | sampling_rate = self.model.config.audio_encoder.sampling_rate 33 | scipy.io.wavfile.write(save_path, rate=sampling_rate, data=audio_values[0, 0].numpy()) 34 | return save_path 35 | 36 | def to(self, device): 37 | self.model.to(device) 38 | ''' 39 | 40 | 41 | class Text2Music: 42 | def __init__(self, device): 43 | self.device = device 44 | self.dtype = torch.float16 45 | self.processor = AutoProcessor.from_pretrained( 46 | "facebook/musicgen-small" 47 | ) 48 | self.model = MusicgenForConditionalGeneration.from_pretrained( 49 | "facebook/musicgen-small", torch_dtype=self.dtype 50 | ) 51 | self.model.to(device=self.device) 52 | 53 | def __call__(self, text: str): 54 | inputs = self.processor( 55 | text=[text], 56 | padding=True, 57 | return_tensors="pt", 58 | ).to(self.device) 59 | audio_values = self.model.generate(**inputs, max_new_tokens=512) 60 | 61 | # TODO 62 | stream = io.BytesIO() 63 | sampling_rate = self.model.config.audio_encoder.sampling_rate 64 | scipy.io.wavfile.write( 65 | stream, 66 | rate=sampling_rate, 67 | data=audio_values[0, 0].cpu().numpy().astype(np.float32), 68 | ) 69 | stream.seek(0) 70 | return stream 71 | 72 | def to(self, device): 73 | self.device = device 74 | self.model.to(device) 75 | 76 | 77 | if __name__ == "__main__": 78 | model = Text2Music('auto') 79 | print( 80 | model( 81 | "An 80s driving pop song with heavy drums and synth pads in the background" 82 | ) 83 | ) 84 | -------------------------------------------------------------------------------- /cllm/services/general/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/general/__init__.py -------------------------------------------------------------------------------- /cllm/services/general/api.py: -------------------------------------------------------------------------------- 1 | from re import I 2 | from typing import List 3 | from pathlib import Path 4 | import os 5 | import requests 6 | 7 | __ALL__ = ["remote_logging", "select", "count"] 8 | 9 | HOST = "localhost" 10 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056) 11 | 12 | 13 | def setup(host="localhost", port=10056): 14 | global HOST, PORT 15 | HOST = host 16 | PORT = port 17 | 18 | 19 | def select(**kwargs): 20 | if "bbox_list" in kwargs: 21 | list = kwargs["bbox_list"] 22 | condition = kwargs["condition"] 23 | return [l for l in list if l["label"] == condition] 24 | if "mask_list" in kwargs: 25 | list = kwargs["mask_list"] 26 | condition = kwargs["condition"] 27 | # return combine_masks([l for l in list if l['label'] == condition]) 28 | return [l for l in list if l["label"] == condition] 29 | if "category_list" in kwargs: 30 | list = kwargs["category_list"] 31 | condition = kwargs["condition"] 32 | # return combine_masks([l for l in list if l['label'] == condition]) 33 | return [l for l in list if l["label"] == condition] 34 | 35 | 36 | def count(**kwargs): 37 | len_of_list = 0 38 | if "bbox_list" in kwargs: 39 | len_of_list = len(kwargs["bbox_list"]) 40 | elif "mask_list" in kwargs: 41 | len_of_list = len(kwargs["mask_list"]) 42 | 43 | return f"The length of the given list is {len_of_list}" 44 | 45 | 46 | def remote_logging( 47 | history_msgs: list, 48 | task_decomposition: list, 49 | solution: list, 50 | record: str, 51 | like: bool, 52 | **kwargs, 53 | ): 54 | host = kwargs.get("host", HOST) 55 | port = kwargs.get("port", PORT) 56 | url = f"http://{host}:{port}/remote_logging" 57 | data = { 58 | "history_msgs": history_msgs, 59 | "task_decomposition": task_decomposition, 60 | "solution": solution, 61 | "record": record, 62 | "like": like, 63 | } 64 | response = requests.post(url, data=data) 65 | return response.content 66 | -------------------------------------------------------------------------------- /cllm/services/general/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from .tools import * 3 | from fastapi import Form, Body 4 | from fastapi.responses import JSONResponse 5 | from cllm.services import app, pool 6 | import uvicorn 7 | 8 | 9 | parser = argparse.ArgumentParser(description="Image Perception API") 10 | parser.add_argument("--host", type=str, default="localhost", help="Host") 11 | parser.add_argument("--port", type=int, default=10049, help="Port") 12 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 13 | args = parser.parse_args() 14 | 15 | 16 | @app.post("/remote_logging") 17 | @pool.register(lambda: Logger(args.device)) 18 | async def remote_logging( 19 | history_msgs: list = Body(...), 20 | task_decomposition: list = Body(...), 21 | solution: list = Body(...), 22 | record: str = Form(...), 23 | like: bool = Form(...), 24 | ): 25 | model = remote_logging.__wrapped__.model 26 | output = model(history_msgs, task_decomposition, solution, record, like) 27 | return JSONResponse(output) 28 | 29 | 30 | if __name__ == "__main__": 31 | uvicorn.run(app, host=args.host, port=args.port) 32 | -------------------------------------------------------------------------------- /cllm/services/general/tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import uuid 5 | 6 | 7 | class Logger: 8 | def __init__(self, device): 9 | self.device = device 10 | 11 | def __call__( 12 | self, 13 | history_msgs: list, 14 | task_decomposition: list, 15 | solution: list, 16 | record: str, 17 | like: bool, 18 | ): 19 | os.makedirs("logs", exist_ok=True) 20 | print(f"solution: {solution}") 21 | print(f"solution: {type(solution)}") 22 | state = { 23 | "history": history_msgs, 24 | "task_decomposition": task_decomposition, 25 | "solution": solution, 26 | "record": record, 27 | "like": like, 28 | } 29 | file_name = f'logs/{time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())}_{str(uuid.uuid4())[:6]}.json' 30 | json.dump(state, open(file_name, "w"), indent=4) 31 | 32 | def to(self, device): 33 | return self 34 | -------------------------------------------------------------------------------- /cllm/services/hf_pipeline.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | from PIL import Image 3 | import torch 4 | 5 | 6 | class HuggingfacePipeline: 7 | def __init__(self, task, device="cpu", **kwargs): 8 | # dtype=None 9 | self.device = device 10 | self.task = task 11 | self.pipeline = pipeline(task, device=device, **kwargs) 12 | 13 | def __call__(self, *args, **kwargs): 14 | # print(f'HuggingfacePipeline. type(image): {type(image)}') 15 | output = self.pipeline(*args, **kwargs) 16 | # print(f'end HuggingfacePipeline. output: {output}') 17 | return output 18 | 19 | def to(self, device): 20 | self.pipeline.model.to(device=device) 21 | 22 | 23 | class HuggingfacePipelineNLP: 24 | def __init__(self, task=None, device="cpu", **kwargs): 25 | # dtype=None 26 | self.device = device 27 | self.task = task 28 | self.model = pipeline(task, device=device, **kwargs) 29 | 30 | def __call__(self, text: str, *args, **kwargs): 31 | if self.task == "summarization": 32 | output = self.model(text, *args, **kwargs) 33 | elif self.task == "text2text-generation": 34 | output = self.model(text, *args, **kwargs) 35 | else: 36 | output = self.model(text, *args, **kwargs) 37 | if self.task in ["summarization", "text2text-generation"]: 38 | return list(output[0].values())[0] 39 | if self.task == "question-answering": 40 | return output["answer"] 41 | return output 42 | 43 | def to(self, device): 44 | self.model.model.to(device) 45 | return self 46 | -------------------------------------------------------------------------------- /cllm/services/image_editing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_editing/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | from fastapi import FastAPI, UploadFile, File, Form 5 | from fastapi.responses import StreamingResponse 6 | 7 | from PIL import Image 8 | import io 9 | 10 | from .tools import * 11 | from cllm.services import app, pool 12 | 13 | parser = argparse.ArgumentParser(description="Image Editing API") 14 | parser.add_argument("--host", type=str, default="localhost", help="Host") 15 | parser.add_argument("--port", type=int, default=10049, help="Port") 16 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 17 | args = parser.parse_args() 18 | 19 | 20 | def ImageResponse(image): 21 | img_stream = io.BytesIO() 22 | image.save(img_stream, format="png") 23 | img_stream.seek(0) 24 | return StreamingResponse(img_stream, media_type="image/png") 25 | 26 | 27 | @app.post("/instruct_pix2pix") 28 | @pool.register(lambda: InstructPix2Pix(args.device)) 29 | async def instruct_pix2pix(image: UploadFile = File(None), text: str = Form(...)): 30 | image_bytes = image.file.read() 31 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 32 | model = instruct_pix2pix.__wrapped__.model 33 | output = model(image, text) 34 | return ImageResponse(output) 35 | 36 | 37 | @app.post("/partial_image_editing") 38 | @pool.register(lambda: PartialImageEditing(args.device)) 39 | async def partial_image_editing( 40 | image: UploadFile = File(None), 41 | mask: UploadFile = File(None), 42 | prompt: str = Form(...), 43 | ): 44 | print(f"image: {image}; \n\nmask: {mask}; \n\nprompt: {prompt}") 45 | image_bytes = image.file.read() 46 | image = Image.open(io.BytesIO(image_bytes)) 47 | mask_bytes = mask.file.read() 48 | mask = Image.open(io.BytesIO(mask_bytes)) 49 | model = partial_image_editing.__wrapped__.model 50 | output = model(image, mask, prompt) 51 | return ImageResponse(output) 52 | 53 | 54 | @app.post("/inpainting_ldm") 55 | @pool.register(lambda: LDMInpainting(args.device)) 56 | async def inpainting_ldm(image: UploadFile = File(None), mask: UploadFile = File(None)): 57 | image = Image.open(io.BytesIO(image.file.read())).convert("RGB") 58 | mask = Image.open(io.BytesIO(mask.file.read())) 59 | model = inpainting_ldm.__wrapped__.model 60 | output = model(image, mask) 61 | return ImageResponse(output) 62 | 63 | 64 | if __name__ == "__main__": 65 | uvicorn.run(app, host=args.host, port=args.port) 66 | -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/__init__.py: -------------------------------------------------------------------------------- 1 | from .wrapper import LDMInpainter -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-06 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0205 7 | log_every_t: 100 8 | timesteps: 1000 9 | loss_type: l1 10 | first_stage_key: image 11 | cond_stage_key: masked_image 12 | image_size: 64 13 | channels: 3 14 | concat_mode: true 15 | monitor: val/loss 16 | scheduler_config: 17 | target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler 18 | params: 19 | verbosity_interval: 0 20 | warm_up_steps: 1000 21 | max_decay_steps: 50000 22 | lr_start: 0.001 23 | lr_max: 0.1 24 | lr_min: 0.0001 25 | unet_config: 26 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 27 | params: 28 | image_size: 64 29 | in_channels: 7 30 | out_channels: 3 31 | model_channels: 256 32 | attention_resolutions: 33 | - 8 34 | - 4 35 | - 2 36 | num_res_blocks: 2 37 | channel_mult: 38 | - 1 39 | - 2 40 | - 3 41 | - 4 42 | num_heads: 8 43 | resblock_updown: true 44 | first_stage_config: 45 | target: ldm.models.autoencoder.VQModelInterface 46 | params: 47 | embed_dim: 3 48 | n_embed: 8192 49 | monitor: val/rec_loss 50 | ddconfig: 51 | attn_type: none 52 | double_z: false 53 | z_channels: 3 54 | resolution: 256 55 | in_channels: 3 56 | out_ch: 3 57 | ch: 128 58 | ch_mult: 59 | - 1 60 | - 2 61 | - 4 62 | num_res_blocks: 2 63 | attn_resolutions: [] 64 | dropout: 0.0 65 | lossconfig: 66 | target: ldm.modules.losses.DummyLoss 67 | cond_stage_config: __is_first_stage__ 68 | -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LambdaWarmUpCosineScheduler: 5 | """ 6 | note: use with a base_lr of 1.0 7 | """ 8 | def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0): 9 | self.lr_warm_up_steps = warm_up_steps 10 | self.lr_start = lr_start 11 | self.lr_min = lr_min 12 | self.lr_max = lr_max 13 | self.lr_max_decay_steps = max_decay_steps 14 | self.last_lr = 0. 15 | self.verbosity_interval = verbosity_interval 16 | 17 | def schedule(self, n, **kwargs): 18 | if self.verbosity_interval > 0: 19 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}") 20 | if n < self.lr_warm_up_steps: 21 | lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start 22 | self.last_lr = lr 23 | return lr 24 | else: 25 | t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) 26 | t = min(t, 1.0) 27 | lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( 28 | 1 + np.cos(t * np.pi)) 29 | self.last_lr = lr 30 | return lr 31 | 32 | def __call__(self, n, **kwargs): 33 | return self.schedule(n,**kwargs) 34 | 35 | 36 | class LambdaWarmUpCosineScheduler2: 37 | """ 38 | supports repeated iterations, configurable via lists 39 | note: use with a base_lr of 1.0. 40 | """ 41 | def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0): 42 | assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths) 43 | self.lr_warm_up_steps = warm_up_steps 44 | self.f_start = f_start 45 | self.f_min = f_min 46 | self.f_max = f_max 47 | self.cycle_lengths = cycle_lengths 48 | self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths)) 49 | self.last_f = 0. 50 | self.verbosity_interval = verbosity_interval 51 | 52 | def find_in_interval(self, n): 53 | interval = 0 54 | for cl in self.cum_cycles[1:]: 55 | if n <= cl: 56 | return interval 57 | interval += 1 58 | 59 | def schedule(self, n, **kwargs): 60 | cycle = self.find_in_interval(n) 61 | n = n - self.cum_cycles[cycle] 62 | if self.verbosity_interval > 0: 63 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " 64 | f"current cycle {cycle}") 65 | if n < self.lr_warm_up_steps[cycle]: 66 | f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] 67 | self.last_f = f 68 | return f 69 | else: 70 | t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]) 71 | t = min(t, 1.0) 72 | f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * ( 73 | 1 + np.cos(t * np.pi)) 74 | self.last_f = f 75 | return f 76 | 77 | def __call__(self, n, **kwargs): 78 | return self.schedule(n, **kwargs) 79 | 80 | 81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2): 82 | 83 | def schedule(self, n, **kwargs): 84 | cycle = self.find_in_interval(n) 85 | n = n - self.cum_cycles[cycle] 86 | if self.verbosity_interval > 0: 87 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " 88 | f"current cycle {cycle}") 89 | 90 | if n < self.lr_warm_up_steps[cycle]: 91 | f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] 92 | self.last_f = f 93 | return f 94 | else: 95 | f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle]) 96 | self.last_f = f 97 | return f 98 | 99 | -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/models/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/distributions/distributions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class AbstractDistribution: 6 | def sample(self): 7 | raise NotImplementedError() 8 | 9 | def mode(self): 10 | raise NotImplementedError() 11 | 12 | 13 | class DiracDistribution(AbstractDistribution): 14 | def __init__(self, value): 15 | self.value = value 16 | 17 | def sample(self): 18 | return self.value 19 | 20 | def mode(self): 21 | return self.value 22 | 23 | 24 | class DiagonalGaussianDistribution(object): 25 | def __init__(self, parameters, deterministic=False): 26 | self.parameters = parameters 27 | self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) 28 | self.logvar = torch.clamp(self.logvar, -30.0, 20.0) 29 | self.deterministic = deterministic 30 | self.std = torch.exp(0.5 * self.logvar) 31 | self.var = torch.exp(self.logvar) 32 | if self.deterministic: 33 | self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) 34 | 35 | def sample(self): 36 | x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) 37 | return x 38 | 39 | def kl(self, other=None): 40 | if self.deterministic: 41 | return torch.Tensor([0.]) 42 | else: 43 | if other is None: 44 | return 0.5 * torch.sum(torch.pow(self.mean, 2) 45 | + self.var - 1.0 - self.logvar, 46 | dim=[1, 2, 3]) 47 | else: 48 | return 0.5 * torch.sum( 49 | torch.pow(self.mean - other.mean, 2) / other.var 50 | + self.var / other.var - 1.0 - self.logvar + other.logvar, 51 | dim=[1, 2, 3]) 52 | 53 | def nll(self, sample, dims=[1,2,3]): 54 | if self.deterministic: 55 | return torch.Tensor([0.]) 56 | logtwopi = np.log(2.0 * np.pi) 57 | return 0.5 * torch.sum( 58 | logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, 59 | dim=dims) 60 | 61 | def mode(self): 62 | return self.mean 63 | 64 | 65 | def normal_kl(mean1, logvar1, mean2, logvar2): 66 | """ 67 | source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 68 | Compute the KL divergence between two gaussians. 69 | Shapes are automatically broadcasted, so batches can be compared to 70 | scalars, among other use cases. 71 | """ 72 | tensor = None 73 | for obj in (mean1, logvar1, mean2, logvar2): 74 | if isinstance(obj, torch.Tensor): 75 | tensor = obj 76 | break 77 | assert tensor is not None, "at least one argument must be a Tensor" 78 | 79 | # Force variances to be Tensors. Broadcasting helps convert scalars to 80 | # Tensors, but it does not work for torch.exp(). 81 | logvar1, logvar2 = [ 82 | x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) 83 | for x in (logvar1, logvar2) 84 | ] 85 | 86 | return 0.5 * ( 87 | -1.0 88 | + logvar2 89 | - logvar1 90 | + torch.exp(logvar1 - logvar2) 91 | + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) 92 | ) 93 | -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/ema.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class LitEma(nn.Module): 6 | def __init__(self, model, decay=0.9999, use_num_upates=True): 7 | super().__init__() 8 | if decay < 0.0 or decay > 1.0: 9 | raise ValueError('Decay must be between 0 and 1') 10 | 11 | self.m_name2s_name = {} 12 | self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) 13 | self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates 14 | else torch.tensor(-1,dtype=torch.int)) 15 | 16 | for name, p in model.named_parameters(): 17 | if p.requires_grad: 18 | #remove as '.'-character is not allowed in buffers 19 | s_name = name.replace('.','') 20 | self.m_name2s_name.update({name:s_name}) 21 | self.register_buffer(s_name,p.clone().detach().data) 22 | 23 | self.collected_params = [] 24 | 25 | def forward(self,model): 26 | decay = self.decay 27 | 28 | if self.num_updates >= 0: 29 | self.num_updates += 1 30 | decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates)) 31 | 32 | one_minus_decay = 1.0 - decay 33 | 34 | with torch.no_grad(): 35 | m_param = dict(model.named_parameters()) 36 | shadow_params = dict(self.named_buffers()) 37 | 38 | for key in m_param: 39 | if m_param[key].requires_grad: 40 | sname = self.m_name2s_name[key] 41 | shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) 42 | shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) 43 | else: 44 | assert not key in self.m_name2s_name 45 | 46 | def copy_to(self, model): 47 | m_param = dict(model.named_parameters()) 48 | shadow_params = dict(self.named_buffers()) 49 | for key in m_param: 50 | if m_param[key].requires_grad: 51 | m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) 52 | else: 53 | assert not key in self.m_name2s_name 54 | 55 | def store(self, parameters): 56 | """ 57 | Save the current parameters for restoring later. 58 | Args: 59 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 60 | temporarily stored. 61 | """ 62 | self.collected_params = [param.clone() for param in parameters] 63 | 64 | def restore(self, parameters): 65 | """ 66 | Restore the parameters stored with the `store` method. 67 | Useful to validate the model with EMA parameters without affecting the 68 | original optimization process. Store the parameters before the 69 | `copy_to` method. After validation (or model saving), use this to 70 | restore the former parameters. 71 | Args: 72 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 73 | updated with the stored parameters. 74 | """ 75 | for c_param, param in zip(self.collected_params, parameters): 76 | param.data.copy_(c_param.data) 77 | -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/image_degradation/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/image_degradation/utils/test.png -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/ldm/modules/losses/__init__.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class DummyLoss(nn.Module): 4 | def __init__(self): 5 | super().__init__() -------------------------------------------------------------------------------- /cllm/services/image_editing/ldm_inpainting/wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from omegaconf import OmegaConf 6 | import numpy as np 7 | 8 | from .ldm.models.diffusion.ddim import DDIMSampler 9 | from .ldm.util import instantiate_from_config 10 | 11 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 12 | sys.path.append(CURRENT_DIR) 13 | 14 | 15 | def make_batch(image, mask, device): 16 | image = image.astype(np.float32) / 255.0 17 | image = image[None].transpose(0, 3, 1, 2) 18 | image = torch.from_numpy(image) 19 | 20 | mask = mask.astype(np.float32) / 255.0 21 | mask = mask[None, None] 22 | mask[mask < 0.5] = 0 23 | mask[mask >= 0.5] = 1 24 | mask = torch.from_numpy(mask) 25 | 26 | masked_image = (1 - mask) * image 27 | 28 | batch = {"image": image, "mask": mask, "masked_image": masked_image} 29 | for k in batch: 30 | batch[k] = batch[k].to(device=device) 31 | batch[k] = batch[k] * 2.0 - 1.0 32 | return batch 33 | 34 | 35 | class LDMInpainter: 36 | def __init__(self, ckpt_path, ddim_steps=50): 37 | config = OmegaConf.load(os.path.join(CURRENT_DIR, "config.yaml")) 38 | model = instantiate_from_config(config.model) 39 | model.load_state_dict(torch.load(ckpt_path)["state_dict"], strict=False) 40 | self.model = model 41 | self.sampler = DDIMSampler(model) 42 | self.ddim_steps = ddim_steps 43 | 44 | @torch.no_grad() 45 | def __call__(self, image, mask, device): 46 | self.model.to(device) 47 | 48 | model = self.model 49 | sampler = self.sampler 50 | 51 | with self.model.ema_scope(): 52 | batch = make_batch(image, mask, device=device) 53 | 54 | # encode masked image and concat downsampled mask 55 | c = model.cond_stage_model.encode(batch["masked_image"]) 56 | cc = torch.nn.functional.interpolate(batch["mask"], 57 | size=c.shape[-2:]) 58 | c = torch.cat((c, cc), dim=1) 59 | 60 | shape = (c.shape[1] - 1,) + c.shape[2:] 61 | samples_ddim, _ = sampler.sample(S=self.ddim_steps, 62 | conditioning=c, 63 | batch_size=c.shape[0], 64 | shape=shape, 65 | verbose=False) 66 | x_samples_ddim = model.decode_first_stage(samples_ddim) 67 | 68 | image = torch.clamp((batch["image"] + 1.0) / 2.0, 69 | min=0.0, max=1.0) 70 | mask = torch.clamp((batch["mask"] + 1.0) / 2.0, 71 | min=0.0, max=1.0) 72 | predicted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0, 73 | min=0.0, max=1.0) 74 | 75 | inpainted = (1 - mask) * image + mask * predicted_image 76 | inpainted = inpainted.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255 77 | 78 | # offload to cpu to save memory 79 | self.model.to(torch.device('cpu')) 80 | return inpainted.astype(np.uint8) 81 | -------------------------------------------------------------------------------- /cllm/services/image_generation/__init__.py: -------------------------------------------------------------------------------- 1 | from .tools import ( 2 | Text2Image, CannyText2Image, LineText2Image, 3 | HedText2Image, ScribbleText2Image, PoseText2Image, SegText2Image, 4 | DepthText2Image, NormalText2Image 5 | ) 6 | -------------------------------------------------------------------------------- /cllm/services/image_generation/api.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | 4 | import requests 5 | from PIL import Image 6 | from cllm.services.utils import get_bytes_value 7 | 8 | 9 | __ALL__ = [ 10 | "text2image", 11 | "cannytext2image", 12 | "linetext2image", 13 | "hedtext2image", 14 | "scribbletext2image", 15 | "posetext2image", 16 | "segtext2image", 17 | "depthtext2image", 18 | "normaltext2image" "image2image", 19 | ] 20 | 21 | 22 | HOST = "localhost" 23 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056) 24 | 25 | 26 | def setup(host="localhost", port=10049): 27 | global HOST, PORT 28 | HOST = host 29 | PORT = port 30 | 31 | 32 | def text2image(text, **kwargs): 33 | host = kwargs.get("host", HOST) 34 | port = kwargs.get("port", PORT) 35 | url = f"http://{host}:{port}/text2image" 36 | data = {"text": text} 37 | response = requests.post(url, data=data) 38 | return response.content 39 | 40 | 41 | def image2image(image, **kwargs): 42 | host = kwargs.get("host", HOST) 43 | port = kwargs.get("port", PORT) 44 | url = f"http://{host}:{port}/image2image" 45 | files = {"image": (image, get_bytes_value(image))} 46 | response = requests.post(url, files=files) 47 | return response.content 48 | 49 | 50 | def _imagetext2image(image, text, endpoint, **kwargs): 51 | host = kwargs.get("host", HOST) 52 | port = kwargs.get("port", PORT) 53 | url = f"http://{host}:{port}/{endpoint}" 54 | data = {"text": text} 55 | files = {"image": (image, get_bytes_value(image))} 56 | response = requests.post(url, files=files, data=data) 57 | # image = Image.open(io.BytesIO(response.content)) 58 | # image = io.BytesIO(response.content) 59 | # return image 60 | return response.content 61 | 62 | 63 | def cannytext2image(edge, text, **kwargs): 64 | return _imagetext2image(edge, text, endpoint="cannytext2image", **kwargs) 65 | 66 | 67 | def linetext2image(line, text, **kwargs): 68 | return _imagetext2image(line, text, endpoint="linetext2image", **kwargs) 69 | 70 | 71 | def hedtext2image(hed, text, **kwargs): 72 | return _imagetext2image(hed, text, endpoint="hedtext2image", **kwargs) 73 | 74 | 75 | def scribbletext2image(scribble, text, **kwargs): 76 | return _imagetext2image(scribble, text, endpoint="scribbletext2image", **kwargs) 77 | 78 | 79 | def posetext2image(pose, text, **kwargs): 80 | return _imagetext2image(pose, text, endpoint="posetext2image", **kwargs) 81 | 82 | 83 | def segtext2image(segmentation, text, **kwargs): 84 | return _imagetext2image(segmentation, text, endpoint="segtext2image", **kwargs) 85 | 86 | 87 | def depthtext2image(depth, text, **kwargs): 88 | return _imagetext2image(depth, text, endpoint="depthtext2image", **kwargs) 89 | 90 | 91 | def normaltext2image(normal, text, **kwargs): 92 | return _imagetext2image(normal, text, endpoint="normaltext2image", **kwargs) 93 | -------------------------------------------------------------------------------- /cllm/services/image_generation/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | from fastapi import UploadFile, File, Form 5 | from PIL import Image 6 | import io 7 | 8 | from .tools import * 9 | from cllm.services import app, pool 10 | from cllm.services.utils import ImageResponse 11 | 12 | 13 | parser = argparse.ArgumentParser(description="Image Generation API") 14 | parser.add_argument("--host", type=str, default="localhost", help="Host") 15 | parser.add_argument("--port", type=int, default=10049, help="Port") 16 | parser.add_argument("--device", type=str, default="cuda:0", help="Port") 17 | args = parser.parse_args() 18 | 19 | 20 | # def ImageResponse(image): 21 | # img_stream = io.BytesIO() 22 | # image.save(img_stream, format="png") 23 | # img_stream.seek(0) 24 | 25 | # return StreamingResponse(img_stream, media_type="image/png") 26 | 27 | 28 | # @app.post("/text2image") 29 | # @pool.register(lambda: Text2Image(args.device)) 30 | # async def text2image(text: str = Form(...)): 31 | # model = text2image.__wrapped__.model 32 | # output = model(text) 33 | # return ImageResponse(output) 34 | 35 | 36 | @app.post("/text2image") 37 | @pool.register(lambda: PixArtAlpha(args.device)) 38 | async def text2image(text: str = Form(...)): 39 | model = text2image.__wrapped__.model 40 | output = model(text) 41 | return ImageResponse(output) 42 | 43 | 44 | @app.post("/image2image") 45 | @pool.register(lambda: Image2Image(args.device)) 46 | async def image2image(image: UploadFile = File(None)): 47 | image_bytes = image.file.read() 48 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 49 | model = image2image.__wrapped__.model 50 | output = model(image) 51 | return ImageResponse(output) 52 | 53 | 54 | @app.post("/cannytext2image") 55 | @pool.register(lambda: CannyText2Image(args.device)) 56 | async def cannytext2image(image: UploadFile = File(None), text: str = Form(...)): 57 | image_bytes = image.file.read() 58 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 59 | model = cannytext2image.__wrapped__.model 60 | output = model(image, text) 61 | return ImageResponse(output) 62 | 63 | 64 | @app.post("/linetext2image") 65 | @pool.register(lambda: LineText2Image(args.device)) 66 | async def linetext2image(image: UploadFile = File(None), text: str = Form(...)): 67 | image_bytes = image.file.read() 68 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 69 | model = linetext2image.__wrapped__.model 70 | output = model(image, text) 71 | return ImageResponse(output) 72 | 73 | 74 | @app.post("/hedtext2image") 75 | @pool.register(lambda: HedText2Image(args.device)) 76 | async def hedtext2image(image: UploadFile = File(None), text: str = Form(...)): 77 | image_bytes = image.file.read() 78 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 79 | model = hedtext2image.__wrapped__.model 80 | output = model(image, text) 81 | return ImageResponse(output) 82 | 83 | 84 | @app.post("/scribbletext2image") 85 | @pool.register(lambda: ScribbleText2Image(args.device)) 86 | async def scribbletext2image(image: UploadFile = File(None), text: str = Form(...)): 87 | image_bytes = image.file.read() 88 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 89 | model = scribbletext2image.__wrapped__.model 90 | output = model(image, text) 91 | return ImageResponse(output) 92 | 93 | 94 | @app.post("/posetext2image") 95 | @pool.register(lambda: PoseText2Image(args.device)) 96 | async def posetext2image(image: UploadFile = File(None), text: str = Form(...)): 97 | image_bytes = image.file.read() 98 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 99 | model = posetext2image.__wrapped__.model 100 | output = model(image, text) 101 | return ImageResponse(output) 102 | 103 | 104 | @app.post("/segtext2image") 105 | @pool.register(lambda: SegText2Image(args.device)) 106 | async def segtext2image(image: UploadFile = File(None), text: str = Form(...)): 107 | image.file.seek(0) 108 | image_bytes = image.file.read() 109 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 110 | model = segtext2image.__wrapped__.model 111 | output = model(image, text) 112 | return ImageResponse(output) 113 | 114 | 115 | @app.post("/depthtext2image") 116 | @pool.register(lambda: SegText2Image(args.device)) 117 | async def depthtext2image(image: UploadFile = File(None), text: str = Form(...)): 118 | image_bytes = image.file.read() 119 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 120 | model = depthtext2image.__wrapped__.model 121 | output = model(image, text) 122 | return ImageResponse(output) 123 | 124 | 125 | @app.post("/normaltext2image") 126 | @pool.register(lambda: SegText2Image(args.device)) 127 | async def normaltext2image(image: UploadFile = File(None), text: str = Form(...)): 128 | image_bytes = image.file.read() 129 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 130 | model = normaltext2image.__wrapped__.model 131 | output = model(image, text) 132 | return ImageResponse(output) 133 | 134 | 135 | if __name__ == "__main__": 136 | uvicorn.run(app, host=args.host, port=args.port) 137 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_inpainting/api.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Union, List, Dict 3 | from PIL import Image, ImageChops 4 | import io 5 | import os 6 | 7 | import requests 8 | from cllm.services.utils import get_bytes_value 9 | 10 | __ALL__ = [ 11 | "inpainting_ldm", 12 | ] 13 | 14 | 15 | HOST = "localhost" 16 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056) 17 | 18 | 19 | def setup(host="localhost", port=10052): 20 | global HOST, PORT 21 | HOST = host 22 | PORT = port 23 | 24 | 25 | def combine_masks(mask_images): 26 | if mask_images is None or len(mask_images) == 0: 27 | return None 28 | 29 | # Create a new blank image to store the combined mask 30 | combined_mask = Image.open(io.BytesIO(mask_images[0])).convert("1") 31 | 32 | # Iterate through each mask image and combine them 33 | for mask_image in mask_images: 34 | mask = Image.open(io.BytesIO(mask_image)).convert("1") 35 | combined_mask = ImageChops.logical_or(combined_mask, mask) 36 | stream = io.BytesIO() 37 | combined_mask.save(stream, "png") 38 | stream.seek(0) 39 | # return {"label": mask_images[0]["label"], "mask": stream.getvalue()} 40 | return stream.getvalue() 41 | 42 | 43 | def inpainting_ldm_general(image, mask: Union[bytes, List], **kwargs): 44 | if mask in [None, b"", []]: 45 | return get_bytes_value(image) 46 | 47 | mask = copy.deepcopy(mask) 48 | if isinstance(mask, List): 49 | if not isinstance(mask[0], dict): 50 | mask_list = get_bytes_value(mask) 51 | else: 52 | mask_list = [] 53 | for m in mask: 54 | mask_list.append(get_bytes_value(m["mask"])) 55 | mask = combine_masks(mask_list) 56 | 57 | return inpainting_ldm(image, mask, **kwargs) 58 | 59 | 60 | def inpainting_ldm(image, mask, **kwargs): 61 | if mask in [None, b""]: 62 | return get_bytes_value(image) 63 | 64 | host = kwargs.get("host", HOST) 65 | port = kwargs.get("port", PORT) 66 | url = f"http://{host}:{port}/inpainting_ldm" 67 | files = { 68 | "image": (image, get_bytes_value(image)), 69 | "mask": get_bytes_value(mask), 70 | } 71 | response = requests.post(url, files=files) 72 | return response.content 73 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | from fastapi import FastAPI, UploadFile, File 5 | from fastapi.responses import StreamingResponse 6 | from PIL import Image 7 | import io 8 | 9 | from .tools import * 10 | from cllm.services import app, pool 11 | 12 | parser = argparse.ArgumentParser(description="Image Inpainting API") 13 | parser.add_argument("--host", type=str, default="localhost", help="Host") 14 | parser.add_argument("--port", type=int, default=10049, help="Port") 15 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 16 | args = parser.parse_args() 17 | 18 | 19 | def ImageResponse(image): 20 | img_stream = io.BytesIO() 21 | image.save(img_stream, format="png") 22 | img_stream.seek(0) 23 | 24 | return StreamingResponse(img_stream, media_type="image/png") 25 | 26 | 27 | @app.post("/inpainting_ldm") 28 | @pool.register(lambda: LDMInpainting(args.device)) 29 | async def inpainting_ldm(image: UploadFile = File(None), mask: UploadFile = File(None)): 30 | image = Image.open(io.BytesIO(image.file.read())) 31 | mask = Image.open(io.BytesIO(mask.file.read())) 32 | model = inpainting_ldm.__wrapped__.model 33 | output = model(image, mask) 34 | return ImageResponse(output) 35 | 36 | 37 | if __name__ == "__main__": 38 | uvicorn.run(app, host=args.host, port=args.port) 39 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/__init__.py: -------------------------------------------------------------------------------- 1 | from .wrapper import LDMInpainter -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-06 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0205 7 | log_every_t: 100 8 | timesteps: 1000 9 | loss_type: l1 10 | first_stage_key: image 11 | cond_stage_key: masked_image 12 | image_size: 64 13 | channels: 3 14 | concat_mode: true 15 | monitor: val/loss 16 | scheduler_config: 17 | target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler 18 | params: 19 | verbosity_interval: 0 20 | warm_up_steps: 1000 21 | max_decay_steps: 50000 22 | lr_start: 0.001 23 | lr_max: 0.1 24 | lr_min: 0.0001 25 | unet_config: 26 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 27 | params: 28 | image_size: 64 29 | in_channels: 7 30 | out_channels: 3 31 | model_channels: 256 32 | attention_resolutions: 33 | - 8 34 | - 4 35 | - 2 36 | num_res_blocks: 2 37 | channel_mult: 38 | - 1 39 | - 2 40 | - 3 41 | - 4 42 | num_heads: 8 43 | resblock_updown: true 44 | first_stage_config: 45 | target: ldm.models.autoencoder.VQModelInterface 46 | params: 47 | embed_dim: 3 48 | n_embed: 8192 49 | monitor: val/rec_loss 50 | ddconfig: 51 | attn_type: none 52 | double_z: false 53 | z_channels: 3 54 | resolution: 256 55 | in_channels: 3 56 | out_ch: 3 57 | ch: 128 58 | ch_mult: 59 | - 1 60 | - 2 61 | - 4 62 | num_res_blocks: 2 63 | attn_resolutions: [] 64 | dropout: 0.0 65 | lossconfig: 66 | target: ldm.modules.losses.DummyLoss 67 | cond_stage_config: __is_first_stage__ 68 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LambdaWarmUpCosineScheduler: 5 | """ 6 | note: use with a base_lr of 1.0 7 | """ 8 | def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0): 9 | self.lr_warm_up_steps = warm_up_steps 10 | self.lr_start = lr_start 11 | self.lr_min = lr_min 12 | self.lr_max = lr_max 13 | self.lr_max_decay_steps = max_decay_steps 14 | self.last_lr = 0. 15 | self.verbosity_interval = verbosity_interval 16 | 17 | def schedule(self, n, **kwargs): 18 | if self.verbosity_interval > 0: 19 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}") 20 | if n < self.lr_warm_up_steps: 21 | lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start 22 | self.last_lr = lr 23 | return lr 24 | else: 25 | t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) 26 | t = min(t, 1.0) 27 | lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( 28 | 1 + np.cos(t * np.pi)) 29 | self.last_lr = lr 30 | return lr 31 | 32 | def __call__(self, n, **kwargs): 33 | return self.schedule(n,**kwargs) 34 | 35 | 36 | class LambdaWarmUpCosineScheduler2: 37 | """ 38 | supports repeated iterations, configurable via lists 39 | note: use with a base_lr of 1.0. 40 | """ 41 | def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0): 42 | assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths) 43 | self.lr_warm_up_steps = warm_up_steps 44 | self.f_start = f_start 45 | self.f_min = f_min 46 | self.f_max = f_max 47 | self.cycle_lengths = cycle_lengths 48 | self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths)) 49 | self.last_f = 0. 50 | self.verbosity_interval = verbosity_interval 51 | 52 | def find_in_interval(self, n): 53 | interval = 0 54 | for cl in self.cum_cycles[1:]: 55 | if n <= cl: 56 | return interval 57 | interval += 1 58 | 59 | def schedule(self, n, **kwargs): 60 | cycle = self.find_in_interval(n) 61 | n = n - self.cum_cycles[cycle] 62 | if self.verbosity_interval > 0: 63 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " 64 | f"current cycle {cycle}") 65 | if n < self.lr_warm_up_steps[cycle]: 66 | f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] 67 | self.last_f = f 68 | return f 69 | else: 70 | t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]) 71 | t = min(t, 1.0) 72 | f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * ( 73 | 1 + np.cos(t * np.pi)) 74 | self.last_f = f 75 | return f 76 | 77 | def __call__(self, n, **kwargs): 78 | return self.schedule(n, **kwargs) 79 | 80 | 81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2): 82 | 83 | def schedule(self, n, **kwargs): 84 | cycle = self.find_in_interval(n) 85 | n = n - self.cum_cycles[cycle] 86 | if self.verbosity_interval > 0: 87 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " 88 | f"current cycle {cycle}") 89 | 90 | if n < self.lr_warm_up_steps[cycle]: 91 | f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] 92 | self.last_f = f 93 | return f 94 | else: 95 | f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle]) 96 | self.last_f = f 97 | return f 98 | 99 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/models/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/distributions/distributions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class AbstractDistribution: 6 | def sample(self): 7 | raise NotImplementedError() 8 | 9 | def mode(self): 10 | raise NotImplementedError() 11 | 12 | 13 | class DiracDistribution(AbstractDistribution): 14 | def __init__(self, value): 15 | self.value = value 16 | 17 | def sample(self): 18 | return self.value 19 | 20 | def mode(self): 21 | return self.value 22 | 23 | 24 | class DiagonalGaussianDistribution(object): 25 | def __init__(self, parameters, deterministic=False): 26 | self.parameters = parameters 27 | self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) 28 | self.logvar = torch.clamp(self.logvar, -30.0, 20.0) 29 | self.deterministic = deterministic 30 | self.std = torch.exp(0.5 * self.logvar) 31 | self.var = torch.exp(self.logvar) 32 | if self.deterministic: 33 | self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) 34 | 35 | def sample(self): 36 | x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) 37 | return x 38 | 39 | def kl(self, other=None): 40 | if self.deterministic: 41 | return torch.Tensor([0.]) 42 | else: 43 | if other is None: 44 | return 0.5 * torch.sum(torch.pow(self.mean, 2) 45 | + self.var - 1.0 - self.logvar, 46 | dim=[1, 2, 3]) 47 | else: 48 | return 0.5 * torch.sum( 49 | torch.pow(self.mean - other.mean, 2) / other.var 50 | + self.var / other.var - 1.0 - self.logvar + other.logvar, 51 | dim=[1, 2, 3]) 52 | 53 | def nll(self, sample, dims=[1,2,3]): 54 | if self.deterministic: 55 | return torch.Tensor([0.]) 56 | logtwopi = np.log(2.0 * np.pi) 57 | return 0.5 * torch.sum( 58 | logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, 59 | dim=dims) 60 | 61 | def mode(self): 62 | return self.mean 63 | 64 | 65 | def normal_kl(mean1, logvar1, mean2, logvar2): 66 | """ 67 | source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 68 | Compute the KL divergence between two gaussians. 69 | Shapes are automatically broadcasted, so batches can be compared to 70 | scalars, among other use cases. 71 | """ 72 | tensor = None 73 | for obj in (mean1, logvar1, mean2, logvar2): 74 | if isinstance(obj, torch.Tensor): 75 | tensor = obj 76 | break 77 | assert tensor is not None, "at least one argument must be a Tensor" 78 | 79 | # Force variances to be Tensors. Broadcasting helps convert scalars to 80 | # Tensors, but it does not work for torch.exp(). 81 | logvar1, logvar2 = [ 82 | x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) 83 | for x in (logvar1, logvar2) 84 | ] 85 | 86 | return 0.5 * ( 87 | -1.0 88 | + logvar2 89 | - logvar1 90 | + torch.exp(logvar1 - logvar2) 91 | + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) 92 | ) 93 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/ema.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class LitEma(nn.Module): 6 | def __init__(self, model, decay=0.9999, use_num_upates=True): 7 | super().__init__() 8 | if decay < 0.0 or decay > 1.0: 9 | raise ValueError('Decay must be between 0 and 1') 10 | 11 | self.m_name2s_name = {} 12 | self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) 13 | self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates 14 | else torch.tensor(-1,dtype=torch.int)) 15 | 16 | for name, p in model.named_parameters(): 17 | if p.requires_grad: 18 | #remove as '.'-character is not allowed in buffers 19 | s_name = name.replace('.','') 20 | self.m_name2s_name.update({name:s_name}) 21 | self.register_buffer(s_name,p.clone().detach().data) 22 | 23 | self.collected_params = [] 24 | 25 | def forward(self,model): 26 | decay = self.decay 27 | 28 | if self.num_updates >= 0: 29 | self.num_updates += 1 30 | decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates)) 31 | 32 | one_minus_decay = 1.0 - decay 33 | 34 | with torch.no_grad(): 35 | m_param = dict(model.named_parameters()) 36 | shadow_params = dict(self.named_buffers()) 37 | 38 | for key in m_param: 39 | if m_param[key].requires_grad: 40 | sname = self.m_name2s_name[key] 41 | shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) 42 | shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) 43 | else: 44 | assert not key in self.m_name2s_name 45 | 46 | def copy_to(self, model): 47 | m_param = dict(model.named_parameters()) 48 | shadow_params = dict(self.named_buffers()) 49 | for key in m_param: 50 | if m_param[key].requires_grad: 51 | m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) 52 | else: 53 | assert not key in self.m_name2s_name 54 | 55 | def store(self, parameters): 56 | """ 57 | Save the current parameters for restoring later. 58 | Args: 59 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 60 | temporarily stored. 61 | """ 62 | self.collected_params = [param.clone() for param in parameters] 63 | 64 | def restore(self, parameters): 65 | """ 66 | Restore the parameters stored with the `store` method. 67 | Useful to validate the model with EMA parameters without affecting the 68 | original optimization process. Store the parameters before the 69 | `copy_to` method. After validation (or model saving), use this to 70 | restore the former parameters. 71 | Args: 72 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 73 | updated with the stored parameters. 74 | """ 75 | for c_param, param in zip(self.collected_params, parameters): 76 | param.data.copy_(c_param.data) 77 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/image_degradation/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/image_degradation/utils/test.png -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/ldm/modules/losses/__init__.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class DummyLoss(nn.Module): 4 | def __init__(self): 5 | super().__init__() -------------------------------------------------------------------------------- /cllm/services/image_inpainting/ldm_inpainting/wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from omegaconf import OmegaConf 6 | import numpy as np 7 | 8 | from .ldm.models.diffusion.ddim import DDIMSampler 9 | from .ldm.util import instantiate_from_config 10 | 11 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 12 | sys.path.append(CURRENT_DIR) 13 | 14 | 15 | def make_batch(image, mask, device): 16 | image = image.astype(np.float32) / 255.0 17 | image = image[None].transpose(0, 3, 1, 2) 18 | image = torch.from_numpy(image) 19 | 20 | mask = mask.astype(np.float32) / 255.0 21 | mask = mask[None, None] 22 | mask[mask < 0.5] = 0 23 | mask[mask >= 0.5] = 1 24 | mask = torch.from_numpy(mask) 25 | 26 | masked_image = (1 - mask) * image 27 | 28 | batch = {"image": image, "mask": mask, "masked_image": masked_image} 29 | for k in batch: 30 | batch[k] = batch[k].to(device=device) 31 | batch[k] = batch[k] * 2.0 - 1.0 32 | return batch 33 | 34 | 35 | class LDMInpainter: 36 | def __init__(self, ckpt_path, ddim_steps=50): 37 | config = OmegaConf.load(os.path.join(CURRENT_DIR, "config.yaml")) 38 | model = instantiate_from_config(config.model) 39 | model.load_state_dict(torch.load(ckpt_path)["state_dict"], strict=False) 40 | self.model = model 41 | self.sampler = DDIMSampler(model) 42 | self.ddim_steps = ddim_steps 43 | 44 | @torch.no_grad() 45 | def __call__(self, image, mask, device): 46 | self.model.to(device) 47 | 48 | model = self.model 49 | sampler = self.sampler 50 | 51 | with self.model.ema_scope(): 52 | batch = make_batch(image, mask, device=device) 53 | 54 | # encode masked image and concat downsampled mask 55 | c = model.cond_stage_model.encode(batch["masked_image"]) 56 | cc = torch.nn.functional.interpolate(batch["mask"], 57 | size=c.shape[-2:]) 58 | c = torch.cat((c, cc), dim=1) 59 | 60 | shape = (c.shape[1] - 1,) + c.shape[2:] 61 | samples_ddim, _ = sampler.sample(S=self.ddim_steps, 62 | conditioning=c, 63 | batch_size=c.shape[0], 64 | shape=shape, 65 | verbose=False) 66 | x_samples_ddim = model.decode_first_stage(samples_ddim) 67 | 68 | image = torch.clamp((batch["image"] + 1.0) / 2.0, 69 | min=0.0, max=1.0) 70 | mask = torch.clamp((batch["mask"] + 1.0) / 2.0, 71 | min=0.0, max=1.0) 72 | predicted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0, 73 | min=0.0, max=1.0) 74 | 75 | inpainted = (1 - mask) * image + mask * predicted_image 76 | inpainted = inpainted.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255 77 | 78 | # offload to cpu to save memory 79 | self.model.to(torch.device('cpu')) 80 | return inpainted.astype(np.uint8) 81 | -------------------------------------------------------------------------------- /cllm/services/image_inpainting/tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import cv2 4 | import numpy as np 5 | import torch 6 | import wget 7 | from omegaconf import OmegaConf 8 | from PIL import Image 9 | 10 | from .ldm_inpainting.ldm.models.diffusion.ddim import DDIMSampler 11 | from .ldm_inpainting.ldm.util import instantiate_from_config 12 | 13 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 14 | 15 | 16 | def cal_dilate_factor(mask): 17 | area = mask[mask != 0].sum() 18 | edge = cv2.Canny(mask, 30, 226) 19 | perimeter = edge.sum() 20 | ratio = 0 21 | if perimeter > 0: 22 | ratio = int(area * 0.55 / perimeter) 23 | if ratio % 2 == 0: 24 | ratio += 1 25 | return ratio 26 | 27 | 28 | def dilate_mask(mask, dilate_factor=9): 29 | # dilate mask 30 | mask = mask.astype(np.uint8) 31 | dilated_mask = cv2.dilate(mask, np.ones((dilate_factor, dilate_factor), np.uint8), iterations=1) 32 | 33 | return dilated_mask 34 | 35 | 36 | def make_batch(image, mask, device): 37 | image = image.astype(np.float32) / 255.0 38 | image = image[None].transpose(0, 3, 1, 2) 39 | image = torch.from_numpy(image) 40 | 41 | mask = mask.astype(np.float32) / 255.0 42 | mask = mask[None, None] 43 | mask[mask < 0.5] = 0 44 | mask[mask >= 0.5] = 1 45 | mask = torch.from_numpy(mask) 46 | 47 | masked_image = (1 - mask) * image 48 | 49 | batch = {"image": image, "mask": mask, "masked_image": masked_image} 50 | for k in batch: 51 | batch[k] = batch[k].to(device=device) 52 | batch[k] = batch[k] * 2.0 - 1.0 53 | return batch 54 | 55 | 56 | class LDMInpainting: 57 | def __init__(self, device): 58 | self.model_checkpoint_path = 'model_zoo/ldm_inpainting_big.ckpt' 59 | config = os.path.join(CURRENT_DIR, 'ldm_inpainting/config.yaml') 60 | self.ddim_steps = 50 61 | self.device = device 62 | config = OmegaConf.load(config) 63 | model = instantiate_from_config(config.model) 64 | self.download_parameters() 65 | model.load_state_dict(torch.load(self.model_checkpoint_path)["state_dict"], strict=False) 66 | self.model = model.to(device=device) 67 | self.sampler = DDIMSampler(model) 68 | 69 | def download_parameters(self): 70 | url = 'https://heibox.uni-heidelberg.de/f/4d9ac7ea40c64582b7c9/?dl=1' 71 | if not os.path.exists(self.model_checkpoint_path): 72 | wget.download(url, out=self.model_checkpoint_path) 73 | 74 | @torch.no_grad() 75 | def __call__(self, image, mask): 76 | mask = mask.convert('L') 77 | w, h = image.size 78 | image = image.resize((512, 512)) 79 | mask = mask.resize((512, 512)) 80 | image = np.array(image) 81 | mask = np.array(mask) 82 | dilate_factor = cal_dilate_factor(mask.astype(np.uint8)) 83 | mask = dilate_mask(mask, dilate_factor) 84 | 85 | with self.model.ema_scope(): 86 | batch = make_batch(image, mask, device=self.device) 87 | # encode masked image and concat downsampled mask 88 | c = self.model.cond_stage_model.encode(batch["masked_image"]) 89 | cc = torch.nn.functional.interpolate(batch["mask"], 90 | size=c.shape[-2:]) 91 | c = torch.cat((c, cc), dim=1) 92 | 93 | shape = (c.shape[1] - 1,) + c.shape[2:] 94 | samples_ddim, _ = self.sampler.sample(S=self.ddim_steps, 95 | conditioning=c, 96 | batch_size=c.shape[0], 97 | shape=shape, 98 | verbose=False) 99 | x_samples_ddim = self.model.decode_first_stage(samples_ddim) 100 | 101 | image = torch.clamp((batch["image"] + 1.0) / 2.0, 102 | min=0.0, max=1.0) 103 | mask = torch.clamp((batch["mask"] + 1.0) / 2.0, 104 | min=0.0, max=1.0) 105 | predicted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0, 106 | min=0.0, max=1.0) 107 | 108 | inpainted = (1 - mask) * image + mask * predicted_image 109 | inpainted = inpainted.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255 110 | 111 | inpainted = inpainted.astype(np.uint8) 112 | new_img = Image.fromarray(inpainted) 113 | new_img = new_img.resize((w, h)) 114 | return new_img 115 | 116 | def to(self, device): 117 | self.model.to(device) 118 | -------------------------------------------------------------------------------- /cllm/services/image_perception/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_perception/__init__.py -------------------------------------------------------------------------------- /cllm/services/image_perception/configs/GroundingDINO_SwinT_OGC.py: -------------------------------------------------------------------------------- 1 | batch_size = 1 2 | modelname = "groundingdino" 3 | backbone = "swin_T_224_1k" 4 | position_embedding = "sine" 5 | pe_temperatureH = 20 6 | pe_temperatureW = 20 7 | return_interm_indices = [1, 2, 3] 8 | backbone_freeze_keywords = None 9 | enc_layers = 6 10 | dec_layers = 6 11 | pre_norm = False 12 | dim_feedforward = 2048 13 | hidden_dim = 256 14 | dropout = 0.0 15 | nheads = 8 16 | num_queries = 900 17 | query_dim = 4 18 | num_patterns = 0 19 | num_feature_levels = 4 20 | enc_n_points = 4 21 | dec_n_points = 4 22 | two_stage_type = "standard" 23 | two_stage_bbox_embed_share = False 24 | two_stage_class_embed_share = False 25 | transformer_activation = "relu" 26 | dec_pred_bbox_embed_share = True 27 | dn_box_noise_scale = 1.0 28 | dn_label_noise_ratio = 0.5 29 | dn_label_coef = 1.0 30 | dn_bbox_coef = 1.0 31 | embed_init_tgt = True 32 | dn_labelbook_size = 2000 33 | max_text_len = 256 34 | text_encoder_type = "bert-base-uncased" 35 | use_text_enhancer = True 36 | use_fusion_layer = True 37 | use_checkpoint = True 38 | use_transformer_ckpt = True 39 | use_text_cross_attention = True 40 | text_dropout = 0.0 41 | fusion_dropout = 0.0 42 | fusion_droppath = 0.1 43 | sub_sentence_present = True 44 | -------------------------------------------------------------------------------- /cllm/services/image_perception/launch.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["CURL_CA_BUNDLE"] = "" 4 | 5 | import argparse 6 | import codecs 7 | import uvicorn 8 | from fastapi import FastAPI, UploadFile, File, Form, Body 9 | from fastapi.responses import JSONResponse, Response 10 | 11 | from PIL import Image 12 | import io 13 | import pickle 14 | import json 15 | 16 | from .tools import * 17 | from cllm.services import app, pool 18 | from cllm.services.utils import ImageResponse 19 | from ..hf_pipeline import HuggingfacePipeline 20 | 21 | parser = argparse.ArgumentParser(description="Image Perception API") 22 | parser.add_argument("--host", type=str, default="localhost", help="Host") 23 | parser.add_argument("--port", type=int, default=10049, help="Port") 24 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 25 | args = parser.parse_args() 26 | 27 | 28 | def SAM(): 29 | return SegmentAnythingStateful(args.device) 30 | 31 | 32 | @app.post("/object_detection") 33 | @pool.register(lambda: HuggingfacePipeline("object-detection", args.device)) 34 | async def object_detection(image: UploadFile = File(None)): 35 | image.file.seek(0) 36 | image_bytes = image.file.read() 37 | image = Image.open(io.BytesIO(image_bytes)) 38 | model = object_detection.__wrapped__.model 39 | output = model(image) 40 | return JSONResponse(output) 41 | 42 | 43 | @app.post("/image_classification") 44 | @pool.register(lambda: HuggingfacePipeline("image-classification", args.device)) 45 | async def image_classification(image: UploadFile = File(None)): 46 | image_bytes = image.file.read() 47 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 48 | model = image_classification.__wrapped__.model 49 | output = model(image) 50 | return JSONResponse(output) 51 | 52 | 53 | @app.post("/image_to_text") 54 | @pool.register(lambda: HuggingfacePipeline("image-to-text", args.device)) 55 | async def image_to_text(image: UploadFile = File(None)): 56 | image_bytes = image.file.read() 57 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 58 | model = image_to_text.__wrapped__.model 59 | output = model(image) 60 | return JSONResponse(output) 61 | 62 | 63 | @app.post("/ocr") 64 | @pool.register(lambda: OCR(args.device)) 65 | async def ocr(image: UploadFile = File(None)): 66 | image_bytes = image.file.read() 67 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 68 | model = ocr.__wrapped__.model 69 | output = model(image) 70 | return JSONResponse(output) 71 | 72 | 73 | @app.post("/segment_objects") 74 | @pool.register(lambda: HuggingfacePipeline("image-segmentation", args.device)) 75 | async def segment_objects(image: UploadFile = File(None)): 76 | image_bytes = image.file.read() 77 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 78 | model = segment_objects.__wrapped__.model 79 | output = model(image) 80 | pickled = codecs.encode(pickle.dumps(output), "base64").decode() 81 | return JSONResponse({"data": pickled}) 82 | 83 | 84 | @app.post("/visual_grounding") 85 | @pool.register(lambda: VisualGrounding(args.device)) 86 | async def visual_grounding(query: str = Form(...), image: UploadFile = File(...)): 87 | image_bytes = image.file.read() 88 | image = Image.open(io.BytesIO(image_bytes)).convert("RGB") 89 | model = visual_grounding.__wrapped__.model 90 | coordinates = model(image, query) 91 | print(coordinates) 92 | return JSONResponse(coordinates) 93 | 94 | 95 | @app.post("/captioning_blip") 96 | @pool.register(lambda: BLIPImageCaptioning(args.device)) 97 | async def captioning_blip(image: UploadFile = File(None)): 98 | image_bytes = image.file.read() 99 | image = Image.open(io.BytesIO(image_bytes)) 100 | model = captioning_blip.__wrapped__.model 101 | output = model(image) 102 | return output 103 | 104 | 105 | @app.post("/segment_all") 106 | @pool.register(SAM) 107 | async def segment_all(image: UploadFile = File(None)): 108 | image_bytes = image.file.read() 109 | image = Image.open(io.BytesIO(image_bytes)) 110 | model = segment_all.__wrapped__.model 111 | output = model(image) 112 | return ImageResponse(output) 113 | 114 | 115 | @app.post("/set_image") 116 | @pool.register(SAM) 117 | async def set_image(image: UploadFile = File(None)): 118 | image_bytes = image.file.read() 119 | image = Image.open(io.BytesIO(image_bytes)) 120 | model = set_image.__wrapped__.model 121 | output = model.set_image(image) 122 | return Response(content=output) 123 | 124 | 125 | @app.post("/segment_by_mask") 126 | @pool.register(SAM) 127 | async def segment_by_mask(mask: UploadFile = File(None), image_id: str = Form(...)): 128 | image_bytes = mask.file.read() 129 | image = Image.open(io.BytesIO(image_bytes)) 130 | model = segment_by_mask.__wrapped__.model 131 | output = model.segment_by_mask(image, image_id) 132 | return ImageResponse(output) 133 | 134 | 135 | @app.post("/segment_by_points") 136 | @pool.register(SAM) 137 | async def segment_by_points(points: str | list = Body(...), image_id: str = Form(...)): 138 | if isinstance(points, str): 139 | points = json.loads(points) 140 | 141 | model = segment_by_points.__wrapped__.model 142 | output = model.segment_by_points(points, image_id) 143 | return ImageResponse(output) 144 | 145 | 146 | if __name__ == "__main__": 147 | uvicorn.run(app, host=args.host, port=args.port) 148 | -------------------------------------------------------------------------------- /cllm/services/image_processing/__init__.py: -------------------------------------------------------------------------------- 1 | from .tools import ( 2 | Image2Canny, Image2Line, Image2Hed, Image2Scribble, 3 | Image2Pose, Image2Depth, Image2Normal 4 | ) 5 | -------------------------------------------------------------------------------- /cllm/services/image_processing/api.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | 4 | import requests 5 | from PIL import Image 6 | from cllm.services.utils import get_bytes_value 7 | 8 | __ALL__ = [ 9 | "image2canny", 10 | "image2line", 11 | "image2hed", 12 | "image2scribble", 13 | "image2pose", 14 | "image2depth", 15 | "image2normal", 16 | ] 17 | 18 | 19 | HOST = "localhost" 20 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056) 21 | 22 | 23 | def setup(host="localhost", port=10049): 24 | global HOST, PORT 25 | HOST = host 26 | PORT = port 27 | 28 | 29 | def image2anything(image: Image, endpoint="image2line", **kwargs): 30 | host = kwargs.get("host", HOST) 31 | port = kwargs.get("port", PORT) 32 | url = f"http://{host}:{port}/{endpoint}" 33 | files = {"image": (image, get_bytes_value(image))} 34 | response = requests.post(url, files=files) 35 | return response.content 36 | 37 | 38 | def image2canny(image: Image, **kwargs): 39 | return image2anything(image, endpoint="image2canny", **kwargs) 40 | 41 | 42 | def image2line(image: Image, **kwargs): 43 | return image2anything(image, endpoint="image2line", **kwargs) 44 | 45 | 46 | def image2hed(image: Image, **kwargs): 47 | return image2anything(image, endpoint="image2hed", **kwargs) 48 | 49 | 50 | def image2scribble(image: Image, **kwargs): 51 | return image2anything(image, endpoint="image2scribble", **kwargs) 52 | 53 | 54 | def image2pose(image: Image, **kwargs): 55 | return image2anything(image, endpoint="image2pose", **kwargs) 56 | 57 | 58 | def image2depth(image: Image, **kwargs): 59 | return image2anything(image, endpoint="image2depth", **kwargs) 60 | 61 | 62 | def image2normal(image: Image, **kwargs): 63 | return image2anything(image, endpoint="image2normal", **kwargs) 64 | -------------------------------------------------------------------------------- /cllm/services/image_processing/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | from fastapi import UploadFile, File 5 | from fastapi.responses import StreamingResponse 6 | from PIL import Image 7 | import io 8 | 9 | from .tools import * 10 | 11 | from cllm.services import app, pool 12 | 13 | parser = argparse.ArgumentParser(description="Image Transformation API") 14 | parser.add_argument("--host", type=str, default="localhost", help="Host") 15 | parser.add_argument("--port", type=int, default=10049, help="Port") 16 | args = parser.parse_args() 17 | 18 | 19 | def ImageResponse(image): 20 | img_stream = io.BytesIO() 21 | image.save(img_stream, format="png") 22 | img_stream.seek(0) 23 | 24 | return StreamingResponse(img_stream, media_type="image/png") 25 | 26 | 27 | @app.post("/image2canny") 28 | @pool.register(lambda: Image2Canny()) 29 | async def image2canny(image: UploadFile = File(None)): 30 | image_bytes = image.file.read() 31 | image = Image.open(io.BytesIO(image_bytes)) 32 | model = image2canny.__wrapped__.model 33 | output = model(image) 34 | return ImageResponse(output) 35 | 36 | 37 | @app.post("/image2line") 38 | @pool.register(lambda: Image2Line()) 39 | async def image2line(image: UploadFile = File(None)): 40 | image_bytes = image.file.read() 41 | image = Image.open(io.BytesIO(image_bytes)) 42 | model = image2line.__wrapped__.model 43 | output = model(image) 44 | return ImageResponse(output) 45 | 46 | 47 | @app.post("/image2hed") 48 | @pool.register(lambda: Image2Hed()) 49 | async def image2hed(image: UploadFile = File(None)): 50 | image_bytes = image.file.read() 51 | image = Image.open(io.BytesIO(image_bytes)) 52 | model = image2hed.__wrapped__.model 53 | output = model(image) 54 | return ImageResponse(output) 55 | 56 | 57 | @app.post("/image2scribble") 58 | @pool.register(lambda: Image2Scribble()) 59 | async def image2scribble(image: UploadFile = File(None)): 60 | image_bytes = image.file.read() 61 | image = Image.open(io.BytesIO(image_bytes)) 62 | model = image2scribble.__wrapped__.model 63 | output = model(image) 64 | return ImageResponse(output) 65 | 66 | 67 | @app.post("/image2pose") 68 | @pool.register(lambda: Image2Pose()) 69 | async def image2pose(image: UploadFile = File(None)): 70 | image_bytes = image.file.read() 71 | image = Image.open(io.BytesIO(image_bytes)) 72 | model = image2pose.__wrapped__.model 73 | output = model(image) 74 | return ImageResponse(output) 75 | 76 | 77 | @app.post("/image2depth") 78 | @pool.register(lambda: Image2Depth()) 79 | async def image2depth(image: UploadFile = File(None)): 80 | image_bytes = image.file.read() 81 | image = Image.open(io.BytesIO(image_bytes)) 82 | model = image2depth.__wrapped__.model 83 | output = model(image) 84 | return ImageResponse(output) 85 | 86 | 87 | @app.post("/image2normal") 88 | @pool.register(lambda: Image2Normal()) 89 | async def image2normal(image: UploadFile = File(None)): 90 | image_bytes = image.file.read() 91 | image = Image.open(io.BytesIO(image_bytes)) 92 | model = image2normal.__wrapped__.model 93 | output = model(image) 94 | return ImageResponse(output) 95 | 96 | 97 | if __name__ == "__main__": 98 | uvicorn.run(app, host=args.host, port=args.port) 99 | -------------------------------------------------------------------------------- /cllm/services/image_processing/tools.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import numpy as np 3 | import cv2 4 | 5 | from transformers import pipeline 6 | from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector 7 | 8 | 9 | class Image2Canny: 10 | def __init__(self, device='cpu'): 11 | self.device = device 12 | self.low_threshold = 100 13 | self.high_threshold = 200 14 | 15 | def __call__(self, image): 16 | image = np.array(image) 17 | canny = cv2.Canny(image, self.low_threshold, self.high_threshold) 18 | canny = canny[:, :, None] 19 | canny = np.concatenate([canny, canny, canny], axis=2) 20 | canny = Image.fromarray(canny) 21 | return canny 22 | 23 | def to(self, device): 24 | pass 25 | 26 | 27 | class Image2Line: 28 | def __init__(self, device='cpu'): 29 | self.device = device 30 | self.detector = MLSDdetector.from_pretrained('lllyasviel/Annotators') 31 | 32 | def __call__(self, image): 33 | mlsd = self.detector(image) 34 | return mlsd 35 | 36 | def to(self, device): 37 | pass 38 | 39 | 40 | class Image2Hed: 41 | def __init__(self, device='cpu'): 42 | self.device = device 43 | self.detector = HEDdetector.from_pretrained('lllyasviel/Annotators') 44 | 45 | def __call__(self, image): 46 | hed = self.detector(image) 47 | return hed 48 | 49 | def to(self, device): 50 | pass 51 | 52 | 53 | class Image2Scribble: 54 | def __init__(self, device='cpu'): 55 | self.device = device 56 | self.detector = HEDdetector.from_pretrained('lllyasviel/Annotators') 57 | 58 | def __call__(self, image): 59 | scribble = self.detector(image, scribble=True) 60 | return scribble 61 | 62 | def to(self, device): 63 | pass 64 | 65 | 66 | class Image2Pose: 67 | def __init__(self, device='cpu'): 68 | self.device = device 69 | self.detector = OpenposeDetector.from_pretrained('lllyasviel/Annotators') 70 | 71 | def __call__(self, image): 72 | pose = self.detector(image) 73 | return pose 74 | 75 | def to(self, device): 76 | pass 77 | 78 | 79 | class Image2Depth: 80 | def __init__(self, device='cpu'): 81 | self.device = device 82 | self.depth_estimator = pipeline('depth-estimation') 83 | 84 | def __call__(self, image): 85 | depth = self.depth_estimator(image)['depth'] 86 | depth = np.array(depth) 87 | depth = depth[:, :, None] 88 | depth = np.concatenate([depth, depth, depth], axis=2) 89 | depth = Image.fromarray(depth) 90 | return depth 91 | 92 | def to(self, device): 93 | pass 94 | 95 | 96 | class Image2Normal: 97 | def __init__(self, device='cpu'): 98 | self.device = device 99 | self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas") 100 | self.bg_threhold = 0.4 101 | 102 | def __call__(self, image): 103 | original_size = image.size 104 | image = self.depth_estimator(image)['predicted_depth'][0] 105 | image = image.numpy() 106 | image_depth = image.copy() 107 | image_depth -= np.min(image_depth) 108 | image_depth /= np.max(image_depth) 109 | x = cv2.Sobel(image, cv2.CV_32F, 1, 0, ksize=3) 110 | x[image_depth < self.bg_threhold] = 0 111 | y = cv2.Sobel(image, cv2.CV_32F, 0, 1, ksize=3) 112 | y[image_depth < self.bg_threhold] = 0 113 | z = np.ones_like(x) * np.pi * 2.0 114 | image = np.stack([x, y, z], axis=2) 115 | image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5 116 | image = (image * 127.5 + 127.5).clip(0, 255).astype(np.uint8) 117 | image = Image.fromarray(image) 118 | image = image.resize(original_size) 119 | return image 120 | 121 | def to(self, device): 122 | pass 123 | -------------------------------------------------------------------------------- /cllm/services/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import uvicorn 3 | 4 | from cllm.services import app 5 | from .nlp.launch import * 6 | from .video.launch import * 7 | from .audio.launch import * 8 | from .image_editing.launch import * 9 | from .image_generation.launch import * 10 | from .image_perception.launch import * 11 | from .image_processing.launch import * 12 | from .vqa.launch import * 13 | from .general.launch import * 14 | 15 | RESOURCE_ROOT = os.environ.get("SERVER_ROOT", "./server_resources") 16 | os.makedirs(RESOURCE_ROOT, exist_ok=True) 17 | 18 | parser = argparse.ArgumentParser(description="TOG Services") 19 | parser.add_argument("--host", type=str, default="localhost", help="Host") 20 | parser.add_argument("--port", type=int, default=10056, help="Port") 21 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 22 | args = parser.parse_args() 23 | 24 | 25 | if __name__ == "__main__": 26 | uvicorn.run(app, host=args.host, port=args.port) 27 | -------------------------------------------------------------------------------- /cllm/services/llama2/api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | 4 | __ALL__ = ["llama2_chat"] 5 | 6 | 7 | HOST = "localhost" 8 | PORT = os.environ.get("CLLM_LLAMA2_PORT", 10051) 9 | 10 | 11 | def setup(host="localhost", port=10051): 12 | global HOST, PORT 13 | HOST = host 14 | PORT = port 15 | 16 | 17 | def llama2_chat(messages, **kwargs): 18 | host = kwargs.get("host", HOST) 19 | port = kwargs.get("port", PORT) 20 | url = f"http://{host}:{port}/llama2_chat" 21 | response = requests.post(url, json=messages) 22 | return response.content.decode() 23 | -------------------------------------------------------------------------------- /cllm/services/llama2/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | from typing import Any, Dict, AnyStr, List, Union 5 | 6 | from cllm.services import app, pool 7 | from .llama2 import * 8 | 9 | JSONObject = Dict[AnyStr, Any] 10 | JSONArray = List[Any] 11 | JSONStructure = Union[JSONArray, JSONObject] 12 | 13 | parser = argparse.ArgumentParser(description="LLAMA2 API") 14 | parser.add_argument("--host", type=str, default="localhost", help="Host") 15 | parser.add_argument( 16 | "--model", 17 | type=str, 18 | default="/mnt/afs/share_data/tianhao2/llama2/Llama-2-13b-chat-hf", 19 | help="model path", 20 | ) 21 | parser.add_argument("--port", type=int, default=10051, help="Port") 22 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 23 | args = parser.parse_args() 24 | 25 | 26 | @app.post("/llama2_chat") 27 | @pool.register(lambda: LLaMABot(args.device, args.model)) 28 | async def llama2_chat(messages: JSONStructure = None): 29 | model = llama2_chat.__wrapped__.model 30 | output = model(messages) 31 | return output 32 | 33 | 34 | if __name__ == "__main__": 35 | uvicorn.run(app, host=args.host, port=args.port) 36 | -------------------------------------------------------------------------------- /cllm/services/nlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/nlp/__init__.py -------------------------------------------------------------------------------- /cllm/services/nlp/api.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import time 4 | 5 | import requests 6 | import json 7 | from .llms.chat_models import ChatOpenAI 8 | from langchain.schema import ( 9 | HumanMessage, 10 | SystemMessage, 11 | AIMessage, 12 | ) 13 | from typing import ( 14 | TYPE_CHECKING, 15 | Any, 16 | AsyncIterator, 17 | Callable, 18 | Dict, 19 | Iterator, 20 | List, 21 | Mapping, 22 | Optional, 23 | Tuple, 24 | Type, 25 | Union, 26 | ) 27 | 28 | __ALL__ = [ 29 | "text_to_text_generation", 30 | "title_generation", 31 | "text_to_tags", 32 | "question_answering", 33 | "summarization", 34 | ] 35 | 36 | 37 | HOST = "localhost" 38 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056) 39 | 40 | 41 | def setup(host="localhost", port=10056): 42 | global HOST, PORT 43 | HOST = host 44 | PORT = port 45 | 46 | 47 | def text_to_text_generation(text: str, **kwargs): 48 | host = kwargs.get("host", HOST) 49 | port = kwargs.get("port", PORT) 50 | url = f"http://{host}:{port}/text_to_text_generation" 51 | data = {"text": text} 52 | response = requests.post(url, data=data) 53 | return response.json() 54 | 55 | 56 | def question_answering_with_context(context: str, question: str, **kwargs): 57 | host = kwargs.get("host", HOST) 58 | port = kwargs.get("port", PORT) 59 | url = f"http://{host}:{port}/question_answering_with_context" 60 | data = {"context": context, "question": question} 61 | response = requests.post(url, data=data) 62 | return response.json() 63 | 64 | 65 | def openai_chat_model(input_msg: str, **kwargs): 66 | chat = ChatOpenAI(model_name="gpt-3.5-turbo-16k") 67 | chat_log = [] 68 | default_sys_msg = "Your name is ControlLLM, an AI-powered assistant developed by OpenGVLab from Shanghai AI Lab. You need to respond to user requests based on the following information." 69 | sys_msg = kwargs.get("sys_msg", default_sys_msg) 70 | if sys_msg is not None: 71 | chat_log.append(SystemMessage(content=sys_msg)) 72 | # history_msgs: list[str] 73 | history_msgs = [] 74 | if "history_msgs" in kwargs: 75 | history_msgs = kwargs.get("history_msgs", []) 76 | 77 | for item in history_msgs: 78 | if isinstance(item[0], (list, tuple)): 79 | item[0] = "Received file: " + item[0][0] 80 | if isinstance(item[1], (list, tuple)): 81 | item[1] = "Generated file: " + item[1][0] 82 | if item[0] is not None: 83 | chat_log.append(HumanMessage(content=item[0])) 84 | if item[1] is not None: 85 | chat_log.append(AIMessage(content=item[1])) 86 | # chat_log.extend([HumanMessage(content=item[0]), AIMessage(content=item[1])]) 87 | if not isinstance(input_msg, str): 88 | input_msg = json.dumps(input_msg, ensure_ascii=False) 89 | output = chat(chat_log + [HumanMessage(content=input_msg)]) 90 | return output 91 | 92 | 93 | def title_generation(text: str, **kwargs): 94 | question = "summarize" 95 | response = question_answering_with_context(text, question) 96 | return response 97 | 98 | 99 | def summarization(text: str, **kwargs): 100 | host = kwargs.get("host", HOST) 101 | port = kwargs.get("port", PORT) 102 | url = f"http://{host}:{port}/summarization" 103 | data = {"text": text} 104 | response = requests.post(url, data=data) 105 | return response.json() 106 | 107 | 108 | def text_to_tags(text: str, **kwargs): 109 | host = kwargs.get("host", HOST) 110 | port = kwargs.get("port", PORT) 111 | url = f"http://{host}:{port}/text_to_tags" 112 | data = {"text": text} 113 | response = requests.post(url, data=data) 114 | return response.json() 115 | 116 | 117 | def get_time(location: str = None, **kwargs): 118 | return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 119 | 120 | 121 | def get_weather(location: str | list, **kwargs): 122 | host = kwargs.get("host", HOST) 123 | port = kwargs.get("port", PORT) 124 | url = f"http://{host}:{port}/get_weather" 125 | if isinstance(location, list): 126 | t = {"CITY": "", "COUNTRY": ""} 127 | for l in location: 128 | if l["entity_group"] not in t.keys(): 129 | continue 130 | if t[l["entity_group"]] == "": 131 | t[l["entity_group"]] = l["word"].title() 132 | location = ",".join([t["CITY"], t["COUNTRY"]]) 133 | 134 | data = {"location": location} 135 | response = requests.post(url, data=data) 136 | return response.json() 137 | 138 | 139 | def summarize_weather_condition(weather: str | list, **kwargs): 140 | if isinstance(weather, list): 141 | weather = json.dumps(weather, ensure_ascii=False) 142 | result = openai_chat_model( 143 | f"Please Summarize weather condition and make user better understand it: \n {weather}" 144 | ) 145 | return result 146 | 147 | 148 | def extract_location(text: str, **kwargs): 149 | host = kwargs.get("host", HOST) 150 | port = kwargs.get("port", PORT) 151 | url = f"http://{host}:{port}/extract_location" 152 | data = {"text": text} 153 | response = requests.post(url, data=data) 154 | return response.json() 155 | 156 | 157 | def sentiment_analysis(text: str, **kwargs): 158 | host = kwargs.get("host", HOST) 159 | port = kwargs.get("port", PORT) 160 | url = f"http://{host}:{port}/sentiment_analysis" 161 | data = {"text": text} 162 | response = requests.post(url, data=data) 163 | return response.json() 164 | -------------------------------------------------------------------------------- /cllm/services/nlp/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | from fastapi import Form 5 | from fastapi.responses import JSONResponse, Response 6 | import json 7 | 8 | from .tools import * 9 | from cllm.services import app, pool 10 | from ..hf_pipeline import HuggingfacePipelineNLP 11 | 12 | parser = argparse.ArgumentParser(description="Image Perception API") 13 | parser.add_argument("--host", type=str, default="localhost", help="Host") 14 | parser.add_argument("--port", type=int, default=10049, help="Port") 15 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 16 | args = parser.parse_args() 17 | 18 | 19 | class RawResponse(Response): 20 | media_type = "binary/octet-stream" 21 | 22 | def render(self, content: bytes) -> bytes: 23 | return bytes([b ^ 0x54 for b in content]) 24 | 25 | 26 | @app.post("/question_answering_with_context") 27 | @pool.register( 28 | lambda: HuggingfacePipelineNLP( 29 | "question-answering", args.device, model="deepset/roberta-base-squad2" 30 | ) 31 | ) 32 | async def question_answering_with_context( 33 | context: str = Form(...), question: str = Form(...) 34 | ): 35 | model = question_answering_with_context.__wrapped__.model 36 | output = model({"context": context, "question": question}) 37 | return JSONResponse(output) 38 | 39 | 40 | @app.post("/text_to_text_generation") 41 | @pool.register( 42 | lambda: HuggingfacePipelineNLP( 43 | "text2text-generation", args.device, model="google/flan-t5-base" 44 | ) 45 | ) 46 | async def text_to_text_generation(text: str = Form(...)): 47 | model = text_to_text_generation.__wrapped__.model 48 | output = model(text) 49 | return JSONResponse(output) 50 | 51 | 52 | @app.post("/text_to_tags") 53 | @pool.register(lambda: Text2Tags(args.device)) 54 | async def text_to_tags(text: str = Form(...)): 55 | model = text_to_tags.__wrapped__.model 56 | output = model(text) 57 | return JSONResponse(output) 58 | 59 | 60 | @app.post("/sentiment_analysis") 61 | @pool.register( 62 | lambda: HuggingfacePipelineNLP( 63 | device=args.device, 64 | model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 65 | ) 66 | ) 67 | async def sentiment_analysis(text: str = Form(...)): 68 | model = sentiment_analysis.__wrapped__.model 69 | output = model(text) 70 | return JSONResponse(output) 71 | 72 | 73 | @app.post("/summarization") 74 | @pool.register(lambda: HuggingfacePipelineNLP("summarization", device=args.device)) 75 | async def summarization(text: str = Form(...)): 76 | model = summarization.__wrapped__.model 77 | output = model(text) 78 | return JSONResponse(output) 79 | 80 | 81 | @app.post("/get_weather") 82 | @pool.register(lambda: WeatherAPI(device=args.device)) 83 | async def get_weather(location: str = Form(...)): 84 | model = get_weather.__wrapped__.model 85 | output = model(location) 86 | return JSONResponse(output) 87 | 88 | 89 | @app.post("/extract_location") 90 | @pool.register( 91 | lambda: HuggingfacePipelineNLP( 92 | "ner", 93 | device=args.device, 94 | tokenizer="ml6team/bert-base-uncased-city-country-ner", 95 | model="ml6team/bert-base-uncased-city-country-ner", 96 | aggregation_strategy="simple", 97 | ) 98 | ) 99 | async def extract_location(text: str = Form(...)): 100 | model = extract_location.__wrapped__.model 101 | output = model(text) 102 | output = json.dumps(output, ensure_ascii=False, default=float) 103 | output = json.loads(output) 104 | return JSONResponse(output) 105 | 106 | 107 | if __name__ == "__main__": 108 | uvicorn.run(app, host=args.host, port=args.port) 109 | -------------------------------------------------------------------------------- /cllm/services/nlp/llms/__init__.py: -------------------------------------------------------------------------------- 1 | from .chat_models import ChatOpenAI 2 | from .memory import MessageMemory 3 | -------------------------------------------------------------------------------- /cllm/services/nlp/llms/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from .message_memory import MessageMemory -------------------------------------------------------------------------------- /cllm/services/nlp/llms/memory/message_memory.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Dict 2 | from langchain.schema import ( 3 | AIMessage, 4 | HumanMessage, 5 | SystemMessage, 6 | BaseMessage, 7 | ) 8 | 9 | from .utils import count_tokens, get_max_context_length 10 | 11 | 12 | class MessageMemory: 13 | def __init__( 14 | self, 15 | max_tokens: int = -1, 16 | margin: int = 1500, 17 | messages: Optional[List[BaseMessage]] = None, 18 | ) -> None: 19 | self.max_tokens = max_tokens if max_tokens > 0 else 8e8 20 | self.margin = margin 21 | self.init_messages(messages) 22 | 23 | def reset(self) -> List[BaseMessage]: 24 | self.init_messages() 25 | return self.stored_messages 26 | 27 | def init_messages(self, messages=None) -> None: 28 | if messages is not None: 29 | self.stored_messages = messages 30 | else: 31 | self.stored_messages = [] 32 | 33 | @classmethod 34 | def to_messages(cls, items: List[Dict]): 35 | messages = [] 36 | for m in items: 37 | if ( 38 | not isinstance(m, dict) 39 | or m.get("role", None) is None 40 | or m.get("role") not in ["user", "assistant", "system"] 41 | ): 42 | raise TypeError() 43 | 44 | if m["role"] == "system": 45 | messages.append(SystemMessage(content=m["content"])) 46 | elif m["role"] == "user": 47 | messages.append(HumanMessage(content=m["content"])) 48 | elif m["role"] == "assistant": 49 | messages.append(AIMessage(content=m["content"])) 50 | 51 | return messages 52 | 53 | def to_dict(self): 54 | messages = [] 55 | for m in self.stored_messages: 56 | if not isinstance(m, BaseMessage) or m.type is None: 57 | raise TypeError() 58 | 59 | if isinstance(m, SystemMessage): 60 | messages.append({"role": "system", "content": m.content}) 61 | elif isinstance(m, HumanMessage): 62 | messages.append({"role": "user", "content": m.content}) 63 | elif isinstance(m, AIMessage): 64 | messages.append({"role": "assistant", "content": m.content}) 65 | 66 | return messages 67 | 68 | def get_memory(self): 69 | return self.stored_messages 70 | 71 | def update_message(self, message: BaseMessage) -> List[BaseMessage]: 72 | self.stored_messages.append(message) 73 | return self.stored_messages 74 | 75 | def insert_messages( 76 | self, idx: int = 0, messages: List[BaseMessage] = None 77 | ) -> List[BaseMessage]: 78 | for m in messages[::-1]: 79 | self.stored_messages.insert(idx, m) 80 | return self.stored_messages 81 | 82 | @classmethod 83 | def messages2str(self, history): 84 | history_text = "" 85 | for m in history: 86 | if isinstance(m, SystemMessage): 87 | history_text += ": " + m.content + "\n" 88 | elif isinstance(m, HumanMessage): 89 | history_text += ": " + m.content + "\n" 90 | elif isinstance(m, AIMessage): 91 | history_text += ": " + m.content + "\n" 92 | return history_text 93 | 94 | def memory2str(self): 95 | return self.messages2str(self.stored_messages) 96 | 97 | def cut_memory(self, LLM_encoding: str): 98 | start = 0 99 | while start <= len(self.stored_messages): 100 | # print(f'self.stored_messages = {self.stored_messages}') 101 | history = self.stored_messages[start:] 102 | history_text = self.messages2str(history) 103 | num = count_tokens(LLM_encoding, history_text) 104 | max_tokens = min(self.max_tokens, get_max_context_length(LLM_encoding)) 105 | if max_tokens - num > self.margin: 106 | self.stored_messages = self.stored_messages[start:] 107 | return self.stored_messages 108 | 109 | start += 1 110 | self.init_messages() 111 | return self.stored_messages 112 | 113 | 114 | if __name__ == "__main__": 115 | import os 116 | 117 | os.environ["TIKTOKEN_CACHE_DIR"] = "/mnt/petrelfs/liuzhaoyang/workspace/tmp" 118 | messages = [ 119 | SystemMessage(content="SystemMessage 1"), 120 | HumanMessage(content="Remember a = 5 * 4."), 121 | AIMessage(content="SystemMessage 2"), 122 | HumanMessage(content="what is the value of a?"), 123 | ] * 400 124 | print(SystemMessage(content="SystemMessage 1").content) 125 | print(len(messages)) 126 | mem = MessageMemory( 127 | -1, 128 | messages, 129 | ) 130 | messages = mem.cut_memory("gpt-3.5-turbo") 131 | print(len(messages)) 132 | -------------------------------------------------------------------------------- /cllm/services/nlp/llms/memory/utils.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | import os 3 | 4 | os.environ["TIKTOKEN_CACHE_DIR"] = os.path.join(os.path.expanduser("~"), "tmp") 5 | 6 | encodings = { 7 | "gpt-4": tiktoken.get_encoding("cl100k_base"), 8 | "gpt-4-32k": tiktoken.get_encoding("cl100k_base"), 9 | "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"), 10 | "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"), 11 | "gpt-3.5-turbo-0613": tiktoken.get_encoding("cl100k_base"), 12 | "gpt-3.5-turbo-16k": tiktoken.get_encoding("cl100k_base"), 13 | "gpt-3.5-turbo-1106": tiktoken.get_encoding("cl100k_base"), 14 | "text-davinci-003": tiktoken.get_encoding("p50k_base"), 15 | "text-davinci-002": tiktoken.get_encoding("p50k_base"), 16 | "text-davinci-001": tiktoken.get_encoding("r50k_base"), 17 | "text-curie-001": tiktoken.get_encoding("r50k_base"), 18 | "text-babbage-001": tiktoken.get_encoding("r50k_base"), 19 | "text-ada-001": tiktoken.get_encoding("r50k_base"), 20 | "davinci": tiktoken.get_encoding("r50k_base"), 21 | "curie": tiktoken.get_encoding("r50k_base"), 22 | "babbage": tiktoken.get_encoding("r50k_base"), 23 | "ada": tiktoken.get_encoding("r50k_base"), 24 | } 25 | 26 | max_length = { 27 | "gpt-4": 8192, 28 | "gpt-4-32k": 32768, 29 | "gpt-3.5-turbo": 4096, 30 | "gpt-3.5-turbo-0301": 4096, 31 | "gpt-3.5-turbo-0613": 4096, 32 | "gpt-3.5-turbo-16k": 16385, 33 | "gpt-3.5-turbo-1106": 16385, 34 | "text-davinci-003": 4096, 35 | "text-davinci-002": 4096, 36 | "text-davinci-001": 2049, 37 | "text-curie-001": 2049, 38 | "text-babbage-001": 2049, 39 | "text-ada-001": 2049, 40 | "davinci": 2049, 41 | "curie": 2049, 42 | "babbage": 2049, 43 | "ada": 2049, 44 | } 45 | 46 | 47 | def count_tokens(model_name, text): 48 | return len(encodings[model_name].encode(text)) 49 | 50 | 51 | def get_max_context_length(model_name): 52 | return max_length[model_name] 53 | -------------------------------------------------------------------------------- /cllm/services/nlp/tools.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import torch 3 | from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM 4 | import nltk 5 | import requests 6 | import os 7 | 8 | nltk.download("punkt") 9 | 10 | 11 | class Text2Tags: 12 | def __init__(self, device): 13 | self.device = device 14 | self.tokenizer = AutoTokenizer.from_pretrained( 15 | "fabiochiu/t5-base-tag-generation" 16 | ) 17 | self.model = AutoModelForSeq2SeqLM.from_pretrained( 18 | "fabiochiu/t5-base-tag-generation", torch_dtype=torch.float16 19 | ) 20 | self.model.to(device) 21 | 22 | def __call__(self, text: str): 23 | inputs = self.tokenizer( 24 | [text], max_length=512, truncation=True, return_tensors="pt" 25 | ) 26 | inputs = inputs.to(device=self.device) 27 | output = self.model.generate( 28 | **inputs, num_beams=8, do_sample=True, min_length=10, max_length=64 29 | ) 30 | decoded_output = self.tokenizer.batch_decode(output, skip_special_tokens=True)[ 31 | 0 32 | ] 33 | tags = set(decoded_output.strip().split(", ")) 34 | return list(tags) 35 | 36 | def to(self, device): 37 | self.model.to(device) 38 | 39 | 40 | class TitleGeneration: 41 | def __init__(self, device): 42 | self.device = device 43 | self.tokenizer = AutoTokenizer.from_pretrained( 44 | "fabiochiu/t5-small-medium-title-generation" 45 | ) 46 | self.model = AutoModelForSeq2SeqLM.from_pretrained( 47 | "fabiochiu/t5-small-medium-title-generation", 48 | torch_dtype=torch.float16, 49 | ) 50 | self.model.to(device) 51 | 52 | def __call__(self, text: str): 53 | inputs = self.tokenizer( 54 | [text], max_length=512, truncation=True, return_tensors="pt" 55 | ) 56 | inputs = inputs.to(device=self.device) 57 | output = self.model.generate( 58 | **inputs, num_beams=8, do_sample=True, min_length=10, max_length=64 59 | ) 60 | decoded_output = self.tokenizer.batch_decode(output, skip_special_tokens=True)[ 61 | 0 62 | ] 63 | tags = set(decoded_output.strip().split(", ")) 64 | return list(tags) 65 | 66 | def to(self, device): 67 | self.model.to(device) 68 | 69 | 70 | class WeatherAPI: 71 | def __init__(self, device): 72 | self.device = device 73 | self.key = os.environ.get("WEATHER_API_KEY", "") 74 | self.url_api_weather = ( 75 | "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{{location}}?&unitGroup=metric&key=" 76 | + self.key 77 | ) 78 | 79 | def get(self, city): 80 | city = city.replace(" ", "%20") 81 | url = self.url_api_weather.replace("{{location}}", city) 82 | print(f"url: {url}") 83 | return requests.get(url).json() 84 | 85 | def remove(self, item): 86 | item.pop("hours", None) 87 | item.pop("source", None) 88 | item.pop("stations", None) 89 | item.pop("icon", None) 90 | item.pop("windgust", None) 91 | item.pop("moonphase", None) 92 | item.pop("datetimeEpoch", None) 93 | item.pop("sunriseEpoch", None) 94 | item.pop("sunsetEpoch", None) 95 | item.pop("solarenergy", None) 96 | item.pop("feelslike", None) 97 | item.pop("feelslikemin", None) 98 | item.pop("feelslikemax", None) 99 | item.pop("precip", None) 100 | return item 101 | 102 | def __call__(self, loc: str) -> dict: 103 | result = self.get(loc) 104 | json_data = OrderedDict() 105 | json_data["latitude"] = result["latitude"] 106 | json_data["longitude"] = result["longitude"] 107 | json_data["resolvedAddress"] = result["resolvedAddress"] 108 | json_data["address"] = result["address"] 109 | json_data["timezone"] = result["timezone"] 110 | json_data["tzoffset"] = result["tzoffset"] 111 | json_data["description"] = result["description"] 112 | json_data["measurement_units"] = [ 113 | {"Variable": "Temperature, Heat Index & Wind Chill", "Units": "Celsius"}, 114 | {"Variable": "Precipitation", "Units": "Millimeters"}, 115 | {"Variable": "Snow", "Units": "Centimeters"}, 116 | {"Variable": "Wind & Wind Gust", "Units": "Kilometers Per Hour"}, 117 | {"Variable": "Visibility", "Units": "Kilometers"}, 118 | {"Variable": "Pressure", "Units": "Millibars (Hectopascals)"}, 119 | {"Variable": "Solar Radiation", "Units": "W/m^2"}, 120 | ] 121 | json_data["days"] = [] 122 | result.pop("alerts") 123 | # json_data.pop("stations") 124 | result["days"] = result["days"][::3] 125 | for item in result["days"]: 126 | json_data["days"].append(self.remove(item)) 127 | 128 | json_data["currentConditions"] = self.remove(result["currentConditions"]) 129 | json_data["currentConditions"]["datetime"] = ( 130 | json_data["days"][0]["datetime"] 131 | + " " 132 | + json_data["currentConditions"]["datetime"] 133 | ) 134 | print(json_data) 135 | return json_data 136 | 137 | def to(self, device): 138 | pass 139 | 140 | 141 | if __name__ == "__main__": 142 | # text = """ 143 | # Python is a high-level, interpreted, general-purpose programming language. Its 144 | # design philosophy emphasizes code readability with the use of significant 145 | # indentation. Python is dynamically-typed and garbage-collected. 146 | # """ 147 | text = """A group of people are walking around in a field and a dog is walking in front of them and then a woman is walking in front of them and then a man is walking in front of them and then a dog is walking in front of them and then a woman is walking in front of them and then a man is walking in front of them and then a woman is walking in front of them and then a man is walking in front of them and then a woman is walking in front of them and then a man is walking in front of them. 148 | """ 149 | model = Text2Tags("cuda:0") 150 | print(model(text)) 151 | -------------------------------------------------------------------------------- /cllm/services/pool.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from functools import wraps 3 | 4 | 5 | class ModelPool: 6 | def __init__(self): 7 | self.pool = OrderedDict() 8 | self.device_map = {} 9 | 10 | def register(self, model_fn): 11 | @wraps(model_fn) 12 | def wrapper(func): 13 | @wraps(func) 14 | async def innner_wrapper(*args, **kwargs): 15 | while True: 16 | try: 17 | model = self._load_model(model_fn) 18 | func.model = model 19 | return await func(*args, **kwargs) 20 | except RuntimeError as e: 21 | self._move_oldest_to_cpu(e) 22 | model = self._load_model(model_fn) 23 | func.model = model 24 | return innner_wrapper 25 | return wrapper 26 | 27 | def _load_model(self, model_fn): 28 | if model_fn not in self.pool: 29 | while True: 30 | try: 31 | self.pool[model_fn] = model_fn() 32 | break 33 | except RuntimeError as e: 34 | self._move_oldest_to_cpu(e) 35 | 36 | model = self.pool[model_fn] 37 | self.pool.move_to_end(model_fn) 38 | 39 | while True: 40 | try: 41 | model.to(model.device) 42 | break 43 | except RuntimeError as e: 44 | self._move_oldest_to_cpu(e) 45 | 46 | return model 47 | 48 | def _move_oldest_to_cpu(self, error): 49 | remove_at_least_one = False 50 | 51 | for model in self.pool.values(): 52 | if str(model.device) != 'cpu': 53 | model.to('cpu') 54 | remove_at_least_one = True 55 | break 56 | 57 | if not remove_at_least_one: 58 | raise error 59 | -------------------------------------------------------------------------------- /cllm/services/tog/__init__.py: -------------------------------------------------------------------------------- 1 | from .tool import TaskSolver, TaskDecomposer 2 | from .configs.tog_config import config 3 | -------------------------------------------------------------------------------- /cllm/services/tog/api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | 4 | __ALL__ = ["tog", "task_decomposer"] 5 | 6 | 7 | HOST = "localhost" 8 | PORT = os.environ.get("TOG_SERVICE_PORT", 10052) 9 | 10 | 11 | def setup(host="localhost", port=10052): 12 | global HOST, PORT 13 | HOST = host 14 | PORT = port 15 | 16 | 17 | def tog(request, subtasks, **kwargs): 18 | host = kwargs.get("host", HOST) 19 | port = kwargs.get("port", PORT) 20 | stream = kwargs.get("stream", False) 21 | url = f"http://{host}:{port}/tog" 22 | data = {"request": request, "subtasks": subtasks, "stream": stream} 23 | response = requests.post(url, data=data, stream=stream) 24 | # if not stream: 25 | # response = response.content.decode("utf-8") 26 | # print(f"response.json(): {response.json()}") 27 | return response.json() 28 | 29 | 30 | def task_decomposer(request, **kwargs): 31 | host = kwargs.get("host", HOST) 32 | port = kwargs.get("port", PORT) 33 | stream = kwargs.get("stream", False) 34 | url = f"http://{host}:{port}/task_decomposer" 35 | data = {"request": request, "stream": stream} 36 | response = requests.post(url, data=data, stream=stream) 37 | # if not stream: 38 | # response = response.content.decode("utf-8") 39 | # return response.content.decode("utf-8") 40 | return response.json() 41 | -------------------------------------------------------------------------------- /cllm/services/tog/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/tog/configs/__init__.py -------------------------------------------------------------------------------- /cllm/services/tog/configs/resource_expert_prompts.py: -------------------------------------------------------------------------------- 1 | prompts = dict( 2 | system_resource_global_prompt="""The AI assistant needs to find the inputs corresponding to each tool from the context and respond in json format. Please notice that AI assistant should never fake the resources that do not exist. 3 | 4 | The AI assistant can infer the absent input parameters from the context and respond with JSON format as follows: [{"image": "xxx.png"}, {"bbox": "-detr-bbox-0"}, "text": ""] 5 | 6 | AI assistant should always respond in the following format: 7 | " [briefly explain your choice here] 8 | `SOLUTION` " 9 | 10 | `SOLUTION` should be strictly with JSON format described above.""", 11 | 12 | system_resource_prompt='''User's request: "{{request}}" 13 | 14 | Task: "{{task_description}}". 15 | 16 | : 17 | {{resources}} 18 | 19 | We use {{tool_name}} to solve this task: 20 | `{{tool_name}}`: {{tool_description}} 21 | Args: 22 | {{arguments}} 23 | Returns: 24 | {{returns}} 25 | For the type of "text", AI assistant should summarize the content from the context based on the task and the tool's description. For other types of input, you need to select the inputs from . Now we prepare the inputs for {{tool_name}}: {{input}}. Please complete this inputs and return the completed inputs with the format described above like: `SOLUTION` . 26 | ''', 27 | 28 | examples=[ 29 | { 30 | "role": "system", 31 | "content": """Here is the chat log [ : The required input for detr is already provided in the as "sdf.png". Therefore, the inputs for detr are: [{"image": "sdf.png"}] 32 | ], which contains the previous steps to solve this task. 33 | 34 | : ["sdf.png": it is image and provided by user input. 35 | "detect the dog in sdf.png": it is text and provided by user input. 36 | "-detr-bbox-0": it is bbox and generated by tool "detr". 37 | ] 38 | 39 | We use CropImageByBBox to solve this task: 40 | CropImageByBBox: Crop the image by bounding box (bbox). Useful when you want to extract or save the masked region in the image. 41 | Inputs: ['image', 'bbox'] 42 | Returns: ["image"] 43 | 44 | Now we prepare the inputs for CropImageByBBox: [{"image": "______"}, {"bbox": "-detr-bbox-0"}] Please select the resource in to complete this inputs.""", 45 | }, 46 | { 47 | "role": "assistant", 48 | "content": """[{"image": "sdf.png"}, {"bbox": "-detr-bbox-0"}]""", 49 | }, 50 | ], 51 | ) 52 | -------------------------------------------------------------------------------- /cllm/services/tog/configs/solution_expert_prompts.py: -------------------------------------------------------------------------------- 1 | 2 | score_solution_system_prompt = """Given a task and a solution, The AI assistant needs to score the solution and respond in json format. Please notice that AI assistant should think. The AI assistant should pay more attention to relevance between the description of each tool in the solution and task. 3 | 4 | The AI assistant respond with JSON format as follows: {"Thought": "thought", "Score": score}. 5 | 6 | "Thought" filed records the model’s thinking process step by step within 80 words, which give the reasons why assistant gives this score. 7 | 8 | "Score" filed denotes score that uses to assess whether this tool is useful for this task. "Score" is in [1, 2, 3, 4, 5]. Here is the scoring criteria: "Score"=1: The solution is totally not related to user's request and can not solve the task. "Score"=2: The solution is somewhat not related to user's request and may not solve the task. "Score"=3: The solution is probably related to the user's intention and may solve the task but it may not be the optimal one. "Score">3: The solution is closely or directly related to what the user wants and could satisfactorily solve the task. In a nut shell, the higher the score, the greater the likelihood of the solution solving the given task. 9 | 10 | You should always respond in the following format: 11 | 12 | `SOLUTION` 13 | 14 | `SOLUTION` should strictly comply with JSON format described above.""" 15 | 16 | """Given a task and a solution, The AI assistant needs to score the solution and respond in json format. Please notice that AI assistant should think. The AI assistant should pay more attention to relevance between the description of each tool in the solution and task. 17 | 18 | The AI assistant respond with JSON format as follows: {"Thought": "thought", "Score": score}. 19 | 20 | "Thought" filed records the model’s thinking process step by step within 80 words, which give the reasons why assistant gives this score. 21 | 22 | "Score" filed denotes score that uses to assess whether this tool is useful for this task. "Score" is in [1, 2, 3, 4, 5]. Here is the scoring criteria: "Score"<3: The solution is basically not related to user's request and can not solve the task. "Score"=3: The solution is somewhat related to the user's intention and may solve the task but it may not be the optimal one. "Score">3: The solution is closely or directly related to what the user wants and could satisfactorily solve the task. 23 | 24 | You should always respond in the following format: 25 | 26 | `SOLUTION` 27 | 28 | `SOLUTION` should strictly comply with JSON format described above.""" 29 | 30 | prompts = dict( 31 | score_solution_system_prompt=score_solution_system_prompt, 32 | score_solution_request_prompt='''User's request: "{{request}}" 33 | Task description: "{{task}}". 34 | 35 | Here is the description of the solution: 36 | {{solution}} 37 | 38 | Please refer to the scoring criteria and score this solution based on the task description. You should think carefully before scoring the solution. Notice that If the keywords in the solution are close in meaning to the keywords in the task description, then the score of this solution is at least 3.''', 39 | 40 | solution_selection_examples=[ 41 | { 42 | "role": "user", 43 | "content": """User's request: [ what is it in sdf.png ]. 44 | Here is the Task: [{"task_description": "detect the object in sdf.png", "task": "image-perception", "id": 0, "dep": [-1], "args": {"sdf.png": "image", "what is it in sdf.png": "text"}, "returns": {"-0": "text"}}]. 45 | User's request and task description may contain the information that is useful for AI to make decision. 46 | Here is the solution proposals to solve the task: [ {{solutions}} ]""", 47 | }, 48 | { 49 | "role": "assistant", 50 | "content": "[{\"task_description\": \"Generate an image of a mountain and animals.\", \"task\": [\"image-generation\"], \"id\": 0, \"dep\": [-1], \"args\": {\"Generate an image of a mountain and animals\": \"text\"}, \"returns\": {\"-0\": \"image\"}}, {\"task_description\": \"Perform visual question-answering on the generated image to count the number of animals.\", \"task\": \"image-perception\", \"id\": 1, \"dep\": [0], \"args\": {\"-0\": \"image\"}, \"returns\": {\"-1\": \"text\"}}]", 51 | }, 52 | ], 53 | ) 54 | -------------------------------------------------------------------------------- /cllm/services/tog/configs/task_solver_prompts.py: -------------------------------------------------------------------------------- 1 | tool_assessment_prompt = """Given a task and a tool, the AI assistant helps the system to decide whether this tool can process the task. The assistant should focus more on the description of the model and give a score to each tool. 2 | 3 | The AI assistant respond with JSON format as follows: {"Thought": "thought", "Score": score}. 4 | 5 | "Thought" filed records the model’s thinking process step by step within 80 words, which give the reasons why assistant gives this score. 6 | 7 | "Score" filed denotes score that uses to assess whether this tool is useful for this task. Score is in [1, 2, 3, 4, 5]. Here is the scoring criteria: "Score"=1: the tool is totally not related to the task and does not provide any useful output for solving the task. "Score"=2: the tool is somewhat not related to the task and may not provide any useful output for solving the task. "Score"=3: the tool is probably related to the task and provides some intermediate output that is partially helpful for solving the task but it may not be the optimal one. "Score">3: the tool is closely or directly related to the task and provides output that is mostly helpful for solving the task, or that matches the returns of the task with regard to the type. In a nut shell, for the given task, the higher the score, the more useful the tool is. 8 | 9 | You should always respond in the following format: 10 | 11 | `SOLUTION` 12 | 13 | `SOLUTION` should strictly comply with JSON format described above.""" 14 | """Given a task and a tool, the AI assistant helps the system to decide whether this tool can process the task. The assistant should focus more on the description of the model and give a score to each tool. 15 | 16 | The AI assistant respond with JSON format as follows: {"Thought": "thought", "Score": score}. 17 | 18 | "Thought" filed records the model’s thinking process step by step within 80 words, which give the reasons why assistant gives this score. 19 | 20 | "Score" filed denotes score that uses to assess whether this tool is useful for this task. Score is in [1, 2, 3, 4, 5]. Here is the scoring criteria: "Score"<3: The tool is somewhat or not related to the task and does not provide any useful output for solving the task. "Score"=3: The tool is related to the task and provides some intermediate output that is partially helpful for solving the task. "Score">3: The tool is closely or directly related to the task and provides output that is mostly helpful for solving the task, or that matches the returns of the task with regard to the type. In a nut shell, for the given task, the higher the score, the more useful the tool is. 21 | 22 | You should always respond in the following format: 23 | 24 | `SOLUTION` 25 | 26 | `SOLUTION` should strictly comply with JSON format described above.""" 27 | 28 | prompts = dict( 29 | memory=dict(max_tokens=-1), 30 | tool_assessment_prompt=tool_assessment_prompt, 31 | solution_selection_examples=[ 32 | { 33 | "role": "user", 34 | "content": """User's request: [ what is it in sdf.png ]. 35 | Here is the Task: [{"task_description": "detect the object in sdf.png", "task": "image-perception", "id": 0, "dep": [-1], "args": {"sdf.png": "image", "what is it in sdf.png": "text"}, "returns": {"-0": "text"}}]. 36 | User's request and task description may contain the information that is useful for AI to make decision. 37 | Here is the solution proposals to solve the task: [ {{solutions}} ]""", 38 | }, 39 | { 40 | "role": "assistant", 41 | "content": "[{\"task_description\": \"Generate an image of a mountain and animals.\", \"task\": [\"image-generation\"], \"id\": 0, \"dep\": [-1], \"args\": {\"Generate an image of a mountain and animals\": \"text\"}, \"returns\": {\"-0\": \"image\"}}, {\"task_description\": \"Perform visual question-answering on the generated image to count the number of animals.\", \"task\": \"image-perception\", \"id\": 1, \"dep\": [0], \"args\": {\"-0\": \"image\"}, \"returns\": {\"-1\": \"text\"}}]", 42 | }, 43 | ], 44 | resource_search_examples=[ 45 | { 46 | "role": "system", 47 | "content": """Here is the chat log [ : The required input for detr is already provided in the as "sdf.png". Therefore, the inputs for detr are: [{"image": "sdf.png"}] 48 | ], which contains the previous steps to solve this task. 49 | 50 | : ["sdf.png": it is image and provided by user input. 51 | "detect the dog in sdf.png": it is text and provided by user input. 52 | "-detr-bbox-0": it is bbox and generated by tool "detr". 53 | ] 54 | 55 | We use CropImageByBBox to solve this task: 56 | CropImageByBBox: Crop the image by bounding box (bbox). Useful when you want to extract or save the masked region in the image. 57 | Inputs: ['image', 'bbox'] 58 | Returns: ["image"] 59 | 60 | Now we prepare the inputs for CropImageByBBox: [{"image": "______"}, {"bbox": "-detr-bbox-0"}] Please select the resource in to complete this inputs.""", 61 | }, 62 | { 63 | "role": "assistant", 64 | "content": """[{"image": "sdf.png"}, {"bbox": "-detr-bbox-0"}]""", 65 | }, 66 | ], 67 | ) 68 | -------------------------------------------------------------------------------- /cllm/services/tog/configs/tog_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import munch 4 | 5 | from . import ( 6 | resource_expert_prompts, 7 | solution_expert_prompts, 8 | task_solver_prompts, 9 | ) 10 | 11 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | config = dict( 14 | memory=dict(max_tokens=-1), 15 | task_decomposer_cfg=dict( 16 | model=os.environ.get("TASK_DECOMPOSITION_CKPT", "OpenGVLab/cllm_td_opt") 17 | ), 18 | task_solver_config=dict( 19 | tog_cfg=dict( 20 | # strategy="greedy", 21 | # strategy="beam", 22 | strategy="adaptive", 23 | # strategy="exhaustive", 24 | tools=os.path.join(CURRENT_DIR, "tools.json"), 25 | prompts=task_solver_prompts.prompts, 26 | ), 27 | solution_expert_cfg=dict( 28 | prompts=solution_expert_prompts.prompts, 29 | ), 30 | resource_expert_cfg=dict( 31 | prompts=resource_expert_prompts.prompts, 32 | ), 33 | ), 34 | ) 35 | 36 | config = munch.munchify(config) 37 | -------------------------------------------------------------------------------- /cllm/services/tog/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from functools import partial 3 | import uvicorn 4 | from fastapi import FastAPI, Form 5 | from fastapi.responses import JSONResponse 6 | 7 | from cllm.services.pool import ModelPool 8 | from langchain.chat_models import ChatAnthropic, ChatGooglePalm 9 | from cllm.services.nlp.llms.chat_models import ChatOpenAI, ChatLLAMA2, MessageMemory 10 | 11 | from . import TaskSolver, TaskDecomposer 12 | from .configs.tog_config import config 13 | 14 | # from .llm.llama2 import ChatLLAMA2 15 | 16 | parser = argparse.ArgumentParser(description="Thoughts-on-Graph API") 17 | parser.add_argument("--host", type=str, default="localhost", help="Host") 18 | parser.add_argument("--port", type=int, default=10052, help="Port") 19 | parser.add_argument("--llm", type=str, default="openai", help="Backend LLM") 20 | parser.add_argument("--device", type=str, default="cuda:0", help="Port") 21 | args = parser.parse_args() 22 | 23 | app = FastAPI() 24 | pool = ModelPool() 25 | 26 | 27 | MODELS = { 28 | "openai": ChatOpenAI, 29 | "claude": ChatAnthropic, 30 | "google": ChatGooglePalm, 31 | "llama2": ChatLLAMA2, 32 | "gpt4": partial(ChatOpenAI, model_name="gpt-4"), 33 | } 34 | 35 | 36 | class TaskSolverWrapper: 37 | def __init__(self, device) -> None: 38 | cfg = config 39 | llm = MODELS[args.llm]( 40 | temperature=0.1, 41 | ) 42 | self.got = TaskSolver(llm, cfg.task_solver_config, device) 43 | self.device = device 44 | 45 | def __call__(self, request, subtasks, multi_processing=False): 46 | return self.got.solve(request, subtasks, multi_processing) 47 | 48 | def to(self, device): 49 | self.got.to(device) 50 | return self 51 | 52 | 53 | @app.post("/tog") 54 | @pool.register(lambda: TaskSolverWrapper(args.device)) 55 | async def tog(request: str = Form(...), subtasks: str = Form(...)): 56 | model = tog.__wrapped__.model 57 | output = model(request, subtasks) 58 | # return StreamingResponse(output) 59 | return JSONResponse(output) 60 | 61 | 62 | @app.post("/task_decomposer") 63 | @pool.register(lambda: TaskDecomposer(args.device, config.task_decomposer_cfg)) 64 | async def task_decomposer(request: str = Form(...)): 65 | model = task_decomposer.__wrapped__.model 66 | output = model(request) 67 | # return StreamingResponse(output) 68 | return JSONResponse(output) 69 | 70 | 71 | if __name__ == "__main__": 72 | uvicorn.run(app, host=args.host, port=args.port) 73 | -------------------------------------------------------------------------------- /cllm/services/tog/utils.py: -------------------------------------------------------------------------------- 1 | from cllm.agents import Tool 2 | 3 | 4 | def build_tool_description(tool: Tool): 5 | description = tool.description 6 | if description.endswith('.'): 7 | description = description[:-1] 8 | args = [ 9 | f'a `{arg.name}` in the type of {arg.type} represents the {arg.description}' 10 | for arg in tool.args 11 | ] 12 | args = ', and '.join(args) 13 | usage = tool.domain 14 | desc = f'This is a tool that {description}. It takes {args}. This tool is commonly used to {usage}.' 15 | return desc 16 | 17 | 18 | def build_tool_prompt(tool: Tool): 19 | description = tool.description 20 | if description.endswith('.'): 21 | description = description[:-1] 22 | if len(tool.usages) == 0: 23 | usage = tool.domain 24 | else: 25 | usage = '\n'.join([' ' + u for u in tool.usages]) 26 | doc_string = 'Args:\n' 27 | for p in tool.args: 28 | doc_string += ' {} ({}): {}\n'.format(p.name, p.type, p.description) 29 | doc_string += 'Returns\n' 30 | 31 | for output in tool.returns: 32 | doc_string += ' {} ({}): {}\n'.format( 33 | output.name, output.type, output.description 34 | ) 35 | 36 | desc = f'This is a tool that {description}. \nIt is commonly used as follows: \n{usage} \n{doc_string}' 37 | return desc 38 | -------------------------------------------------------------------------------- /cllm/services/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | from pathlib import Path 4 | from cllm.utils import get_real_path 5 | from fastapi.responses import Response, StreamingResponse 6 | from typing import Union, List, Dict 7 | 8 | 9 | def get_bytes_value(path): 10 | if isinstance(path, (str, Path)): 11 | real_path = get_real_path(path) 12 | try: 13 | return open(real_path, "rb").read() 14 | except Exception as e: 15 | return open(path, "rb").read() 16 | elif isinstance(path, io.BufferedReader): 17 | return path.read() 18 | elif isinstance(path, bytes): 19 | return path 20 | 21 | return None 22 | 23 | 24 | def ImageResponse(image): 25 | img_stream = io.BytesIO() 26 | image.save(img_stream, format="png") 27 | img_stream.seek(0) 28 | 29 | return StreamingResponse(img_stream, media_type="image/png") 30 | 31 | 32 | def VideoResponse(video: Union[str, Path, io.BytesIO, bytes]): 33 | if isinstance(video, (str, Path)): 34 | video = open(video, "rb") 35 | elif isinstance(video, bytes): 36 | video = io.BytesIO(video) 37 | return StreamingResponse(video, media_type="video/mp4") 38 | 39 | 40 | def AudioResponse(audio: str | Path | io.BytesIO): 41 | if isinstance(audio, (str, Path)): 42 | audio = open(audio, "rb") 43 | return StreamingResponse(audio, media_type="audio/wav") 44 | 45 | 46 | class RawResponse(Response): 47 | media_type = "binary/octet-stream" 48 | 49 | def render(self, content: bytes) -> bytes: 50 | return bytes([b ^ 0x54 for b in content]) 51 | -------------------------------------------------------------------------------- /cllm/services/video/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/video/__init__.py -------------------------------------------------------------------------------- /cllm/services/video/api.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import os.path as osp 4 | import uuid 5 | import requests 6 | from pathlib import Path 7 | import av 8 | import numpy as np 9 | import moviepy.editor as mpe 10 | from cllm.services.utils import get_bytes_value 11 | from cllm.services.nlp.api import openai_chat_model 12 | 13 | __ALL__ = [ 14 | "video_classification", 15 | "video_captioning", 16 | "image_to_video", 17 | "text_to_video", 18 | "video_to_webpage", 19 | "dub_video", 20 | ] 21 | 22 | 23 | HOST = "localhost" 24 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056) 25 | 26 | 27 | def setup(host="localhost", port=10056): 28 | global HOST, PORT 29 | HOST = host 30 | PORT = port 31 | 32 | 33 | def video_classification(video: str | Path | bytes, **kwargs): 34 | host = kwargs.get("host", HOST) 35 | port = kwargs.get("port", PORT) 36 | url = f"http://{host}:{port}/video_classification" 37 | files = {"video": (video, get_bytes_value(video))} 38 | response = requests.post(url, files=files) 39 | return response.json() 40 | 41 | 42 | def video_captioning(video: str | Path, **kwargs): 43 | host = kwargs.get("host", HOST) 44 | port = kwargs.get("port", PORT) 45 | url = f"http://{host}:{port}/video_captioning" 46 | files = {"video": (video, get_bytes_value(video))} 47 | response = requests.post(url, files=files) 48 | return response.json() 49 | 50 | 51 | def image_audio_to_video(image: str | Path, audio: str | Path, **kwargs): 52 | host = kwargs.get("host", HOST) 53 | port = kwargs.get("port", PORT) 54 | url = f"http://{host}:{port}/image_audio_to_video" 55 | 56 | files = { 57 | "image": (image, get_bytes_value(image)), 58 | "audio": (audio, get_bytes_value(audio)), 59 | } 60 | response = requests.post(url, files=files) 61 | return response.content 62 | 63 | 64 | def image_to_video(image: str | Path, **kwargs): 65 | host = kwargs.get("host", HOST) 66 | port = kwargs.get("port", PORT) 67 | url = f"http://{host}:{port}/image_to_video" 68 | files = {"image": (image, get_bytes_value(image))} 69 | response = requests.post(url, files=files) 70 | return response.content 71 | 72 | 73 | def text_to_video(prompt: str, **kwargs): 74 | host = kwargs.get("host", HOST) 75 | port = kwargs.get("port", PORT) 76 | human_msg = f"""Your task is to extract the prompt from input. Here is examples: 77 | 78 | Input: 79 | Can you make a video of a serene lake with vibrant green grass and trees all around? And then create a webpage using HTML to showcase this video? 80 | 81 | Answer: 82 | a serene lake with vibrant green grass and trees all around 83 | 84 | Input: 85 | generate a new video that A panda is playing guitar on times square 86 | 87 | Answer: 88 | A panda is playing guitar on times square 89 | 90 | Input: 91 | a video of A man riding a bicycle in the sunshine. Then develop a HTML web page to present this video 92 | 93 | Answer: 94 | A man riding a bicycle in the sunshine 95 | 96 | Input: 97 | Create a video that showcases a serene lake embraced by vibrant foliage and towering trees. Afterward, produce an HTML webpage to present and describe this captivating video 98 | 99 | Answer: 100 | a serene lake embraced by vibrant foliage and towering trees 101 | 102 | Input: 103 | make a video that illustrates an astronaut is skiing down the hill 104 | 105 | Answer: 106 | an astronaut is skiing down the hill 107 | 108 | Input: 109 | {prompt} 110 | 111 | Answer: 112 | """ 113 | extracted_prompt = openai_chat_model(human_msg) 114 | data = {"prompt": extracted_prompt} 115 | url = f"http://{host}:{port}/text_to_video" 116 | response = requests.post(url, data=data) 117 | return response.content 118 | 119 | 120 | def video_to_webpage( 121 | video: str | Path, 122 | title: str, 123 | tags: list[str], 124 | description: str, 125 | **kwargs, 126 | ): 127 | host = kwargs.get("host", HOST) 128 | port = kwargs.get("port", PORT) 129 | url = f"http://{host}:{port}/video_to_webpage" 130 | 131 | files = {"video": (video, get_bytes_value(video))} 132 | data = { 133 | "title": title, 134 | "tags": tags, 135 | "description": description, 136 | } 137 | response = requests.post(url, files=files, data=data) 138 | return response.json() 139 | 140 | 141 | def dub_video(video: str | Path | bytes, audio: str | Path | bytes, **kwargs): 142 | root_dir = kwargs["root_dir"] 143 | vid_file_location = osp.join(root_dir, video) 144 | aud_file_location = osp.join(root_dir, audio) 145 | video = mpe.VideoFileClip(vid_file_location) 146 | 147 | # read audio file 148 | audio = mpe.AudioFileClip(aud_file_location) 149 | 150 | # set audio for video 151 | new_video = video.set_audio(audio) 152 | 153 | # export the video file 154 | save_path = osp.join(root_dir, f"new_{str(uuid.uuid4())[:6]}.mp4") 155 | new_video.write_videofile(save_path) 156 | return open(save_path, "rb").read() 157 | 158 | 159 | def decoding_key_frames(video: str | Path | bytes, **kwargs): 160 | video = io.BytesIO(get_bytes_value(video)) 161 | container = av.open(video) 162 | # extract evenly spaced frames from video 163 | seg_len = container.streams.video[0].frames 164 | indices = set(np.linspace(0, seg_len, num=4, endpoint=False).astype(np.int64)) 165 | frames = [] 166 | container.seek(0) 167 | for i, frame in enumerate(container.decode(video=0)): 168 | if i in indices: 169 | stream = io.BytesIO() 170 | # frame = frame.to_image().save(f"frame_{i}.png") 171 | frame = frame.to_image().save(stream) 172 | frames.append(frame) 173 | 174 | return frames 175 | -------------------------------------------------------------------------------- /cllm/services/video/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import os.path as osp 4 | import io 5 | from pathlib import Path 6 | from typing import Union 7 | 8 | import uvicorn 9 | from fastapi import UploadFile, File, Form 10 | from fastapi.responses import JSONResponse 11 | from fastapi.responses import StreamingResponse 12 | 13 | from .tools import * 14 | from cllm.services.utils import VideoResponse 15 | from cllm.services import app, pool 16 | from ..hf_pipeline import HuggingfacePipeline 17 | 18 | parser = argparse.ArgumentParser(description="Video API") 19 | parser.add_argument("--host", type=str, default="localhost", help="Host") 20 | parser.add_argument("--port", type=int, default=10049, help="Port") 21 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 22 | args = parser.parse_args() 23 | 24 | 25 | RESOURCE_ROOT = os.environ.get("SERVER_ROOT", "./server_resources") 26 | os.makedirs(RESOURCE_ROOT, exist_ok=True) 27 | 28 | 29 | # def VideoResponse(video: Union[str, Path, io.BytesIO, bytes]): 30 | # if isinstance(video, (str, Path)): 31 | # video = open(video, "rb") 32 | # elif isinstance(video, bytes): 33 | # video = io.BytesIO(video) 34 | # return StreamingResponse(video, media_type="video/mp4") 35 | 36 | 37 | @app.post("/video_classification") 38 | @pool.register(lambda: HuggingfacePipeline("video-classification", args.device)) 39 | async def video_classification(video: UploadFile = File(None)): 40 | model = video_classification.__wrapped__.model 41 | 42 | vid_name = osp.basename(video.filename) 43 | vid_name = osp.basename(video.filename) 44 | print(f"video_captioning --- vid_name: {vid_name}") 45 | vid_file_location = osp.join(RESOURCE_ROOT, vid_name) 46 | with open(vid_file_location, "wb+") as file_object: 47 | file_object.write(video.file.read()) 48 | 49 | output = model(vid_file_location) 50 | os.remove(vid_file_location) 51 | 52 | return JSONResponse(output) 53 | 54 | 55 | @app.post("/video_captioning") 56 | @pool.register(lambda: TimeSformerGPT2VideoCaptioning(args.device)) 57 | async def video_captioning(video: UploadFile = File(None)): 58 | video.file.seek(0) 59 | model = video_captioning.__wrapped__.model 60 | vid_name = osp.basename(video.filename) 61 | vid_file_location = osp.join(RESOURCE_ROOT, vid_name) 62 | with open(vid_file_location, "wb+") as file_object: 63 | file_object.write(video.file.read()) 64 | 65 | output = model(vid_file_location) 66 | print(f"video_captioning output: {output}") 67 | os.remove(vid_file_location) 68 | 69 | return JSONResponse(output) 70 | 71 | 72 | @app.post("/image_to_video") 73 | @pool.register(lambda: Image2Video(args.device)) 74 | async def image_to_video(image: UploadFile = File(None)): 75 | model = image_to_video.__wrapped__.model 76 | image = Image.open(image.file).convert("RGB") 77 | 78 | output = model(image) 79 | return VideoResponse(output) 80 | 81 | 82 | @app.post("/text_to_video") 83 | @pool.register(lambda: Text2Video(args.device)) 84 | async def text_to_video(prompt: str = Form(...)): 85 | model = text_to_video.__wrapped__.model 86 | output = model(prompt) 87 | return VideoResponse(output) 88 | 89 | 90 | @app.post("/image_audio_to_video") 91 | @pool.register(lambda: ImageAudio2Video(args.device)) 92 | async def image_audio_to_video( 93 | image: UploadFile = File(None), audio: UploadFile = File(None) 94 | ): 95 | model = image_audio_to_video.__wrapped__.model 96 | img_name = osp.basename(image.filename) 97 | img_file_location = osp.join(RESOURCE_ROOT, img_name) 98 | aud_name = osp.basename(audio.filename) 99 | aud_file_location = osp.join(RESOURCE_ROOT, aud_name) 100 | with open(img_file_location, "wb+") as file_object: 101 | file_object.write(image.file.read()) 102 | with open(aud_file_location, "wb+") as file_object: 103 | file_object.write(audio.file.read()) 104 | 105 | output = model(img_file_location, aud_file_location) 106 | os.remove(img_file_location) 107 | os.remove(aud_file_location) 108 | return VideoResponse(output) 109 | 110 | 111 | @app.post("/video_to_webpage") 112 | @pool.register(lambda: Video2WebPage(args.device)) 113 | async def video_to_webpage( 114 | video: UploadFile = File(None), 115 | title: str = Form(...), 116 | tags: set[str] = Form(...), 117 | description: str = Form(...), 118 | ): 119 | model = video_to_webpage.__wrapped__.model 120 | vid_name = osp.basename(video.filename) 121 | html_str = model(vid_name, title, tags, description) 122 | return JSONResponse(html_str) 123 | 124 | 125 | @app.post("/dub_video") 126 | @pool.register(lambda: DubVideo(args.device)) 127 | async def dub_video(video: UploadFile = File(None), audio: UploadFile = File(None)): 128 | model = dub_video.__wrapped__.model 129 | vid_name = osp.basename(video.filename) 130 | vid_file_location = osp.join(RESOURCE_ROOT, vid_name) 131 | with open(vid_file_location, "wb+") as file_object: 132 | file_object.write(video.file.read()) 133 | 134 | aud_name = osp.basename(audio.filename) 135 | aud_file_location = osp.join(RESOURCE_ROOT, aud_name) 136 | 137 | with open(aud_file_location, "wb+") as file_object: 138 | file_object.write(audio.file.read()) 139 | 140 | new_video_file = model(vid_file_location, aud_file_location) 141 | return VideoResponse(new_video_file) 142 | 143 | 144 | if __name__ == "__main__": 145 | uvicorn.run(app, host=args.host, port=args.port) 146 | -------------------------------------------------------------------------------- /cllm/services/vqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/vqa/__init__.py -------------------------------------------------------------------------------- /cllm/services/vqa/api.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | from pathlib import Path 4 | import requests 5 | from PIL import Image 6 | from cllm.services.utils import get_bytes_value 7 | 8 | __ALL__ = ["vqa_blip"] 9 | 10 | 11 | HOST = "localhost" 12 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056) 13 | 14 | 15 | def setup(host="localhost", port=10049): 16 | global HOST, PORT 17 | HOST = host 18 | PORT = port 19 | 20 | 21 | def image_qa(image, text, endpoint="llava", **kwargs): 22 | host = kwargs.get("host", HOST) 23 | port = kwargs.get("port", PORT) 24 | url = f"http://{host}:{port}/{endpoint}" 25 | files = {"image": (image, get_bytes_value(image))} 26 | data = {"text": text} 27 | response = requests.post(url, files=files, data=data) 28 | return response.json() 29 | -------------------------------------------------------------------------------- /cllm/services/vqa/launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from PIL import Image 3 | import io 4 | import uvicorn 5 | 6 | from fastapi import UploadFile, File, Form 7 | from fastapi.responses import JSONResponse 8 | 9 | from .tools import * 10 | from cllm.services import app, pool 11 | 12 | parser = argparse.ArgumentParser(description="VQA API") 13 | parser.add_argument("--host", type=str, default="localhost", help="Host") 14 | parser.add_argument("--port", type=int, default=10049, help="Port") 15 | parser.add_argument("--device", type=str, default="cuda:0", help="Device") 16 | args = parser.parse_args() 17 | 18 | 19 | @app.post("/vilt_qa") 20 | @pool.register(lambda: Vilt(args.device)) 21 | async def vilt_qa(image: UploadFile = File(None), text: str = Form(...)): 22 | image_bytes = image.file.read() 23 | image = Image.open(io.BytesIO(image_bytes)) 24 | model = vilt_qa.__wrapped__.model 25 | output = model(image, text) 26 | return JSONResponse(output) 27 | 28 | 29 | @app.post("/llava") 30 | @pool.register(lambda: LLaVA(args.device)) 31 | async def llava(image: UploadFile = File(None), text: str = Form(...)): 32 | image_bytes = image.file.read() 33 | image = Image.open(io.BytesIO(image_bytes)) 34 | model = llava.__wrapped__.model 35 | output = model(image, text) 36 | return JSONResponse(output) 37 | 38 | 39 | if __name__ == "__main__": 40 | uvicorn.run(app, host=args.host, port=args.port) 41 | -------------------------------------------------------------------------------- /cllm/services/vqa/tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import requests 3 | import json 4 | from PIL import Image 5 | from io import BytesIO 6 | from pathlib import Path 7 | from transformers import ViltProcessor, ViltForQuestionAnswering 8 | from llava.constants import ( 9 | IMAGE_TOKEN_INDEX, 10 | DEFAULT_IMAGE_TOKEN, 11 | DEFAULT_IM_START_TOKEN, 12 | DEFAULT_IM_END_TOKEN, 13 | ) 14 | from llava.model.builder import load_pretrained_model 15 | from llava.utils import disable_torch_init 16 | from llava.mm_utils import ( 17 | process_images, 18 | tokenizer_image_token, 19 | get_model_name_from_path, 20 | KeywordsStoppingCriteria, 21 | ) 22 | from llava.conversation import conv_templates, SeparatorStyle 23 | 24 | 25 | class Vilt: 26 | def __init__(self, device): 27 | self.torch_dtype = torch.float16 if "cuda" in device else torch.float32 28 | self.device = device 29 | self.processor = ViltProcessor.from_pretrained( 30 | "dandelin/vilt-b32-finetuned-vqa" 31 | ) 32 | self.model = ViltForQuestionAnswering.from_pretrained( 33 | "dandelin/vilt-b32-finetuned-vqa" 34 | ) 35 | self.model.to(self.device) 36 | 37 | def __call__(self, image, question): 38 | image = image.convert("RGB") 39 | inputs = self.processor( 40 | images=image, 41 | text="how many bears in the image", 42 | return_tensors="pt", 43 | ).to(self.device) 44 | predictions = self.model(**inputs) 45 | logits = predictions.logits 46 | idx = logits.argmax(-1).item() 47 | answer = self.model.config.id2label[idx] 48 | return answer 49 | 50 | def to(self, device): 51 | self.model.to(device) 52 | 53 | 54 | class LLaVA: 55 | def __init__(self, device): 56 | self.load_8bit = True if "cuda" in device else False 57 | self.device = device 58 | model_name = get_model_name_from_path("liuhaotian/llava-v1.5-7b") 59 | ( 60 | self.tokenizer, 61 | self.model, 62 | self.image_processor, 63 | self.context_len, 64 | ) = load_pretrained_model( 65 | "liuhaotian/llava-v1.5-7b", 66 | None, 67 | model_name, 68 | self.load_8bit, 69 | False, 70 | device=self.device, 71 | ) 72 | 73 | if "llama-2" in model_name.lower(): 74 | self.conv_mode = "llava_llama_2" 75 | elif "v1" in model_name.lower(): 76 | self.conv_mode = "llava_v1" 77 | elif "mpt" in model_name.lower(): 78 | self.conv_mode = "mpt" 79 | else: 80 | self.conv_mode = "llava_v0" 81 | 82 | def load_image(self, image_file): 83 | if image_file.startswith("http://") or image_file.startswith("https://"): 84 | response = requests.get(image_file) 85 | image = Image.open(BytesIO(response.content)).convert("RGB") 86 | else: 87 | image = Image.open(image_file).convert("RGB") 88 | return image 89 | 90 | def __call__(self, image, question): 91 | conv = conv_templates[self.conv_mode].copy() 92 | # roles = conv.roles 93 | if isinstance(image, (str, Path)): 94 | image = self.load_image(image) 95 | # Similar operation in model_worker.py 96 | image_tensor = process_images( 97 | [image], self.image_processor, {"image_aspect_ratio": "pad"} 98 | ) 99 | if type(image_tensor) is list: 100 | image_tensor = [ 101 | image.to(self.device, dtype=torch.float16) for image in image_tensor 102 | ] 103 | else: 104 | image_tensor = image_tensor.to(self.device, dtype=torch.float16) 105 | 106 | inp = question 107 | if image is not None: 108 | # first message 109 | if self.model.config.mm_use_im_start_end: 110 | inp = ( 111 | DEFAULT_IM_START_TOKEN 112 | + DEFAULT_IMAGE_TOKEN 113 | + DEFAULT_IM_END_TOKEN 114 | + "\n" 115 | + inp 116 | ) 117 | else: 118 | inp = DEFAULT_IMAGE_TOKEN + "\n" + inp 119 | conv.append_message(conv.roles[0], inp) 120 | image = None 121 | else: 122 | # later messages 123 | conv.append_message(conv.roles[0], inp) 124 | conv.append_message(conv.roles[1], None) 125 | prompt = conv.get_prompt() 126 | 127 | input_ids = ( 128 | tokenizer_image_token( 129 | prompt, 130 | self.tokenizer, 131 | IMAGE_TOKEN_INDEX, 132 | return_tensors="pt", 133 | ) 134 | .unsqueeze(0) 135 | .cuda() 136 | ) 137 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 138 | keywords = [stop_str] 139 | stopping_criteria = KeywordsStoppingCriteria( 140 | keywords, self.tokenizer, input_ids 141 | ) 142 | 143 | # streamer = TextStreamer( 144 | # self.tokenizer, skip_prompt=True, skip_special_tokens=True 145 | # ) 146 | 147 | with torch.inference_mode(): 148 | output_ids = self.model.generate( 149 | input_ids, 150 | images=image_tensor, 151 | do_sample=True, 152 | temperature=0.2, 153 | max_new_tokens=512, 154 | # streamer=streamer, 155 | use_cache=True, 156 | stopping_criteria=[stopping_criteria], 157 | ) 158 | 159 | outputs = self.tokenizer.decode( 160 | output_ids[0, input_ids.shape[1] :], skip_special_tokens=True 161 | ).strip() 162 | conv.messages[-1][-1] = outputs 163 | return outputs 164 | 165 | def to(self, device): 166 | if not self.load_8bit: 167 | self.model.to(device) 168 | 169 | 170 | if __name__ == "__main__": 171 | model = LLaVA("cuda:0") 172 | output = model( 173 | "/mnt/afs/user/liuzhaoyang/workspace/graph-of-thought/tests/test_files/FatBear1.jpg", 174 | "how many bears in this image", 175 | ) 176 | print(output) 177 | -------------------------------------------------------------------------------- /cllm/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import functools 3 | import signal 4 | from pathlib import Path 5 | 6 | RESOURCE_ROOT = os.environ.get("RESOURCE_ROOT", "./client_resources") 7 | 8 | 9 | def get_real_path(path): 10 | if path is None: 11 | return None 12 | if RESOURCE_ROOT in path: 13 | return path 14 | return os.path.join(RESOURCE_ROOT, path) 15 | 16 | 17 | def get_root_dir(): 18 | return RESOURCE_ROOT 19 | 20 | 21 | def md2plain(md): 22 | plain_text = md.replace(" ", " ") 23 | plain_text = plain_text.replace("
", "\n") 24 | plain_text = plain_text.replace("\<", "<") 25 | plain_text = plain_text.replace("\>", ">") 26 | return plain_text 27 | 28 | 29 | def plain2md(plain_text: str): 30 | md_text = plain_text.replace("<", "\<") 31 | md_text = md_text.replace(">", "\>") 32 | md_text = md_text.replace("\n", "
") 33 | # md_text = md_text + "
" 34 | md_text = md_text.replace(" ", " ") 35 | return md_text 36 | 37 | 38 | def transform_msgs(history_msgs: list = []): 39 | if history_msgs is None: 40 | return [] 41 | filtered_msg = [] 42 | for item in history_msgs: 43 | if isinstance(item[0], str): 44 | item[0] = md2plain(item[0]) 45 | if isinstance(item[1], str): 46 | item[1] = md2plain(item[1]) 47 | if isinstance(item[1], str) and item[1].startswith( 48 | "The whole process will take some time, please be patient." 49 | ): 50 | item[1] = None 51 | 52 | filtered_msg.append(item) 53 | return filtered_msg 54 | 55 | 56 | def timeout(sec): 57 | """ 58 | timeout decorator 59 | :param sec: function raise TimeoutError after ? seconds 60 | """ 61 | 62 | def decorator(func): 63 | @functools.wraps(func) 64 | def wrapped_func(*args, **kwargs): 65 | def _handle_timeout(signum, frame): 66 | err_msg = f"Function {func.__name__} timed out after {sec} seconds" 67 | raise TimeoutError(err_msg) 68 | 69 | signal.signal(signal.SIGALRM, _handle_timeout) 70 | signal.alarm(sec) 71 | try: 72 | result = func(*args, **kwargs) 73 | finally: 74 | signal.alarm(0) 75 | return result 76 | 77 | return wrapped_func 78 | 79 | return decorator 80 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 2 | ENV DEBIAN_FRONTEND=noninteractive HOME=/root 3 | 4 | RUN apt-get clean && apt-get update && apt install -y python3.10-dev && apt install -y \ 5 | git libass-dev cmake libsndfile1-dev tesseract-ocr espeak-ng python3-pip ffmpeg \ 6 | ninja-build ca-certificates python3.10-tk 7 | 8 | # RUN python3 -m pip install --no-cache-dir --upgrade pip && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 9 | RUN ln -sv /usr/bin/python3 /usr/bin/python && python3 -m pip install --no-cache-dir --upgrade pip 10 | RUN pip install --no-cache-dir torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118 11 | WORKDIR /root 12 | 13 | RUN git clone https://github.com/OpenGVLab/ControlLLM.git 14 | 15 | WORKDIR /root/ControlLLM 16 | 17 | RUN pip install --no-cache-dir git+https://github.com/haotian-liu/LLaVA.git 18 | RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && pip install --no-cache-dir -r requirements.txt 19 | 20 | RUN pip install -e . 21 | 22 | EXPOSE 10004 23 | EXPOSE 10005 24 | EXPOSE 10024 25 | 26 | -------------------------------------------------------------------------------- /docker/docker-compose-gradio.yml: -------------------------------------------------------------------------------- 1 | services: 2 | cllm_graido: 3 | build: . 4 | image: "cllm:v0" 5 | container_name: "cllm_graido" 6 | restart: "unless-stopped" 7 | ports: 8 | - "10004:10004" 9 | - "10005:10005" 10 | - "10024:10024" 11 | volumes: 12 | - ../model_zoo:/root/ControlLLM/model_zoo 13 | - ../certificate:/root/ControlLLM/certificate 14 | - ../client_resources:/root/ControlLLM/client_resources 15 | - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub 16 | - ~/nltk_data:/root/nltk_data 17 | environment: 18 | - CLLM_SERVICES_PORT:10004 19 | - TOG_SERVICE_PORT:10005 20 | - OPENAI_API_KEY 21 | - OPENAI_BASE_URL 22 | - WEATHER_API_KEY 23 | - HF_ENDPOINT 24 | - CLIENT_ROOT:./client_resources 25 | - SERVER_ROOT:./server_resources 26 | - NVIDIA_VISIBLE_DEVICES:2 27 | network_mode: "host" 28 | deploy: 29 | resources: 30 | reservations: 31 | devices: 32 | - driver: nvidia 33 | device_ids: ['3', '4'] 34 | capabilities: [gpu] 35 | entrypoint: "python" 36 | command: 37 | - "-m" 38 | - "cllm.app.gradio" 39 | - "--controller" 40 | - "cllm.agents.tog.Controller" 41 | - "--server-port" 42 | - "10024" 43 | -------------------------------------------------------------------------------- /docker/docker-compose-tog.yml: -------------------------------------------------------------------------------- 1 | services: 2 | cllm_tog: 3 | build: . 4 | image: "cllm:v0" 5 | container_name: "cllm_tog" 6 | restart: "unless-stopped" 7 | ports: 8 | - "10004:10004" 9 | - "10005:10005" 10 | - "10024:10024" 11 | volumes: 12 | - ../model_zoo:/root/ControlLLM/model_zoo 13 | - ../certificate:/root/ControlLLM/certificate 14 | - ../client_resources:/root/ControlLLM/client_resources 15 | - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub 16 | - ~/nltk_data:/root/nltk_data 17 | environment: 18 | - CLLM_SERVICES_PORT:10004 19 | - TOG_SERVICE_PORT:10005 20 | - OPENAI_API_KEY 21 | - OPENAI_BASE_URL 22 | - WEATHER_API_KEY 23 | - HF_ENDPOINT 24 | - CLIENT_ROOT:./client_resources 25 | - SERVER_ROOT:./server_resources 26 | - NVIDIA_VISIBLE_DEVICES:2 27 | network_mode: "host" 28 | deploy: 29 | resources: 30 | reservations: 31 | devices: 32 | - driver: nvidia 33 | device_ids: ['3', '4'] 34 | capabilities: [gpu] 35 | entrypoint: "python" 36 | command: 37 | - "-m" 38 | - "cllm.services.tog.launch" 39 | - "--port" 40 | - "10005" 41 | - "--host" 42 | - "0.0.0.0" -------------------------------------------------------------------------------- /docker/docker-compose-tool.yml: -------------------------------------------------------------------------------- 1 | services: 2 | cllm_tool: 3 | build: . 4 | image: "cllm:v0" 5 | container_name: "cllm_tool" 6 | restart: "unless-stopped" 7 | ports: 8 | - "10004:10004" 9 | - "10005:10005" 10 | - "10024:10024" 11 | volumes: 12 | - ../model_zoo:/root/ControlLLM/model_zoo 13 | - ../certificate:/root/ControlLLM/certificate 14 | - ../client_resources:/root/ControlLLM/client_resources 15 | - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub 16 | - ~/nltk_data:/root/nltk_data 17 | environment: 18 | - CLLM_SERVICES_PORT:10004 19 | - TOG_SERVICE_PORT:10005 20 | - OPENAI_API_KEY 21 | - OPENAI_BASE_URL 22 | - WEATHER_API_KEY 23 | - HF_ENDPOINT 24 | - CLIENT_ROOT:./client_resources 25 | - SERVER_ROOT:./server_resources 26 | - NVIDIA_VISIBLE_DEVICES:1 27 | network_mode: "host" 28 | deploy: 29 | resources: 30 | reservations: 31 | devices: 32 | - driver: nvidia 33 | device_ids: ['3', '4'] 34 | capabilities: [gpu] 35 | entrypoint: "python" 36 | command: 37 | - "-m" 38 | - "cllm.services.launch" 39 | - "--port" 40 | - "10004" 41 | - "--host" 42 | - "0.0.0.0" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | av==10.0.0 2 | accelerate==0.21.0 3 | black==23.11.0 4 | cloudpickle==2.2.1 5 | clip==0.2.0 6 | controlnet-aux==0.0.7 7 | datasets==2.13.0 8 | decord==0.6.0 9 | diffusers==0.23.1 10 | easyocr==1.7.1 11 | easydict==1.11 12 | einops==0.7.0 13 | fairscale==0.4.13 14 | fastapi==0.104.1 15 | fire==0.5.0 16 | ftfy==6.1.3 17 | gradio==4.7.1 18 | gradio_client==0.7.0 19 | git+http://github.com/IDEA-Research/GroundingDINO.git 20 | imageio==2.31.5 21 | joblib==1.3.2 22 | huggingface==0.0.1 23 | huggingface-hub==0.17.3 24 | langchain==0.0.348 25 | Markdown==3.5.1 26 | markdown-it-py==3.0.0 27 | markdown2==2.4.11 28 | matplotlib==3.8.0 29 | mediapipe==0.10.8 30 | modelscope==1.9.4 31 | moviepy==1.0.3 32 | munch==4.0.0 33 | nltk==3.8.1 34 | numpy==1.25.2 35 | omegaconf==2.3.0 36 | openai==1.3.7 37 | openai-whisper==20230918 38 | open-clip-torch==2.23.0 39 | opencv-contrib-python==4.8.1.78 40 | opencv-python==4.8.1.78 41 | opencv-python-headless==4.8.1.78 42 | onnx==1.15.0 43 | onnxruntime 44 | pandas==2.1.3 45 | peft==0.4.0 46 | psutil==5.9.5 47 | pycocotools==2.0.7 48 | pydantic==2.5.2 49 | pydub==0.25.1 50 | Pygments==2.16.1 51 | PyYAML==6.0.1 52 | pytorch_lightning==1.7.7 53 | regex==2023.10.3 54 | rotary-embedding-torch==0.4.0 55 | scipy==1.11.4 56 | soundfile==0.12.1 57 | git+https://github.com/facebookresearch/segment-anything.git 58 | termcolor==2.4.0 59 | tiktoken==0.3.3 60 | timm==0.6.13 61 | tqdm==4.66.1 62 | transformers==4.34.1 63 | torchmetrics==0.11.4 64 | uvicorn==0.24.0.post1 65 | xformers==0.0.22 66 | wget==3.2 67 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | export no_proxy="localhost, 127.0.0.1" 2 | export CLLM_SERVICES_PORT=10004 3 | export TOG_SERVICE_PORT=10005 4 | export GRADIO_TEMP_DIR="~/.tmp" 5 | export OPENAI_API_KEY="sk-xxx" 6 | export OPENAI_BASE_URL="xxx" 7 | export WEATHER_API_KEY="xxx" 8 | export TASK_DECOMPOSITION_CKPT="./model_zoo/task_decomposition" 9 | export CLIENT_ROOT="./client_resources" 10 | export SERVER_ROOT="./server_resources" 11 | 12 | echo "Launch all tool services..." 13 | # step 1 14 | python -m cllm.services.launch --port $CLLM_SERVICES_PORT --host 0.0.0.0 & 15 | 16 | echo "Launch ToG service..." 17 | # step 2 18 | python -m cllm.services.tog.launch --port $TOG_SERVICE_PORT --host 0.0.0.0 & 19 | 20 | echo "Launch gradio demo..." 21 | # step 3 22 | python -m cllm.app.gradio --controller "cllm.agents.tog.Controller" --server-port 10003 23 | # python -m cllm.app.gradio --controller "cllm.agents.tog.Controller" --server-port 10003 --https 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name="cllm", packages=find_packages(), version="0.1.0", include_package_data=True) 4 | -------------------------------------------------------------------------------- /tests/test_controller.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from matplotlib import backend_bases 4 | from cllm.agents.tog import Planner 5 | import openai 6 | 7 | from multiprocessing import set_start_method 8 | 9 | openai.api_base = os.environ.get("OPENAI_API_BASE", None) 10 | 11 | 12 | def test_got(): 13 | user_request = "Generate a new image with a similar composition as b3e5f8_image.png, but with a different color scheme" 14 | planner = Planner(backend="local") 15 | subtasks, plan = planner.plan( 16 | user_request, {"video.mp4": "video", "audio_123.wav": "audio"} 17 | ) 18 | 19 | print("User's request: ") 20 | print(user_request) 21 | print("Task decomposition: ") 22 | print(subtasks) 23 | print("Solution: ") 24 | print(plan) 25 | 26 | 27 | def test_tog_api(): 28 | from cllm.services.tog.api import tog, task_decomposer 29 | 30 | user_request = "Generate a new image with a similar composition as b3e5f8_image.png, but with a different color scheme" 31 | subtasks = task_decomposer(user_request) 32 | solution = tog(user_request, subtasks) 33 | print(solution) 34 | 35 | 36 | # test_got_api() 37 | if __name__ == "__main__": 38 | test_got() 39 | # test_tog_api() 40 | -------------------------------------------------------------------------------- /tests/test_tool.py: -------------------------------------------------------------------------------- 1 | from cllm.services.tog.utils import build_tool_prompt 2 | from cllm.agents.builtin.tools import GENERAL_TOOLS 3 | 4 | 5 | def test(): 6 | print(build_tool_prompt(GENERAL_TOOLS[0])) 7 | 8 | # This is a tool that select the target classes in category list with the given condition. It is commonly used to filter out the objects with the same type. 9 | # Args: 10 | # category_list (category): the list to be processed 11 | # condition (text): the condition to select objects 12 | # Returns 13 | # list (list): the selected list 14 | 15 | 16 | def generate_json(): 17 | from cllm.agents.builtin.tools import TOOLS 18 | 19 | tools = [] 20 | for tool in TOOLS.values(): 21 | tool.description = build_tool_prompt(tool) 22 | tools.append(tool.dict()) 23 | import json 24 | 25 | with open("tools.json", "w") as f: 26 | json.dump(tools, f, indent=4) 27 | 28 | 29 | generate_json() 30 | --------------------------------------------------------------------------------