├── .gitignore
├── README.md
├── assets
    ├── arch.png
    ├── assistant.png
    ├── camparison.bmp
    ├── human.png
    ├── moti.png
    └── resources
    │   ├── image_1.png
    │   ├── image_2.png
    │   ├── image_3.png
    │   ├── image_4.png
    │   ├── image_5.png
    │   ├── image_6.png
    │   ├── image_7.png
    │   ├── image_8.png
    │   ├── image_9.png
    │   └── video_1.mp4
├── builtin_plan.json
├── cllm
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── base.py
    │   ├── builtin
    │   │   ├── __init__.py
    │   │   ├── plans.py
    │   │   ├── prompts.py
    │   │   └── tools.py
    │   ├── container.py
    │   └── tog
    │   │   ├── __init__.py
    │   │   ├── compiler.py
    │   │   ├── controller.py
    │   │   ├── interpretor.py
    │   │   ├── planner.py
    │   │   └── responser.py
    ├── app
    │   ├── __init__.py
    │   └── gradio.py
    ├── services
    │   ├── __init__.py
    │   ├── anything2image
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── imagebind
    │   │   │   ├── __init__.py
    │   │   │   ├── bpe
    │   │   │   │   └── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── data.py
    │   │   │   └── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── helpers.py
    │   │   │   │   ├── imagebind_model.py
    │   │   │   │   ├── multimodal_preprocessors.py
    │   │   │   │   └── transformer.py
    │   │   ├── launch.py
    │   │   └── tools.py
    │   ├── audio
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   └── tools.py
    │   ├── general
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   └── tools.py
    │   ├── hf_pipeline.py
    │   ├── image_editing
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   ├── ldm_inpainting
    │   │   │   ├── __init__.py
    │   │   │   ├── config.yaml
    │   │   │   ├── ldm
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── lr_scheduler.py
    │   │   │   │   ├── models
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── autoencoder.py
    │   │   │   │   │   ├── diffusion
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── classifier.py
    │   │   │   │   │   │   ├── ddim.py
    │   │   │   │   │   │   ├── ddpm.py
    │   │   │   │   │   │   └── plms.py
    │   │   │   │   │   └── quantize.py
    │   │   │   │   ├── modules
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── attention.py
    │   │   │   │   │   ├── diffusionmodules
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── model.py
    │   │   │   │   │   │   ├── openaimodel.py
    │   │   │   │   │   │   └── util.py
    │   │   │   │   │   ├── distributions
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── distributions.py
    │   │   │   │   │   ├── ema.py
    │   │   │   │   │   ├── encoders
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── modules.py
    │   │   │   │   │   ├── image_degradation
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── bsrgan.py
    │   │   │   │   │   │   ├── bsrgan_light.py
    │   │   │   │   │   │   ├── utils
    │   │   │   │   │   │   │   └── test.png
    │   │   │   │   │   │   └── utils_image.py
    │   │   │   │   │   ├── losses
    │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   └── x_transformer.py
    │   │   │   │   └── util.py
    │   │   │   └── wrapper.py
    │   │   └── tools.py
    │   ├── image_generation
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   └── tools.py
    │   ├── image_inpainting
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   ├── ldm_inpainting
    │   │   │   ├── __init__.py
    │   │   │   ├── config.yaml
    │   │   │   ├── ldm
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── lr_scheduler.py
    │   │   │   │   ├── models
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── autoencoder.py
    │   │   │   │   │   ├── diffusion
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── classifier.py
    │   │   │   │   │   │   ├── ddim.py
    │   │   │   │   │   │   ├── ddpm.py
    │   │   │   │   │   │   └── plms.py
    │   │   │   │   │   └── quantize.py
    │   │   │   │   ├── modules
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── attention.py
    │   │   │   │   │   ├── diffusionmodules
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── model.py
    │   │   │   │   │   │   ├── openaimodel.py
    │   │   │   │   │   │   └── util.py
    │   │   │   │   │   ├── distributions
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── distributions.py
    │   │   │   │   │   ├── ema.py
    │   │   │   │   │   ├── encoders
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── modules.py
    │   │   │   │   │   ├── image_degradation
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── bsrgan.py
    │   │   │   │   │   │   ├── bsrgan_light.py
    │   │   │   │   │   │   ├── utils
    │   │   │   │   │   │   │   └── test.png
    │   │   │   │   │   │   └── utils_image.py
    │   │   │   │   │   ├── losses
    │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   └── x_transformer.py
    │   │   │   │   └── util.py
    │   │   │   └── wrapper.py
    │   │   └── tools.py
    │   ├── image_perception
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── configs
    │   │   │   └── GroundingDINO_SwinT_OGC.py
    │   │   ├── launch.py
    │   │   ├── sam_preditor.py
    │   │   └── tools.py
    │   ├── image_processing
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   └── tools.py
    │   ├── launch.py
    │   ├── llama2
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   └── llama2.py
    │   ├── nlp
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   ├── llms
    │   │   │   ├── __init__.py
    │   │   │   ├── chat_models.py
    │   │   │   └── memory
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── message_memory.py
    │   │   │   │   └── utils.py
    │   │   └── tools.py
    │   ├── pool.py
    │   ├── tog
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── configs
    │   │   │   ├── __init__.py
    │   │   │   ├── resource_expert_prompts.py
    │   │   │   ├── solution_expert_prompts.py
    │   │   │   ├── task_decomposition_prompts.py
    │   │   │   ├── task_solver_prompts.py
    │   │   │   ├── tog_config.py
    │   │   │   └── tools.json
    │   │   ├── launch.py
    │   │   ├── tool.py
    │   │   └── utils.py
    │   ├── utils.py
    │   ├── video
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   └── tools.py
    │   └── vqa
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── launch.py
    │   │   └── tools.py
    └── utils.py
├── docker
    ├── Dockerfile
    ├── docker-compose-gradio.yml
    ├── docker-compose-tog.yml
    └── docker-compose-tool.yml
├── eval_data
    ├── tool100.json
    └── tool2k.json
├── requirements.txt
├── run.sh
├── setup.py
└── tests
    ├── test_controller.py
    └── test_tool.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | tmp
  2 | !tests/services/test.png
  3 | model_zoo
  4 | .vscode
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | .DS_Store
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | cover/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | .pybuilder/
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | #   For a library or package, you might want to ignore these files since the code is
 92 | #   intended to run in multiple environments; otherwise, check them in:
 93 | # .python-version
 94 | 
 95 | # pipenv
 96 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 97 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 98 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 99 | #   install all needed dependencies.
100 | #Pipfile.lock
101 | 
102 | # poetry
103 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
104 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
105 | #   commonly ignored for libraries.
106 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
107 | #poetry.lock
108 | 
109 | # pdm
110 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
111 | #pdm.lock
112 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
113 | #   in version control.
114 | #   https://pdm.fming.dev/#use-with-ide
115 | .pdm.toml
116 | 
117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118 | __pypackages__/
119 | 
120 | # Celery stuff
121 | celerybeat-schedule
122 | celerybeat.pid
123 | 
124 | # SageMath parsed files
125 | *.sage.py
126 | 
127 | # Environments
128 | .env
129 | .venv
130 | env/
131 | venv/
132 | ENV/
133 | env.bak/
134 | venv.bak/
135 | 
136 | # Spyder project settings
137 | .spyderproject
138 | .spyproject
139 | 
140 | # Rope project settings
141 | .ropeproject
142 | 
143 | # mkdocs documentation
144 | /site
145 | 
146 | # mypy
147 | .mypy_cache/
148 | .dmypy.json
149 | dmypy.json
150 | 
151 | # Pyre type checker
152 | .pyre/
153 | 
154 | # pytype static type analyzer
155 | .pytype/
156 | 
157 | # Cython debug symbols
158 | cython_debug/
159 | 
160 | # PyCharm
161 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
164 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
165 | #.idea/
166 | client_resources/
167 | server_resources/
168 | certificate/
169 | logs/
170 | bash.sh
171 | 


--------------------------------------------------------------------------------
/assets/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/arch.png


--------------------------------------------------------------------------------
/assets/assistant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/assistant.png


--------------------------------------------------------------------------------
/assets/camparison.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/camparison.bmp


--------------------------------------------------------------------------------
/assets/human.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/human.png


--------------------------------------------------------------------------------
/assets/moti.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/moti.png


--------------------------------------------------------------------------------
/assets/resources/image_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_1.png


--------------------------------------------------------------------------------
/assets/resources/image_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_2.png


--------------------------------------------------------------------------------
/assets/resources/image_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_3.png


--------------------------------------------------------------------------------
/assets/resources/image_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_4.png


--------------------------------------------------------------------------------
/assets/resources/image_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_5.png


--------------------------------------------------------------------------------
/assets/resources/image_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_6.png


--------------------------------------------------------------------------------
/assets/resources/image_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_7.png


--------------------------------------------------------------------------------
/assets/resources/image_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_8.png


--------------------------------------------------------------------------------
/assets/resources/image_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/image_9.png


--------------------------------------------------------------------------------
/assets/resources/video_1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/assets/resources/video_1.mp4


--------------------------------------------------------------------------------
/builtin_plan.json:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/cllm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/__init__.py


--------------------------------------------------------------------------------
/cllm/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Tool, Action
2 | from .container import *
3 | 


--------------------------------------------------------------------------------
/cllm/agents/base.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from enum import Enum
  3 | from typing import Callable, List
  4 | import json
  5 | from pathlib import Path
  6 | from collections import OrderedDict
  7 | 
  8 | 
  9 | @dataclass
 10 | class Action:
 11 |     """The action represent an assignment.
 12 |         `output = tool_name(**inputs)`
 13 | 
 14 |     Examples:
 15 |         >>> mask = segmentation_by_mask(image=image, prompt_mask=prompt_mask)
 16 |         >>> image = image_inpainting(image=image, mask=mask)
 17 |     """
 18 | 
 19 |     tool_name: str = (None,)
 20 |     inputs: dict = (None,)
 21 |     outputs: List[str] = (None,)
 22 | 
 23 |     def __str__(self) -> str:
 24 |         args = ", ".join([f"{k}={v}" for k, v in self.inputs.items()])
 25 |         return "{} = {}(".format(", ".join(self.outputs), self.tool_name) + args + ")"
 26 | 
 27 |     def dict(self):
 28 |         args = {str(k): str(v) for k, v in self.inputs.items()}
 29 |         # args = {str(item["name"]): str(item["value"]) for item in self.inputs}
 30 |         rets = [o if isinstance(o, str) else str(o) for o in self.outputs]
 31 |         return {
 32 |             "tool": self.tool_name,
 33 |             "inputs": args,
 34 |             "outputs": rets,
 35 |         }
 36 | 
 37 | 
 38 | class DataType(Enum):
 39 |     TEXT = "text"
 40 |     TAGS = "tags"
 41 |     TITLE = "title"
 42 |     # HTML = "text.html"
 43 |     HTML = "html"
 44 |     LOCATION = "location"
 45 |     WEATHER = "weather"
 46 |     TIME = "time"
 47 | 
 48 |     IMAGE = "image"
 49 |     VIDEO = "video"
 50 |     AUDIO = "audio"
 51 |     ANY = "any"
 52 |     NONE = "none"
 53 | 
 54 |     SEGMENTATION = "image.segmentation"
 55 |     EDGE = "image.edge"
 56 |     LINE = "image.line"
 57 |     HED = "image.hed"
 58 |     CANNY = "image.canny"
 59 |     SCRIBBLE = "image.scribble"
 60 |     POSE = "image.pose"
 61 |     DEPTH = "image.depth"
 62 |     NORMAL = "image.normal"
 63 | 
 64 |     MASK = "image.mask"  # SAM mask
 65 |     POINT = "point"
 66 |     BBOX = "bbox"  # {'label': 'dog', 'box': [1,2,3,4], 'score': 0.9}
 67 |     CATEGORY = "category"
 68 | 
 69 |     LIST = "list"
 70 | 
 71 |     def __str__(self):
 72 |         return self.value
 73 | 
 74 |     def __eq__(self, other):
 75 |         if isinstance(other, str):
 76 |             return self.value == other
 77 |         elif isinstance(other, self.__class__):
 78 |             return self.value == other.value
 79 |         else:
 80 |             return False
 81 | 
 82 | 
 83 | @dataclass
 84 | class Resource:
 85 |     name: str
 86 |     type: DataType
 87 |     value: None
 88 |     # description: str = None
 89 | 
 90 |     def dict(self):
 91 |         return {
 92 |             "name": self.name,
 93 |             "type": str(self.type),
 94 |             "value": str(self.value),
 95 |             # "description": self.description,
 96 |         }
 97 | 
 98 | 
 99 | @dataclass
100 | class Tool:
101 |     class Domain(Enum):
102 |         IMAGE_PERCEPTION = "image-perception"
103 |         IMAGE_GENERATION = "image-generation"
104 |         IMAGE_EDITING = "image-editing"
105 |         IMAGE_PROCESSING = "image-processing"
106 |         AUDIO_PERCEPTION = "audio-perception"
107 |         AUDIO_GENERATION = "audio-generation"
108 |         VIDEO_PERCEPTION = "video-perception"
109 |         VIDEO_GENERATION = "video-generation"
110 |         VIDEO_PROCESSING = "video-processing"
111 |         VIDEO_EDITING = "video-editing"
112 |         VIDEO_CUTTING = "video-cutting"
113 |         NATURAL_LANGUAGE_PROCESSING = "natural-language-processing"
114 |         CODE_GENERATION = "code-generation"
115 |         VISUAL_QUESTION_ANSWERING = "visual-question-answering"
116 |         QUESTION_ANSWERING = "question-answering"
117 |         GENERAL = "general"
118 | 
119 |         def __str__(self):
120 |             return self.value
121 | 
122 |     @dataclass
123 |     class Argument:
124 |         name: str
125 |         type: DataType
126 |         description: str
127 | 
128 |         def dict(self):
129 |             return {
130 |                 "name": self.name,
131 |                 "type": str(self.type),
132 |                 "description": self.description,
133 |             }
134 | 
135 |     name: str
136 |     description: str
137 |     domain: Domain
138 |     model: Callable
139 | 
140 |     usages: List[str] = field(default_factory=lambda: [])
141 |     args: List[Argument] = field(default_factory=lambda: [])
142 |     returns: List[Argument] = field(default_factory=lambda: [])
143 | 
144 |     def dict(self):
145 |         return {
146 |             "name": self.name,
147 |             "description": self.description,
148 |             "domain": str(self.domain),
149 |             "args": [a.dict() for a in self.args],
150 |             "returns": [r.dict() for r in self.returns],
151 |         }
152 | 
153 | 
154 | NON_FILE_TYPES = [
155 |     DataType.TAGS,
156 |     DataType.TEXT,
157 |     DataType.TITLE,
158 |     DataType.BBOX,
159 |     DataType.CATEGORY,
160 |     DataType.LIST,
161 |     DataType.LOCATION,
162 |     DataType.POINT,
163 |     DataType.WEATHER,
164 |     DataType.TIME,
165 | ]
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     s = [
170 |         [Action("a", {"aa": [Path("/a/d/e/t.txt")]}, [Path("/a/aa.txt")])],
171 |         Action("b", {"bb": "bbb"}, ["bbb"]),
172 |     ]
173 |     print(json.dumps(s, indent=4, default=lambda o: o.dict()))
174 | 


--------------------------------------------------------------------------------
/cllm/agents/builtin/__init__.py:
--------------------------------------------------------------------------------
1 | from . import plans
2 | from . import prompts
3 | from . import tools
4 | from .plans import BUILTIN_PLANS, load_builtin_plans
5 | from .prompts import RUN_PROMPT_TEMPLATE, CODE_PROMPT_TEMPLATE
6 | from .tools import TOOLS
7 | 


--------------------------------------------------------------------------------
/cllm/agents/builtin/prompts.py:
--------------------------------------------------------------------------------
  1 | RUN_PROMPT_TEMPLATE = """
  2 | I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
  3 | To help you, I will give you access to a set of tools that you can use. Each tool is a Python function and has a docstring explaining the task it performs, the inputs it expects and the outputs it returns.
  4 | You should first explain which tool you will use to perform the task and for what reason, then write the code in Python.
  5 | Each instruction in Python should be a simple assignment. 
  6 | 
  7 | Tools:
  8 | <<tools>>
  9 | 
 10 | Task: "How are you."
 11 | 
 12 | I will use `question_answering` to answer the given question.
 13 | 
 14 | Answer:
 15 | ```
 16 | output = question_answering(text=_task_)
 17 | ```
 18 | 
 19 | Task: "describe `image_1`."
 20 | 
 21 | I will use `image_captioning` to answer the question on `image_1`.
 22 | 
 23 | Answer:
 24 | ```
 25 | output = image_captioning(image=image_1)
 26 | ```
 27 | 
 28 | Task: "generate an image with a dog."
 29 | 
 30 | I will use `text_to_image` to generate an image.
 31 | 
 32 | Answer:
 33 | ```
 34 | output = text_to_image(text="an image with a dog")
 35 | ```
 36 | 
 37 | Task: "<<prompt>>"
 38 | 
 39 | """
 40 | 
 41 | CODE_PROMPT_TEMPLATE = '''
 42 | I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
 43 | - To help you, I will give you access to a set of tools that you can use. 
 44 | - Each tool is a Python function and has a docstring explaining the task it performs, the inputs it expects and the outputs it returns.
 45 | - You should first explain which tool you will use to perform the task and for what reason, then write the code in Python.
 46 | - Each instruction in Python should be a simple assignment. 
 47 | 
 48 | Tools:
 49 | <<tools>>
 50 | 
 51 | =======
 52 | 
 53 | History:
 54 | ```
 55 | # Task: "How are you."
 56 | ```
 57 | 
 58 | I will use `question_answering` to answer the given question.
 59 | 
 60 | Answer:
 61 | ```
 62 | # Task: "How are you."
 63 | output = question_answering(text=_task_)
 64 | ```
 65 | 
 66 | =======
 67 | 
 68 | History:
 69 | ```
 70 | # Task: "describe the given image."
 71 | ```
 72 | 
 73 | I will use `image_question_answering` to answer the question on the input image.
 74 | 
 75 | Answer:
 76 | ```
 77 | # Task: "describe the given image."
 78 | output = image_question_answering(text=_task_, image=image)
 79 | ```
 80 | 
 81 | =======
 82 | 
 83 | History:
 84 | ```
 85 | output = text_to_image(text="an image with a dog")
 86 | # Task: "describe the given image."
 87 | ```
 88 | 
 89 | I will use `image_question_answering` to answer the question on the input image.
 90 | 
 91 | Answer:
 92 | ```
 93 | # Task: "describe the given image."
 94 | output = image_question_answering(text=_task_, image=output)
 95 | ```
 96 | 
 97 | =======
 98 | 
 99 | History:
100 | ```
101 | # Task: "generate an image with a dog."
102 | ```
103 | 
104 | I will use `text_to_image` to generate an image.
105 | 
106 | Answer:
107 | ```
108 | # Task: "generate an image with a dog."
109 | output = text_to_image(text="an image with a dog")
110 | ```
111 | 
112 | =======
113 | 
114 | History:
115 | ```
116 | <<history>>
117 | # Task: "<<prompt>>"
118 | ```
119 | 
120 | 
121 | '''
122 | 


--------------------------------------------------------------------------------
/cllm/agents/container.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | from pathlib import Path
 4 | import json
 5 | from .base import DataType
 6 | from cllm.utils import get_real_path
 7 | 
 8 | FILE_EXT = {
 9 |     "image": ["png", "jpeg", "jpg", "gif", "bmp", "tiff", "webp"],
10 |     "video": ["mp4", "mov", "avi", "mkv"],
11 |     "audio": ["wav", "mp3"],
12 | }
13 | 
14 | 
15 | class Container:
16 |     def __init__(self, name, rtype, value) -> None:
17 |         self.name = name
18 |         self.rtype = rtype
19 |         self.value = value
20 | 
21 |     def to_chatbot(self):
22 |         pass
23 | 
24 |     def __str__(self):
25 |         pass
26 | 
27 |     def __repr__(self) -> str:
28 |         return str(self)
29 | 
30 | 
31 | class File(Container):
32 |     def to_chatbot(self):
33 |         return str(self.value)
34 | 
35 |     @property
36 |     def filename(self):
37 |         return os.path.basename(self.value)
38 | 
39 |     def __str__(self):
40 |         return f"`{self.filename}`"
41 | 
42 | 
43 | class HTML(File):
44 |     def to_chatbot(self):
45 |         return str(self.value)
46 | 
47 |     def __str__(self):
48 |         return f"`{self.filename}`"
49 | 
50 | 
51 | class Image(File):
52 |     def __str__(self):
53 |         return f"`{self.filename}`"
54 | 
55 | 
56 | class Video(File):
57 |     def __str__(self):
58 |         return f"`{self.filename}`"
59 | 
60 | 
61 | class Audio(File):
62 |     def __str__(self):
63 |         return f"`{self.filename}`"
64 | 
65 | 
66 | class Text(Container):
67 |     def to_chatbot(self):
68 |         if isinstance(self.value, str):
69 |             return self.value
70 |         elif isinstance(self.value, (list, tuple, dict)):
71 |             return json.dumps(self.value, indent=2)
72 |         return self.value
73 | 
74 |     def __str__(self):
75 |         if isinstance(self.value, (list, dict)):
76 |             return json.dumps(self.value)
77 |         elif isinstance(self.value, str):
78 |             return self.value
79 |         return str(self.value)
80 | 
81 | 
82 | def auto_type(name, rtype, value):
83 |     if value is None:
84 |         return None
85 |     if "image" in str(rtype):
86 |         return Image(name, rtype, get_real_path(value))
87 |     if DataType.VIDEO == rtype:
88 |         return Video(name, rtype, get_real_path(value))
89 |     if DataType.AUDIO == rtype:
90 |         return Audio(name, rtype, get_real_path(value))
91 |     if DataType.HTML == rtype:
92 |         return HTML(name, rtype, get_real_path(value))
93 |     return Text(name, rtype, value)
94 | 


--------------------------------------------------------------------------------
/cllm/agents/tog/__init__.py:
--------------------------------------------------------------------------------
1 | from .planner import Planner
2 | from .controller import Controller


--------------------------------------------------------------------------------
/cllm/agents/tog/compiler.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union
 2 | import ast
 3 | 
 4 | from cllm.agents.base import Action
 5 | 
 6 | 
 7 | class Parser:
 8 |     def parse(self, plan) -> List[Action]:
 9 |         # ignore indent
10 |         input = '\n'.join([line.strip() for line in plan.split('\n')])
11 |         actions = []
12 |         for stmt in ast.parse(input).body:
13 |             if isinstance(stmt, ast.Assign):
14 |                 assign: ast.Assign = stmt
15 |                 output: ast.Name = assign.targets[0]
16 |                 func_call: ast.Call = assign.value
17 |                 func_name: ast.Name = func_call.func
18 |                 kwargs: List[ast.keyword] = func_call.keywords
19 |                 args = {}
20 |                 for kwarg in kwargs:
21 |                     k = kwarg.arg
22 |                     if isinstance(kwarg.value, ast.Name):
23 |                         v = kwarg.value.id
24 |                     else:
25 |                         v = ast.literal_eval(kwarg.value)
26 |                     args[k] = v
27 |                 action = Action(tool_name=func_name.id, outputs=[output.id], inputs=args)
28 |                 actions.append(action)
29 |         return actions
30 | 
31 | 
32 | class Compiler:
33 |     def __init__(self):
34 |         self.parser = Parser()
35 | 
36 |     def compile(self, plan: Union[str, List[Union[Action, str]]]) -> List[Action]:
37 |         """ The input could be a plain string, a list of structured `Action` 
38 |             or combination of structured `Action` or unstructured action string.
39 |         """
40 |         actions = self.parse(plan)
41 |         actions = self.correct(actions)
42 |         return actions
43 | 
44 |     def parse(self, plan) -> List[Action]:
45 |         if isinstance(plan, str):
46 |             return self.parser.parse(plan)
47 | 
48 |         actions = []
49 |         for action in plan:
50 |             if isinstance(action, str):
51 |                 action = self.parser.parse(action)[0]
52 |             actions.append(action)
53 | 
54 |         return actions
55 | 
56 |     def correct(self, actions):
57 |         return actions
58 | 


--------------------------------------------------------------------------------
/cllm/agents/tog/controller.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | import logging
  3 | from typing import Tuple, List
  4 | import copy
  5 | from pathlib import Path
  6 | from cllm.agents import container
  7 | import json
  8 | from collections import OrderedDict
  9 | 
 10 | from cllm.agents.builtin import BUILTIN_PLANS, load_builtin_plans
 11 | from cllm.agents.container import auto_type
 12 | from cllm.agents.base import DataType, NON_FILE_TYPES
 13 | 
 14 | from cllm.agents.tog.interpretor import Interpretor
 15 | from cllm.agents.tog.planner import Planner
 16 | from cllm.agents.tog.responser2 import generate_response
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class Controller:
 22 |     def __init__(self, stream=True, interpretor_kwargs={}):
 23 |         self.stream = stream
 24 |         self.planner = Planner(self.stream)
 25 |         self.interpretor = Interpretor(**interpretor_kwargs)
 26 |         self.SHORTCUT = "**Using builtin shortcut solution.**"
 27 |         BUILTIN_PLANS.update(load_builtin_plans("builtin_plan.json"))
 28 |         logger.info(BUILTIN_PLANS)
 29 | 
 30 |     def plan(self, request: str, state: dict):
 31 |         logger.info(request)
 32 | 
 33 |         resource_memory = state.get("resources", {})
 34 |         raw_solution = None
 35 |         # shortcut for builtin plan
 36 |         for trigger_prompt, _ in BUILTIN_PLANS.items():
 37 |             if request == trigger_prompt:
 38 |                 return self.SHORTCUT
 39 | 
 40 |         # dynamic execution
 41 |         if raw_solution is None:
 42 |             raw_solution = self.planner.plan(request, resource_memory)
 43 |         return raw_solution
 44 | 
 45 |     def parse_solution_from_stream(self, raw_solution):
 46 |         return self.planner.parse(raw_solution)
 47 | 
 48 |     def execute(self, raw_solution: str, state: dict):
 49 |         resource_memory = state.get("resources")
 50 |         request = state["request"]
 51 |         solution = None
 52 |         if raw_solution == self.SHORTCUT:
 53 |             for trigger_prompt, builtin_plan in BUILTIN_PLANS.items():
 54 |                 if request == trigger_prompt:
 55 |                     solution = builtin_plan
 56 |                     solution = self._fill_args(solution, resource_memory)
 57 |         else:
 58 |             solution = self.planner.parse(raw_solution)
 59 | 
 60 |         if not solution:
 61 |             return None
 62 |         try:
 63 |             history_msgs = state.get("history_msgs")
 64 |             return self.interpretor.interpret(solution, history_msgs)
 65 |         except Exception as e:
 66 |             traceback.print_exc()
 67 |         return None
 68 | 
 69 |     def reply(self, executed_plan: dict, outputs: list, state: dict):
 70 |         error_response = [
 71 |             auto_type(
 72 |                 "response",
 73 |                 DataType.TEXT,
 74 |                 "Sorry, I cannot understand your request due to an internal error.",
 75 |             )
 76 |         ]
 77 |         state = copy.deepcopy(state)
 78 |         if (
 79 |             executed_plan is None
 80 |             or len(executed_plan) == 0
 81 |             or outputs is None
 82 |             or len(outputs) == 0
 83 |         ):
 84 |             return error_response, state
 85 |         resources = state.get("resources", OrderedDict())
 86 |         for o in outputs:
 87 |             if isinstance(o, container.File):
 88 |                 resources[str(o.filename)] = str(o.rtype)
 89 |         state["resources"] = resources
 90 |         response = generate_response(state["request"], executed_plan, outputs)
 91 |         if len(response) == 0:
 92 |             return error_response, state
 93 |         logger.info(response)
 94 |         return response, state
 95 | 
 96 |     def run(self, task: str, state: dict) -> Tuple[List, str]:
 97 |         try:
 98 |             return self._run(task, state)
 99 |         except:
100 |             traceback.print_exc()
101 |             logger.info(traceback.format_exc())
102 |             return [
103 |                 auto_type(
104 |                     "response",
105 |                     DataType.TEXT,
106 |                     "Sorry, I cannot understand your request due to an internal error.",
107 |                 )
108 |             ], "Internal Error"
109 | 
110 |     def _run(self, task: str, state: dict) -> Tuple[List, str]:
111 |         state["request"] = task
112 |         _, plan = self.plan(task, state)
113 |         logger.info(plan)
114 |         executed_plan = self.execute(plan, state)
115 | 
116 |         state["outputs"] = []
117 |         executed_plan = list(executed_plan)
118 |         for result_per_step, executed_solutions, wrapped_outputs in executed_plan:
119 |             state["executed_solutions"] = executed_solutions
120 |             for _, output in enumerate(wrapped_outputs):
121 |                 if output is None or output.value is None:
122 |                     continue
123 |                 state["outputs"].extend(wrapped_outputs)
124 | 
125 |         outputs = self.reply(state["executed_solutions"], state["outputs"], state)
126 | 
127 |         logger.info(outputs)
128 |         return outputs, executed_plan
129 | 
130 |     def _fill_args(self, plan, memory):
131 |         plan = copy.deepcopy(plan)
132 |         latest_resource = OrderedDict()
133 |         for key, val in memory.items():
134 |             latest_resource[val] = key
135 | 
136 |         for actions in plan:
137 |             for action in actions:
138 |                 for key, val in action.inputs.items():
139 |                     if "<TOOL-GENERATED>" not in val:
140 |                         action.inputs[key] = latest_resource.get(val, val)
141 |         return plan
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     controller = Controller(False)
146 |     task = "describe the image in details."
147 |     state = {
148 |         "resources": {
149 |             "image_3.png": "image",
150 |         },
151 |         "history_msgs": [],
152 |     }
153 |     outputs, executed_plan = controller.run(task, state)
154 |     print(outputs[0])
155 |     print("*" * 40)
156 |     print(executed_plan)
157 |     print("Done!")
158 | 


--------------------------------------------------------------------------------
/cllm/agents/tog/planner.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import List
  3 | import logging
  4 | 
  5 | from ..base import Action, NON_FILE_TYPES
  6 | from cllm.services.tog import TaskSolver, TaskDecomposer, config
  7 | from cllm.services.nlp.llms import ChatOpenAI, MessageMemory
  8 | from cllm.services.tog.api import tog, task_decomposer
  9 | from collections import OrderedDict
 10 | import copy
 11 | 
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class Planner:
 17 |     def __init__(
 18 |         self, streaming=False, backend="remote", device="cuda:0", **llm_kwargs
 19 |     ):
 20 |         self.streaming = streaming
 21 |         if backend == "local":
 22 |             self.cfg = config
 23 |             self.device = device
 24 |             self.mem = MessageMemory(**self.cfg.memory)
 25 |             self.llm = ChatOpenAI(temperature=0.2, **llm_kwargs)
 26 |             self.tog = TaskSolver(self.llm, self.cfg.task_solver_config, device).solve
 27 |             self.decomposer = TaskDecomposer(device, self.cfg.task_decomposer_cfg).solve
 28 |         elif backend == "remote":
 29 |             self.decomposer = task_decomposer
 30 |             self.tog = tog
 31 |         else:
 32 |             raise ValueError("Backend should be chosen from [remote, local]")
 33 | 
 34 |     def _find_latest_resource(self, resources, type):
 35 |         for key, val in list(resources.items())[::-1]:
 36 |             if val == type:
 37 |                 return key
 38 |         return None
 39 | 
 40 |     def _check_task_decomposition(
 41 |         self, task_decomposition: str | list, available_resources: dict
 42 |     ):
 43 |         copy_task_decomposition = copy.deepcopy(task_decomposition)
 44 |         available_resources = copy.deepcopy(available_resources)
 45 |         if isinstance(copy_task_decomposition, str):
 46 |             copy_task_decomposition = json.loads(copy_task_decomposition)
 47 | 
 48 |         for subtask in copy_task_decomposition:
 49 |             for arg in subtask["args"]:
 50 |                 if arg["type"] in NON_FILE_TYPES:
 51 |                     continue
 52 | 
 53 |                 r_type = available_resources.get(arg["value"], "None").split(".")[-1]
 54 |                 if arg["value"] not in available_resources or arg["type"] != r_type:
 55 |                     new_value = self._find_latest_resource(
 56 |                         available_resources, arg["type"]
 57 |                     )
 58 |                     if new_value is None:
 59 |                         logger.error(
 60 |                             f"No available resource for {arg['value']} with type {arg['type']}"
 61 |                         )
 62 |                         return None
 63 | 
 64 |                     arg["value"] = new_value
 65 | 
 66 |             available_resources[subtask["returns"][0]["value"]] = subtask["returns"][0][
 67 |                 "type"
 68 |             ]
 69 |         return json.dumps(copy_task_decomposition, indent=2, ensure_ascii=False)
 70 | 
 71 |     def wrap_request(self, request, memory):
 72 |         logger.info(memory)
 73 |         resource_list = {k: v.split(".")[-1] for k, v in memory.items()}
 74 |         request = f"Resource list: {resource_list}\n{request}"
 75 |         logger.info(f"Input: {request}")
 76 |         # print(f"Input: {request}")
 77 |         return request
 78 | 
 79 |     def solve_streaming(self, request: str, memory: dict = OrderedDict()):
 80 |         request = self.wrap_request(request, memory)
 81 |         sub_tasks = self.decomposer(request, streaming=self.streaming)
 82 |         logger.info(f"Task decomposition: \n{sub_tasks}")
 83 |         sub_tasks = self._check_task_decomposition(sub_tasks, memory)
 84 |         yield sub_tasks
 85 |         if sub_tasks in [None, "", []]:
 86 |             yield None
 87 |         else:
 88 |             solutions = self.tog(request, sub_tasks, streaming=self.streaming)
 89 |             yield solutions
 90 | 
 91 |     def solve(self, request: str, memory: dict = OrderedDict()) -> List:
 92 |         request = self.wrap_request(request, memory)
 93 |         sub_tasks = self.decomposer(request)
 94 |         solutions = self.tog(request, sub_tasks)
 95 |         return sub_tasks, solutions
 96 | 
 97 |     def plan(self, task, memory: dict = OrderedDict()) -> List:
 98 |         if self.streaming:
 99 |             return self.solve_streaming(task, memory)
100 |         else:
101 |             return self.solve(task, memory)
102 | 
103 |     def _check_solutions(self, solution: List | str) -> bool:
104 |         if isinstance(solution, str):
105 |             solution = json.loads(solution)
106 |         if len(solution) == 0:
107 |             return False
108 | 
109 |         valid = True
110 |         for i, stage_candiate in enumerate(solution):
111 |             if len(stage_candiate) == 0:
112 |                 logger.error(f"No solution is found in {i}-th subtask.")
113 |                 valid = False
114 |             elif (
115 |                 "solution" in stage_candiate[0]
116 |                 and len(stage_candiate[0]["solution"]) == 0
117 |             ):
118 |                 logger.error(f"No solution is found in {i+1}-th subtask.")
119 |                 valid = False
120 |             else:
121 |                 logger.info(f"Solutions for {i+1}-th subtask:\n{stage_candiate}")
122 |         return valid
123 | 
124 |     def parse(self, solution: List | str) -> List[List[Action]]:
125 |         if isinstance(solution, str):
126 |             solution = json.loads(solution)
127 | 
128 |         if not self._check_solutions(solution):
129 |             return None
130 | 
131 |         if isinstance(solution[0], Action):
132 |             return solution
133 | 
134 |         stages = []
135 |         for i, stage_candiate in enumerate(solution):
136 |             stage = stage_candiate[0]["solution"]
137 |             actions = []
138 |             for action in stage:
139 |                 inputs = {arg["name"]: arg["value"] for arg in action["args"]}
140 |                 outputs = [r["value"] for r in action["returns"]]
141 |                 actions.append(
142 |                     Action(action["tool_name"], inputs=inputs, outputs=outputs)
143 |                 )
144 |             stages.append(actions)
145 |         return stages
146 | 
147 |     def __call__(
148 |         self, request: str, memory: dict = OrderedDict()
149 |     ) -> List[List[Action]]:
150 |         solution = self.solve(request, memory)
151 |         return self.parse(solution)
152 | 


--------------------------------------------------------------------------------
/cllm/agents/tog/responser.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import logging
 3 | 
 4 | from cllm.services.nlp.llms.chat_models import ChatOpenAI
 5 | from cllm.services.nlp.llms.memory import MessageMemory
 6 | from langchain.schema import SystemMessage
 7 | 
 8 | from cllm.agents.base import DataType
 9 | from cllm.agents import container
10 | 
11 | 
12 | RESPONSE_GENERATION_PROMPT = """Your name is ControlLLM, an AI-powered assistant developed by OpenGV-lab from Shanghai Artificial Intelligence Laboratory. For user's request, the system executes the solution and collects the results based on the following workflow. You need to respond to user requests based on the following information. 
13 | Here are the information for you reference.
14 | 
15 | ## User Request
16 | {}
17 | 
18 | ## Workflow and Execution Results
19 | {}
20 | 
21 | Now you should pay attention to Collected Results. You first must answer the user’s request in a straightforward manner. Then you need to summarize the workflow and intermediate results friendly. Some of the results may not be accurate and need you to use your judgement in making decisions. If the results contain file names, you have to output the file name directly. Only if there is nothing returned by tools, you should tell user you can not finish the task. Now, please friendly summarize the results and answer the question for the user requests `{}`.
22 | """.strip()
23 | 
24 | 
25 | SIMPLE_RESPONSE_GENERATION_PROMPT = """Your name is ControlLLM, an AI-powered assistant developed by OpenGVLab from Shanghai Artificial Intelligence Laboratory. You need to respond to user requests based on the following information.
26 | Here are the information for you reference.
27 | 
28 | ## User Request
29 | {}
30 | 
31 | ## Workflow and Execution Results
32 | {}
33 | 
34 | Now, please friendly summarize the results and answer the question for the user requests `{}`.
35 | """.strip()
36 | 
37 | logger = logging.getLogger(__name__)
38 | 
39 | 
40 | def generate_response(user_input, solution, output_files):
41 |     if (
42 |         len(solution) <= 1
43 |         and len(solution[0]) <= 1
44 |         and solution[0][0].tool_name == "question_answering"
45 |     ):
46 |         content = SIMPLE_RESPONSE_GENERATION_PROMPT.format(
47 |             user_input, solution, user_input
48 |         )
49 |     else:
50 |         content = RESPONSE_GENERATION_PROMPT.format(user_input, solution, user_input)
51 | 
52 |     logger.info("##### Response Generation #####")
53 |     logger.info(content)
54 | 
55 |     chat = ChatOpenAI(model_name="gpt-3.5-turbo-16k")
56 |     messages = [SystemMessage(content=content)]
57 |     output = chat(messages)
58 |     logger.info(output)
59 | 
60 |     # files = [output for output in output_files if isinstance(output, container.File)]
61 |     # return [container.Text('Response', DataType.TEXT, output)] + files
62 |     return [container.Text("Response", DataType.TEXT, output)]
63 | 


--------------------------------------------------------------------------------
/cllm/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/app/__init__.py


--------------------------------------------------------------------------------
/cllm/services/__init__.py:
--------------------------------------------------------------------------------
 1 | from cllm.services.image_editing.api import (
 2 |     inpainting_ldm,
 3 |     inpainting_ldm_general,
 4 |     partial_image_editing,
 5 |     instruct_pix2pix,
 6 |     image_cropping,
 7 |     image_matting,
 8 |     draw_bbox_on_image,
 9 | )
10 | from cllm.services.image_generation.api import (
11 |     text2image,
12 |     image2image,
13 |     cannytext2image,
14 |     linetext2image,
15 |     hedtext2image,
16 |     scribbletext2image,
17 |     posetext2image,
18 |     segtext2image,
19 |     depthtext2image,
20 |     normaltext2image,
21 | )
22 | 
23 | from cllm.services.image_processing.api import (
24 |     image2canny,
25 |     image2line,
26 |     image2hed,
27 |     image2scribble,
28 |     image2pose,
29 |     image2depth,
30 |     image2normal,
31 | )
32 | from cllm.services.image_perception.api import (
33 |     object_detection,
34 |     image_classification,
35 |     ocr,
36 |     segment_objects,
37 |     visual_grounding,
38 |     image_captioning,
39 |     segment_by_mask,
40 |     segment_by_points,
41 |     set_image,
42 |     segment_all,
43 |     seg_by_mask,
44 |     seg_by_points,
45 | )
46 | from cllm.services.video.api import (
47 |     video_classification,
48 |     video_captioning,
49 |     image_audio_to_video,
50 |     video_to_webpage,
51 |     dub_video,
52 |     image_to_video,
53 | )
54 | from cllm.services.audio.api import (
55 |     text_to_music,
56 |     text_to_speech,
57 |     audio_classification,
58 | )
59 | from cllm.services.general.api import (
60 |     select,
61 |     count,
62 |     remote_logging,
63 | )
64 | from cllm.services.nlp.api import (
65 |     text_to_text_generation,
66 |     title_generation,
67 |     text_to_tags,
68 |     question_answering_with_context,
69 |     openai_chat_model,
70 |     summarization,
71 |     extract_location,
72 |     sentiment_analysis,
73 |     get_weather,
74 |     summarize_weather_condition,
75 |     get_time,
76 | )
77 | from cllm.services.vqa.api import image_qa
78 | 
79 | from fastapi import FastAPI
80 | from .pool import ModelPool
81 | 
82 | app = FastAPI()
83 | pool = ModelPool()
84 | 


--------------------------------------------------------------------------------
/cllm/services/anything2image/__init__.py:
--------------------------------------------------------------------------------
1 | from .tools import Anything2Image


--------------------------------------------------------------------------------
/cllm/services/anything2image/api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | 
 4 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
 5 | 
 6 | 
 7 | def audio2image(audio):
 8 |     url = "http://localhost:10049/chat"
 9 |     # files = {"image": open("assets/ADE_val_00000529.jpg", "rb")}
10 |     data = {"audio": audio}
11 |     response = requests.post(url, data=data)
12 |     return response.json()
13 | 


--------------------------------------------------------------------------------
/cllm/services/anything2image/imagebind/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import load_and_transform_text, load_and_transform_audio_data, load_and_transform_video_data, load_and_transform_vision_data, load_and_transform_thermal_data
2 | from .models.imagebind_model import imagebind_huge, ModalityType


--------------------------------------------------------------------------------
/cllm/services/anything2image/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/anything2image/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/cllm/services/anything2image/imagebind/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/anything2image/imagebind/models/__init__.py


--------------------------------------------------------------------------------
/cllm/services/anything2image/imagebind/models/helpers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
  3 | # All rights reserved.
  4 | 
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | 
  8 | import math
  9 | 
 10 | import einops
 11 | import numpy as np
 12 | import torch
 13 | 
 14 | import torch.nn as nn
 15 | 
 16 | 
 17 | class Normalize(nn.Module):
 18 |     def __init__(self, dim: int) -> None:
 19 |         super().__init__()
 20 |         self.dim = dim
 21 | 
 22 |     def forward(self, x):
 23 |         return torch.nn.functional.normalize(x, dim=self.dim, p=2)
 24 | 
 25 | 
 26 | class LearnableLogitScaling(nn.Module):
 27 |     def __init__(
 28 |         self,
 29 |         logit_scale_init: float = 1 / 0.07,
 30 |         learnable: bool = True,
 31 |         max_logit_scale: float = 100,
 32 |     ) -> None:
 33 |         super().__init__()
 34 |         self.max_logit_scale = max_logit_scale
 35 |         self.logit_scale_init = logit_scale_init
 36 |         self.learnable = learnable
 37 |         log_logit_scale = torch.ones([]) * np.log(self.logit_scale_init)
 38 |         if learnable:
 39 |             self.log_logit_scale = nn.Parameter(log_logit_scale)
 40 |         else:
 41 |             self.register_buffer("log_logit_scale", log_logit_scale)
 42 | 
 43 |     def forward(self, x):
 44 |         return torch.clip(self.log_logit_scale.exp(), max=self.max_logit_scale) * x
 45 | 
 46 |     def extra_repr(self):
 47 |         st = f"logit_scale_init={self.logit_scale_init},learnable={self.learnable}, max_logit_scale={self.max_logit_scale}"
 48 |         return st
 49 | 
 50 | 
 51 | class EinOpsRearrange(nn.Module):
 52 |     def __init__(self, rearrange_expr: str, **kwargs) -> None:
 53 |         super().__init__()
 54 |         self.rearrange_expr = rearrange_expr
 55 |         self.kwargs = kwargs
 56 | 
 57 |     def forward(self, x):
 58 |         assert isinstance(x, torch.Tensor)
 59 |         return einops.rearrange(x, self.rearrange_expr, **self.kwargs)
 60 | 
 61 | 
 62 | class VerboseNNModule(nn.Module):
 63 |     """
 64 |     Wrapper around nn.Module that prints registered buffers and parameter names.
 65 |     """
 66 | 
 67 |     @staticmethod
 68 |     def get_readable_tensor_repr(name: str, tensor: torch.Tensor) -> str:
 69 |         st = (
 70 |             "("
 71 |             + name
 72 |             + "): "
 73 |             + "tensor("
 74 |             + str(tuple(tensor[1].shape))
 75 |             + ", requires_grad="
 76 |             + str(tensor[1].requires_grad)
 77 |             + ")\n"
 78 |         )
 79 |         return st
 80 | 
 81 |     def extra_repr(self) -> str:
 82 |         named_modules = set()
 83 |         for p in self.named_modules():
 84 |             named_modules.update([p[0]])
 85 |         named_modules = list(named_modules)
 86 | 
 87 |         string_repr = ""
 88 |         for p in self.named_parameters():
 89 |             name = p[0].split(".")[0]
 90 |             if name not in named_modules:
 91 |                 string_repr += self.get_readable_tensor_repr(name, p)
 92 | 
 93 |         for p in self.named_buffers():
 94 |             name = p[0].split(".")[0]
 95 |             string_repr += self.get_readable_tensor_repr(name, p)
 96 | 
 97 |         return string_repr
 98 | 
 99 | 
100 | def cast_if_src_dtype(
101 |     tensor: torch.Tensor, src_dtype: torch.dtype, tgt_dtype: torch.dtype
102 | ):
103 |     updated = False
104 |     if tensor.dtype == src_dtype:
105 |         tensor = tensor.to(dtype=tgt_dtype)
106 |         updated = True
107 |     return tensor, updated
108 | 
109 | 
110 | class QuickGELU(nn.Module):
111 |     # From https://github.com/openai/CLIP/blob/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1/clip/model.py#L166
112 |     def forward(self, x: torch.Tensor):
113 |         return x * torch.sigmoid(1.702 * x)
114 | 
115 | 
116 | class SelectElement(nn.Module):
117 |     def __init__(self, index) -> None:
118 |         super().__init__()
119 |         self.index = index
120 | 
121 |     def forward(self, x):
122 |         assert x.ndim >= 3
123 |         return x[:, self.index, ...]
124 | 
125 | 
126 | class SelectEOSAndProject(nn.Module):
127 |     """
128 |     Text Pooling used in OpenCLIP
129 |     """
130 | 
131 |     def __init__(self, proj: nn.Module) -> None:
132 |         super().__init__()
133 |         self.proj = proj
134 | 
135 |     def forward(self, x, seq_len):
136 |         assert x.ndim == 3
137 |         # x is of shape B x L x D
138 |         # take features from the eot embedding (eot_token is the highest number in each sequence)
139 |         x = x[torch.arange(x.shape[0]), seq_len]
140 |         x = self.proj(x)
141 |         return x
142 | 


--------------------------------------------------------------------------------
/cllm/services/anything2image/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import uvicorn
 5 | from fastapi import FastAPI, UploadFile, File, Form, Response
 6 | from PIL import Image
 7 | import io
 8 | import uuid
 9 | 
10 | from .tools import Anything2Image
11 | 
12 | parser = argparse.ArgumentParser(description="Anything2Image API")
13 | parser.add_argument("--port", type=int, default=10049, help="Port")
14 | args = parser.parse_args()
15 | 
16 | app = FastAPI()
17 | model = Anything2Image('cuda:0')
18 | 
19 | TMP_DIR = 'anything2image_tmp'
20 | os.makedirs(TMP_DIR, exist_ok=True)
21 | 
22 | 
23 | def get_bytes_value(image):
24 |     img_byte_arr = io.BytesIO()
25 |     image.save(img_byte_arr, format='png')
26 |     return img_byte_arr.getvalue()
27 | 
28 | 
29 | @app.post("/audio2image")
30 | async def audio2image(audio: UploadFile = File(None)):
31 |     image_bytes = image.file.read()
32 |     image: Image = Image.open(io.BytesIO(image_bytes))
33 |     image_path = os.path.join(TMP_DIR, str(uuid.uuid3))
34 |     image.save(image_path)
35 |     output = model.audio2image(image_path)
36 |     buffer = get_bytes_value(output)
37 |     return Response(content=buffer, media_type="image/jpg")
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     uvicorn.run(app, host="0.0.0.0", port=args.port)
42 | 


--------------------------------------------------------------------------------
/cllm/services/anything2image/tools.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from diffusers import StableUnCLIPImg2ImgPipeline
 3 | from . import imagebind as ib
 4 | 
 5 | 
 6 | class Anything2Image:
 7 |     def __init__(self, device):
 8 |         pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
 9 |             "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
10 |         )
11 |         self.device = device
12 |         self.pipe = pipe.to(device)
13 |         self.pipe.enable_model_cpu_offload()
14 |         self.pipe.enable_vae_slicing()
15 | 
16 |         self.model = ib.imagebind_huge(pretrained=True)
17 |         self.model.eval()
18 |         self.model.to(device)
19 | 
20 |     def audio2image(self, audio_path):
21 |         embeddings = self.model.forward({
22 |             ib.ModalityType.AUDIO: ib.load_and_transform_audio_data([audio_path], self.device),
23 |         })
24 |         embeddings = embeddings[ib.ModalityType.AUDIO]
25 |         images = self.pipe(image_embeds=embeddings.half(), width=512, height=512).images
26 |         return images[0]
27 | 
28 |     def thermal2image(self, thermal_path):
29 |         embeddings = self.model.forward({
30 |             ib.ModalityType.THERMAL: ib.load_and_transform_thermal_data([thermal_path], self.device),
31 |         })
32 |         embeddings = embeddings[ib.ModalityType.THERMAL]
33 |         images = self.pipe(image_embeds=embeddings.half(), width=512, height=512).images
34 |         return images[0]
35 | 
36 |     def audioimage2image(self, image_path, audio_path):
37 |         embeddings = self.model.forward({
38 |             ib.ModalityType.VISION: ib.load_and_transform_vision_data([image_path], self.device),
39 |         }, normalize=False)
40 |         img_embeddings = embeddings[ib.ModalityType.VISION]
41 |         embeddings = self.model.forward({
42 |             ib.ModalityType.AUDIO: ib.load_and_transform_audio_data([audio_path], self.device),
43 |         })
44 |         audio_embeddings = embeddings[ib.ModalityType.AUDIO]
45 |         embeddings = (img_embeddings + audio_embeddings) / 2
46 |         images = self.pipe(image_embeds=embeddings.half(), width=512, height=512).images
47 |         return images[0]
48 | 
49 |     def audiotext2image(self, audio_path, text):
50 |         embeddings = self.model.forward({
51 |             ib.ModalityType.TEXT: ib.load_and_transform_text([text], self.device),
52 |         }, normalize=False)
53 |         text_embeddings = embeddings[ib.ModalityType.TEXT]
54 | 
55 |         embeddings = self.model.forward({
56 |             ib.ModalityType.AUDIO: ib.load_and_transform_audio_data([audio_path], self.device),
57 |         })
58 |         audio_embeddings = embeddings[ib.ModalityType.AUDIO]
59 |         embeddings = text_embeddings * 0.5 + audio_embeddings * 0.5
60 |         images = self.pipe(image_embeds=embeddings.half(), width=512, height=512).images
61 |         return images[0]
62 | 


--------------------------------------------------------------------------------
/cllm/services/audio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/audio/__init__.py


--------------------------------------------------------------------------------
/cllm/services/audio/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import uuid
 4 | import numpy as np
 5 | import os.path as osp
 6 | import whisper
 7 | import uvicorn
 8 | from fastapi import UploadFile, File, Form
 9 | from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
10 | 
11 | from .tools import *
12 | 
13 | from cllm.services import app, pool
14 | from cllm.services.utils import AudioResponse
15 | from ..hf_pipeline import HuggingfacePipeline
16 | 
17 | 
18 | parser = argparse.ArgumentParser(description="Audio API")
19 | parser.add_argument("--host", type=str, default="localhost", help="Host")
20 | parser.add_argument("--port", type=int, default=10049, help="Port")
21 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
22 | args = parser.parse_args()
23 | 
24 | RESOURCE_ROOT = os.environ.get("SERVER_ROOT", "./server_resources")
25 | os.makedirs(RESOURCE_ROOT, exist_ok=True)
26 | 
27 | 
28 | @app.post("/audio_classification")
29 | @pool.register(lambda: HuggingfacePipeline("audio-classification", args.device))
30 | async def audio_classification(audio: UploadFile = File(None)):
31 |     bytes = audio.file.read()
32 |     model = audio_classification.__wrapped__.model
33 |     output = model(bytes)
34 |     return JSONResponse(output)
35 | 
36 | 
37 | @app.post("/automatic_speech_recognition")
38 | @pool.register(lambda: HuggingfacePipeline("automatic-speech-recognition", args.device))
39 | async def automatic_speech_recognition(audio: UploadFile = File(None)):
40 |     bytes = audio.file.read()
41 |     model = automatic_speech_recognition.__wrapped__.model
42 |     output = model(bytes)
43 |     return JSONResponse(output)
44 | 
45 | 
46 | @app.post("/text_to_music")
47 | @pool.register(lambda: Text2Music(args.device))
48 | async def text_to_music(text: str = Form(...)):
49 |     model = text_to_music.__wrapped__.model
50 |     output = model(text)
51 |     return AudioResponse(output)
52 | 
53 | 
54 | @app.post("/text_to_speech")
55 | @pool.register(
56 |     lambda: HuggingfacePipeline("text-to-speech", args.device, model="suno/bark")
57 | )
58 | async def text_to_speech(text: str = Form(...)):
59 |     model = text_to_speech.__wrapped__.model
60 |     speech = model(text)
61 |     save_path = osp.join(RESOURCE_ROOT, f"{str(uuid.uuid4())[:6]}_audio.wav")
62 |     scipy.io.wavfile.write(
63 |         save_path,
64 |         rate=speech["sampling_rate"],
65 |         data=speech["audio"][0].astype(np.float32),
66 |     )
67 |     return AudioResponse(save_path)
68 | 
69 | 
70 | @app.post("/speech_to_text")
71 | @pool.register(lambda: whisper.load_model("base", args.device))
72 | async def speech_to_text(audio: UploadFile = File(None)):
73 |     model = speech_to_text.__wrapped__.model
74 |     save_path = osp.join(RESOURCE_ROOT, f"{str(uuid.uuid4())[:6]}_audio.wav")
75 |     with open(save_path, "wb") as fout:
76 |         fout.write(audio.file.read())
77 |     result = model.transcribe(save_path)
78 |     text = result["text"]
79 |     return JSONResponse(text)
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     uvicorn.run(app, host=args.host, port=args.port)
84 | 


--------------------------------------------------------------------------------
/cllm/services/audio/tools.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     pipeline,
 3 |     AutoModel,
 4 |     AutoProcessor,
 5 |     MusicgenForConditionalGeneration,
 6 | )
 7 | from PIL import Image
 8 | import torch
 9 | import scipy
10 | import io
11 | import numpy as np
12 | 
13 | 
14 | '''
15 | class Text2Speech:
16 |     def __init__(self, device):
17 |         self.device = device
18 |         self.processor = AutoProcessor.from_pretrained("suno/bark-small")
19 |         self.model = AutoModel.from_pretrained("suno/bark-small")
20 |         # self.model.to(self.device)
21 | 
22 |     def __call__(self, text):
23 |         inputs = self.processor(
24 |             text = [text],
25 |             padding=True,
26 |             return_tensors="pt",
27 |         ).to(self.device)
28 |         audio_values = self.model.generate(**inputs, do_sample=True)
29 |         
30 |         # TODO
31 |         save_path = 'resources/test.wav'
32 |         sampling_rate = self.model.config.audio_encoder.sampling_rate
33 |         scipy.io.wavfile.write(save_path, rate=sampling_rate, data=audio_values[0, 0].numpy())
34 |         return save_path
35 | 
36 |     def to(self, device):
37 |         self.model.to(device)
38 | '''
39 | 
40 | 
41 | class Text2Music:
42 |     def __init__(self, device):
43 |         self.device = device
44 |         self.dtype = torch.float16
45 |         self.processor = AutoProcessor.from_pretrained(
46 |             "facebook/musicgen-small"
47 |         )
48 |         self.model = MusicgenForConditionalGeneration.from_pretrained(
49 |             "facebook/musicgen-small", torch_dtype=self.dtype
50 |         )
51 |         self.model.to(device=self.device)
52 | 
53 |     def __call__(self, text: str):
54 |         inputs = self.processor(
55 |             text=[text],
56 |             padding=True,
57 |             return_tensors="pt",
58 |         ).to(self.device)
59 |         audio_values = self.model.generate(**inputs, max_new_tokens=512)
60 | 
61 |         # TODO
62 |         stream = io.BytesIO()
63 |         sampling_rate = self.model.config.audio_encoder.sampling_rate
64 |         scipy.io.wavfile.write(
65 |             stream,
66 |             rate=sampling_rate,
67 |             data=audio_values[0, 0].cpu().numpy().astype(np.float32),
68 |         )
69 |         stream.seek(0)
70 |         return stream
71 | 
72 |     def to(self, device):
73 |         self.device = device
74 |         self.model.to(device)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     model = Text2Music('auto')
79 |     print(
80 |         model(
81 |             "An 80s driving pop song with heavy drums and synth pads in the background"
82 |         )
83 |     )
84 | 


--------------------------------------------------------------------------------
/cllm/services/general/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/general/__init__.py


--------------------------------------------------------------------------------
/cllm/services/general/api.py:
--------------------------------------------------------------------------------
 1 | from re import I
 2 | from typing import List
 3 | from pathlib import Path
 4 | import os
 5 | import requests
 6 | 
 7 | __ALL__ = ["remote_logging", "select", "count"]
 8 | 
 9 | HOST = "localhost"
10 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
11 | 
12 | 
13 | def setup(host="localhost", port=10056):
14 |     global HOST, PORT
15 |     HOST = host
16 |     PORT = port
17 | 
18 | 
19 | def select(**kwargs):
20 |     if "bbox_list" in kwargs:
21 |         list = kwargs["bbox_list"]
22 |         condition = kwargs["condition"]
23 |         return [l for l in list if l["label"] == condition]
24 |     if "mask_list" in kwargs:
25 |         list = kwargs["mask_list"]
26 |         condition = kwargs["condition"]
27 |         # return combine_masks([l for l in list if l['label'] == condition])
28 |         return [l for l in list if l["label"] == condition]
29 |     if "category_list" in kwargs:
30 |         list = kwargs["category_list"]
31 |         condition = kwargs["condition"]
32 |         # return combine_masks([l for l in list if l['label'] == condition])
33 |         return [l for l in list if l["label"] == condition]
34 | 
35 | 
36 | def count(**kwargs):
37 |     len_of_list = 0
38 |     if "bbox_list" in kwargs:
39 |         len_of_list = len(kwargs["bbox_list"])
40 |     elif "mask_list" in kwargs:
41 |         len_of_list = len(kwargs["mask_list"])
42 | 
43 |     return f"The length of the given list is {len_of_list}"
44 | 
45 | 
46 | def remote_logging(
47 |     history_msgs: list,
48 |     task_decomposition: list,
49 |     solution: list,
50 |     record: str,
51 |     like: bool,
52 |     **kwargs,
53 | ):
54 |     host = kwargs.get("host", HOST)
55 |     port = kwargs.get("port", PORT)
56 |     url = f"http://{host}:{port}/remote_logging"
57 |     data = {
58 |         "history_msgs": history_msgs,
59 |         "task_decomposition": task_decomposition,
60 |         "solution": solution,
61 |         "record": record,
62 |         "like": like,
63 |     }
64 |     response = requests.post(url, data=data)
65 |     return response.content
66 | 


--------------------------------------------------------------------------------
/cllm/services/general/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from .tools import *
 3 | from fastapi import Form, Body
 4 | from fastapi.responses import JSONResponse
 5 | from cllm.services import app, pool
 6 | import uvicorn
 7 | 
 8 | 
 9 | parser = argparse.ArgumentParser(description="Image Perception API")
10 | parser.add_argument("--host", type=str, default="localhost", help="Host")
11 | parser.add_argument("--port", type=int, default=10049, help="Port")
12 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
13 | args = parser.parse_args()
14 | 
15 | 
16 | @app.post("/remote_logging")
17 | @pool.register(lambda: Logger(args.device))
18 | async def remote_logging(
19 |     history_msgs: list = Body(...),
20 |     task_decomposition: list = Body(...),
21 |     solution: list = Body(...),
22 |     record: str = Form(...),
23 |     like: bool = Form(...),
24 | ):
25 |     model = remote_logging.__wrapped__.model
26 |     output = model(history_msgs, task_decomposition, solution, record, like)
27 |     return JSONResponse(output)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     uvicorn.run(app, host=args.host, port=args.port)
32 | 


--------------------------------------------------------------------------------
/cllm/services/general/tools.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import json
 4 | import uuid
 5 | 
 6 | 
 7 | class Logger:
 8 |     def __init__(self, device):
 9 |         self.device = device
10 | 
11 |     def __call__(
12 |         self,
13 |         history_msgs: list,
14 |         task_decomposition: list,
15 |         solution: list,
16 |         record: str,
17 |         like: bool,
18 |     ):
19 |         os.makedirs("logs", exist_ok=True)
20 |         print(f"solution: {solution}")
21 |         print(f"solution: {type(solution)}")
22 |         state = {
23 |             "history": history_msgs,
24 |             "task_decomposition": task_decomposition,
25 |             "solution": solution,
26 |             "record": record,
27 |             "like": like,
28 |         }
29 |         file_name = f'logs/{time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())}_{str(uuid.uuid4())[:6]}.json'
30 |         json.dump(state, open(file_name, "w"), indent=4)
31 | 
32 |     def to(self, device):
33 |         return self
34 | 


--------------------------------------------------------------------------------
/cllm/services/hf_pipeline.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline
 2 | from PIL import Image
 3 | import torch
 4 | 
 5 | 
 6 | class HuggingfacePipeline:
 7 |     def __init__(self, task, device="cpu", **kwargs):
 8 |         # dtype=None
 9 |         self.device = device
10 |         self.task = task
11 |         self.pipeline = pipeline(task, device=device, **kwargs)
12 | 
13 |     def __call__(self, *args, **kwargs):
14 |         # print(f'HuggingfacePipeline. type(image): {type(image)}')
15 |         output = self.pipeline(*args, **kwargs)
16 |         # print(f'end HuggingfacePipeline. output: {output}')
17 |         return output
18 | 
19 |     def to(self, device):
20 |         self.pipeline.model.to(device=device)
21 | 
22 | 
23 | class HuggingfacePipelineNLP:
24 |     def __init__(self, task=None, device="cpu", **kwargs):
25 |         # dtype=None
26 |         self.device = device
27 |         self.task = task
28 |         self.model = pipeline(task, device=device, **kwargs)
29 | 
30 |     def __call__(self, text: str, *args, **kwargs):
31 |         if self.task == "summarization":
32 |             output = self.model(text, *args, **kwargs)
33 |         elif self.task == "text2text-generation":
34 |             output = self.model(text, *args, **kwargs)
35 |         else:
36 |             output = self.model(text, *args, **kwargs)
37 |         if self.task in ["summarization", "text2text-generation"]:
38 |             return list(output[0].values())[0]
39 |         if self.task == "question-answering":
40 |             return output["answer"]
41 |         return output
42 | 
43 |     def to(self, device):
44 |         self.model.model.to(device)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/cllm/services/image_editing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_editing/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import uvicorn
 4 | from fastapi import FastAPI, UploadFile, File, Form
 5 | from fastapi.responses import StreamingResponse
 6 | 
 7 | from PIL import Image
 8 | import io
 9 | 
10 | from .tools import *
11 | from cllm.services import app, pool
12 | 
13 | parser = argparse.ArgumentParser(description="Image Editing API")
14 | parser.add_argument("--host", type=str, default="localhost", help="Host")
15 | parser.add_argument("--port", type=int, default=10049, help="Port")
16 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
17 | args = parser.parse_args()
18 | 
19 | 
20 | def ImageResponse(image):
21 |     img_stream = io.BytesIO()
22 |     image.save(img_stream, format="png")
23 |     img_stream.seek(0)
24 |     return StreamingResponse(img_stream, media_type="image/png")
25 | 
26 | 
27 | @app.post("/instruct_pix2pix")
28 | @pool.register(lambda: InstructPix2Pix(args.device))
29 | async def instruct_pix2pix(image: UploadFile = File(None), text: str = Form(...)):
30 |     image_bytes = image.file.read()
31 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
32 |     model = instruct_pix2pix.__wrapped__.model
33 |     output = model(image, text)
34 |     return ImageResponse(output)
35 | 
36 | 
37 | @app.post("/partial_image_editing")
38 | @pool.register(lambda: PartialImageEditing(args.device))
39 | async def partial_image_editing(
40 |     image: UploadFile = File(None),
41 |     mask: UploadFile = File(None),
42 |     prompt: str = Form(...),
43 | ):
44 |     print(f"image: {image}; \n\nmask: {mask}; \n\nprompt: {prompt}")
45 |     image_bytes = image.file.read()
46 |     image = Image.open(io.BytesIO(image_bytes))
47 |     mask_bytes = mask.file.read()
48 |     mask = Image.open(io.BytesIO(mask_bytes))
49 |     model = partial_image_editing.__wrapped__.model
50 |     output = model(image, mask, prompt)
51 |     return ImageResponse(output)
52 | 
53 | 
54 | @app.post("/inpainting_ldm")
55 | @pool.register(lambda: LDMInpainting(args.device))
56 | async def inpainting_ldm(image: UploadFile = File(None), mask: UploadFile = File(None)):
57 |     image = Image.open(io.BytesIO(image.file.read())).convert("RGB")
58 |     mask = Image.open(io.BytesIO(mask.file.read()))
59 |     model = inpainting_ldm.__wrapped__.model
60 |     output = model(image, mask)
61 |     return ImageResponse(output)
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     uvicorn.run(app, host=args.host, port=args.port)
66 | 


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/__init__.py:
--------------------------------------------------------------------------------
1 | from .wrapper import LDMInpainter


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: masked_image
12 |     image_size: 64
13 |     channels: 3
14 |     concat_mode: true
15 |     monitor: val/loss
16 |     scheduler_config:
17 |       target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
18 |       params:
19 |         verbosity_interval: 0
20 |         warm_up_steps: 1000
21 |         max_decay_steps: 50000
22 |         lr_start: 0.001
23 |         lr_max: 0.1
24 |         lr_min: 0.0001
25 |     unet_config:
26 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
27 |       params:
28 |         image_size: 64
29 |         in_channels: 7
30 |         out_channels: 3
31 |         model_channels: 256
32 |         attention_resolutions:
33 |         - 8
34 |         - 4
35 |         - 2
36 |         num_res_blocks: 2
37 |         channel_mult:
38 |         - 1
39 |         - 2
40 |         - 3
41 |         - 4
42 |         num_heads: 8
43 |         resblock_updown: true
44 |     first_stage_config:
45 |       target: ldm.models.autoencoder.VQModelInterface
46 |       params:
47 |         embed_dim: 3
48 |         n_embed: 8192
49 |         monitor: val/rec_loss
50 |         ddconfig:
51 |           attn_type: none
52 |           double_z: false
53 |           z_channels: 3
54 |           resolution: 256
55 |           in_channels: 3
56 |           out_ch: 3
57 |           ch: 128
58 |           ch_mult:
59 |           - 1
60 |           - 2
61 |           - 4
62 |           num_res_blocks: 2
63 |           attn_resolutions: []
64 |           dropout: 0.0
65 |         lossconfig:
66 |           target: ldm.modules.losses.DummyLoss
67 |     cond_stage_config: __is_first_stage__
68 | 


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LambdaWarmUpCosineScheduler:
 5 |     """
 6 |     note: use with a base_lr of 1.0
 7 |     """
 8 |     def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
 9 |         self.lr_warm_up_steps = warm_up_steps
10 |         self.lr_start = lr_start
11 |         self.lr_min = lr_min
12 |         self.lr_max = lr_max
13 |         self.lr_max_decay_steps = max_decay_steps
14 |         self.last_lr = 0.
15 |         self.verbosity_interval = verbosity_interval
16 | 
17 |     def schedule(self, n, **kwargs):
18 |         if self.verbosity_interval > 0:
19 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
20 |         if n < self.lr_warm_up_steps:
21 |             lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
22 |             self.last_lr = lr
23 |             return lr
24 |         else:
25 |             t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
26 |             t = min(t, 1.0)
27 |             lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
28 |                     1 + np.cos(t * np.pi))
29 |             self.last_lr = lr
30 |             return lr
31 | 
32 |     def __call__(self, n, **kwargs):
33 |         return self.schedule(n,**kwargs)
34 | 
35 | 
36 | class LambdaWarmUpCosineScheduler2:
37 |     """
38 |     supports repeated iterations, configurable via lists
39 |     note: use with a base_lr of 1.0.
40 |     """
41 |     def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
42 |         assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
43 |         self.lr_warm_up_steps = warm_up_steps
44 |         self.f_start = f_start
45 |         self.f_min = f_min
46 |         self.f_max = f_max
47 |         self.cycle_lengths = cycle_lengths
48 |         self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
49 |         self.last_f = 0.
50 |         self.verbosity_interval = verbosity_interval
51 | 
52 |     def find_in_interval(self, n):
53 |         interval = 0
54 |         for cl in self.cum_cycles[1:]:
55 |             if n <= cl:
56 |                 return interval
57 |             interval += 1
58 | 
59 |     def schedule(self, n, **kwargs):
60 |         cycle = self.find_in_interval(n)
61 |         n = n - self.cum_cycles[cycle]
62 |         if self.verbosity_interval > 0:
63 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
64 |                                                        f"current cycle {cycle}")
65 |         if n < self.lr_warm_up_steps[cycle]:
66 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
67 |             self.last_f = f
68 |             return f
69 |         else:
70 |             t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
71 |             t = min(t, 1.0)
72 |             f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
73 |                     1 + np.cos(t * np.pi))
74 |             self.last_f = f
75 |             return f
76 | 
77 |     def __call__(self, n, **kwargs):
78 |         return self.schedule(n, **kwargs)
79 | 
80 | 
81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
82 | 
83 |     def schedule(self, n, **kwargs):
84 |         cycle = self.find_in_interval(n)
85 |         n = n - self.cum_cycles[cycle]
86 |         if self.verbosity_interval > 0:
87 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
88 |                                                        f"current cycle {cycle}")
89 | 
90 |         if n < self.lr_warm_up_steps[cycle]:
91 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
92 |             self.last_f = f
93 |             return f
94 |         else:
95 |             f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
96 |             self.last_f = f
97 |             return f
98 | 
99 | 


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/models/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/distributions/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/distributions/distributions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class AbstractDistribution:
 6 |     def sample(self):
 7 |         raise NotImplementedError()
 8 | 
 9 |     def mode(self):
10 |         raise NotImplementedError()
11 | 
12 | 
13 | class DiracDistribution(AbstractDistribution):
14 |     def __init__(self, value):
15 |         self.value = value
16 | 
17 |     def sample(self):
18 |         return self.value
19 | 
20 |     def mode(self):
21 |         return self.value
22 | 
23 | 
24 | class DiagonalGaussianDistribution(object):
25 |     def __init__(self, parameters, deterministic=False):
26 |         self.parameters = parameters
27 |         self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28 |         self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29 |         self.deterministic = deterministic
30 |         self.std = torch.exp(0.5 * self.logvar)
31 |         self.var = torch.exp(self.logvar)
32 |         if self.deterministic:
33 |             self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34 | 
35 |     def sample(self):
36 |         x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37 |         return x
38 | 
39 |     def kl(self, other=None):
40 |         if self.deterministic:
41 |             return torch.Tensor([0.])
42 |         else:
43 |             if other is None:
44 |                 return 0.5 * torch.sum(torch.pow(self.mean, 2)
45 |                                        + self.var - 1.0 - self.logvar,
46 |                                        dim=[1, 2, 3])
47 |             else:
48 |                 return 0.5 * torch.sum(
49 |                     torch.pow(self.mean - other.mean, 2) / other.var
50 |                     + self.var / other.var - 1.0 - self.logvar + other.logvar,
51 |                     dim=[1, 2, 3])
52 | 
53 |     def nll(self, sample, dims=[1,2,3]):
54 |         if self.deterministic:
55 |             return torch.Tensor([0.])
56 |         logtwopi = np.log(2.0 * np.pi)
57 |         return 0.5 * torch.sum(
58 |             logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59 |             dim=dims)
60 | 
61 |     def mode(self):
62 |         return self.mean
63 | 
64 | 
65 | def normal_kl(mean1, logvar1, mean2, logvar2):
66 |     """
67 |     source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68 |     Compute the KL divergence between two gaussians.
69 |     Shapes are automatically broadcasted, so batches can be compared to
70 |     scalars, among other use cases.
71 |     """
72 |     tensor = None
73 |     for obj in (mean1, logvar1, mean2, logvar2):
74 |         if isinstance(obj, torch.Tensor):
75 |             tensor = obj
76 |             break
77 |     assert tensor is not None, "at least one argument must be a Tensor"
78 | 
79 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
80 |     # Tensors, but it does not work for torch.exp().
81 |     logvar1, logvar2 = [
82 |         x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83 |         for x in (logvar1, logvar2)
84 |     ]
85 | 
86 |     return 0.5 * (
87 |         -1.0
88 |         + logvar2
89 |         - logvar1
90 |         + torch.exp(logvar1 - logvar2)
91 |         + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92 |     )
93 | 


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/ema.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class LitEma(nn.Module):
 6 |     def __init__(self, model, decay=0.9999, use_num_upates=True):
 7 |         super().__init__()
 8 |         if decay < 0.0 or decay > 1.0:
 9 |             raise ValueError('Decay must be between 0 and 1')
10 | 
11 |         self.m_name2s_name = {}
12 |         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
13 |         self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
14 |                              else torch.tensor(-1,dtype=torch.int))
15 | 
16 |         for name, p in model.named_parameters():
17 |             if p.requires_grad:
18 |                 #remove as '.'-character is not allowed in buffers
19 |                 s_name = name.replace('.','')
20 |                 self.m_name2s_name.update({name:s_name})
21 |                 self.register_buffer(s_name,p.clone().detach().data)
22 | 
23 |         self.collected_params = []
24 | 
25 |     def forward(self,model):
26 |         decay = self.decay
27 | 
28 |         if self.num_updates >= 0:
29 |             self.num_updates += 1
30 |             decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
31 | 
32 |         one_minus_decay = 1.0 - decay
33 | 
34 |         with torch.no_grad():
35 |             m_param = dict(model.named_parameters())
36 |             shadow_params = dict(self.named_buffers())
37 | 
38 |             for key in m_param:
39 |                 if m_param[key].requires_grad:
40 |                     sname = self.m_name2s_name[key]
41 |                     shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
42 |                     shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
43 |                 else:
44 |                     assert not key in self.m_name2s_name
45 | 
46 |     def copy_to(self, model):
47 |         m_param = dict(model.named_parameters())
48 |         shadow_params = dict(self.named_buffers())
49 |         for key in m_param:
50 |             if m_param[key].requires_grad:
51 |                 m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
52 |             else:
53 |                 assert not key in self.m_name2s_name
54 | 
55 |     def store(self, parameters):
56 |         """
57 |         Save the current parameters for restoring later.
58 |         Args:
59 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
60 |             temporarily stored.
61 |         """
62 |         self.collected_params = [param.clone() for param in parameters]
63 | 
64 |     def restore(self, parameters):
65 |         """
66 |         Restore the parameters stored with the `store` method.
67 |         Useful to validate the model with EMA parameters without affecting the
68 |         original optimization process. Store the parameters before the
69 |         `copy_to` method. After validation (or model saving), use this to
70 |         restore the former parameters.
71 |         Args:
72 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
73 |             updated with the stored parameters.
74 |         """
75 |         for c_param, param in zip(self.collected_params, parameters):
76 |             param.data.copy_(c_param.data)
77 | 


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/encoders/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/image_degradation/utils/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_editing/ldm_inpainting/ldm/modules/image_degradation/utils/test.png


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/ldm/modules/losses/__init__.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | 
3 | class DummyLoss(nn.Module):
4 |     def __init__(self):
5 |         super().__init__()


--------------------------------------------------------------------------------
/cllm/services/image_editing/ldm_inpainting/wrapper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import torch
 5 | from omegaconf import OmegaConf
 6 | import numpy as np
 7 | 
 8 | from .ldm.models.diffusion.ddim import DDIMSampler
 9 | from .ldm.util import instantiate_from_config
10 | 
11 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
12 | sys.path.append(CURRENT_DIR)
13 | 
14 | 
15 | def make_batch(image, mask, device):
16 |     image = image.astype(np.float32) / 255.0
17 |     image = image[None].transpose(0, 3, 1, 2)
18 |     image = torch.from_numpy(image)
19 | 
20 |     mask = mask.astype(np.float32) / 255.0
21 |     mask = mask[None, None]
22 |     mask[mask < 0.5] = 0
23 |     mask[mask >= 0.5] = 1
24 |     mask = torch.from_numpy(mask)
25 |         
26 |     masked_image = (1 - mask) * image
27 | 
28 |     batch = {"image": image, "mask": mask, "masked_image": masked_image}
29 |     for k in batch:
30 |         batch[k] = batch[k].to(device=device)
31 |         batch[k] = batch[k] * 2.0 - 1.0
32 |     return batch
33 | 
34 | 
35 | class LDMInpainter:
36 |     def __init__(self, ckpt_path, ddim_steps=50):
37 |         config = OmegaConf.load(os.path.join(CURRENT_DIR, "config.yaml"))
38 |         model = instantiate_from_config(config.model)
39 |         model.load_state_dict(torch.load(ckpt_path)["state_dict"], strict=False)
40 |         self.model = model
41 |         self.sampler = DDIMSampler(model)
42 |         self.ddim_steps = ddim_steps
43 | 
44 |     @torch.no_grad()
45 |     def __call__(self, image, mask, device):
46 |         self.model.to(device)
47 |         
48 |         model = self.model
49 |         sampler = self.sampler
50 |         
51 |         with self.model.ema_scope():
52 |             batch = make_batch(image, mask, device=device)
53 | 
54 |             # encode masked image and concat downsampled mask
55 |             c = model.cond_stage_model.encode(batch["masked_image"])
56 |             cc = torch.nn.functional.interpolate(batch["mask"],
57 |                                                  size=c.shape[-2:])
58 |             c = torch.cat((c, cc), dim=1)
59 | 
60 |             shape = (c.shape[1] - 1,) + c.shape[2:]
61 |             samples_ddim, _ = sampler.sample(S=self.ddim_steps,
62 |                                              conditioning=c,
63 |                                              batch_size=c.shape[0],
64 |                                              shape=shape,
65 |                                              verbose=False)
66 |             x_samples_ddim = model.decode_first_stage(samples_ddim)
67 | 
68 |             image = torch.clamp((batch["image"] + 1.0) / 2.0,
69 |                                 min=0.0, max=1.0)
70 |             mask = torch.clamp((batch["mask"] + 1.0) / 2.0,
71 |                                min=0.0, max=1.0)
72 |             predicted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0,
73 |                                           min=0.0, max=1.0)
74 | 
75 |             inpainted = (1 - mask) * image + mask * predicted_image
76 |             inpainted = inpainted.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255
77 |         
78 |         # offload to cpu to save memory
79 |         self.model.to(torch.device('cpu'))
80 |         return inpainted.astype(np.uint8)
81 | 


--------------------------------------------------------------------------------
/cllm/services/image_generation/__init__.py:
--------------------------------------------------------------------------------
1 | from .tools import (
2 |     Text2Image, CannyText2Image, LineText2Image, 
3 |     HedText2Image, ScribbleText2Image, PoseText2Image, SegText2Image, 
4 |     DepthText2Image, NormalText2Image
5 | )
6 | 


--------------------------------------------------------------------------------
/cllm/services/image_generation/api.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | 
 4 | import requests
 5 | from PIL import Image
 6 | from cllm.services.utils import get_bytes_value
 7 | 
 8 | 
 9 | __ALL__ = [
10 |     "text2image",
11 |     "cannytext2image",
12 |     "linetext2image",
13 |     "hedtext2image",
14 |     "scribbletext2image",
15 |     "posetext2image",
16 |     "segtext2image",
17 |     "depthtext2image",
18 |     "normaltext2image" "image2image",
19 | ]
20 | 
21 | 
22 | HOST = "localhost"
23 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
24 | 
25 | 
26 | def setup(host="localhost", port=10049):
27 |     global HOST, PORT
28 |     HOST = host
29 |     PORT = port
30 | 
31 | 
32 | def text2image(text, **kwargs):
33 |     host = kwargs.get("host", HOST)
34 |     port = kwargs.get("port", PORT)
35 |     url = f"http://{host}:{port}/text2image"
36 |     data = {"text": text}
37 |     response = requests.post(url, data=data)
38 |     return response.content
39 | 
40 | 
41 | def image2image(image, **kwargs):
42 |     host = kwargs.get("host", HOST)
43 |     port = kwargs.get("port", PORT)
44 |     url = f"http://{host}:{port}/image2image"
45 |     files = {"image": (image, get_bytes_value(image))}
46 |     response = requests.post(url, files=files)
47 |     return response.content
48 | 
49 | 
50 | def _imagetext2image(image, text, endpoint, **kwargs):
51 |     host = kwargs.get("host", HOST)
52 |     port = kwargs.get("port", PORT)
53 |     url = f"http://{host}:{port}/{endpoint}"
54 |     data = {"text": text}
55 |     files = {"image": (image, get_bytes_value(image))}
56 |     response = requests.post(url, files=files, data=data)
57 |     # image = Image.open(io.BytesIO(response.content))
58 |     # image = io.BytesIO(response.content)
59 |     # return image
60 |     return response.content
61 | 
62 | 
63 | def cannytext2image(edge, text, **kwargs):
64 |     return _imagetext2image(edge, text, endpoint="cannytext2image", **kwargs)
65 | 
66 | 
67 | def linetext2image(line, text, **kwargs):
68 |     return _imagetext2image(line, text, endpoint="linetext2image", **kwargs)
69 | 
70 | 
71 | def hedtext2image(hed, text, **kwargs):
72 |     return _imagetext2image(hed, text, endpoint="hedtext2image", **kwargs)
73 | 
74 | 
75 | def scribbletext2image(scribble, text, **kwargs):
76 |     return _imagetext2image(scribble, text, endpoint="scribbletext2image", **kwargs)
77 | 
78 | 
79 | def posetext2image(pose, text, **kwargs):
80 |     return _imagetext2image(pose, text, endpoint="posetext2image", **kwargs)
81 | 
82 | 
83 | def segtext2image(segmentation, text, **kwargs):
84 |     return _imagetext2image(segmentation, text, endpoint="segtext2image", **kwargs)
85 | 
86 | 
87 | def depthtext2image(depth, text, **kwargs):
88 |     return _imagetext2image(depth, text, endpoint="depthtext2image", **kwargs)
89 | 
90 | 
91 | def normaltext2image(normal, text, **kwargs):
92 |     return _imagetext2image(normal, text, endpoint="normaltext2image", **kwargs)
93 | 


--------------------------------------------------------------------------------
/cllm/services/image_generation/launch.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import uvicorn
  4 | from fastapi import UploadFile, File, Form
  5 | from PIL import Image
  6 | import io
  7 | 
  8 | from .tools import *
  9 | from cllm.services import app, pool
 10 | from cllm.services.utils import ImageResponse
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser(description="Image Generation API")
 14 | parser.add_argument("--host", type=str, default="localhost", help="Host")
 15 | parser.add_argument("--port", type=int, default=10049, help="Port")
 16 | parser.add_argument("--device", type=str, default="cuda:0", help="Port")
 17 | args = parser.parse_args()
 18 | 
 19 | 
 20 | # def ImageResponse(image):
 21 | #     img_stream = io.BytesIO()
 22 | #     image.save(img_stream, format="png")
 23 | #     img_stream.seek(0)
 24 | 
 25 | #     return StreamingResponse(img_stream, media_type="image/png")
 26 | 
 27 | 
 28 | # @app.post("/text2image")
 29 | # @pool.register(lambda: Text2Image(args.device))
 30 | # async def text2image(text: str = Form(...)):
 31 | #     model = text2image.__wrapped__.model
 32 | #     output = model(text)
 33 | #     return ImageResponse(output)
 34 | 
 35 | 
 36 | @app.post("/text2image")
 37 | @pool.register(lambda: PixArtAlpha(args.device))
 38 | async def text2image(text: str = Form(...)):
 39 |     model = text2image.__wrapped__.model
 40 |     output = model(text)
 41 |     return ImageResponse(output)
 42 | 
 43 | 
 44 | @app.post("/image2image")
 45 | @pool.register(lambda: Image2Image(args.device))
 46 | async def image2image(image: UploadFile = File(None)):
 47 |     image_bytes = image.file.read()
 48 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 49 |     model = image2image.__wrapped__.model
 50 |     output = model(image)
 51 |     return ImageResponse(output)
 52 | 
 53 | 
 54 | @app.post("/cannytext2image")
 55 | @pool.register(lambda: CannyText2Image(args.device))
 56 | async def cannytext2image(image: UploadFile = File(None), text: str = Form(...)):
 57 |     image_bytes = image.file.read()
 58 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 59 |     model = cannytext2image.__wrapped__.model
 60 |     output = model(image, text)
 61 |     return ImageResponse(output)
 62 | 
 63 | 
 64 | @app.post("/linetext2image")
 65 | @pool.register(lambda: LineText2Image(args.device))
 66 | async def linetext2image(image: UploadFile = File(None), text: str = Form(...)):
 67 |     image_bytes = image.file.read()
 68 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 69 |     model = linetext2image.__wrapped__.model
 70 |     output = model(image, text)
 71 |     return ImageResponse(output)
 72 | 
 73 | 
 74 | @app.post("/hedtext2image")
 75 | @pool.register(lambda: HedText2Image(args.device))
 76 | async def hedtext2image(image: UploadFile = File(None), text: str = Form(...)):
 77 |     image_bytes = image.file.read()
 78 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 79 |     model = hedtext2image.__wrapped__.model
 80 |     output = model(image, text)
 81 |     return ImageResponse(output)
 82 | 
 83 | 
 84 | @app.post("/scribbletext2image")
 85 | @pool.register(lambda: ScribbleText2Image(args.device))
 86 | async def scribbletext2image(image: UploadFile = File(None), text: str = Form(...)):
 87 |     image_bytes = image.file.read()
 88 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 89 |     model = scribbletext2image.__wrapped__.model
 90 |     output = model(image, text)
 91 |     return ImageResponse(output)
 92 | 
 93 | 
 94 | @app.post("/posetext2image")
 95 | @pool.register(lambda: PoseText2Image(args.device))
 96 | async def posetext2image(image: UploadFile = File(None), text: str = Form(...)):
 97 |     image_bytes = image.file.read()
 98 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 99 |     model = posetext2image.__wrapped__.model
100 |     output = model(image, text)
101 |     return ImageResponse(output)
102 | 
103 | 
104 | @app.post("/segtext2image")
105 | @pool.register(lambda: SegText2Image(args.device))
106 | async def segtext2image(image: UploadFile = File(None), text: str = Form(...)):
107 |     image.file.seek(0)
108 |     image_bytes = image.file.read()
109 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
110 |     model = segtext2image.__wrapped__.model
111 |     output = model(image, text)
112 |     return ImageResponse(output)
113 | 
114 | 
115 | @app.post("/depthtext2image")
116 | @pool.register(lambda: SegText2Image(args.device))
117 | async def depthtext2image(image: UploadFile = File(None), text: str = Form(...)):
118 |     image_bytes = image.file.read()
119 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
120 |     model = depthtext2image.__wrapped__.model
121 |     output = model(image, text)
122 |     return ImageResponse(output)
123 | 
124 | 
125 | @app.post("/normaltext2image")
126 | @pool.register(lambda: SegText2Image(args.device))
127 | async def normaltext2image(image: UploadFile = File(None), text: str = Form(...)):
128 |     image_bytes = image.file.read()
129 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
130 |     model = normaltext2image.__wrapped__.model
131 |     output = model(image, text)
132 |     return ImageResponse(output)
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     uvicorn.run(app, host=args.host, port=args.port)
137 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/api.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from typing import Union, List, Dict
 3 | from PIL import Image, ImageChops
 4 | import io
 5 | import os
 6 | 
 7 | import requests
 8 | from cllm.services.utils import get_bytes_value
 9 | 
10 | __ALL__ = [
11 |     "inpainting_ldm",
12 | ]
13 | 
14 | 
15 | HOST = "localhost"
16 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
17 | 
18 | 
19 | def setup(host="localhost", port=10052):
20 |     global HOST, PORT
21 |     HOST = host
22 |     PORT = port
23 | 
24 | 
25 | def combine_masks(mask_images):
26 |     if mask_images is None or len(mask_images) == 0:
27 |         return None
28 | 
29 |     # Create a new blank image to store the combined mask
30 |     combined_mask = Image.open(io.BytesIO(mask_images[0])).convert("1")
31 | 
32 |     # Iterate through each mask image and combine them
33 |     for mask_image in mask_images:
34 |         mask = Image.open(io.BytesIO(mask_image)).convert("1")
35 |         combined_mask = ImageChops.logical_or(combined_mask, mask)
36 |     stream = io.BytesIO()
37 |     combined_mask.save(stream, "png")
38 |     stream.seek(0)
39 |     # return {"label": mask_images[0]["label"], "mask": stream.getvalue()}
40 |     return stream.getvalue()
41 | 
42 | 
43 | def inpainting_ldm_general(image, mask: Union[bytes, List], **kwargs):
44 |     if mask in [None, b"", []]:
45 |         return get_bytes_value(image)
46 | 
47 |     mask = copy.deepcopy(mask)
48 |     if isinstance(mask, List):
49 |         if not isinstance(mask[0], dict):
50 |             mask_list = get_bytes_value(mask)
51 |         else:
52 |             mask_list = []
53 |             for m in mask:
54 |                 mask_list.append(get_bytes_value(m["mask"]))
55 |         mask = combine_masks(mask_list)
56 | 
57 |     return inpainting_ldm(image, mask, **kwargs)
58 | 
59 | 
60 | def inpainting_ldm(image, mask, **kwargs):
61 |     if mask in [None, b""]:
62 |         return get_bytes_value(image)
63 | 
64 |     host = kwargs.get("host", HOST)
65 |     port = kwargs.get("port", PORT)
66 |     url = f"http://{host}:{port}/inpainting_ldm"
67 |     files = {
68 |         "image": (image, get_bytes_value(image)),
69 |         "mask": get_bytes_value(mask),
70 |     }
71 |     response = requests.post(url, files=files)
72 |     return response.content
73 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import uvicorn
 4 | from fastapi import FastAPI, UploadFile, File
 5 | from fastapi.responses import StreamingResponse
 6 | from PIL import Image
 7 | import io
 8 | 
 9 | from .tools import *
10 | from cllm.services import app, pool
11 | 
12 | parser = argparse.ArgumentParser(description="Image Inpainting API")
13 | parser.add_argument("--host", type=str, default="localhost", help="Host")
14 | parser.add_argument("--port", type=int, default=10049, help="Port")
15 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
16 | args = parser.parse_args()
17 | 
18 | 
19 | def ImageResponse(image):
20 |     img_stream = io.BytesIO()
21 |     image.save(img_stream, format="png")
22 |     img_stream.seek(0)
23 | 
24 |     return StreamingResponse(img_stream, media_type="image/png")
25 | 
26 | 
27 | @app.post("/inpainting_ldm")
28 | @pool.register(lambda: LDMInpainting(args.device))
29 | async def inpainting_ldm(image: UploadFile = File(None), mask: UploadFile = File(None)):
30 |     image = Image.open(io.BytesIO(image.file.read()))
31 |     mask = Image.open(io.BytesIO(mask.file.read()))
32 |     model = inpainting_ldm.__wrapped__.model
33 |     output = model(image, mask)
34 |     return ImageResponse(output)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     uvicorn.run(app, host=args.host, port=args.port)
39 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/__init__.py:
--------------------------------------------------------------------------------
1 | from .wrapper import LDMInpainter


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: masked_image
12 |     image_size: 64
13 |     channels: 3
14 |     concat_mode: true
15 |     monitor: val/loss
16 |     scheduler_config:
17 |       target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
18 |       params:
19 |         verbosity_interval: 0
20 |         warm_up_steps: 1000
21 |         max_decay_steps: 50000
22 |         lr_start: 0.001
23 |         lr_max: 0.1
24 |         lr_min: 0.0001
25 |     unet_config:
26 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
27 |       params:
28 |         image_size: 64
29 |         in_channels: 7
30 |         out_channels: 3
31 |         model_channels: 256
32 |         attention_resolutions:
33 |         - 8
34 |         - 4
35 |         - 2
36 |         num_res_blocks: 2
37 |         channel_mult:
38 |         - 1
39 |         - 2
40 |         - 3
41 |         - 4
42 |         num_heads: 8
43 |         resblock_updown: true
44 |     first_stage_config:
45 |       target: ldm.models.autoencoder.VQModelInterface
46 |       params:
47 |         embed_dim: 3
48 |         n_embed: 8192
49 |         monitor: val/rec_loss
50 |         ddconfig:
51 |           attn_type: none
52 |           double_z: false
53 |           z_channels: 3
54 |           resolution: 256
55 |           in_channels: 3
56 |           out_ch: 3
57 |           ch: 128
58 |           ch_mult:
59 |           - 1
60 |           - 2
61 |           - 4
62 |           num_res_blocks: 2
63 |           attn_resolutions: []
64 |           dropout: 0.0
65 |         lossconfig:
66 |           target: ldm.modules.losses.DummyLoss
67 |     cond_stage_config: __is_first_stage__
68 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LambdaWarmUpCosineScheduler:
 5 |     """
 6 |     note: use with a base_lr of 1.0
 7 |     """
 8 |     def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
 9 |         self.lr_warm_up_steps = warm_up_steps
10 |         self.lr_start = lr_start
11 |         self.lr_min = lr_min
12 |         self.lr_max = lr_max
13 |         self.lr_max_decay_steps = max_decay_steps
14 |         self.last_lr = 0.
15 |         self.verbosity_interval = verbosity_interval
16 | 
17 |     def schedule(self, n, **kwargs):
18 |         if self.verbosity_interval > 0:
19 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
20 |         if n < self.lr_warm_up_steps:
21 |             lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
22 |             self.last_lr = lr
23 |             return lr
24 |         else:
25 |             t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
26 |             t = min(t, 1.0)
27 |             lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
28 |                     1 + np.cos(t * np.pi))
29 |             self.last_lr = lr
30 |             return lr
31 | 
32 |     def __call__(self, n, **kwargs):
33 |         return self.schedule(n,**kwargs)
34 | 
35 | 
36 | class LambdaWarmUpCosineScheduler2:
37 |     """
38 |     supports repeated iterations, configurable via lists
39 |     note: use with a base_lr of 1.0.
40 |     """
41 |     def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
42 |         assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
43 |         self.lr_warm_up_steps = warm_up_steps
44 |         self.f_start = f_start
45 |         self.f_min = f_min
46 |         self.f_max = f_max
47 |         self.cycle_lengths = cycle_lengths
48 |         self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
49 |         self.last_f = 0.
50 |         self.verbosity_interval = verbosity_interval
51 | 
52 |     def find_in_interval(self, n):
53 |         interval = 0
54 |         for cl in self.cum_cycles[1:]:
55 |             if n <= cl:
56 |                 return interval
57 |             interval += 1
58 | 
59 |     def schedule(self, n, **kwargs):
60 |         cycle = self.find_in_interval(n)
61 |         n = n - self.cum_cycles[cycle]
62 |         if self.verbosity_interval > 0:
63 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
64 |                                                        f"current cycle {cycle}")
65 |         if n < self.lr_warm_up_steps[cycle]:
66 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
67 |             self.last_f = f
68 |             return f
69 |         else:
70 |             t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
71 |             t = min(t, 1.0)
72 |             f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
73 |                     1 + np.cos(t * np.pi))
74 |             self.last_f = f
75 |             return f
76 | 
77 |     def __call__(self, n, **kwargs):
78 |         return self.schedule(n, **kwargs)
79 | 
80 | 
81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
82 | 
83 |     def schedule(self, n, **kwargs):
84 |         cycle = self.find_in_interval(n)
85 |         n = n - self.cum_cycles[cycle]
86 |         if self.verbosity_interval > 0:
87 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
88 |                                                        f"current cycle {cycle}")
89 | 
90 |         if n < self.lr_warm_up_steps[cycle]:
91 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
92 |             self.last_f = f
93 |             return f
94 |         else:
95 |             f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
96 |             self.last_f = f
97 |             return f
98 | 
99 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/models/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/distributions/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/distributions/distributions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class AbstractDistribution:
 6 |     def sample(self):
 7 |         raise NotImplementedError()
 8 | 
 9 |     def mode(self):
10 |         raise NotImplementedError()
11 | 
12 | 
13 | class DiracDistribution(AbstractDistribution):
14 |     def __init__(self, value):
15 |         self.value = value
16 | 
17 |     def sample(self):
18 |         return self.value
19 | 
20 |     def mode(self):
21 |         return self.value
22 | 
23 | 
24 | class DiagonalGaussianDistribution(object):
25 |     def __init__(self, parameters, deterministic=False):
26 |         self.parameters = parameters
27 |         self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28 |         self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29 |         self.deterministic = deterministic
30 |         self.std = torch.exp(0.5 * self.logvar)
31 |         self.var = torch.exp(self.logvar)
32 |         if self.deterministic:
33 |             self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34 | 
35 |     def sample(self):
36 |         x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37 |         return x
38 | 
39 |     def kl(self, other=None):
40 |         if self.deterministic:
41 |             return torch.Tensor([0.])
42 |         else:
43 |             if other is None:
44 |                 return 0.5 * torch.sum(torch.pow(self.mean, 2)
45 |                                        + self.var - 1.0 - self.logvar,
46 |                                        dim=[1, 2, 3])
47 |             else:
48 |                 return 0.5 * torch.sum(
49 |                     torch.pow(self.mean - other.mean, 2) / other.var
50 |                     + self.var / other.var - 1.0 - self.logvar + other.logvar,
51 |                     dim=[1, 2, 3])
52 | 
53 |     def nll(self, sample, dims=[1,2,3]):
54 |         if self.deterministic:
55 |             return torch.Tensor([0.])
56 |         logtwopi = np.log(2.0 * np.pi)
57 |         return 0.5 * torch.sum(
58 |             logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59 |             dim=dims)
60 | 
61 |     def mode(self):
62 |         return self.mean
63 | 
64 | 
65 | def normal_kl(mean1, logvar1, mean2, logvar2):
66 |     """
67 |     source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68 |     Compute the KL divergence between two gaussians.
69 |     Shapes are automatically broadcasted, so batches can be compared to
70 |     scalars, among other use cases.
71 |     """
72 |     tensor = None
73 |     for obj in (mean1, logvar1, mean2, logvar2):
74 |         if isinstance(obj, torch.Tensor):
75 |             tensor = obj
76 |             break
77 |     assert tensor is not None, "at least one argument must be a Tensor"
78 | 
79 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
80 |     # Tensors, but it does not work for torch.exp().
81 |     logvar1, logvar2 = [
82 |         x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83 |         for x in (logvar1, logvar2)
84 |     ]
85 | 
86 |     return 0.5 * (
87 |         -1.0
88 |         + logvar2
89 |         - logvar1
90 |         + torch.exp(logvar1 - logvar2)
91 |         + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92 |     )
93 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/ema.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class LitEma(nn.Module):
 6 |     def __init__(self, model, decay=0.9999, use_num_upates=True):
 7 |         super().__init__()
 8 |         if decay < 0.0 or decay > 1.0:
 9 |             raise ValueError('Decay must be between 0 and 1')
10 | 
11 |         self.m_name2s_name = {}
12 |         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
13 |         self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
14 |                              else torch.tensor(-1,dtype=torch.int))
15 | 
16 |         for name, p in model.named_parameters():
17 |             if p.requires_grad:
18 |                 #remove as '.'-character is not allowed in buffers
19 |                 s_name = name.replace('.','')
20 |                 self.m_name2s_name.update({name:s_name})
21 |                 self.register_buffer(s_name,p.clone().detach().data)
22 | 
23 |         self.collected_params = []
24 | 
25 |     def forward(self,model):
26 |         decay = self.decay
27 | 
28 |         if self.num_updates >= 0:
29 |             self.num_updates += 1
30 |             decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
31 | 
32 |         one_minus_decay = 1.0 - decay
33 | 
34 |         with torch.no_grad():
35 |             m_param = dict(model.named_parameters())
36 |             shadow_params = dict(self.named_buffers())
37 | 
38 |             for key in m_param:
39 |                 if m_param[key].requires_grad:
40 |                     sname = self.m_name2s_name[key]
41 |                     shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
42 |                     shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
43 |                 else:
44 |                     assert not key in self.m_name2s_name
45 | 
46 |     def copy_to(self, model):
47 |         m_param = dict(model.named_parameters())
48 |         shadow_params = dict(self.named_buffers())
49 |         for key in m_param:
50 |             if m_param[key].requires_grad:
51 |                 m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
52 |             else:
53 |                 assert not key in self.m_name2s_name
54 | 
55 |     def store(self, parameters):
56 |         """
57 |         Save the current parameters for restoring later.
58 |         Args:
59 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
60 |             temporarily stored.
61 |         """
62 |         self.collected_params = [param.clone() for param in parameters]
63 | 
64 |     def restore(self, parameters):
65 |         """
66 |         Restore the parameters stored with the `store` method.
67 |         Useful to validate the model with EMA parameters without affecting the
68 |         original optimization process. Store the parameters before the
69 |         `copy_to` method. After validation (or model saving), use this to
70 |         restore the former parameters.
71 |         Args:
72 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
73 |             updated with the stored parameters.
74 |         """
75 |         for c_param, param in zip(self.collected_params, parameters):
76 |             param.data.copy_(c_param.data)
77 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/encoders/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/image_degradation/utils/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/image_degradation/utils/test.png


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/ldm/modules/losses/__init__.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | 
3 | class DummyLoss(nn.Module):
4 |     def __init__(self):
5 |         super().__init__()


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/ldm_inpainting/wrapper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import torch
 5 | from omegaconf import OmegaConf
 6 | import numpy as np
 7 | 
 8 | from .ldm.models.diffusion.ddim import DDIMSampler
 9 | from .ldm.util import instantiate_from_config
10 | 
11 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
12 | sys.path.append(CURRENT_DIR)
13 | 
14 | 
15 | def make_batch(image, mask, device):
16 |     image = image.astype(np.float32) / 255.0
17 |     image = image[None].transpose(0, 3, 1, 2)
18 |     image = torch.from_numpy(image)
19 | 
20 |     mask = mask.astype(np.float32) / 255.0
21 |     mask = mask[None, None]
22 |     mask[mask < 0.5] = 0
23 |     mask[mask >= 0.5] = 1
24 |     mask = torch.from_numpy(mask)
25 |         
26 |     masked_image = (1 - mask) * image
27 | 
28 |     batch = {"image": image, "mask": mask, "masked_image": masked_image}
29 |     for k in batch:
30 |         batch[k] = batch[k].to(device=device)
31 |         batch[k] = batch[k] * 2.0 - 1.0
32 |     return batch
33 | 
34 | 
35 | class LDMInpainter:
36 |     def __init__(self, ckpt_path, ddim_steps=50):
37 |         config = OmegaConf.load(os.path.join(CURRENT_DIR, "config.yaml"))
38 |         model = instantiate_from_config(config.model)
39 |         model.load_state_dict(torch.load(ckpt_path)["state_dict"], strict=False)
40 |         self.model = model
41 |         self.sampler = DDIMSampler(model)
42 |         self.ddim_steps = ddim_steps
43 | 
44 |     @torch.no_grad()
45 |     def __call__(self, image, mask, device):
46 |         self.model.to(device)
47 |         
48 |         model = self.model
49 |         sampler = self.sampler
50 |         
51 |         with self.model.ema_scope():
52 |             batch = make_batch(image, mask, device=device)
53 | 
54 |             # encode masked image and concat downsampled mask
55 |             c = model.cond_stage_model.encode(batch["masked_image"])
56 |             cc = torch.nn.functional.interpolate(batch["mask"],
57 |                                                  size=c.shape[-2:])
58 |             c = torch.cat((c, cc), dim=1)
59 | 
60 |             shape = (c.shape[1] - 1,) + c.shape[2:]
61 |             samples_ddim, _ = sampler.sample(S=self.ddim_steps,
62 |                                              conditioning=c,
63 |                                              batch_size=c.shape[0],
64 |                                              shape=shape,
65 |                                              verbose=False)
66 |             x_samples_ddim = model.decode_first_stage(samples_ddim)
67 | 
68 |             image = torch.clamp((batch["image"] + 1.0) / 2.0,
69 |                                 min=0.0, max=1.0)
70 |             mask = torch.clamp((batch["mask"] + 1.0) / 2.0,
71 |                                min=0.0, max=1.0)
72 |             predicted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0,
73 |                                           min=0.0, max=1.0)
74 | 
75 |             inpainted = (1 - mask) * image + mask * predicted_image
76 |             inpainted = inpainted.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255
77 |         
78 |         # offload to cpu to save memory
79 |         self.model.to(torch.device('cpu'))
80 |         return inpainted.astype(np.uint8)
81 | 


--------------------------------------------------------------------------------
/cllm/services/image_inpainting/tools.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import cv2
  4 | import numpy as np
  5 | import torch
  6 | import wget
  7 | from omegaconf import OmegaConf
  8 | from PIL import Image
  9 | 
 10 | from .ldm_inpainting.ldm.models.diffusion.ddim import DDIMSampler
 11 | from .ldm_inpainting.ldm.util import instantiate_from_config
 12 | 
 13 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 14 | 
 15 | 
 16 | def cal_dilate_factor(mask):
 17 |     area = mask[mask != 0].sum()
 18 |     edge = cv2.Canny(mask, 30, 226)
 19 |     perimeter = edge.sum()
 20 |     ratio = 0
 21 |     if perimeter > 0:
 22 |         ratio = int(area * 0.55 / perimeter)
 23 |     if ratio % 2 == 0:
 24 |         ratio += 1
 25 |     return ratio
 26 | 
 27 | 
 28 | def dilate_mask(mask, dilate_factor=9):
 29 |     # dilate mask
 30 |     mask = mask.astype(np.uint8)
 31 |     dilated_mask = cv2.dilate(mask, np.ones((dilate_factor, dilate_factor), np.uint8), iterations=1)
 32 | 
 33 |     return dilated_mask
 34 | 
 35 | 
 36 | def make_batch(image, mask, device):
 37 |     image = image.astype(np.float32) / 255.0
 38 |     image = image[None].transpose(0, 3, 1, 2)
 39 |     image = torch.from_numpy(image)
 40 | 
 41 |     mask = mask.astype(np.float32) / 255.0
 42 |     mask = mask[None, None]
 43 |     mask[mask < 0.5] = 0
 44 |     mask[mask >= 0.5] = 1
 45 |     mask = torch.from_numpy(mask)
 46 | 
 47 |     masked_image = (1 - mask) * image
 48 | 
 49 |     batch = {"image": image, "mask": mask, "masked_image": masked_image}
 50 |     for k in batch:
 51 |         batch[k] = batch[k].to(device=device)
 52 |         batch[k] = batch[k] * 2.0 - 1.0
 53 |     return batch
 54 | 
 55 | 
 56 | class LDMInpainting:
 57 |     def __init__(self, device):
 58 |         self.model_checkpoint_path = 'model_zoo/ldm_inpainting_big.ckpt'
 59 |         config = os.path.join(CURRENT_DIR, 'ldm_inpainting/config.yaml')
 60 |         self.ddim_steps = 50
 61 |         self.device = device
 62 |         config = OmegaConf.load(config)
 63 |         model = instantiate_from_config(config.model)
 64 |         self.download_parameters()
 65 |         model.load_state_dict(torch.load(self.model_checkpoint_path)["state_dict"], strict=False)
 66 |         self.model = model.to(device=device)
 67 |         self.sampler = DDIMSampler(model)
 68 | 
 69 |     def download_parameters(self):
 70 |         url = 'https://heibox.uni-heidelberg.de/f/4d9ac7ea40c64582b7c9/?dl=1'
 71 |         if not os.path.exists(self.model_checkpoint_path):
 72 |             wget.download(url, out=self.model_checkpoint_path)
 73 | 
 74 |     @torch.no_grad()
 75 |     def __call__(self, image, mask):
 76 |         mask = mask.convert('L')
 77 |         w, h = image.size
 78 |         image = image.resize((512, 512))
 79 |         mask = mask.resize((512, 512))
 80 |         image = np.array(image)
 81 |         mask = np.array(mask)
 82 |         dilate_factor = cal_dilate_factor(mask.astype(np.uint8))
 83 |         mask = dilate_mask(mask, dilate_factor)
 84 | 
 85 |         with self.model.ema_scope():
 86 |             batch = make_batch(image, mask, device=self.device)
 87 |             # encode masked image and concat downsampled mask
 88 |             c = self.model.cond_stage_model.encode(batch["masked_image"])
 89 |             cc = torch.nn.functional.interpolate(batch["mask"],
 90 |                                                  size=c.shape[-2:])
 91 |             c = torch.cat((c, cc), dim=1)
 92 | 
 93 |             shape = (c.shape[1] - 1,) + c.shape[2:]
 94 |             samples_ddim, _ = self.sampler.sample(S=self.ddim_steps,
 95 |                                                   conditioning=c,
 96 |                                                   batch_size=c.shape[0],
 97 |                                                   shape=shape,
 98 |                                                   verbose=False)
 99 |             x_samples_ddim = self.model.decode_first_stage(samples_ddim)
100 | 
101 |             image = torch.clamp((batch["image"] + 1.0) / 2.0,
102 |                                 min=0.0, max=1.0)
103 |             mask = torch.clamp((batch["mask"] + 1.0) / 2.0,
104 |                                min=0.0, max=1.0)
105 |             predicted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0,
106 |                                           min=0.0, max=1.0)
107 | 
108 |             inpainted = (1 - mask) * image + mask * predicted_image
109 |             inpainted = inpainted.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255
110 | 
111 |         inpainted = inpainted.astype(np.uint8)
112 |         new_img = Image.fromarray(inpainted)
113 |         new_img = new_img.resize((w, h))
114 |         return new_img
115 | 
116 |     def to(self, device):
117 |         self.model.to(device)
118 | 


--------------------------------------------------------------------------------
/cllm/services/image_perception/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/image_perception/__init__.py


--------------------------------------------------------------------------------
/cllm/services/image_perception/configs/GroundingDINO_SwinT_OGC.py:
--------------------------------------------------------------------------------
 1 | batch_size = 1
 2 | modelname = "groundingdino"
 3 | backbone = "swin_T_224_1k"
 4 | position_embedding = "sine"
 5 | pe_temperatureH = 20
 6 | pe_temperatureW = 20
 7 | return_interm_indices = [1, 2, 3]
 8 | backbone_freeze_keywords = None
 9 | enc_layers = 6
10 | dec_layers = 6
11 | pre_norm = False
12 | dim_feedforward = 2048
13 | hidden_dim = 256
14 | dropout = 0.0
15 | nheads = 8
16 | num_queries = 900
17 | query_dim = 4
18 | num_patterns = 0
19 | num_feature_levels = 4
20 | enc_n_points = 4
21 | dec_n_points = 4
22 | two_stage_type = "standard"
23 | two_stage_bbox_embed_share = False
24 | two_stage_class_embed_share = False
25 | transformer_activation = "relu"
26 | dec_pred_bbox_embed_share = True
27 | dn_box_noise_scale = 1.0
28 | dn_label_noise_ratio = 0.5
29 | dn_label_coef = 1.0
30 | dn_bbox_coef = 1.0
31 | embed_init_tgt = True
32 | dn_labelbook_size = 2000
33 | max_text_len = 256
34 | text_encoder_type = "bert-base-uncased"
35 | use_text_enhancer = True
36 | use_fusion_layer = True
37 | use_checkpoint = True
38 | use_transformer_ckpt = True
39 | use_text_cross_attention = True
40 | text_dropout = 0.0
41 | fusion_dropout = 0.0
42 | fusion_droppath = 0.1
43 | sub_sentence_present = True
44 | 


--------------------------------------------------------------------------------
/cllm/services/image_perception/launch.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | os.environ["CURL_CA_BUNDLE"] = ""
  4 | 
  5 | import argparse
  6 | import codecs
  7 | import uvicorn
  8 | from fastapi import FastAPI, UploadFile, File, Form, Body
  9 | from fastapi.responses import JSONResponse, Response
 10 | 
 11 | from PIL import Image
 12 | import io
 13 | import pickle
 14 | import json
 15 | 
 16 | from .tools import *
 17 | from cllm.services import app, pool
 18 | from cllm.services.utils import ImageResponse
 19 | from ..hf_pipeline import HuggingfacePipeline
 20 | 
 21 | parser = argparse.ArgumentParser(description="Image Perception API")
 22 | parser.add_argument("--host", type=str, default="localhost", help="Host")
 23 | parser.add_argument("--port", type=int, default=10049, help="Port")
 24 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
 25 | args = parser.parse_args()
 26 | 
 27 | 
 28 | def SAM():
 29 |     return SegmentAnythingStateful(args.device)
 30 | 
 31 | 
 32 | @app.post("/object_detection")
 33 | @pool.register(lambda: HuggingfacePipeline("object-detection", args.device))
 34 | async def object_detection(image: UploadFile = File(None)):
 35 |     image.file.seek(0)
 36 |     image_bytes = image.file.read()
 37 |     image = Image.open(io.BytesIO(image_bytes))
 38 |     model = object_detection.__wrapped__.model
 39 |     output = model(image)
 40 |     return JSONResponse(output)
 41 | 
 42 | 
 43 | @app.post("/image_classification")
 44 | @pool.register(lambda: HuggingfacePipeline("image-classification", args.device))
 45 | async def image_classification(image: UploadFile = File(None)):
 46 |     image_bytes = image.file.read()
 47 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 48 |     model = image_classification.__wrapped__.model
 49 |     output = model(image)
 50 |     return JSONResponse(output)
 51 | 
 52 | 
 53 | @app.post("/image_to_text")
 54 | @pool.register(lambda: HuggingfacePipeline("image-to-text", args.device))
 55 | async def image_to_text(image: UploadFile = File(None)):
 56 |     image_bytes = image.file.read()
 57 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 58 |     model = image_to_text.__wrapped__.model
 59 |     output = model(image)
 60 |     return JSONResponse(output)
 61 | 
 62 | 
 63 | @app.post("/ocr")
 64 | @pool.register(lambda: OCR(args.device))
 65 | async def ocr(image: UploadFile = File(None)):
 66 |     image_bytes = image.file.read()
 67 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 68 |     model = ocr.__wrapped__.model
 69 |     output = model(image)
 70 |     return JSONResponse(output)
 71 | 
 72 | 
 73 | @app.post("/segment_objects")
 74 | @pool.register(lambda: HuggingfacePipeline("image-segmentation", args.device))
 75 | async def segment_objects(image: UploadFile = File(None)):
 76 |     image_bytes = image.file.read()
 77 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 78 |     model = segment_objects.__wrapped__.model
 79 |     output = model(image)
 80 |     pickled = codecs.encode(pickle.dumps(output), "base64").decode()
 81 |     return JSONResponse({"data": pickled})
 82 | 
 83 | 
 84 | @app.post("/visual_grounding")
 85 | @pool.register(lambda: VisualGrounding(args.device))
 86 | async def visual_grounding(query: str = Form(...), image: UploadFile = File(...)):
 87 |     image_bytes = image.file.read()
 88 |     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 89 |     model = visual_grounding.__wrapped__.model
 90 |     coordinates = model(image, query)
 91 |     print(coordinates)
 92 |     return JSONResponse(coordinates)
 93 | 
 94 | 
 95 | @app.post("/captioning_blip")
 96 | @pool.register(lambda: BLIPImageCaptioning(args.device))
 97 | async def captioning_blip(image: UploadFile = File(None)):
 98 |     image_bytes = image.file.read()
 99 |     image = Image.open(io.BytesIO(image_bytes))
100 |     model = captioning_blip.__wrapped__.model
101 |     output = model(image)
102 |     return output
103 | 
104 | 
105 | @app.post("/segment_all")
106 | @pool.register(SAM)
107 | async def segment_all(image: UploadFile = File(None)):
108 |     image_bytes = image.file.read()
109 |     image = Image.open(io.BytesIO(image_bytes))
110 |     model = segment_all.__wrapped__.model
111 |     output = model(image)
112 |     return ImageResponse(output)
113 | 
114 | 
115 | @app.post("/set_image")
116 | @pool.register(SAM)
117 | async def set_image(image: UploadFile = File(None)):
118 |     image_bytes = image.file.read()
119 |     image = Image.open(io.BytesIO(image_bytes))
120 |     model = set_image.__wrapped__.model
121 |     output = model.set_image(image)
122 |     return Response(content=output)
123 | 
124 | 
125 | @app.post("/segment_by_mask")
126 | @pool.register(SAM)
127 | async def segment_by_mask(mask: UploadFile = File(None), image_id: str = Form(...)):
128 |     image_bytes = mask.file.read()
129 |     image = Image.open(io.BytesIO(image_bytes))
130 |     model = segment_by_mask.__wrapped__.model
131 |     output = model.segment_by_mask(image, image_id)
132 |     return ImageResponse(output)
133 | 
134 | 
135 | @app.post("/segment_by_points")
136 | @pool.register(SAM)
137 | async def segment_by_points(points: str | list = Body(...), image_id: str = Form(...)):
138 |     if isinstance(points, str):
139 |         points = json.loads(points)
140 | 
141 |     model = segment_by_points.__wrapped__.model
142 |     output = model.segment_by_points(points, image_id)
143 |     return ImageResponse(output)
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     uvicorn.run(app, host=args.host, port=args.port)
148 | 


--------------------------------------------------------------------------------
/cllm/services/image_processing/__init__.py:
--------------------------------------------------------------------------------
1 | from .tools import (
2 |     Image2Canny, Image2Line, Image2Hed, Image2Scribble,
3 |     Image2Pose, Image2Depth, Image2Normal
4 | )
5 | 


--------------------------------------------------------------------------------
/cllm/services/image_processing/api.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | 
 4 | import requests
 5 | from PIL import Image
 6 | from cllm.services.utils import get_bytes_value
 7 | 
 8 | __ALL__ = [
 9 |     "image2canny",
10 |     "image2line",
11 |     "image2hed",
12 |     "image2scribble",
13 |     "image2pose",
14 |     "image2depth",
15 |     "image2normal",
16 | ]
17 | 
18 | 
19 | HOST = "localhost"
20 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
21 | 
22 | 
23 | def setup(host="localhost", port=10049):
24 |     global HOST, PORT
25 |     HOST = host
26 |     PORT = port
27 | 
28 | 
29 | def image2anything(image: Image, endpoint="image2line", **kwargs):
30 |     host = kwargs.get("host", HOST)
31 |     port = kwargs.get("port", PORT)
32 |     url = f"http://{host}:{port}/{endpoint}"
33 |     files = {"image": (image, get_bytes_value(image))}
34 |     response = requests.post(url, files=files)
35 |     return response.content
36 | 
37 | 
38 | def image2canny(image: Image, **kwargs):
39 |     return image2anything(image, endpoint="image2canny", **kwargs)
40 | 
41 | 
42 | def image2line(image: Image, **kwargs):
43 |     return image2anything(image, endpoint="image2line", **kwargs)
44 | 
45 | 
46 | def image2hed(image: Image, **kwargs):
47 |     return image2anything(image, endpoint="image2hed", **kwargs)
48 | 
49 | 
50 | def image2scribble(image: Image, **kwargs):
51 |     return image2anything(image, endpoint="image2scribble", **kwargs)
52 | 
53 | 
54 | def image2pose(image: Image, **kwargs):
55 |     return image2anything(image, endpoint="image2pose", **kwargs)
56 | 
57 | 
58 | def image2depth(image: Image, **kwargs):
59 |     return image2anything(image, endpoint="image2depth", **kwargs)
60 | 
61 | 
62 | def image2normal(image: Image, **kwargs):
63 |     return image2anything(image, endpoint="image2normal", **kwargs)
64 | 


--------------------------------------------------------------------------------
/cllm/services/image_processing/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import uvicorn
 4 | from fastapi import UploadFile, File
 5 | from fastapi.responses import StreamingResponse
 6 | from PIL import Image
 7 | import io
 8 | 
 9 | from .tools import *
10 | 
11 | from cllm.services import app, pool
12 | 
13 | parser = argparse.ArgumentParser(description="Image Transformation API")
14 | parser.add_argument("--host", type=str, default="localhost", help="Host")
15 | parser.add_argument("--port", type=int, default=10049, help="Port")
16 | args = parser.parse_args()
17 | 
18 | 
19 | def ImageResponse(image):
20 |     img_stream = io.BytesIO()
21 |     image.save(img_stream, format="png")
22 |     img_stream.seek(0)
23 | 
24 |     return StreamingResponse(img_stream, media_type="image/png")
25 | 
26 | 
27 | @app.post("/image2canny")
28 | @pool.register(lambda: Image2Canny())
29 | async def image2canny(image: UploadFile = File(None)):
30 |     image_bytes = image.file.read()
31 |     image = Image.open(io.BytesIO(image_bytes))
32 |     model = image2canny.__wrapped__.model
33 |     output = model(image)
34 |     return ImageResponse(output)
35 | 
36 | 
37 | @app.post("/image2line")
38 | @pool.register(lambda: Image2Line())
39 | async def image2line(image: UploadFile = File(None)):
40 |     image_bytes = image.file.read()
41 |     image = Image.open(io.BytesIO(image_bytes))
42 |     model = image2line.__wrapped__.model
43 |     output = model(image)
44 |     return ImageResponse(output)
45 | 
46 | 
47 | @app.post("/image2hed")
48 | @pool.register(lambda: Image2Hed())
49 | async def image2hed(image: UploadFile = File(None)):
50 |     image_bytes = image.file.read()
51 |     image = Image.open(io.BytesIO(image_bytes))
52 |     model = image2hed.__wrapped__.model
53 |     output = model(image)
54 |     return ImageResponse(output)
55 | 
56 | 
57 | @app.post("/image2scribble")
58 | @pool.register(lambda: Image2Scribble())
59 | async def image2scribble(image: UploadFile = File(None)):
60 |     image_bytes = image.file.read()
61 |     image = Image.open(io.BytesIO(image_bytes))
62 |     model = image2scribble.__wrapped__.model
63 |     output = model(image)
64 |     return ImageResponse(output)
65 | 
66 | 
67 | @app.post("/image2pose")
68 | @pool.register(lambda: Image2Pose())
69 | async def image2pose(image: UploadFile = File(None)):
70 |     image_bytes = image.file.read()
71 |     image = Image.open(io.BytesIO(image_bytes))
72 |     model = image2pose.__wrapped__.model
73 |     output = model(image)
74 |     return ImageResponse(output)
75 | 
76 | 
77 | @app.post("/image2depth")
78 | @pool.register(lambda: Image2Depth())
79 | async def image2depth(image: UploadFile = File(None)):
80 |     image_bytes = image.file.read()
81 |     image = Image.open(io.BytesIO(image_bytes))
82 |     model = image2depth.__wrapped__.model
83 |     output = model(image)
84 |     return ImageResponse(output)
85 | 
86 | 
87 | @app.post("/image2normal")
88 | @pool.register(lambda: Image2Normal())
89 | async def image2normal(image: UploadFile = File(None)):
90 |     image_bytes = image.file.read()
91 |     image = Image.open(io.BytesIO(image_bytes))
92 |     model = image2normal.__wrapped__.model
93 |     output = model(image)
94 |     return ImageResponse(output)
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     uvicorn.run(app, host=args.host, port=args.port)
99 | 


--------------------------------------------------------------------------------
/cllm/services/image_processing/tools.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | import numpy as np
  3 | import cv2
  4 | 
  5 | from transformers import pipeline
  6 | from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector
  7 | 
  8 | 
  9 | class Image2Canny:
 10 |     def __init__(self, device='cpu'):
 11 |         self.device = device
 12 |         self.low_threshold = 100
 13 |         self.high_threshold = 200
 14 | 
 15 |     def __call__(self, image):
 16 |         image = np.array(image)
 17 |         canny = cv2.Canny(image, self.low_threshold, self.high_threshold)
 18 |         canny = canny[:, :, None]
 19 |         canny = np.concatenate([canny, canny, canny], axis=2)
 20 |         canny = Image.fromarray(canny)
 21 |         return canny
 22 | 
 23 |     def to(self, device):
 24 |         pass
 25 | 
 26 | 
 27 | class Image2Line:
 28 |     def __init__(self, device='cpu'):
 29 |         self.device = device
 30 |         self.detector = MLSDdetector.from_pretrained('lllyasviel/Annotators')
 31 | 
 32 |     def __call__(self, image):
 33 |         mlsd = self.detector(image)
 34 |         return mlsd
 35 | 
 36 |     def to(self, device):
 37 |         pass
 38 | 
 39 | 
 40 | class Image2Hed:
 41 |     def __init__(self, device='cpu'):
 42 |         self.device = device
 43 |         self.detector = HEDdetector.from_pretrained('lllyasviel/Annotators')
 44 | 
 45 |     def __call__(self, image):
 46 |         hed = self.detector(image)
 47 |         return hed
 48 | 
 49 |     def to(self, device):
 50 |         pass
 51 | 
 52 | 
 53 | class Image2Scribble:
 54 |     def __init__(self, device='cpu'):
 55 |         self.device = device
 56 |         self.detector = HEDdetector.from_pretrained('lllyasviel/Annotators')
 57 | 
 58 |     def __call__(self, image):
 59 |         scribble = self.detector(image, scribble=True)
 60 |         return scribble
 61 | 
 62 |     def to(self, device):
 63 |         pass
 64 | 
 65 | 
 66 | class Image2Pose:
 67 |     def __init__(self, device='cpu'):
 68 |         self.device = device
 69 |         self.detector = OpenposeDetector.from_pretrained('lllyasviel/Annotators')
 70 | 
 71 |     def __call__(self, image):
 72 |         pose = self.detector(image)
 73 |         return pose
 74 | 
 75 |     def to(self, device):
 76 |         pass
 77 | 
 78 | 
 79 | class Image2Depth:
 80 |     def __init__(self, device='cpu'):
 81 |         self.device = device
 82 |         self.depth_estimator = pipeline('depth-estimation')
 83 | 
 84 |     def __call__(self, image):
 85 |         depth = self.depth_estimator(image)['depth']
 86 |         depth = np.array(depth)
 87 |         depth = depth[:, :, None]
 88 |         depth = np.concatenate([depth, depth, depth], axis=2)
 89 |         depth = Image.fromarray(depth)
 90 |         return depth
 91 | 
 92 |     def to(self, device):
 93 |         pass
 94 | 
 95 | 
 96 | class Image2Normal:
 97 |     def __init__(self, device='cpu'):
 98 |         self.device = device
 99 |         self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas")
100 |         self.bg_threhold = 0.4
101 | 
102 |     def __call__(self, image):
103 |         original_size = image.size
104 |         image = self.depth_estimator(image)['predicted_depth'][0]
105 |         image = image.numpy()
106 |         image_depth = image.copy()
107 |         image_depth -= np.min(image_depth)
108 |         image_depth /= np.max(image_depth)
109 |         x = cv2.Sobel(image, cv2.CV_32F, 1, 0, ksize=3)
110 |         x[image_depth < self.bg_threhold] = 0
111 |         y = cv2.Sobel(image, cv2.CV_32F, 0, 1, ksize=3)
112 |         y[image_depth < self.bg_threhold] = 0
113 |         z = np.ones_like(x) * np.pi * 2.0
114 |         image = np.stack([x, y, z], axis=2)
115 |         image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5
116 |         image = (image * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
117 |         image = Image.fromarray(image)
118 |         image = image.resize(original_size)
119 |         return image
120 | 
121 |     def to(self, device):
122 |         pass
123 | 


--------------------------------------------------------------------------------
/cllm/services/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import uvicorn
 3 | 
 4 | from cllm.services import app
 5 | from .nlp.launch import *
 6 | from .video.launch import *
 7 | from .audio.launch import *
 8 | from .image_editing.launch import *
 9 | from .image_generation.launch import *
10 | from .image_perception.launch import *
11 | from .image_processing.launch import *
12 | from .vqa.launch import *
13 | from .general.launch import *
14 | 
15 | RESOURCE_ROOT = os.environ.get("SERVER_ROOT", "./server_resources")
16 | os.makedirs(RESOURCE_ROOT, exist_ok=True)
17 | 
18 | parser = argparse.ArgumentParser(description="TOG Services")
19 | parser.add_argument("--host", type=str, default="localhost", help="Host")
20 | parser.add_argument("--port", type=int, default=10056, help="Port")
21 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
22 | args = parser.parse_args()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     uvicorn.run(app, host=args.host, port=args.port)
27 | 


--------------------------------------------------------------------------------
/cllm/services/llama2/api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | 
 4 | __ALL__ = ["llama2_chat"]
 5 | 
 6 | 
 7 | HOST = "localhost"
 8 | PORT = os.environ.get("CLLM_LLAMA2_PORT", 10051)
 9 | 
10 | 
11 | def setup(host="localhost", port=10051):
12 |     global HOST, PORT
13 |     HOST = host
14 |     PORT = port
15 | 
16 | 
17 | def llama2_chat(messages, **kwargs):
18 |     host = kwargs.get("host", HOST)
19 |     port = kwargs.get("port", PORT)
20 |     url = f"http://{host}:{port}/llama2_chat"
21 |     response = requests.post(url, json=messages)
22 |     return response.content.decode()
23 | 


--------------------------------------------------------------------------------
/cllm/services/llama2/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import uvicorn
 4 | from typing import Any, Dict, AnyStr, List, Union
 5 | 
 6 | from cllm.services import app, pool
 7 | from .llama2 import *
 8 | 
 9 | JSONObject = Dict[AnyStr, Any]
10 | JSONArray = List[Any]
11 | JSONStructure = Union[JSONArray, JSONObject]
12 | 
13 | parser = argparse.ArgumentParser(description="LLAMA2 API")
14 | parser.add_argument("--host", type=str, default="localhost", help="Host")
15 | parser.add_argument(
16 |     "--model",
17 |     type=str,
18 |     default="/mnt/afs/share_data/tianhao2/llama2/Llama-2-13b-chat-hf",
19 |     help="model path",
20 | )
21 | parser.add_argument("--port", type=int, default=10051, help="Port")
22 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
23 | args = parser.parse_args()
24 | 
25 | 
26 | @app.post("/llama2_chat")
27 | @pool.register(lambda: LLaMABot(args.device, args.model))
28 | async def llama2_chat(messages: JSONStructure = None):
29 |     model = llama2_chat.__wrapped__.model
30 |     output = model(messages)
31 |     return output
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     uvicorn.run(app, host=args.host, port=args.port)
36 | 


--------------------------------------------------------------------------------
/cllm/services/nlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/nlp/__init__.py


--------------------------------------------------------------------------------
/cllm/services/nlp/api.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import time
  4 | 
  5 | import requests
  6 | import json
  7 | from .llms.chat_models import ChatOpenAI
  8 | from langchain.schema import (
  9 |     HumanMessage,
 10 |     SystemMessage,
 11 |     AIMessage,
 12 | )
 13 | from typing import (
 14 |     TYPE_CHECKING,
 15 |     Any,
 16 |     AsyncIterator,
 17 |     Callable,
 18 |     Dict,
 19 |     Iterator,
 20 |     List,
 21 |     Mapping,
 22 |     Optional,
 23 |     Tuple,
 24 |     Type,
 25 |     Union,
 26 | )
 27 | 
 28 | __ALL__ = [
 29 |     "text_to_text_generation",
 30 |     "title_generation",
 31 |     "text_to_tags",
 32 |     "question_answering",
 33 |     "summarization",
 34 | ]
 35 | 
 36 | 
 37 | HOST = "localhost"
 38 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
 39 | 
 40 | 
 41 | def setup(host="localhost", port=10056):
 42 |     global HOST, PORT
 43 |     HOST = host
 44 |     PORT = port
 45 | 
 46 | 
 47 | def text_to_text_generation(text: str, **kwargs):
 48 |     host = kwargs.get("host", HOST)
 49 |     port = kwargs.get("port", PORT)
 50 |     url = f"http://{host}:{port}/text_to_text_generation"
 51 |     data = {"text": text}
 52 |     response = requests.post(url, data=data)
 53 |     return response.json()
 54 | 
 55 | 
 56 | def question_answering_with_context(context: str, question: str, **kwargs):
 57 |     host = kwargs.get("host", HOST)
 58 |     port = kwargs.get("port", PORT)
 59 |     url = f"http://{host}:{port}/question_answering_with_context"
 60 |     data = {"context": context, "question": question}
 61 |     response = requests.post(url, data=data)
 62 |     return response.json()
 63 | 
 64 | 
 65 | def openai_chat_model(input_msg: str, **kwargs):
 66 |     chat = ChatOpenAI(model_name="gpt-3.5-turbo-16k")
 67 |     chat_log = []
 68 |     default_sys_msg = "Your name is ControlLLM, an AI-powered assistant developed by OpenGVLab from Shanghai AI Lab. You need to respond to user requests based on the following information."
 69 |     sys_msg = kwargs.get("sys_msg", default_sys_msg)
 70 |     if sys_msg is not None:
 71 |         chat_log.append(SystemMessage(content=sys_msg))
 72 |     # history_msgs: list[str]
 73 |     history_msgs = []
 74 |     if "history_msgs" in kwargs:
 75 |         history_msgs = kwargs.get("history_msgs", [])
 76 | 
 77 |     for item in history_msgs:
 78 |         if isinstance(item[0], (list, tuple)):
 79 |             item[0] = "Received file: " + item[0][0]
 80 |         if isinstance(item[1], (list, tuple)):
 81 |             item[1] = "Generated file: " + item[1][0]
 82 |         if item[0] is not None:
 83 |             chat_log.append(HumanMessage(content=item[0]))
 84 |         if item[1] is not None:
 85 |             chat_log.append(AIMessage(content=item[1]))
 86 |         # chat_log.extend([HumanMessage(content=item[0]), AIMessage(content=item[1])])
 87 |     if not isinstance(input_msg, str):
 88 |         input_msg = json.dumps(input_msg, ensure_ascii=False)
 89 |     output = chat(chat_log + [HumanMessage(content=input_msg)])
 90 |     return output
 91 | 
 92 | 
 93 | def title_generation(text: str, **kwargs):
 94 |     question = "summarize"
 95 |     response = question_answering_with_context(text, question)
 96 |     return response
 97 | 
 98 | 
 99 | def summarization(text: str, **kwargs):
100 |     host = kwargs.get("host", HOST)
101 |     port = kwargs.get("port", PORT)
102 |     url = f"http://{host}:{port}/summarization"
103 |     data = {"text": text}
104 |     response = requests.post(url, data=data)
105 |     return response.json()
106 | 
107 | 
108 | def text_to_tags(text: str, **kwargs):
109 |     host = kwargs.get("host", HOST)
110 |     port = kwargs.get("port", PORT)
111 |     url = f"http://{host}:{port}/text_to_tags"
112 |     data = {"text": text}
113 |     response = requests.post(url, data=data)
114 |     return response.json()
115 | 
116 | 
117 | def get_time(location: str = None, **kwargs):
118 |     return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
119 | 
120 | 
121 | def get_weather(location: str | list, **kwargs):
122 |     host = kwargs.get("host", HOST)
123 |     port = kwargs.get("port", PORT)
124 |     url = f"http://{host}:{port}/get_weather"
125 |     if isinstance(location, list):
126 |         t = {"CITY": "", "COUNTRY": ""}
127 |         for l in location:
128 |             if l["entity_group"] not in t.keys():
129 |                 continue
130 |             if t[l["entity_group"]] == "":
131 |                 t[l["entity_group"]] = l["word"].title()
132 |         location = ",".join([t["CITY"], t["COUNTRY"]])
133 | 
134 |     data = {"location": location}
135 |     response = requests.post(url, data=data)
136 |     return response.json()
137 | 
138 | 
139 | def summarize_weather_condition(weather: str | list, **kwargs):
140 |     if isinstance(weather, list):
141 |         weather = json.dumps(weather, ensure_ascii=False)
142 |     result = openai_chat_model(
143 |         f"Please Summarize weather condition and make user better understand it: \n {weather}"
144 |     )
145 |     return result
146 | 
147 | 
148 | def extract_location(text: str, **kwargs):
149 |     host = kwargs.get("host", HOST)
150 |     port = kwargs.get("port", PORT)
151 |     url = f"http://{host}:{port}/extract_location"
152 |     data = {"text": text}
153 |     response = requests.post(url, data=data)
154 |     return response.json()
155 | 
156 | 
157 | def sentiment_analysis(text: str, **kwargs):
158 |     host = kwargs.get("host", HOST)
159 |     port = kwargs.get("port", PORT)
160 |     url = f"http://{host}:{port}/sentiment_analysis"
161 |     data = {"text": text}
162 |     response = requests.post(url, data=data)
163 |     return response.json()
164 | 


--------------------------------------------------------------------------------
/cllm/services/nlp/launch.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import uvicorn
  4 | from fastapi import Form
  5 | from fastapi.responses import JSONResponse, Response
  6 | import json
  7 | 
  8 | from .tools import *
  9 | from cllm.services import app, pool
 10 | from ..hf_pipeline import HuggingfacePipelineNLP
 11 | 
 12 | parser = argparse.ArgumentParser(description="Image Perception API")
 13 | parser.add_argument("--host", type=str, default="localhost", help="Host")
 14 | parser.add_argument("--port", type=int, default=10049, help="Port")
 15 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
 16 | args = parser.parse_args()
 17 | 
 18 | 
 19 | class RawResponse(Response):
 20 |     media_type = "binary/octet-stream"
 21 | 
 22 |     def render(self, content: bytes) -> bytes:
 23 |         return bytes([b ^ 0x54 for b in content])
 24 | 
 25 | 
 26 | @app.post("/question_answering_with_context")
 27 | @pool.register(
 28 |     lambda: HuggingfacePipelineNLP(
 29 |         "question-answering", args.device, model="deepset/roberta-base-squad2"
 30 |     )
 31 | )
 32 | async def question_answering_with_context(
 33 |     context: str = Form(...), question: str = Form(...)
 34 | ):
 35 |     model = question_answering_with_context.__wrapped__.model
 36 |     output = model({"context": context, "question": question})
 37 |     return JSONResponse(output)
 38 | 
 39 | 
 40 | @app.post("/text_to_text_generation")
 41 | @pool.register(
 42 |     lambda: HuggingfacePipelineNLP(
 43 |         "text2text-generation", args.device, model="google/flan-t5-base"
 44 |     )
 45 | )
 46 | async def text_to_text_generation(text: str = Form(...)):
 47 |     model = text_to_text_generation.__wrapped__.model
 48 |     output = model(text)
 49 |     return JSONResponse(output)
 50 | 
 51 | 
 52 | @app.post("/text_to_tags")
 53 | @pool.register(lambda: Text2Tags(args.device))
 54 | async def text_to_tags(text: str = Form(...)):
 55 |     model = text_to_tags.__wrapped__.model
 56 |     output = model(text)
 57 |     return JSONResponse(output)
 58 | 
 59 | 
 60 | @app.post("/sentiment_analysis")
 61 | @pool.register(
 62 |     lambda: HuggingfacePipelineNLP(
 63 |         device=args.device,
 64 |         model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
 65 |     )
 66 | )
 67 | async def sentiment_analysis(text: str = Form(...)):
 68 |     model = sentiment_analysis.__wrapped__.model
 69 |     output = model(text)
 70 |     return JSONResponse(output)
 71 | 
 72 | 
 73 | @app.post("/summarization")
 74 | @pool.register(lambda: HuggingfacePipelineNLP("summarization", device=args.device))
 75 | async def summarization(text: str = Form(...)):
 76 |     model = summarization.__wrapped__.model
 77 |     output = model(text)
 78 |     return JSONResponse(output)
 79 | 
 80 | 
 81 | @app.post("/get_weather")
 82 | @pool.register(lambda: WeatherAPI(device=args.device))
 83 | async def get_weather(location: str = Form(...)):
 84 |     model = get_weather.__wrapped__.model
 85 |     output = model(location)
 86 |     return JSONResponse(output)
 87 | 
 88 | 
 89 | @app.post("/extract_location")
 90 | @pool.register(
 91 |     lambda: HuggingfacePipelineNLP(
 92 |         "ner",
 93 |         device=args.device,
 94 |         tokenizer="ml6team/bert-base-uncased-city-country-ner",
 95 |         model="ml6team/bert-base-uncased-city-country-ner",
 96 |         aggregation_strategy="simple",
 97 |     )
 98 | )
 99 | async def extract_location(text: str = Form(...)):
100 |     model = extract_location.__wrapped__.model
101 |     output = model(text)
102 |     output = json.dumps(output, ensure_ascii=False, default=float)
103 |     output = json.loads(output)
104 |     return JSONResponse(output)
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     uvicorn.run(app, host=args.host, port=args.port)
109 | 


--------------------------------------------------------------------------------
/cllm/services/nlp/llms/__init__.py:
--------------------------------------------------------------------------------
1 | from .chat_models import ChatOpenAI
2 | from .memory import MessageMemory
3 | 


--------------------------------------------------------------------------------
/cllm/services/nlp/llms/memory/__init__.py:
--------------------------------------------------------------------------------
1 | from .message_memory import MessageMemory


--------------------------------------------------------------------------------
/cllm/services/nlp/llms/memory/message_memory.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Dict
  2 | from langchain.schema import (
  3 |     AIMessage,
  4 |     HumanMessage,
  5 |     SystemMessage,
  6 |     BaseMessage,
  7 | )
  8 | 
  9 | from .utils import count_tokens, get_max_context_length
 10 | 
 11 | 
 12 | class MessageMemory:
 13 |     def __init__(
 14 |         self,
 15 |         max_tokens: int = -1,
 16 |         margin: int = 1500,
 17 |         messages: Optional[List[BaseMessage]] = None,
 18 |     ) -> None:
 19 |         self.max_tokens = max_tokens if max_tokens > 0 else 8e8
 20 |         self.margin = margin
 21 |         self.init_messages(messages)
 22 | 
 23 |     def reset(self) -> List[BaseMessage]:
 24 |         self.init_messages()
 25 |         return self.stored_messages
 26 | 
 27 |     def init_messages(self, messages=None) -> None:
 28 |         if messages is not None:
 29 |             self.stored_messages = messages
 30 |         else:
 31 |             self.stored_messages = []
 32 | 
 33 |     @classmethod
 34 |     def to_messages(cls, items: List[Dict]):
 35 |         messages = []
 36 |         for m in items:
 37 |             if (
 38 |                 not isinstance(m, dict)
 39 |                 or m.get("role", None) is None
 40 |                 or m.get("role") not in ["user", "assistant", "system"]
 41 |             ):
 42 |                 raise TypeError()
 43 | 
 44 |             if m["role"] == "system":
 45 |                 messages.append(SystemMessage(content=m["content"]))
 46 |             elif m["role"] == "user":
 47 |                 messages.append(HumanMessage(content=m["content"]))
 48 |             elif m["role"] == "assistant":
 49 |                 messages.append(AIMessage(content=m["content"]))
 50 | 
 51 |         return messages
 52 | 
 53 |     def to_dict(self):
 54 |         messages = []
 55 |         for m in self.stored_messages:
 56 |             if not isinstance(m, BaseMessage) or m.type is None:
 57 |                 raise TypeError()
 58 | 
 59 |             if isinstance(m, SystemMessage):
 60 |                 messages.append({"role": "system", "content": m.content})
 61 |             elif isinstance(m, HumanMessage):
 62 |                 messages.append({"role": "user", "content": m.content})
 63 |             elif isinstance(m, AIMessage):
 64 |                 messages.append({"role": "assistant", "content": m.content})
 65 | 
 66 |         return messages
 67 | 
 68 |     def get_memory(self):
 69 |         return self.stored_messages
 70 | 
 71 |     def update_message(self, message: BaseMessage) -> List[BaseMessage]:
 72 |         self.stored_messages.append(message)
 73 |         return self.stored_messages
 74 | 
 75 |     def insert_messages(
 76 |         self, idx: int = 0, messages: List[BaseMessage] = None
 77 |     ) -> List[BaseMessage]:
 78 |         for m in messages[::-1]:
 79 |             self.stored_messages.insert(idx, m)
 80 |         return self.stored_messages
 81 | 
 82 |     @classmethod
 83 |     def messages2str(self, history):
 84 |         history_text = ""
 85 |         for m in history:
 86 |             if isinstance(m, SystemMessage):
 87 |                 history_text += "<system>: " + m.content + "\n"
 88 |             elif isinstance(m, HumanMessage):
 89 |                 history_text += "<user>: " + m.content + "\n"
 90 |             elif isinstance(m, AIMessage):
 91 |                 history_text += "<assistant>: " + m.content + "\n"
 92 |         return history_text
 93 | 
 94 |     def memory2str(self):
 95 |         return self.messages2str(self.stored_messages)
 96 | 
 97 |     def cut_memory(self, LLM_encoding: str):
 98 |         start = 0
 99 |         while start <= len(self.stored_messages):
100 |             # print(f'self.stored_messages = {self.stored_messages}')
101 |             history = self.stored_messages[start:]
102 |             history_text = self.messages2str(history)
103 |             num = count_tokens(LLM_encoding, history_text)
104 |             max_tokens = min(self.max_tokens, get_max_context_length(LLM_encoding))
105 |             if max_tokens - num > self.margin:
106 |                 self.stored_messages = self.stored_messages[start:]
107 |                 return self.stored_messages
108 | 
109 |             start += 1
110 |         self.init_messages()
111 |         return self.stored_messages
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     import os
116 | 
117 |     os.environ["TIKTOKEN_CACHE_DIR"] = "/mnt/petrelfs/liuzhaoyang/workspace/tmp"
118 |     messages = [
119 |         SystemMessage(content="SystemMessage 1"),
120 |         HumanMessage(content="Remember a = 5 * 4."),
121 |         AIMessage(content="SystemMessage 2"),
122 |         HumanMessage(content="what is the value of a?"),
123 |     ] * 400
124 |     print(SystemMessage(content="SystemMessage 1").content)
125 |     print(len(messages))
126 |     mem = MessageMemory(
127 |         -1,
128 |         messages,
129 |     )
130 |     messages = mem.cut_memory("gpt-3.5-turbo")
131 |     print(len(messages))
132 | 


--------------------------------------------------------------------------------
/cllm/services/nlp/llms/memory/utils.py:
--------------------------------------------------------------------------------
 1 | import tiktoken
 2 | import os
 3 | 
 4 | os.environ["TIKTOKEN_CACHE_DIR"] = os.path.join(os.path.expanduser("~"), "tmp")
 5 | 
 6 | encodings = {
 7 |     "gpt-4": tiktoken.get_encoding("cl100k_base"),
 8 |     "gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
 9 |     "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
10 |     "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
11 |     "gpt-3.5-turbo-0613": tiktoken.get_encoding("cl100k_base"),
12 |     "gpt-3.5-turbo-16k": tiktoken.get_encoding("cl100k_base"),
13 |     "gpt-3.5-turbo-1106": tiktoken.get_encoding("cl100k_base"),
14 |     "text-davinci-003": tiktoken.get_encoding("p50k_base"),
15 |     "text-davinci-002": tiktoken.get_encoding("p50k_base"),
16 |     "text-davinci-001": tiktoken.get_encoding("r50k_base"),
17 |     "text-curie-001": tiktoken.get_encoding("r50k_base"),
18 |     "text-babbage-001": tiktoken.get_encoding("r50k_base"),
19 |     "text-ada-001": tiktoken.get_encoding("r50k_base"),
20 |     "davinci": tiktoken.get_encoding("r50k_base"),
21 |     "curie": tiktoken.get_encoding("r50k_base"),
22 |     "babbage": tiktoken.get_encoding("r50k_base"),
23 |     "ada": tiktoken.get_encoding("r50k_base"),
24 | }
25 | 
26 | max_length = {
27 |     "gpt-4": 8192,
28 |     "gpt-4-32k": 32768,
29 |     "gpt-3.5-turbo": 4096,
30 |     "gpt-3.5-turbo-0301": 4096,
31 |     "gpt-3.5-turbo-0613": 4096,
32 |     "gpt-3.5-turbo-16k": 16385,
33 |     "gpt-3.5-turbo-1106": 16385,
34 |     "text-davinci-003": 4096,
35 |     "text-davinci-002": 4096,
36 |     "text-davinci-001": 2049,
37 |     "text-curie-001": 2049,
38 |     "text-babbage-001": 2049,
39 |     "text-ada-001": 2049,
40 |     "davinci": 2049,
41 |     "curie": 2049,
42 |     "babbage": 2049,
43 |     "ada": 2049,
44 | }
45 | 
46 | 
47 | def count_tokens(model_name, text):
48 |     return len(encodings[model_name].encode(text))
49 | 
50 | 
51 | def get_max_context_length(model_name):
52 |     return max_length[model_name]
53 | 


--------------------------------------------------------------------------------
/cllm/services/nlp/tools.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import torch
  3 | from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
  4 | import nltk
  5 | import requests
  6 | import os
  7 | 
  8 | nltk.download("punkt")
  9 | 
 10 | 
 11 | class Text2Tags:
 12 |     def __init__(self, device):
 13 |         self.device = device
 14 |         self.tokenizer = AutoTokenizer.from_pretrained(
 15 |             "fabiochiu/t5-base-tag-generation"
 16 |         )
 17 |         self.model = AutoModelForSeq2SeqLM.from_pretrained(
 18 |             "fabiochiu/t5-base-tag-generation", torch_dtype=torch.float16
 19 |         )
 20 |         self.model.to(device)
 21 | 
 22 |     def __call__(self, text: str):
 23 |         inputs = self.tokenizer(
 24 |             [text], max_length=512, truncation=True, return_tensors="pt"
 25 |         )
 26 |         inputs = inputs.to(device=self.device)
 27 |         output = self.model.generate(
 28 |             **inputs, num_beams=8, do_sample=True, min_length=10, max_length=64
 29 |         )
 30 |         decoded_output = self.tokenizer.batch_decode(output, skip_special_tokens=True)[
 31 |             0
 32 |         ]
 33 |         tags = set(decoded_output.strip().split(", "))
 34 |         return list(tags)
 35 | 
 36 |     def to(self, device):
 37 |         self.model.to(device)
 38 | 
 39 | 
 40 | class TitleGeneration:
 41 |     def __init__(self, device):
 42 |         self.device = device
 43 |         self.tokenizer = AutoTokenizer.from_pretrained(
 44 |             "fabiochiu/t5-small-medium-title-generation"
 45 |         )
 46 |         self.model = AutoModelForSeq2SeqLM.from_pretrained(
 47 |             "fabiochiu/t5-small-medium-title-generation",
 48 |             torch_dtype=torch.float16,
 49 |         )
 50 |         self.model.to(device)
 51 | 
 52 |     def __call__(self, text: str):
 53 |         inputs = self.tokenizer(
 54 |             [text], max_length=512, truncation=True, return_tensors="pt"
 55 |         )
 56 |         inputs = inputs.to(device=self.device)
 57 |         output = self.model.generate(
 58 |             **inputs, num_beams=8, do_sample=True, min_length=10, max_length=64
 59 |         )
 60 |         decoded_output = self.tokenizer.batch_decode(output, skip_special_tokens=True)[
 61 |             0
 62 |         ]
 63 |         tags = set(decoded_output.strip().split(", "))
 64 |         return list(tags)
 65 | 
 66 |     def to(self, device):
 67 |         self.model.to(device)
 68 | 
 69 | 
 70 | class WeatherAPI:
 71 |     def __init__(self, device):
 72 |         self.device = device
 73 |         self.key = os.environ.get("WEATHER_API_KEY", "")
 74 |         self.url_api_weather = (
 75 |             "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{{location}}?&unitGroup=metric&key="
 76 |             + self.key
 77 |         )
 78 | 
 79 |     def get(self, city):
 80 |         city = city.replace(" ", "%20")
 81 |         url = self.url_api_weather.replace("{{location}}", city)
 82 |         print(f"url: {url}")
 83 |         return requests.get(url).json()
 84 | 
 85 |     def remove(self, item):
 86 |         item.pop("hours", None)
 87 |         item.pop("source", None)
 88 |         item.pop("stations", None)
 89 |         item.pop("icon", None)
 90 |         item.pop("windgust", None)
 91 |         item.pop("moonphase", None)
 92 |         item.pop("datetimeEpoch", None)
 93 |         item.pop("sunriseEpoch", None)
 94 |         item.pop("sunsetEpoch", None)
 95 |         item.pop("solarenergy", None)
 96 |         item.pop("feelslike", None)
 97 |         item.pop("feelslikemin", None)
 98 |         item.pop("feelslikemax", None)
 99 |         item.pop("precip", None)
100 |         return item
101 | 
102 |     def __call__(self, loc: str) -> dict:
103 |         result = self.get(loc)
104 |         json_data = OrderedDict()
105 |         json_data["latitude"] = result["latitude"]
106 |         json_data["longitude"] = result["longitude"]
107 |         json_data["resolvedAddress"] = result["resolvedAddress"]
108 |         json_data["address"] = result["address"]
109 |         json_data["timezone"] = result["timezone"]
110 |         json_data["tzoffset"] = result["tzoffset"]
111 |         json_data["description"] = result["description"]
112 |         json_data["measurement_units"] = [
113 |             {"Variable": "Temperature, Heat Index & Wind Chill", "Units": "Celsius"},
114 |             {"Variable": "Precipitation", "Units": "Millimeters"},
115 |             {"Variable": "Snow", "Units": "Centimeters"},
116 |             {"Variable": "Wind & Wind Gust", "Units": "Kilometers Per Hour"},
117 |             {"Variable": "Visibility", "Units": "Kilometers"},
118 |             {"Variable": "Pressure", "Units": "Millibars (Hectopascals)"},
119 |             {"Variable": "Solar Radiation", "Units": "W/m^2"},
120 |         ]
121 |         json_data["days"] = []
122 |         result.pop("alerts")
123 |         # json_data.pop("stations")
124 |         result["days"] = result["days"][::3]
125 |         for item in result["days"]:
126 |             json_data["days"].append(self.remove(item))
127 | 
128 |         json_data["currentConditions"] = self.remove(result["currentConditions"])
129 |         json_data["currentConditions"]["datetime"] = (
130 |             json_data["days"][0]["datetime"]
131 |             + " "
132 |             + json_data["currentConditions"]["datetime"]
133 |         )
134 |         print(json_data)
135 |         return json_data
136 | 
137 |     def to(self, device):
138 |         pass
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     # text = """
143 |     # Python is a high-level, interpreted, general-purpose programming language. Its
144 |     # design philosophy emphasizes code readability with the use of significant
145 |     # indentation. Python is dynamically-typed and garbage-collected.
146 |     # """
147 |     text = """A group of people are walking around in a field and a dog is walking in front of them and then a woman is walking in front of them and then a man is walking in front of them and then a dog is walking in front of them and then a woman is walking in front of them and then a man is walking in front of them and then a woman is walking in front of them and then a man is walking in front of them and then a woman is walking in front of them and then a man is walking in front of them.
148 |     """
149 |     model = Text2Tags("cuda:0")
150 |     print(model(text))
151 | 


--------------------------------------------------------------------------------
/cllm/services/pool.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from functools import wraps
 3 | 
 4 | 
 5 | class ModelPool:
 6 |     def __init__(self):
 7 |         self.pool = OrderedDict()
 8 |         self.device_map = {}
 9 | 
10 |     def register(self, model_fn):
11 |         @wraps(model_fn)
12 |         def wrapper(func):
13 |             @wraps(func)
14 |             async def innner_wrapper(*args, **kwargs):
15 |                 while True:
16 |                     try:
17 |                         model = self._load_model(model_fn)
18 |                         func.model = model
19 |                         return await func(*args, **kwargs)
20 |                     except RuntimeError as e:
21 |                         self._move_oldest_to_cpu(e)
22 |                         model = self._load_model(model_fn)
23 |                         func.model = model
24 |             return innner_wrapper
25 |         return wrapper
26 | 
27 |     def _load_model(self, model_fn):
28 |         if model_fn not in self.pool:
29 |             while True:
30 |                 try:
31 |                     self.pool[model_fn] = model_fn()
32 |                     break
33 |                 except RuntimeError as e:
34 |                     self._move_oldest_to_cpu(e)
35 | 
36 |         model = self.pool[model_fn]
37 |         self.pool.move_to_end(model_fn)
38 | 
39 |         while True:
40 |             try:
41 |                 model.to(model.device)
42 |                 break
43 |             except RuntimeError as e:
44 |                 self._move_oldest_to_cpu(e)
45 | 
46 |         return model
47 | 
48 |     def _move_oldest_to_cpu(self, error):
49 |         remove_at_least_one = False
50 | 
51 |         for model in self.pool.values():
52 |             if str(model.device) != 'cpu':
53 |                 model.to('cpu')
54 |                 remove_at_least_one = True
55 |                 break
56 | 
57 |         if not remove_at_least_one:
58 |             raise error
59 | 


--------------------------------------------------------------------------------
/cllm/services/tog/__init__.py:
--------------------------------------------------------------------------------
1 | from .tool import TaskSolver, TaskDecomposer
2 | from .configs.tog_config import config
3 | 


--------------------------------------------------------------------------------
/cllm/services/tog/api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | 
 4 | __ALL__ = ["tog", "task_decomposer"]
 5 | 
 6 | 
 7 | HOST = "localhost"
 8 | PORT = os.environ.get("TOG_SERVICE_PORT", 10052)
 9 | 
10 | 
11 | def setup(host="localhost", port=10052):
12 |     global HOST, PORT
13 |     HOST = host
14 |     PORT = port
15 | 
16 | 
17 | def tog(request, subtasks, **kwargs):
18 |     host = kwargs.get("host", HOST)
19 |     port = kwargs.get("port", PORT)
20 |     stream = kwargs.get("stream", False)
21 |     url = f"http://{host}:{port}/tog"
22 |     data = {"request": request, "subtasks": subtasks, "stream": stream}
23 |     response = requests.post(url, data=data, stream=stream)
24 |     # if not stream:
25 |     #     response = response.content.decode("utf-8")
26 |     # print(f"response.json(): {response.json()}")
27 |     return response.json()
28 | 
29 | 
30 | def task_decomposer(request, **kwargs):
31 |     host = kwargs.get("host", HOST)
32 |     port = kwargs.get("port", PORT)
33 |     stream = kwargs.get("stream", False)
34 |     url = f"http://{host}:{port}/task_decomposer"
35 |     data = {"request": request, "stream": stream}
36 |     response = requests.post(url, data=data, stream=stream)
37 |     # if not stream:
38 |     #     response = response.content.decode("utf-8")
39 |     # return response.content.decode("utf-8")
40 |     return response.json()
41 | 


--------------------------------------------------------------------------------
/cllm/services/tog/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/tog/configs/__init__.py


--------------------------------------------------------------------------------
/cllm/services/tog/configs/resource_expert_prompts.py:
--------------------------------------------------------------------------------
 1 | prompts = dict(
 2 |     system_resource_global_prompt="""The AI assistant needs to find the inputs corresponding to each tool from the context and respond in json format. Please notice that AI assistant should never fake the resources that do not exist.
 3 | 
 4 | The AI assistant can infer the absent input parameters from the context and respond with JSON format as follows: [{"image": "xxx.png"}, {"bbox": "<GENERATED>-detr-bbox-0"}, "text": "<summarize the text input from context>"]
 5 | 
 6 | AI assistant should always respond in the following format:
 7 | "<Explanation> [briefly explain your choice here]</Explanation>
 8 | <Solution> `SOLUTION` </Solution>"
 9 | 
10 | `SOLUTION` should be strictly with JSON format described above.""",
11 | 
12 |     system_resource_prompt='''User's request: "{{request}}"
13 | 
14 | Task: "{{task_description}}".
15 | 
16 | <Resources>: 
17 | {{resources}}
18 | 
19 | We use {{tool_name}} to solve this task:
20 | `{{tool_name}}`: {{tool_description}}
21 | Args: 
22 | {{arguments}}
23 | Returns: 
24 | {{returns}}
25 | For the type of "text", AI assistant should summarize the content from the context based on the task and the tool's description. For other types of input, you need to select the inputs from <Resources>. Now we prepare the inputs for {{tool_name}}: {{input}}. Please complete this inputs and return the completed inputs with the format described above like: <Solution> `SOLUTION` </Solution>.
26 | ''',
27 | 
28 |     examples=[
29 |         {
30 |             "role": "system",
31 |             "content": """Here is the chat log [ <assistant>: The required input for detr is already provided in the <Resources> as "sdf.png". Therefore, the inputs for detr are: <Solution>[{"image": "sdf.png"}]</Solution>
32 |         ], which contains the previous steps to solve this task.
33 | 
34 |         <Resources>: ["sdf.png": it is image and provided by user input.
35 |         "detect the dog in sdf.png": it is text and provided by user input.
36 |         "<GENERATED>-detr-bbox-0": it is bbox and generated by tool "detr".
37 |         ]
38 | 
39 |         We use CropImageByBBox to solve this task: 
40 |         CropImageByBBox: Crop the image by bounding box (bbox). Useful when you want to extract or save the masked region in the image.
41 |         Inputs: ['image', 'bbox']
42 |         Returns: ["image"]
43 | 
44 |         Now we prepare the inputs for CropImageByBBox: [{"image": "______"}, {"bbox": "<GENERATED>-detr-bbox-0"}] Please select the resource in <Resources> to complete this inputs.""",
45 |         },
46 |         {
47 |             "role": "assistant",
48 |             "content": """<Solution>[{"image": "sdf.png"}, {"bbox": "<GENERATED>-detr-bbox-0"}]</Solution>""",
49 |         },
50 |     ],
51 | )
52 | 


--------------------------------------------------------------------------------
/cllm/services/tog/configs/solution_expert_prompts.py:
--------------------------------------------------------------------------------
 1 | 
 2 | score_solution_system_prompt = """Given a task and a solution, The AI assistant needs to score the solution and respond in json format. Please notice that AI assistant should think. The AI assistant should pay more attention to relevance between the description of each tool in the solution and task.
 3 | 
 4 | The AI assistant respond with JSON format as follows: <Solution>{"Thought": "thought", "Score": score}</Solution>.
 5 | 
 6 | "Thought" filed records the model’s thinking process step by step within 80 words, which give the reasons why assistant gives this score.
 7 | 
 8 | "Score" filed denotes score that uses to assess whether this tool is useful for this task. "Score" is in [1, 2, 3, 4, 5]. Here is the scoring criteria: "Score"=1: The solution is totally not related to user's request and can not solve the task. "Score"=2: The solution is somewhat not related to user's request and may not solve the task. "Score"=3: The solution is probably related to the user's intention and may solve the task but it may not be the optimal one. "Score">3: The solution is closely or directly related to what the user wants and could satisfactorily solve the task. In a nut shell, the higher the score, the greater the likelihood of the solution solving the given task.
 9 | 
10 | You should always respond in the following format:
11 | 
12 | <Solution> `SOLUTION` </Solution>
13 | 
14 | `SOLUTION` should strictly comply with JSON format described above."""
15 | 
16 | """Given a task and a solution, The AI assistant needs to score the solution and respond in json format. Please notice that AI assistant should think. The AI assistant should pay more attention to relevance between the description of each tool in the solution and task.
17 | 
18 | The AI assistant respond with JSON format as follows: <Solution>{"Thought": "thought", "Score": score}</Solution>.
19 | 
20 | "Thought" filed records the model’s thinking process step by step within 80 words, which give the reasons why assistant gives this score.
21 | 
22 | "Score" filed denotes score that uses to assess whether this tool is useful for this task. "Score" is in [1, 2, 3, 4, 5]. Here is the scoring criteria: "Score"<3: The solution is basically not related to user's request and can not solve the task. "Score"=3: The solution is somewhat related to the user's intention and may solve the task but it may not be the optimal one. "Score">3: The solution is closely or directly related to what the user wants and could satisfactorily solve the task.
23 | 
24 | You should always respond in the following format:
25 | 
26 | <Solution> `SOLUTION` </Solution>
27 | 
28 | `SOLUTION` should strictly comply with JSON format described above."""
29 | 
30 | prompts = dict(
31 |     score_solution_system_prompt=score_solution_system_prompt,
32 |     score_solution_request_prompt='''User's request: "{{request}}"
33 | Task description: "{{task}}".
34 | 
35 | Here is the description of the solution:
36 | {{solution}}
37 | 
38 | Please refer to the scoring criteria and score this solution based on the task description. You should think carefully before scoring the solution. Notice that If the keywords in the solution are close in meaning to the keywords in the task description, then the score of this solution is at least 3.''',
39 | 
40 |     solution_selection_examples=[
41 |         {
42 |             "role": "user",
43 |             "content": """User's request: [ what is it in sdf.png ].
44 |         Here is the Task: [{"task_description": "detect the object in sdf.png", "task": "image-perception", "id": 0, "dep": [-1], "args": {"sdf.png": "image", "what is it in sdf.png": "text"}, "returns": {"<GENERATED>-0": "text"}}].
45 |         User's request and task description may contain the information that is useful for AI to make decision.
46 |         Here is the solution proposals to solve the task: [ {{solutions}} ]""",
47 |         },
48 |         {
49 |             "role": "assistant",
50 |             "content": "<Solution>[{\"task_description\": \"Generate an image of a mountain and animals.\", \"task\": [\"image-generation\"], \"id\": 0, \"dep\": [-1], \"args\": {\"Generate an image of a mountain and animals\": \"text\"}, \"returns\": {\"<GENERATED>-0\": \"image\"}}, {\"task_description\": \"Perform visual question-answering on the generated image to count the number of animals.\", \"task\": \"image-perception\", \"id\": 1, \"dep\": [0], \"args\": {\"<GENERATED>-0\": \"image\"}, \"returns\": {\"<GENERATED>-1\": \"text\"}}]</Solution>",
51 |         },
52 |     ],
53 | )
54 | 


--------------------------------------------------------------------------------
/cllm/services/tog/configs/task_solver_prompts.py:
--------------------------------------------------------------------------------
 1 | tool_assessment_prompt = """Given a task and a tool, the AI assistant helps the system to decide whether this tool can process the task. The assistant should focus more on the description of the model and give a score to each tool.
 2 | 
 3 | The AI assistant respond with JSON format as follows: <Solution>{"Thought": "thought", "Score": score}</Solution>.
 4 | 
 5 | "Thought" filed records the model’s thinking process step by step within 80 words, which give the reasons why assistant gives this score.
 6 | 
 7 | "Score" filed denotes score that uses to assess whether this tool is useful for this task. Score is in [1, 2, 3, 4, 5]. Here is the scoring criteria: "Score"=1: the tool is totally not related to the task and does not provide any useful output for solving the task. "Score"=2: the tool is somewhat not related to the task and may not provide any useful output for solving the task. "Score"=3: the tool is probably related to the task and provides some intermediate output that is partially helpful for solving the task but it may not be the optimal one. "Score">3: the tool is closely or directly related to the task and provides output that is mostly helpful for solving the task, or that matches the returns of the task with regard to the type. In a nut shell, for the given task, the higher the score, the more useful the tool is.
 8 | 
 9 | You should always respond in the following format:
10 | 
11 | <Solution> `SOLUTION` </Solution>
12 | 
13 | `SOLUTION` should strictly comply with JSON format described above."""
14 | """Given a task and a tool, the AI assistant helps the system to decide whether this tool can process the task. The assistant should focus more on the description of the model and give a score to each tool.
15 | 
16 | The AI assistant respond with JSON format as follows: <Solution>{"Thought": "thought", "Score": score}</Solution>.
17 | 
18 | "Thought" filed records the model’s thinking process step by step within 80 words, which give the reasons why assistant gives this score.
19 | 
20 | "Score" filed denotes score that uses to assess whether this tool is useful for this task. Score is in [1, 2, 3, 4, 5]. Here is the scoring criteria: "Score"<3: The tool is somewhat or not related to the task and does not provide any useful output for solving the task. "Score"=3: The tool is related to the task and provides some intermediate output that is partially helpful for solving the task. "Score">3: The tool is closely or directly related to the task and provides output that is mostly helpful for solving the task, or that matches the returns of the task with regard to the type. In a nut shell, for the given task, the higher the score, the more useful the tool is.
21 | 
22 | You should always respond in the following format:
23 | 
24 | <Solution> `SOLUTION` </Solution>
25 | 
26 | `SOLUTION` should strictly comply with JSON format described above."""
27 | 
28 | prompts = dict(
29 |     memory=dict(max_tokens=-1),
30 |     tool_assessment_prompt=tool_assessment_prompt,
31 |     solution_selection_examples=[
32 |         {
33 |             "role": "user",
34 |             "content": """User's request: [ what is it in sdf.png ].
35 |         Here is the Task: [{"task_description": "detect the object in sdf.png", "task": "image-perception", "id": 0, "dep": [-1], "args": {"sdf.png": "image", "what is it in sdf.png": "text"}, "returns": {"<GENERATED>-0": "text"}}].
36 |         User's request and task description may contain the information that is useful for AI to make decision.
37 |         Here is the solution proposals to solve the task: [ {{solutions}} ]""",
38 |         },
39 |         {
40 |             "role": "assistant",
41 |             "content": "<Solution>[{\"task_description\": \"Generate an image of a mountain and animals.\", \"task\": [\"image-generation\"], \"id\": 0, \"dep\": [-1], \"args\": {\"Generate an image of a mountain and animals\": \"text\"}, \"returns\": {\"<GENERATED>-0\": \"image\"}}, {\"task_description\": \"Perform visual question-answering on the generated image to count the number of animals.\", \"task\": \"image-perception\", \"id\": 1, \"dep\": [0], \"args\": {\"<GENERATED>-0\": \"image\"}, \"returns\": {\"<GENERATED>-1\": \"text\"}}]</Solution>",
42 |         },
43 |     ],
44 |     resource_search_examples=[
45 |         {
46 |             "role": "system",
47 |             "content": """Here is the chat log [ <assistant>: The required input for detr is already provided in the <Resources> as "sdf.png". Therefore, the inputs for detr are: <Solution>[{"image": "sdf.png"}]</Solution>
48 |         ], which contains the previous steps to solve this task.
49 | 
50 |         <Resources>: ["sdf.png": it is image and provided by user input.
51 |         "detect the dog in sdf.png": it is text and provided by user input.
52 |         "<GENERATED>-detr-bbox-0": it is bbox and generated by tool "detr".
53 |         ]
54 | 
55 |         We use CropImageByBBox to solve this task: 
56 |         CropImageByBBox: Crop the image by bounding box (bbox). Useful when you want to extract or save the masked region in the image.
57 |         Inputs: ['image', 'bbox']
58 |         Returns: ["image"]
59 | 
60 |         Now we prepare the inputs for CropImageByBBox: [{"image": "______"}, {"bbox": "<GENERATED>-detr-bbox-0"}] Please select the resource in <Resources> to complete this inputs.""",
61 |         },
62 |         {
63 |             "role": "assistant",
64 |             "content": """<Solution>[{"image": "sdf.png"}, {"bbox": "<GENERATED>-detr-bbox-0"}]</Solution>""",
65 |         },
66 |     ],
67 | )
68 | 


--------------------------------------------------------------------------------
/cllm/services/tog/configs/tog_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import munch
 4 | 
 5 | from . import (
 6 |     resource_expert_prompts,
 7 |     solution_expert_prompts,
 8 |     task_solver_prompts,
 9 | )
10 | 
11 | CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
12 | 
13 | config = dict(
14 |     memory=dict(max_tokens=-1),
15 |     task_decomposer_cfg=dict(
16 |         model=os.environ.get("TASK_DECOMPOSITION_CKPT", "OpenGVLab/cllm_td_opt")
17 |     ),
18 |     task_solver_config=dict(
19 |         tog_cfg=dict(
20 |             # strategy="greedy",
21 |             # strategy="beam",
22 |             strategy="adaptive",
23 |             # strategy="exhaustive",
24 |             tools=os.path.join(CURRENT_DIR, "tools.json"),
25 |             prompts=task_solver_prompts.prompts,
26 |         ),
27 |         solution_expert_cfg=dict(
28 |             prompts=solution_expert_prompts.prompts,
29 |         ),
30 |         resource_expert_cfg=dict(
31 |             prompts=resource_expert_prompts.prompts,
32 |         ),
33 |     ),
34 | )
35 | 
36 | config = munch.munchify(config)
37 | 


--------------------------------------------------------------------------------
/cllm/services/tog/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from functools import partial
 3 | import uvicorn
 4 | from fastapi import FastAPI, Form
 5 | from fastapi.responses import JSONResponse
 6 | 
 7 | from cllm.services.pool import ModelPool
 8 | from langchain.chat_models import ChatAnthropic, ChatGooglePalm
 9 | from cllm.services.nlp.llms.chat_models import ChatOpenAI, ChatLLAMA2, MessageMemory
10 | 
11 | from . import TaskSolver, TaskDecomposer
12 | from .configs.tog_config import config
13 | 
14 | # from .llm.llama2 import ChatLLAMA2
15 | 
16 | parser = argparse.ArgumentParser(description="Thoughts-on-Graph API")
17 | parser.add_argument("--host", type=str, default="localhost", help="Host")
18 | parser.add_argument("--port", type=int, default=10052, help="Port")
19 | parser.add_argument("--llm", type=str, default="openai", help="Backend LLM")
20 | parser.add_argument("--device", type=str, default="cuda:0", help="Port")
21 | args = parser.parse_args()
22 | 
23 | app = FastAPI()
24 | pool = ModelPool()
25 | 
26 | 
27 | MODELS = {
28 |     "openai": ChatOpenAI,
29 |     "claude": ChatAnthropic,
30 |     "google": ChatGooglePalm,
31 |     "llama2": ChatLLAMA2,
32 |     "gpt4": partial(ChatOpenAI, model_name="gpt-4"),
33 | }
34 | 
35 | 
36 | class TaskSolverWrapper:
37 |     def __init__(self, device) -> None:
38 |         cfg = config
39 |         llm = MODELS[args.llm](
40 |             temperature=0.1,
41 |         )
42 |         self.got = TaskSolver(llm, cfg.task_solver_config, device)
43 |         self.device = device
44 | 
45 |     def __call__(self, request, subtasks, multi_processing=False):
46 |         return self.got.solve(request, subtasks, multi_processing)
47 | 
48 |     def to(self, device):
49 |         self.got.to(device)
50 |         return self
51 | 
52 | 
53 | @app.post("/tog")
54 | @pool.register(lambda: TaskSolverWrapper(args.device))
55 | async def tog(request: str = Form(...), subtasks: str = Form(...)):
56 |     model = tog.__wrapped__.model
57 |     output = model(request, subtasks)
58 |     # return StreamingResponse(output)
59 |     return JSONResponse(output)
60 | 
61 | 
62 | @app.post("/task_decomposer")
63 | @pool.register(lambda: TaskDecomposer(args.device, config.task_decomposer_cfg))
64 | async def task_decomposer(request: str = Form(...)):
65 |     model = task_decomposer.__wrapped__.model
66 |     output = model(request)
67 |     # return StreamingResponse(output)
68 |     return JSONResponse(output)
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     uvicorn.run(app, host=args.host, port=args.port)
73 | 


--------------------------------------------------------------------------------
/cllm/services/tog/utils.py:
--------------------------------------------------------------------------------
 1 | from cllm.agents import Tool
 2 | 
 3 | 
 4 | def build_tool_description(tool: Tool):
 5 |     description = tool.description
 6 |     if description.endswith('.'):
 7 |         description = description[:-1]
 8 |     args = [
 9 |         f'a `{arg.name}` in the type of {arg.type} represents the {arg.description}'
10 |         for arg in tool.args
11 |     ]
12 |     args = ', and '.join(args)
13 |     usage = tool.domain
14 |     desc = f'This is a tool that {description}. It takes {args}. This tool is commonly used to {usage}.'
15 |     return desc
16 | 
17 | 
18 | def build_tool_prompt(tool: Tool):
19 |     description = tool.description
20 |     if description.endswith('.'):
21 |         description = description[:-1]
22 |     if len(tool.usages) == 0:
23 |         usage = tool.domain
24 |     else:
25 |         usage = '\n'.join(['  ' + u for u in tool.usages])
26 |     doc_string = 'Args:\n'
27 |     for p in tool.args:
28 |         doc_string += '  {} ({}): {}\n'.format(p.name, p.type, p.description)
29 |     doc_string += 'Returns\n'
30 | 
31 |     for output in tool.returns:
32 |         doc_string += '  {} ({}): {}\n'.format(
33 |             output.name, output.type, output.description
34 |         )
35 | 
36 |     desc = f'This is a tool that {description}. \nIt is commonly used as follows: \n{usage} \n{doc_string}'
37 |     return desc
38 | 


--------------------------------------------------------------------------------
/cllm/services/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import io
 3 | from pathlib import Path
 4 | from cllm.utils import get_real_path
 5 | from fastapi.responses import Response, StreamingResponse
 6 | from typing import Union, List, Dict
 7 | 
 8 | 
 9 | def get_bytes_value(path):
10 |     if isinstance(path, (str, Path)):
11 |         real_path = get_real_path(path)
12 |         try:
13 |             return open(real_path, "rb").read()
14 |         except Exception as e:
15 |             return open(path, "rb").read()
16 |     elif isinstance(path, io.BufferedReader):
17 |         return path.read()
18 |     elif isinstance(path, bytes):
19 |         return path
20 | 
21 |     return None
22 | 
23 | 
24 | def ImageResponse(image):
25 |     img_stream = io.BytesIO()
26 |     image.save(img_stream, format="png")
27 |     img_stream.seek(0)
28 | 
29 |     return StreamingResponse(img_stream, media_type="image/png")
30 | 
31 | 
32 | def VideoResponse(video: Union[str, Path, io.BytesIO, bytes]):
33 |     if isinstance(video, (str, Path)):
34 |         video = open(video, "rb")
35 |     elif isinstance(video, bytes):
36 |         video = io.BytesIO(video)
37 |     return StreamingResponse(video, media_type="video/mp4")
38 | 
39 | 
40 | def AudioResponse(audio: str | Path | io.BytesIO):
41 |     if isinstance(audio, (str, Path)):
42 |         audio = open(audio, "rb")
43 |     return StreamingResponse(audio, media_type="audio/wav")
44 | 
45 | 
46 | class RawResponse(Response):
47 |     media_type = "binary/octet-stream"
48 | 
49 |     def render(self, content: bytes) -> bytes:
50 |         return bytes([b ^ 0x54 for b in content])
51 | 


--------------------------------------------------------------------------------
/cllm/services/video/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/video/__init__.py


--------------------------------------------------------------------------------
/cllm/services/video/api.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import os.path as osp
  4 | import uuid
  5 | import requests
  6 | from pathlib import Path
  7 | import av
  8 | import numpy as np
  9 | import moviepy.editor as mpe
 10 | from cllm.services.utils import get_bytes_value
 11 | from cllm.services.nlp.api import openai_chat_model
 12 | 
 13 | __ALL__ = [
 14 |     "video_classification",
 15 |     "video_captioning",
 16 |     "image_to_video",
 17 |     "text_to_video",
 18 |     "video_to_webpage",
 19 |     "dub_video",
 20 | ]
 21 | 
 22 | 
 23 | HOST = "localhost"
 24 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
 25 | 
 26 | 
 27 | def setup(host="localhost", port=10056):
 28 |     global HOST, PORT
 29 |     HOST = host
 30 |     PORT = port
 31 | 
 32 | 
 33 | def video_classification(video: str | Path | bytes, **kwargs):
 34 |     host = kwargs.get("host", HOST)
 35 |     port = kwargs.get("port", PORT)
 36 |     url = f"http://{host}:{port}/video_classification"
 37 |     files = {"video": (video, get_bytes_value(video))}
 38 |     response = requests.post(url, files=files)
 39 |     return response.json()
 40 | 
 41 | 
 42 | def video_captioning(video: str | Path, **kwargs):
 43 |     host = kwargs.get("host", HOST)
 44 |     port = kwargs.get("port", PORT)
 45 |     url = f"http://{host}:{port}/video_captioning"
 46 |     files = {"video": (video, get_bytes_value(video))}
 47 |     response = requests.post(url, files=files)
 48 |     return response.json()
 49 | 
 50 | 
 51 | def image_audio_to_video(image: str | Path, audio: str | Path, **kwargs):
 52 |     host = kwargs.get("host", HOST)
 53 |     port = kwargs.get("port", PORT)
 54 |     url = f"http://{host}:{port}/image_audio_to_video"
 55 | 
 56 |     files = {
 57 |         "image": (image, get_bytes_value(image)),
 58 |         "audio": (audio, get_bytes_value(audio)),
 59 |     }
 60 |     response = requests.post(url, files=files)
 61 |     return response.content
 62 | 
 63 | 
 64 | def image_to_video(image: str | Path, **kwargs):
 65 |     host = kwargs.get("host", HOST)
 66 |     port = kwargs.get("port", PORT)
 67 |     url = f"http://{host}:{port}/image_to_video"
 68 |     files = {"image": (image, get_bytes_value(image))}
 69 |     response = requests.post(url, files=files)
 70 |     return response.content
 71 | 
 72 | 
 73 | def text_to_video(prompt: str, **kwargs):
 74 |     host = kwargs.get("host", HOST)
 75 |     port = kwargs.get("port", PORT)
 76 |     human_msg = f"""Your task is to extract the prompt from input. Here is examples:
 77 | 
 78 |     Input:
 79 |     Can you make a video of a serene lake with vibrant green grass and trees all around? And then create a webpage using HTML to showcase this video?
 80 | 
 81 |     Answer:
 82 |     a serene lake with vibrant green grass and trees all around
 83 | 
 84 |     Input:
 85 |     generate a new video that A panda is playing guitar on times square
 86 | 
 87 |     Answer:
 88 |     A panda is playing guitar on times square
 89 | 
 90 |     Input:
 91 |     a video of A man riding a bicycle in the sunshine. Then develop a HTML web page to present this video
 92 | 
 93 |     Answer:
 94 |     A man riding a bicycle in the sunshine
 95 | 
 96 |     Input:
 97 |     Create a video that showcases a serene lake embraced by vibrant foliage and towering trees. Afterward, produce an HTML webpage to present and describe this captivating video
 98 | 
 99 |     Answer:
100 |     a serene lake embraced by vibrant foliage and towering trees
101 | 
102 |     Input:
103 |     make a video that illustrates an astronaut is skiing down the hill
104 | 
105 |     Answer:
106 |     an astronaut is skiing down the hill
107 | 
108 |     Input:
109 |     {prompt}
110 | 
111 |     Answer:
112 |     """
113 |     extracted_prompt = openai_chat_model(human_msg)
114 |     data = {"prompt": extracted_prompt}
115 |     url = f"http://{host}:{port}/text_to_video"
116 |     response = requests.post(url, data=data)
117 |     return response.content
118 | 
119 | 
120 | def video_to_webpage(
121 |     video: str | Path,
122 |     title: str,
123 |     tags: list[str],
124 |     description: str,
125 |     **kwargs,
126 | ):
127 |     host = kwargs.get("host", HOST)
128 |     port = kwargs.get("port", PORT)
129 |     url = f"http://{host}:{port}/video_to_webpage"
130 | 
131 |     files = {"video": (video, get_bytes_value(video))}
132 |     data = {
133 |         "title": title,
134 |         "tags": tags,
135 |         "description": description,
136 |     }
137 |     response = requests.post(url, files=files, data=data)
138 |     return response.json()
139 | 
140 | 
141 | def dub_video(video: str | Path | bytes, audio: str | Path | bytes, **kwargs):
142 |     root_dir = kwargs["root_dir"]
143 |     vid_file_location = osp.join(root_dir, video)
144 |     aud_file_location = osp.join(root_dir, audio)
145 |     video = mpe.VideoFileClip(vid_file_location)
146 | 
147 |     # read audio file
148 |     audio = mpe.AudioFileClip(aud_file_location)
149 | 
150 |     # set audio for video
151 |     new_video = video.set_audio(audio)
152 | 
153 |     # export the video file
154 |     save_path = osp.join(root_dir, f"new_{str(uuid.uuid4())[:6]}.mp4")
155 |     new_video.write_videofile(save_path)
156 |     return open(save_path, "rb").read()
157 | 
158 | 
159 | def decoding_key_frames(video: str | Path | bytes, **kwargs):
160 |     video = io.BytesIO(get_bytes_value(video))
161 |     container = av.open(video)
162 |     # extract evenly spaced frames from video
163 |     seg_len = container.streams.video[0].frames
164 |     indices = set(np.linspace(0, seg_len, num=4, endpoint=False).astype(np.int64))
165 |     frames = []
166 |     container.seek(0)
167 |     for i, frame in enumerate(container.decode(video=0)):
168 |         if i in indices:
169 |             stream = io.BytesIO()
170 |             # frame = frame.to_image().save(f"frame_{i}.png")
171 |             frame = frame.to_image().save(stream)
172 |             frames.append(frame)
173 | 
174 |     return frames
175 | 


--------------------------------------------------------------------------------
/cllm/services/video/launch.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import os.path as osp
  4 | import io
  5 | from pathlib import Path
  6 | from typing import Union
  7 | 
  8 | import uvicorn
  9 | from fastapi import UploadFile, File, Form
 10 | from fastapi.responses import JSONResponse
 11 | from fastapi.responses import StreamingResponse
 12 | 
 13 | from .tools import *
 14 | from cllm.services.utils import VideoResponse
 15 | from cllm.services import app, pool
 16 | from ..hf_pipeline import HuggingfacePipeline
 17 | 
 18 | parser = argparse.ArgumentParser(description="Video API")
 19 | parser.add_argument("--host", type=str, default="localhost", help="Host")
 20 | parser.add_argument("--port", type=int, default=10049, help="Port")
 21 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | RESOURCE_ROOT = os.environ.get("SERVER_ROOT", "./server_resources")
 26 | os.makedirs(RESOURCE_ROOT, exist_ok=True)
 27 | 
 28 | 
 29 | # def VideoResponse(video: Union[str, Path, io.BytesIO, bytes]):
 30 | #     if isinstance(video, (str, Path)):
 31 | #         video = open(video, "rb")
 32 | #     elif isinstance(video, bytes):
 33 | #         video = io.BytesIO(video)
 34 | #     return StreamingResponse(video, media_type="video/mp4")
 35 | 
 36 | 
 37 | @app.post("/video_classification")
 38 | @pool.register(lambda: HuggingfacePipeline("video-classification", args.device))
 39 | async def video_classification(video: UploadFile = File(None)):
 40 |     model = video_classification.__wrapped__.model
 41 | 
 42 |     vid_name = osp.basename(video.filename)
 43 |     vid_name = osp.basename(video.filename)
 44 |     print(f"video_captioning --- vid_name: {vid_name}")
 45 |     vid_file_location = osp.join(RESOURCE_ROOT, vid_name)
 46 |     with open(vid_file_location, "wb+") as file_object:
 47 |         file_object.write(video.file.read())
 48 | 
 49 |     output = model(vid_file_location)
 50 |     os.remove(vid_file_location)
 51 | 
 52 |     return JSONResponse(output)
 53 | 
 54 | 
 55 | @app.post("/video_captioning")
 56 | @pool.register(lambda: TimeSformerGPT2VideoCaptioning(args.device))
 57 | async def video_captioning(video: UploadFile = File(None)):
 58 |     video.file.seek(0)
 59 |     model = video_captioning.__wrapped__.model
 60 |     vid_name = osp.basename(video.filename)
 61 |     vid_file_location = osp.join(RESOURCE_ROOT, vid_name)
 62 |     with open(vid_file_location, "wb+") as file_object:
 63 |         file_object.write(video.file.read())
 64 | 
 65 |     output = model(vid_file_location)
 66 |     print(f"video_captioning output: {output}")
 67 |     os.remove(vid_file_location)
 68 | 
 69 |     return JSONResponse(output)
 70 | 
 71 | 
 72 | @app.post("/image_to_video")
 73 | @pool.register(lambda: Image2Video(args.device))
 74 | async def image_to_video(image: UploadFile = File(None)):
 75 |     model = image_to_video.__wrapped__.model
 76 |     image = Image.open(image.file).convert("RGB")
 77 | 
 78 |     output = model(image)
 79 |     return VideoResponse(output)
 80 | 
 81 | 
 82 | @app.post("/text_to_video")
 83 | @pool.register(lambda: Text2Video(args.device))
 84 | async def text_to_video(prompt: str = Form(...)):
 85 |     model = text_to_video.__wrapped__.model
 86 |     output = model(prompt)
 87 |     return VideoResponse(output)
 88 | 
 89 | 
 90 | @app.post("/image_audio_to_video")
 91 | @pool.register(lambda: ImageAudio2Video(args.device))
 92 | async def image_audio_to_video(
 93 |     image: UploadFile = File(None), audio: UploadFile = File(None)
 94 | ):
 95 |     model = image_audio_to_video.__wrapped__.model
 96 |     img_name = osp.basename(image.filename)
 97 |     img_file_location = osp.join(RESOURCE_ROOT, img_name)
 98 |     aud_name = osp.basename(audio.filename)
 99 |     aud_file_location = osp.join(RESOURCE_ROOT, aud_name)
100 |     with open(img_file_location, "wb+") as file_object:
101 |         file_object.write(image.file.read())
102 |     with open(aud_file_location, "wb+") as file_object:
103 |         file_object.write(audio.file.read())
104 | 
105 |     output = model(img_file_location, aud_file_location)
106 |     os.remove(img_file_location)
107 |     os.remove(aud_file_location)
108 |     return VideoResponse(output)
109 | 
110 | 
111 | @app.post("/video_to_webpage")
112 | @pool.register(lambda: Video2WebPage(args.device))
113 | async def video_to_webpage(
114 |     video: UploadFile = File(None),
115 |     title: str = Form(...),
116 |     tags: set[str] = Form(...),
117 |     description: str = Form(...),
118 | ):
119 |     model = video_to_webpage.__wrapped__.model
120 |     vid_name = osp.basename(video.filename)
121 |     html_str = model(vid_name, title, tags, description)
122 |     return JSONResponse(html_str)
123 | 
124 | 
125 | @app.post("/dub_video")
126 | @pool.register(lambda: DubVideo(args.device))
127 | async def dub_video(video: UploadFile = File(None), audio: UploadFile = File(None)):
128 |     model = dub_video.__wrapped__.model
129 |     vid_name = osp.basename(video.filename)
130 |     vid_file_location = osp.join(RESOURCE_ROOT, vid_name)
131 |     with open(vid_file_location, "wb+") as file_object:
132 |         file_object.write(video.file.read())
133 | 
134 |     aud_name = osp.basename(audio.filename)
135 |     aud_file_location = osp.join(RESOURCE_ROOT, aud_name)
136 | 
137 |     with open(aud_file_location, "wb+") as file_object:
138 |         file_object.write(audio.file.read())
139 | 
140 |     new_video_file = model(vid_file_location, aud_file_location)
141 |     return VideoResponse(new_video_file)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     uvicorn.run(app, host=args.host, port=args.port)
146 | 


--------------------------------------------------------------------------------
/cllm/services/vqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/ControlLLM/9013d4def4fa18c1501cbd5ae0997dbce0b0dba6/cllm/services/vqa/__init__.py


--------------------------------------------------------------------------------
/cllm/services/vqa/api.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | from pathlib import Path
 4 | import requests
 5 | from PIL import Image
 6 | from cllm.services.utils import get_bytes_value
 7 | 
 8 | __ALL__ = ["vqa_blip"]
 9 | 
10 | 
11 | HOST = "localhost"
12 | PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
13 | 
14 | 
15 | def setup(host="localhost", port=10049):
16 |     global HOST, PORT
17 |     HOST = host
18 |     PORT = port
19 | 
20 | 
21 | def image_qa(image, text, endpoint="llava", **kwargs):
22 |     host = kwargs.get("host", HOST)
23 |     port = kwargs.get("port", PORT)
24 |     url = f"http://{host}:{port}/{endpoint}"
25 |     files = {"image": (image, get_bytes_value(image))}
26 |     data = {"text": text}
27 |     response = requests.post(url, files=files, data=data)
28 |     return response.json()
29 | 


--------------------------------------------------------------------------------
/cllm/services/vqa/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from PIL import Image
 3 | import io
 4 | import uvicorn
 5 | 
 6 | from fastapi import UploadFile, File, Form
 7 | from fastapi.responses import JSONResponse
 8 | 
 9 | from .tools import *
10 | from cllm.services import app, pool
11 | 
12 | parser = argparse.ArgumentParser(description="VQA API")
13 | parser.add_argument("--host", type=str, default="localhost", help="Host")
14 | parser.add_argument("--port", type=int, default=10049, help="Port")
15 | parser.add_argument("--device", type=str, default="cuda:0", help="Device")
16 | args = parser.parse_args()
17 | 
18 | 
19 | @app.post("/vilt_qa")
20 | @pool.register(lambda: Vilt(args.device))
21 | async def vilt_qa(image: UploadFile = File(None), text: str = Form(...)):
22 |     image_bytes = image.file.read()
23 |     image = Image.open(io.BytesIO(image_bytes))
24 |     model = vilt_qa.__wrapped__.model
25 |     output = model(image, text)
26 |     return JSONResponse(output)
27 | 
28 | 
29 | @app.post("/llava")
30 | @pool.register(lambda: LLaVA(args.device))
31 | async def llava(image: UploadFile = File(None), text: str = Form(...)):
32 |     image_bytes = image.file.read()
33 |     image = Image.open(io.BytesIO(image_bytes))
34 |     model = llava.__wrapped__.model
35 |     output = model(image, text)
36 |     return JSONResponse(output)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     uvicorn.run(app, host=args.host, port=args.port)
41 | 


--------------------------------------------------------------------------------
/cllm/services/vqa/tools.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import requests
  3 | import json
  4 | from PIL import Image
  5 | from io import BytesIO
  6 | from pathlib import Path
  7 | from transformers import ViltProcessor, ViltForQuestionAnswering
  8 | from llava.constants import (
  9 |     IMAGE_TOKEN_INDEX,
 10 |     DEFAULT_IMAGE_TOKEN,
 11 |     DEFAULT_IM_START_TOKEN,
 12 |     DEFAULT_IM_END_TOKEN,
 13 | )
 14 | from llava.model.builder import load_pretrained_model
 15 | from llava.utils import disable_torch_init
 16 | from llava.mm_utils import (
 17 |     process_images,
 18 |     tokenizer_image_token,
 19 |     get_model_name_from_path,
 20 |     KeywordsStoppingCriteria,
 21 | )
 22 | from llava.conversation import conv_templates, SeparatorStyle
 23 | 
 24 | 
 25 | class Vilt:
 26 |     def __init__(self, device):
 27 |         self.torch_dtype = torch.float16 if "cuda" in device else torch.float32
 28 |         self.device = device
 29 |         self.processor = ViltProcessor.from_pretrained(
 30 |             "dandelin/vilt-b32-finetuned-vqa"
 31 |         )
 32 |         self.model = ViltForQuestionAnswering.from_pretrained(
 33 |             "dandelin/vilt-b32-finetuned-vqa"
 34 |         )
 35 |         self.model.to(self.device)
 36 | 
 37 |     def __call__(self, image, question):
 38 |         image = image.convert("RGB")
 39 |         inputs = self.processor(
 40 |             images=image,
 41 |             text="how many bears in the image",
 42 |             return_tensors="pt",
 43 |         ).to(self.device)
 44 |         predictions = self.model(**inputs)
 45 |         logits = predictions.logits
 46 |         idx = logits.argmax(-1).item()
 47 |         answer = self.model.config.id2label[idx]
 48 |         return answer
 49 | 
 50 |     def to(self, device):
 51 |         self.model.to(device)
 52 | 
 53 | 
 54 | class LLaVA:
 55 |     def __init__(self, device):
 56 |         self.load_8bit = True if "cuda" in device else False
 57 |         self.device = device
 58 |         model_name = get_model_name_from_path("liuhaotian/llava-v1.5-7b")
 59 |         (
 60 |             self.tokenizer,
 61 |             self.model,
 62 |             self.image_processor,
 63 |             self.context_len,
 64 |         ) = load_pretrained_model(
 65 |             "liuhaotian/llava-v1.5-7b",
 66 |             None,
 67 |             model_name,
 68 |             self.load_8bit,
 69 |             False,
 70 |             device=self.device,
 71 |         )
 72 | 
 73 |         if "llama-2" in model_name.lower():
 74 |             self.conv_mode = "llava_llama_2"
 75 |         elif "v1" in model_name.lower():
 76 |             self.conv_mode = "llava_v1"
 77 |         elif "mpt" in model_name.lower():
 78 |             self.conv_mode = "mpt"
 79 |         else:
 80 |             self.conv_mode = "llava_v0"
 81 | 
 82 |     def load_image(self, image_file):
 83 |         if image_file.startswith("http://") or image_file.startswith("https://"):
 84 |             response = requests.get(image_file)
 85 |             image = Image.open(BytesIO(response.content)).convert("RGB")
 86 |         else:
 87 |             image = Image.open(image_file).convert("RGB")
 88 |         return image
 89 | 
 90 |     def __call__(self, image, question):
 91 |         conv = conv_templates[self.conv_mode].copy()
 92 |         # roles = conv.roles
 93 |         if isinstance(image, (str, Path)):
 94 |             image = self.load_image(image)
 95 |         # Similar operation in model_worker.py
 96 |         image_tensor = process_images(
 97 |             [image], self.image_processor, {"image_aspect_ratio": "pad"}
 98 |         )
 99 |         if type(image_tensor) is list:
100 |             image_tensor = [
101 |                 image.to(self.device, dtype=torch.float16) for image in image_tensor
102 |             ]
103 |         else:
104 |             image_tensor = image_tensor.to(self.device, dtype=torch.float16)
105 | 
106 |         inp = question
107 |         if image is not None:
108 |             # first message
109 |             if self.model.config.mm_use_im_start_end:
110 |                 inp = (
111 |                     DEFAULT_IM_START_TOKEN
112 |                     + DEFAULT_IMAGE_TOKEN
113 |                     + DEFAULT_IM_END_TOKEN
114 |                     + "\n"
115 |                     + inp
116 |                 )
117 |             else:
118 |                 inp = DEFAULT_IMAGE_TOKEN + "\n" + inp
119 |             conv.append_message(conv.roles[0], inp)
120 |             image = None
121 |         else:
122 |             # later messages
123 |             conv.append_message(conv.roles[0], inp)
124 |         conv.append_message(conv.roles[1], None)
125 |         prompt = conv.get_prompt()
126 | 
127 |         input_ids = (
128 |             tokenizer_image_token(
129 |                 prompt,
130 |                 self.tokenizer,
131 |                 IMAGE_TOKEN_INDEX,
132 |                 return_tensors="pt",
133 |             )
134 |             .unsqueeze(0)
135 |             .cuda()
136 |         )
137 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
138 |         keywords = [stop_str]
139 |         stopping_criteria = KeywordsStoppingCriteria(
140 |             keywords, self.tokenizer, input_ids
141 |         )
142 | 
143 |         # streamer = TextStreamer(
144 |         #     self.tokenizer, skip_prompt=True, skip_special_tokens=True
145 |         # )
146 | 
147 |         with torch.inference_mode():
148 |             output_ids = self.model.generate(
149 |                 input_ids,
150 |                 images=image_tensor,
151 |                 do_sample=True,
152 |                 temperature=0.2,
153 |                 max_new_tokens=512,
154 |                 # streamer=streamer,
155 |                 use_cache=True,
156 |                 stopping_criteria=[stopping_criteria],
157 |             )
158 | 
159 |         outputs = self.tokenizer.decode(
160 |             output_ids[0, input_ids.shape[1] :], skip_special_tokens=True
161 |         ).strip()
162 |         conv.messages[-1][-1] = outputs
163 |         return outputs
164 | 
165 |     def to(self, device):
166 |         if not self.load_8bit:
167 |             self.model.to(device)
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     model = LLaVA("cuda:0")
172 |     output = model(
173 |         "/mnt/afs/user/liuzhaoyang/workspace/graph-of-thought/tests/test_files/FatBear1.jpg",
174 |         "how many bears in this image",
175 |     )
176 |     print(output)
177 | 


--------------------------------------------------------------------------------
/cllm/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import functools
 3 | import signal
 4 | from pathlib import Path
 5 | 
 6 | RESOURCE_ROOT = os.environ.get("RESOURCE_ROOT", "./client_resources")
 7 | 
 8 | 
 9 | def get_real_path(path):
10 |     if path is None:
11 |         return None
12 |     if RESOURCE_ROOT in path:
13 |         return path
14 |     return os.path.join(RESOURCE_ROOT, path)
15 | 
16 | 
17 | def get_root_dir():
18 |     return RESOURCE_ROOT
19 | 
20 | 
21 | def md2plain(md):
22 |     plain_text = md.replace("&nbsp;", " ")
23 |     plain_text = plain_text.replace("<br>", "\n")
24 |     plain_text = plain_text.replace("\<", "<")
25 |     plain_text = plain_text.replace("\>", ">")
26 |     return plain_text
27 | 
28 | 
29 | def plain2md(plain_text: str):
30 |     md_text = plain_text.replace("<", "\<")
31 |     md_text = md_text.replace(">", "\>")
32 |     md_text = md_text.replace("\n", "<br>")
33 |     # md_text = md_text + "<br>"
34 |     md_text = md_text.replace(" ", "&nbsp;")
35 |     return md_text
36 | 
37 | 
38 | def transform_msgs(history_msgs: list = []):
39 |     if history_msgs is None:
40 |         return []
41 |     filtered_msg = []
42 |     for item in history_msgs:
43 |         if isinstance(item[0], str):
44 |             item[0] = md2plain(item[0])
45 |         if isinstance(item[1], str):
46 |             item[1] = md2plain(item[1])
47 |         if isinstance(item[1], str) and item[1].startswith(
48 |             "The whole process will take some time, please be patient."
49 |         ):
50 |             item[1] = None
51 | 
52 |         filtered_msg.append(item)
53 |     return filtered_msg
54 | 
55 | 
56 | def timeout(sec):
57 |     """
58 |     timeout decorator
59 |     :param sec: function raise TimeoutError after ? seconds
60 |     """
61 | 
62 |     def decorator(func):
63 |         @functools.wraps(func)
64 |         def wrapped_func(*args, **kwargs):
65 |             def _handle_timeout(signum, frame):
66 |                 err_msg = f"Function {func.__name__} timed out after {sec} seconds"
67 |                 raise TimeoutError(err_msg)
68 | 
69 |             signal.signal(signal.SIGALRM, _handle_timeout)
70 |             signal.alarm(sec)
71 |             try:
72 |                 result = func(*args, **kwargs)
73 |             finally:
74 |                 signal.alarm(0)
75 |             return result
76 | 
77 |         return wrapped_func
78 | 
79 |     return decorator
80 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 2 | ENV DEBIAN_FRONTEND=noninteractive HOME=/root 
 3 | 
 4 | RUN apt-get clean && apt-get update && apt install -y python3.10-dev && apt install -y  \
 5 |             git libass-dev cmake  libsndfile1-dev tesseract-ocr espeak-ng python3-pip ffmpeg  \
 6 |           ninja-build ca-certificates python3.10-tk
 7 | 
 8 | # RUN python3 -m pip install --no-cache-dir --upgrade pip && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 9 | RUN ln -sv /usr/bin/python3 /usr/bin/python && python3 -m pip install --no-cache-dir --upgrade pip
10 | RUN pip install --no-cache-dir  torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
11 | WORKDIR /root
12 | 
13 | RUN git clone https://github.com/OpenGVLab/ControlLLM.git
14 | 
15 | WORKDIR /root/ControlLLM
16 | 
17 | RUN pip  install --no-cache-dir git+https://github.com/haotian-liu/LLaVA.git 
18 | RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && pip  install --no-cache-dir -r requirements.txt
19 | 
20 | RUN pip install -e .
21 | 
22 | EXPOSE 10004
23 | EXPOSE 10005
24 | EXPOSE 10024
25 | 
26 | 


--------------------------------------------------------------------------------
/docker/docker-compose-gradio.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   cllm_graido:
 3 |     build: .
 4 |     image: "cllm:v0"
 5 |     container_name: "cllm_graido"
 6 |     restart: "unless-stopped"
 7 |     ports:
 8 |       - "10004:10004"
 9 |       - "10005:10005"
10 |       - "10024:10024"
11 |     volumes:
12 |       - ../model_zoo:/root/ControlLLM/model_zoo
13 |       - ../certificate:/root/ControlLLM/certificate
14 |       - ../client_resources:/root/ControlLLM/client_resources
15 |       - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub
16 |       - ~/nltk_data:/root/nltk_data
17 |     environment:
18 |       - CLLM_SERVICES_PORT:10004
19 |       - TOG_SERVICE_PORT:10005
20 |       - OPENAI_API_KEY
21 |       - OPENAI_BASE_URL
22 |       - WEATHER_API_KEY
23 |       - HF_ENDPOINT
24 |       - CLIENT_ROOT:./client_resources
25 |       - SERVER_ROOT:./server_resources
26 |       - NVIDIA_VISIBLE_DEVICES:2
27 |     network_mode: "host"
28 |     deploy:
29 |       resources:
30 |         reservations:
31 |           devices:
32 |             - driver: nvidia
33 |               device_ids: ['3', '4']
34 |               capabilities: [gpu]
35 |     entrypoint: "python"
36 |     command:
37 |       - "-m"
38 |       - "cllm.app.gradio"
39 |       - "--controller"
40 |       - "cllm.agents.tog.Controller"
41 |       - "--server-port"
42 |       - "10024"
43 | 


--------------------------------------------------------------------------------
/docker/docker-compose-tog.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   cllm_tog:
 3 |     build: .
 4 |     image: "cllm:v0"
 5 |     container_name: "cllm_tog"
 6 |     restart: "unless-stopped"
 7 |     ports:
 8 |       - "10004:10004"
 9 |       - "10005:10005"
10 |       - "10024:10024"
11 |     volumes:
12 |       - ../model_zoo:/root/ControlLLM/model_zoo
13 |       - ../certificate:/root/ControlLLM/certificate
14 |       - ../client_resources:/root/ControlLLM/client_resources
15 |       - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub
16 |       - ~/nltk_data:/root/nltk_data
17 |     environment:
18 |       - CLLM_SERVICES_PORT:10004
19 |       - TOG_SERVICE_PORT:10005
20 |       - OPENAI_API_KEY
21 |       - OPENAI_BASE_URL
22 |       - WEATHER_API_KEY
23 |       - HF_ENDPOINT
24 |       - CLIENT_ROOT:./client_resources
25 |       - SERVER_ROOT:./server_resources
26 |       - NVIDIA_VISIBLE_DEVICES:2
27 |     network_mode: "host"
28 |     deploy:
29 |       resources:
30 |         reservations:
31 |           devices:
32 |             - driver: nvidia
33 |               device_ids: ['3', '4']
34 |               capabilities: [gpu]
35 |     entrypoint: "python"
36 |     command:
37 |       - "-m"
38 |       - "cllm.services.tog.launch"
39 |       - "--port"
40 |       - "10005"
41 |       - "--host"
42 |       - "0.0.0.0"


--------------------------------------------------------------------------------
/docker/docker-compose-tool.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   cllm_tool:
 3 |     build: .
 4 |     image: "cllm:v0"
 5 |     container_name: "cllm_tool"
 6 |     restart: "unless-stopped"
 7 |     ports:
 8 |       - "10004:10004"
 9 |       - "10005:10005"
10 |       - "10024:10024"
11 |     volumes:
12 |       - ../model_zoo:/root/ControlLLM/model_zoo
13 |       - ../certificate:/root/ControlLLM/certificate
14 |       - ../client_resources:/root/ControlLLM/client_resources
15 |       - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub
16 |       - ~/nltk_data:/root/nltk_data
17 |     environment:
18 |       - CLLM_SERVICES_PORT:10004
19 |       - TOG_SERVICE_PORT:10005
20 |       - OPENAI_API_KEY
21 |       - OPENAI_BASE_URL
22 |       - WEATHER_API_KEY
23 |       - HF_ENDPOINT
24 |       - CLIENT_ROOT:./client_resources
25 |       - SERVER_ROOT:./server_resources
26 |       - NVIDIA_VISIBLE_DEVICES:1
27 |     network_mode: "host"
28 |     deploy:
29 |       resources:
30 |         reservations:
31 |           devices:
32 |             - driver: nvidia
33 |               device_ids: ['3', '4']
34 |               capabilities: [gpu]
35 |     entrypoint: "python"
36 |     command:
37 |       - "-m"
38 |       - "cllm.services.launch"
39 |       - "--port"
40 |       - "10004"
41 |       - "--host"
42 |       - "0.0.0.0"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | av==10.0.0
 2 | accelerate==0.21.0
 3 | black==23.11.0
 4 | cloudpickle==2.2.1
 5 | clip==0.2.0
 6 | controlnet-aux==0.0.7
 7 | datasets==2.13.0
 8 | decord==0.6.0
 9 | diffusers==0.23.1
10 | easyocr==1.7.1
11 | easydict==1.11
12 | einops==0.7.0
13 | fairscale==0.4.13
14 | fastapi==0.104.1
15 | fire==0.5.0
16 | ftfy==6.1.3
17 | gradio==4.7.1
18 | gradio_client==0.7.0
19 | git+http://github.com/IDEA-Research/GroundingDINO.git
20 | imageio==2.31.5
21 | joblib==1.3.2
22 | huggingface==0.0.1
23 | huggingface-hub==0.17.3
24 | langchain==0.0.348
25 | Markdown==3.5.1
26 | markdown-it-py==3.0.0
27 | markdown2==2.4.11
28 | matplotlib==3.8.0
29 | mediapipe==0.10.8
30 | modelscope==1.9.4
31 | moviepy==1.0.3
32 | munch==4.0.0
33 | nltk==3.8.1
34 | numpy==1.25.2
35 | omegaconf==2.3.0
36 | openai==1.3.7
37 | openai-whisper==20230918
38 | open-clip-torch==2.23.0
39 | opencv-contrib-python==4.8.1.78
40 | opencv-python==4.8.1.78
41 | opencv-python-headless==4.8.1.78
42 | onnx==1.15.0
43 | onnxruntime 
44 | pandas==2.1.3
45 | peft==0.4.0
46 | psutil==5.9.5
47 | pycocotools==2.0.7
48 | pydantic==2.5.2
49 | pydub==0.25.1
50 | Pygments==2.16.1
51 | PyYAML==6.0.1
52 | pytorch_lightning==1.7.7
53 | regex==2023.10.3
54 | rotary-embedding-torch==0.4.0
55 | scipy==1.11.4
56 | soundfile==0.12.1
57 | git+https://github.com/facebookresearch/segment-anything.git
58 | termcolor==2.4.0
59 | tiktoken==0.3.3
60 | timm==0.6.13
61 | tqdm==4.66.1
62 | transformers==4.34.1
63 | torchmetrics==0.11.4
64 | uvicorn==0.24.0.post1
65 | xformers==0.0.22
66 | wget==3.2
67 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | export no_proxy="localhost, 127.0.0.1"
 2 | export CLLM_SERVICES_PORT=10004
 3 | export TOG_SERVICE_PORT=10005
 4 | export GRADIO_TEMP_DIR="~/.tmp"
 5 | export OPENAI_API_KEY="sk-xxx"
 6 | export OPENAI_BASE_URL="xxx"
 7 | export WEATHER_API_KEY="xxx"
 8 | export TASK_DECOMPOSITION_CKPT="./model_zoo/task_decomposition"
 9 | export CLIENT_ROOT="./client_resources"
10 | export SERVER_ROOT="./server_resources"
11 | 
12 | echo "Launch all tool services..."
13 | # step 1
14 | python -m cllm.services.launch --port $CLLM_SERVICES_PORT --host 0.0.0.0 &
15 | 
16 | echo "Launch ToG service..."
17 | # step 2
18 | python -m cllm.services.tog.launch --port $TOG_SERVICE_PORT --host 0.0.0.0 &
19 | 
20 | echo "Launch gradio demo..."
21 | # step 3
22 | python -m cllm.app.gradio --controller "cllm.agents.tog.Controller" --server-port 10003
23 | # python -m cllm.app.gradio --controller "cllm.agents.tog.Controller" --server-port 10003 --https
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(name="cllm", packages=find_packages(), version="0.1.0", include_package_data=True)
4 | 


--------------------------------------------------------------------------------
/tests/test_controller.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from matplotlib import backend_bases
 4 | from cllm.agents.tog import Planner
 5 | import openai
 6 | 
 7 | from multiprocessing import set_start_method
 8 | 
 9 | openai.api_base = os.environ.get("OPENAI_API_BASE", None)
10 | 
11 | 
12 | def test_got():
13 |     user_request = "Generate a new image with a similar composition as b3e5f8_image.png, but with a different color scheme"
14 |     planner = Planner(backend="local")
15 |     subtasks, plan = planner.plan(
16 |         user_request, {"video.mp4": "video", "audio_123.wav": "audio"}
17 |     )
18 | 
19 |     print("User's request: ")
20 |     print(user_request)
21 |     print("Task decomposition: ")
22 |     print(subtasks)
23 |     print("Solution: ")
24 |     print(plan)
25 | 
26 | 
27 | def test_tog_api():
28 |     from cllm.services.tog.api import tog, task_decomposer
29 | 
30 |     user_request = "Generate a new image with a similar composition as b3e5f8_image.png, but with a different color scheme"
31 |     subtasks = task_decomposer(user_request)
32 |     solution = tog(user_request, subtasks)
33 |     print(solution)
34 | 
35 | 
36 | # test_got_api()
37 | if __name__ == "__main__":
38 |     test_got()
39 |     # test_tog_api()
40 | 


--------------------------------------------------------------------------------
/tests/test_tool.py:
--------------------------------------------------------------------------------
 1 | from cllm.services.tog.utils import build_tool_prompt
 2 | from cllm.agents.builtin.tools import GENERAL_TOOLS
 3 | 
 4 | 
 5 | def test():
 6 |     print(build_tool_prompt(GENERAL_TOOLS[0]))
 7 | 
 8 |     # This is a tool that select the target classes in category list with the given condition. It is commonly used to filter out the objects with the same type.
 9 |     # Args:
10 |     #   category_list (category): the list to be processed
11 |     #   condition (text): the condition to select objects
12 |     # Returns
13 |     #   list (list): the selected list
14 | 
15 | 
16 | def generate_json():
17 |     from cllm.agents.builtin.tools import TOOLS
18 | 
19 |     tools = []
20 |     for tool in TOOLS.values():
21 |         tool.description = build_tool_prompt(tool)
22 |         tools.append(tool.dict())
23 |     import json
24 | 
25 |     with open("tools.json", "w") as f:
26 |         json.dump(tools, f, indent=4)
27 | 
28 | 
29 | generate_json()
30 | 


--------------------------------------------------------------------------------