├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── README.md ├── agora-banner.png ├── pyproject.toml ├── setup.py └── src └── distiller ├── __init__.py ├── __main__.py ├── agents ├── abstract.py └── huggingface.py ├── base.py ├── cli.py ├── conversations.py ├── outputs.py └── texts.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # macOS local attributes 156 | .DS_Store 157 | 158 | # Local experimentation 159 | lab 160 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Agora 2 | Agora is an new open source Multi-Modality AI Research Organization devoted to advancing Humanity! 3 | 4 | ![Agora banner](agora-banner.png) 5 | 6 | [Join our Agora discord and contribute to this project or 40+ others!](https://discord.gg/qUtxnK2NMf) 7 | 8 | 9 | # The Distiller 10 | 11 | `The Distiller` is a command-line interface and a Python library for inferencing Large Language Models to generate textual or multi-modal datasets. 12 | 13 | Possible use cases may include: 14 | 15 | - Constructing textual corpora to train/fine-tune detectors for content written by AI. 16 | - Collecting datasets of LLM-produced conversations for research purposes, analysis of AI performance/impact/ethics, etc. 17 | - Automating a task that a LLM can handle over big amounts of input texts. For example, using GPT-3 to summarize 1000 paragraphs with a single CLI command. 18 | - Leveraging APIs of especially big LLMs to produce diverse texts for a specific task and then fine-tune a smaller model with them. 19 | 20 | > This tool is distributed freely and doesn't imply any restrictions on the downstream use cases. 21 | > However, you should make sure to follow the **Terms of use** of the backend APIs (OpenAI, Cohere, Petals, etc.) in your specific context. 22 | 23 | 24 | # Agora 25 | The Distiller is brought to you by Agora, we're an all-new open source multi-modality AI Research organization devoted to advancing Humanity! 26 | 27 | 28 | [Join our Discord and help work on this project or 30+ others!](https://discord.gg/qUtxnK2NMf) 29 | 30 | 31 | ## Installation 32 | 33 | ``` 34 | pip install distiller 35 | ``` 36 | 37 | Most of the generation features rely on third-party APIs. Install their respective packages: 38 | 39 | ``` 40 | pip install openai cohere petals 41 | ``` 42 | 43 | ## Usage examples 44 | 45 | ### Inference LLMs at scale 46 | 47 | ```bash 48 | export OPENAI_API_KEY="..." 49 | export COHERE_API_KEY="..." 50 | 51 | distiller texts \ 52 | --prompt "If {country} was a planet in the Star Wars universe it would be called" \ 53 | --backend "openai|text-davinci-003" \ 54 | --backend "cohere|medium" \ 55 | --temperature 0.9 \ 56 | --option country Germany \ 57 | --option country France \ 58 | --max-length 50 \ 59 | --num-samples 1 \ 60 | --single-file 61 | ``` 62 | 63 | The command above should produce a dataset file with 4 texts. Each possible combination of options is used for each of the backend LLMs. Check out the [CLI reference](#cli-reference) for more details. A dataset file looks like this: 64 | 65 | ```json 66 | [ 67 | { 68 | "sample_id": 0, 69 | "backend": "openai|text-davinci-003", 70 | "max_length": 50, 71 | "temperature": 0.9, 72 | "country": "Germany", 73 | "prompt": "If Germany was a planet in the Star Wars universe it would be called", 74 | "output": " Euron. The planet would be home to a powerful and diverse species of aliens, known as the Eurons, that have evolved to a higher level of understanding and technological advancement compared to many of the other planets in the galaxy. The planet would be" 75 | }, 76 | { 77 | "sample_id": 0, 78 | "backend": "openai|text-davinci-003", 79 | "max_length": 50, 80 | "temperature": 0.9, 81 | "country": "France", 82 | "prompt": "If France was a planet in the Star Wars universe it would be called", 83 | "output": " The Empire of Liberty. It would be a peaceful, democratic planet with a strong sense of justice and equality. The planet would be home to many different species of aliens but the majority of its population would be humans. It would have a strong military and" 84 | }, 85 | { 86 | "sample_id": 0, 87 | "backend": "cohere|medium", 88 | "max_length": 50, 89 | "temperature": 0.9, 90 | "country": "Germany", 91 | "prompt": "If Germany was a planet in the Star Wars universe it would be called", 92 | "output": " the Hoth of the universe.\nAfter the Soviet invasion of Eastern Europe and the subsequent Western anti-Soviet sentiment, Germany's arms manufacturers went into hyperdrive and the country churned out guns at a frightening pace. By the early 1930" 93 | }, 94 | ... 95 | ] 96 | ``` 97 | 98 | Alternatively, you can use our [`TextsGenerator`](https://github.com/radi-cho/distiller/blob/main/distiller/texts.py#L27) to produce texts in Python. 99 | 100 | ### Generate conversations with the ChatGPT API 101 | 102 | ```bash 103 | export OPENAI_API_KEY="..." 104 | distiller conversations \ 105 | --length 4 \ 106 | --length 5 \ 107 | --agent1 "You're a shop assistant in a pet store. Answer to customer questions politely." \ 108 | --agent2 "You're a customer in a pet store. You should behave like a human. You want to buy {n} pets. Ask questions about the pets in the store." \ 109 | --temperature 0.1 \ 110 | --temperature 0.2 \ 111 | --option n 2 \ 112 | --option n 3 \ 113 | --path dataset 114 | ``` 115 | 116 | The command above should produce a dataset directory with 16 conversations saved as JSON files. You can specify if you want all of them to be saved in a single file. But why 16? Because `num_samples` dialogues are generated for each possible combination of parameters (conversation length, LLM temperature, and custom prompt options). A dataset item looks like this: 117 | 118 | ```json 119 | { 120 | "length": 5, 121 | "temperature": 0.1, 122 | "n": "2", 123 | "agent1": "You're a shop assistant in a pet store. Answer to customer questions politely. When the whole conversation is over end with \"Goodbye\".", 124 | "agent2": "You're a customer in a pet store. You should behave like a human. You want to buy 2 pets. Ask questions about the pets in the store. When the whole conversation is over end with \"Goodbye\".", 125 | "utterances": [ 126 | [ 127 | "agent1", 128 | "Hello! How can I assist you today?" 129 | ], 130 | [ 131 | "agent2", 132 | "Hi! I'm interested in buying two pets. Can you tell me what kind of pets you have available in the store?" 133 | ], 134 | [ 135 | "agent1", 136 | "Certainly! We have a variety of pets available, including dogs, cats, birds, fish, hamsters, guinea pigs, rabbits, and reptiles. Is there a specific type of pet you're interested in?" 137 | ], 138 | [ 139 | "agent2", 140 | "I'm not sure yet. Can you tell me more about the dogs and cats you have available? What breeds do you have?" 141 | ], 142 | ... 143 | ] 144 | } 145 | ``` 146 | 147 | You can also use our [`ConversationsGenerator`](https://github.com/radi-cho/distiller/blob/main/distiller/conversations.py#L43) to produce texts with the `gpt-3.5-turbo` API programatically. 148 | 149 | ```python 150 | from distiller import ConversationsGenerator, ConversationsGeneratorConfig, DatasetWriter 151 | 152 | dataset_writer = DatasetWriter() # single_file=True 153 | 154 | generator_config = ConversationsGeneratorConfig(agent1="You're a shop assistant in a pet store. Answer to customer questions politely.", 155 | agent2="You're a customer in a pet store. You should behave like a human. You want to buy {n} pets. Ask questions about the pets in the store.", 156 | num_samples=2, 157 | interruption="length", 158 | lengths=[4, 5], 159 | temperatures=[0.1, 0.2], 160 | options=[("n", "2"), ("n", "3")]) 161 | 162 | conversations_generator = ConversationsGenerator(generator_config) 163 | 164 | for conversation in conversations_generator: 165 | dataset_writer.save_intermediate_result(conversation) 166 | ``` 167 | 168 | ## Contributing 169 | 170 | > Still under active development. 171 | 172 | Contributions will be highly appreciated. Currently these features are under development: 173 | - [x] `distiller conversations` - Make two ChatGPT agents talk with one another and record the conversation history. 174 | - [x] `distiller texts` - Inference different LLMs with a given input prompt and generate multiple outputs by varying parameters. 175 | - [ ] `distiller transformations` - Apply a list of transformations to a list of texts. For example, summarizing a list of texts for a {child | university student | PhD candidate} to understand. 176 | - [ ] Support more backend LLMs. 177 | 178 | To set up a local development environment: 179 | 180 | ```bash 181 | git clone https://github.com/kyegomez/The-Distiller/tree/Epoch2 182 | cd distiller 183 | pip install -e . 184 | ``` 185 | 186 | ## CLI Reference 187 | 188 | ``` 189 | distiller [OPTIONS] COMMAND [ARGS]... 190 | 191 | Command line interface that generates datasets with LLMs. 192 | 193 | Options: 194 | --help Show this message and exit. 195 | 196 | Commands: 197 | conversations Produce conversations between two gpt-3.5-turbo agents... 198 | texts Inference multiple LLMs at scale. 199 | ``` 200 | 201 | ``` 202 | distiller texts [OPTIONS] 203 | 204 | Inference multiple LLMs at scale. 205 | 206 | Options: 207 | -p, --prompt TEXT Input prompt. [required] 208 | -b, --backend TEXT LLM APIs to use as backends. Use 209 | "backend|model_name" notation. For example: 210 | "openai|text-davinci-003". 211 | -l, --max-length INTEGER Maximum number of tokens to generate for each 212 | prompt. 213 | -t, --temperature FLOAT Possible temperature values for the backend 214 | language model. 215 | -n, --num-samples INTEGER Number of conversations for each configuration. 216 | -o, --option ... Values for additional options denoted in your 217 | prompts by {OPTION_NAME}. 218 | -f, --path PATH Where to save the dataset. Either a file or a 219 | directory (folder). 220 | -s, --single-file Either save the whole dataset to a single file 221 | or create multiple files. 222 | --help Show this message and exit. 223 | ``` 224 | 225 | - You can specify multiple variants for the following options: `--length`, `--temperature`, `--num-samples`, `--option`. A dataset item will be generated for each possible combination of the supplied values. 226 | - Each `--option` provided must be formatted as follows: `--option option_name "Some option value"`. 227 | - Currently supported backends: GPT-3 model variants by [OpenAI](https://openai.com/blog/openai-api), the language models by [Cohere](https://pypi.org/project/cohere/), BLOOM through the [Petals API](https://petals.ml/). 228 | 229 | ``` 230 | distiller conversations [OPTIONS] 231 | 232 | Produce conversations between two gpt-3.5-turbo agents with given roles. 233 | 234 | Options: 235 | 236 | -k, --openai-api-key TEXT OpenAI API key. 237 | -hf, --hf-api-token TEXT Hugging Face API token. 238 | -a, --agent1 TEXT Agent role description. [required] 239 | -b, --agent2 TEXT Agent role description. [required] 240 | -m, --model-agent [openai|hf] Backend to use for the AI agents. Can be 'openai' for OpenAI's gpt-3.5-turbo or 'hf' for a Hugging Face model. Defaults to 'openai'. 241 | -r, --repo-id TEXT Hugging Face model repo id, required if 'hf' is chosen as model-agent. 242 | -i, --interruption [length|end_phrase] 243 | Interruption mode. 244 | -e, --end-phrase TEXT Interrupt after this phrase is outputted by 245 | one of the agents. 246 | -d, --end-agent [agent1|agent2|both] 247 | In which agent's messages to look for the 248 | end phrase. 249 | -l, --length INTEGER Maximum number of utterances for each agent. 250 | A conversation sample will be generated for 251 | each length. 252 | -t, --temperature FLOAT Possible temperature values for the backend 253 | language model. 254 | -n, --num-samples INTEGER Number of conversations for each 255 | configuration. 256 | -o, --option ... Values for additional options denoted in 257 | your prompts by {OPTION_NAME}. 258 | -f, --path PATH Where to save the dataset. Either a file or 259 | a directory (folder). 260 | -s, --single-file Either save the whole dataset to a single 261 | file or create multiple files. 262 | --help Show this message and exit. 263 | ``` 264 | 265 | - The length parameter specifies how many utterances each agent should make. A length of 4 typically produces 8 utterances in total. 266 | - You can specify either `length` (default) or `end_phrase` as an interruption strategy. When using `end_phrase` a conversation will be interrupted once the `--end-phrase` has appeared in the messages of the `--end-agent` (could be both). In this case, the lengths provided will be treated as maximum conversation lengths. 267 | 268 | 269 | # Roadmap 270 | 271 | * Conversation transformations like chain of thought prompting, conversational summaries, step by step breakdowns of conversations, explain your rational 272 | 273 | * Dynamically scale up agent instances, 2, 4, 100, 1000, all conversating with eachother. 274 | 275 | * Add Hugginggace polymorphic function to use any model on hf 276 | 277 | * Add CLI arg for huggingface model to utilize 278 | 279 | * Integrate tree of thoughts prompting 280 | 281 | * Integrate multi-modality models from hf 282 | 283 | * Integrate multi-modality conversation => store images in folder then agent2 looks up image address 284 | 285 | * Enable Multi-Modal Dataset Generation using generate, critique -------------------------------------------------------------------------------- /agora-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Agora-Lab-AI/The-Distiller/0a4ad46c65e102e90696851c5dbcfa1923161f17/agora-banner.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ['setuptools>=61.0'] 3 | build-backend = 'setuptools.build_meta' -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from setuptools import find_packages 3 | 4 | with open("README.md", "r", encoding = "utf-8") as readme: 5 | long_description = readme.read() 6 | 7 | setup( 8 | name="The Distiller", 9 | version="0.0.2", 10 | description="Generate textual and conversational datasets with LLMs.", 11 | long_description = long_description, 12 | long_description_content_type = "text/markdown", 13 | author="Kye Gomez", 14 | author_email="Kye@apac.ai", 15 | url="https://github.com/kyegomez/The-Distiller", 16 | keywords=["dataset", "llm", "langchain", "openai"], 17 | package_dir={"": "src"}, 18 | packages = find_packages(where="src"), 19 | install_requires=[ 20 | "langchain>=0.0.113", 21 | "click>=8.1" 22 | ], 23 | entry_points={ 24 | "console_scripts": [ 25 | "distiller=distiller:distiller" 26 | ], 27 | }, 28 | ) 29 | -------------------------------------------------------------------------------- /src/distiller/__init__.py: -------------------------------------------------------------------------------- 1 | from .cli import distiller 2 | from .conversations import * 3 | from .texts import * 4 | from .outputs import * 5 | -------------------------------------------------------------------------------- /src/distiller/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import main 2 | 3 | main() 4 | -------------------------------------------------------------------------------- /src/distiller/agents/abstract.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | class AbstractLanguageModel(ABC): 4 | @abstractmethod 5 | def generate_thoughts(self, state, k): 6 | pass 7 | 8 | @abstractmethod 9 | def evaluate_states(self, states): 10 | pass 11 | -------------------------------------------------------------------------------- /src/distiller/agents/huggingface.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | from abstract import AbstractLanguageModel 6 | 7 | 8 | class HuggingLanguageModel(AbstractLanguageModel): 9 | def __init__(self, model_name, model_tokenizer=None, verbose=False): 10 | self.model = AutoModelForCausalLM.from_pretrained(model_name) 11 | self.tokenizer = AutoTokenizer.from_pretrained(model_tokenizer or model_name) 12 | self.verbose = verbose 13 | 14 | def generate_thoughts(self, state, k, max_length=100): 15 | state_text = ' '.join(state) 16 | prompt = f"Write down your observations in format 'Observation:xxxx', then write down your thoughts in format 'Thoughts:xxxx Given the current state of reasoning: '{state_text}', generate {k} coherent solutions to achieve {state_text}" 17 | 18 | if self.verbose: 19 | print(f"Generating thoughts for state: {state_text}") 20 | 21 | try: 22 | inputs = self.tokenizer(prompt, return_tensors="pt") 23 | outputs = self.model.generate(**inputs, max_length=max_length, num_return_sequences=k) 24 | thoughts = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs] 25 | except Exception as e: 26 | if self.verbose: 27 | print(f"Error generating thoughts for state: {state_text}") 28 | print(f"Error: {e}") 29 | thoughts = [] 30 | 31 | return thoughts 32 | 33 | def evaluate_states(self, states, initial_prompt, max_length=10): 34 | state_values = {} 35 | for state in states: 36 | state_text = ' '.join(state) 37 | prompt = f"Given the current state of reasoning: '{state_text}', pessimitically evaluate its value as a float between 0 and 1 based on it's potential to achieve {initial_prompt}" 38 | 39 | if self.verbose: 40 | print(f"Evaluating state: {state_text}") 41 | 42 | try: 43 | inputs = self.tokenizer(prompt, return_tensors="pt") 44 | outputs = self.model.generate(**inputs, num_return_sequences=1, max_length=max_length) 45 | value_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) 46 | value = float(value_text) 47 | except ValueError: 48 | if self.verbose: 49 | print(f"Error converting value to float for state: {state_text}") 50 | value = 0 # Assign a default value if the conversion fails 51 | except Exception as e: 52 | if self.verbose: 53 | print(f"Error evaluating state: {state_text}") 54 | print(f"Error: {e}") 55 | value = 0 56 | 57 | state_values[state] = value 58 | 59 | return state_values -------------------------------------------------------------------------------- /src/distiller/base.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import List, Any, Dict, Tuple, Generator, Iterator, Protocol 3 | 4 | OPTIONS_CONFIG_KEYS = ["temperature"] 5 | GENERATOR_CONFIG_KEYS = ["temperatures"] 6 | 7 | 8 | class DatasetGeneratorConfig(Protocol): 9 | """Base generator configuration protocol.""" 10 | openai_api_key: str 11 | """OpenAI API key.""" 12 | num_samples: int 13 | """Number of texts to generate for each options combination.""" 14 | options: List[Tuple[str, str]] 15 | """Additional options defined in the text prompt with curly brackets.""" 16 | 17 | 18 | class DatasetGenerator: 19 | """Abstraction of a dataset generator.""" 20 | 21 | config: DatasetGeneratorConfig 22 | """Generator configuration.""" 23 | options_configs: List[Dict[str, Any]] 24 | """Possible combinations of the provided options.""" 25 | generator_index: int = 0 26 | """Index of the next item to be returned by the generator.""" 27 | 28 | def __init__(self, config: DatasetGeneratorConfig) -> None: 29 | self.config = config 30 | self.initialize_options_configs() 31 | 32 | def initialize_options_configs( 33 | self, 34 | options_config_keys: List[str] = OPTIONS_CONFIG_KEYS, 35 | generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS 36 | ) -> None: 37 | """Prepare options combinations.""" 38 | options_keys = ["sample_id", *options_config_keys] 39 | options_values = [range(self.config.num_samples)] 40 | options_values += [getattr(self.config, key) for key in generator_config_keys] 41 | 42 | for option in self.config.options: 43 | if option[0] not in options_keys: 44 | options_keys.append(option[0]) 45 | options_values.append([option[1]]) 46 | else: 47 | index = options_keys.index(option[0]) 48 | if option[1] not in options_values[index]: 49 | options_values[index].append(option[1]) 50 | 51 | self.options_configs = list(map(lambda x: dict(zip(options_keys, x)), 52 | itertools.product(*options_values))) 53 | 54 | def generate_item(self) -> Dict[str, Any]: 55 | """Produce a data item.""" 56 | return {} 57 | 58 | def __next__(self) -> Generator[Dict[str, Any], None, None]: 59 | return self.generate_item() 60 | 61 | def __iter__(self) -> Iterator: 62 | return self 63 | -------------------------------------------------------------------------------- /src/distiller/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | from typing import List, Tuple 3 | 4 | from .conversations import ConversationsGeneratorConfig, ConversationsGenerator 5 | from .texts import TextsGeneratorConfig, TextsGenerator 6 | from .outputs import DatasetWriter 7 | 8 | 9 | @click.group() 10 | def distiller() -> None: 11 | """Command line interface that generates datasets with LLMs.""" 12 | pass 13 | 14 | 15 | click_options = click.option("--option", 16 | "-o", 17 | "options", 18 | type=(str, str), 19 | multiple=True, 20 | help="Values for additional options denoted in your prompts by {OPTION_NAME}.") 21 | 22 | click_path = click.option("--path", 23 | "-f", 24 | "path", 25 | type=click.Path(), 26 | help="Where to save the dataset. Either a file or a directory (folder).") 27 | 28 | click_single_file = click.option("--single-file", 29 | "-s", 30 | "single_file", 31 | type=bool, 32 | is_flag=True, 33 | help="Either save the whole dataset to a single file or create multiple files.") 34 | 35 | click_num_samples = click.option("--num-samples", 36 | "-n", 37 | "num_samples", 38 | type=int, 39 | default=1, 40 | help="Number of conversations for each configuration.") 41 | 42 | click_temperatures = click.option("--temperature", 43 | "-t", 44 | "temperatures", 45 | type=float, 46 | multiple=True, 47 | default=[0.5], 48 | help="Possible temperature values for the backend language model.") 49 | 50 | 51 | @click.command() 52 | @click.option("--agents", 53 | "-a", 54 | "agents", 55 | type=str, 56 | multiple=True, 57 | required=True, 58 | help="Agent role descriptions.") 59 | 60 | @click.option("--hf-api-token", 61 | "-hf", 62 | type=str, 63 | envvar="HF_API_KEY", 64 | help="Hugging Face API key.") 65 | 66 | @click.option("--model-agent", 67 | "-m", 68 | type=click.Choice(["openai", "hf"]), 69 | default="openai", 70 | help="Backend to use for the AI agents. Can be 'openai' for OpenAI's gpt-3.5-turbo or 'hf' for a Hugging Face model. Defaults to 'openai'.") 71 | 72 | @click.option("--repo-id", 73 | "-r", 74 | type=str, 75 | help="Hugging Face model repo id, required if 'hf' is chosen as model-agent.") 76 | 77 | @click.option("--openai-api-key", 78 | "-k", 79 | "openai_api_key", 80 | type=str, 81 | envvar="OPENAI_API_KEY", 82 | help="OpenAI API key.") 83 | @click.option("--agent1", 84 | "-a", 85 | "agent1", 86 | type=str, 87 | required=True, 88 | help="Agent role description.") 89 | @click.option("--agent2", 90 | "-b", 91 | "agent2", 92 | type=str, 93 | required=True, 94 | help="Agent role description.") 95 | @click.option("--initial-utterance", 96 | "-u", 97 | "initial_utterances", 98 | type=str, 99 | default=["Hello."], 100 | multiple=True, 101 | help="Utterance to be provisioned to the first agent. For many use cases a \"Hello\" is enough.") 102 | @click.option("--interruption", 103 | "-i", 104 | "interruption", 105 | type=click.Choice(["length", "end_phrase"]), 106 | default="length", 107 | help="Interruption mode.") 108 | @click.option("--end-phrase", 109 | "-e", 110 | "end_phrase", 111 | type=str, 112 | default="Goodbye", 113 | help="Interrupt after this phrase is outputted by one of the agents.") 114 | @click.option("--end-agent", 115 | "-d", 116 | "end_agent", 117 | type=click.Choice(["agent1", "agent2", "both"]), 118 | default="both", 119 | help="In which agent's messages to look for the end phrase.") 120 | @click.option("--length", 121 | "-l", 122 | "lengths", 123 | type=int, 124 | multiple=True, 125 | default=[5], 126 | help="Maximum number of utterances for each agent. A conversation sample will be generated for each length.") 127 | @click_temperatures 128 | @click_num_samples 129 | @click_options 130 | @click_path 131 | @click_single_file 132 | def conversations( 133 | agents: List[str], 134 | repo_id: str, 135 | openai_api_key: str, 136 | agent1: str, 137 | agent2: str, 138 | initial_utterances: List[str], 139 | num_samples: int, 140 | interruption: str, 141 | end_phrase: str, 142 | end_agent: str, 143 | lengths: List[int], 144 | temperatures: List[int], 145 | options: List[Tuple[str, str]], 146 | path: str, 147 | single_file: bool 148 | ) -> None: 149 | """Produce conversations between two gpt-3.5-turbo agents with given roles.""" 150 | dataset_writer = DatasetWriter(path, single_file) 151 | 152 | generator_config = ConversationsGeneratorConfig(agents=agents, 153 | openai_api_key=openai_api_key, 154 | agent1=agent1, 155 | agent2=agent2, 156 | initial_utterances=initial_utterances, 157 | num_samples=num_samples, 158 | interruption=interruption, 159 | end_phrase=end_phrase, 160 | end_agent=end_agent, 161 | lengths=lengths, 162 | temperatures=temperatures, 163 | options=options) 164 | 165 | conversations_generator = ConversationsGenerator(generator_config) 166 | 167 | for conversation in conversations_generator: 168 | dataset_writer.save_intermediate_result(conversation) 169 | 170 | 171 | @click.command() 172 | @click.option("--prompt", 173 | "-p", 174 | "prompt", 175 | type=str, 176 | required=True, 177 | help="Input prompt.") 178 | @click.option("--backend", 179 | "-b", 180 | "backends", 181 | type=str, 182 | multiple=True, 183 | default=["openai|text-davinci-003"], 184 | help="LLM APIs to use as backends. Use \"backend|model_name\" notation. For example: \"openai|text-davinci-003\".") 185 | @click.option("--max-length", 186 | "-l", 187 | "max_lengths", 188 | type=int, 189 | multiple=True, 190 | default=[100], 191 | help="Maximum number of tokens to generate for each prompt.") 192 | @click_temperatures 193 | @click_num_samples 194 | @click_options 195 | @click_path 196 | @click_single_file 197 | def texts( 198 | agents: List[str], 199 | prompt: str, 200 | num_samples: int, 201 | max_lengths: List[int], 202 | temperatures: List[int], 203 | backends: List[str], 204 | options: List[Tuple[str, str]], 205 | path: str, 206 | single_file: bool 207 | ) -> None: 208 | """Inference multiple LLMs at scale.""" 209 | dataset_writer = DatasetWriter(path, single_file) 210 | 211 | generator_config = TextsGeneratorConfig(agents=agents, 212 | prompt=prompt, 213 | backends=backends, 214 | num_samples=num_samples, 215 | max_lengths=max_lengths, 216 | temperatures=temperatures, 217 | options=options) 218 | 219 | texts_generator = TextsGenerator(generator_config) 220 | 221 | for text_object in texts_generator: 222 | dataset_writer.save_intermediate_result(text_object) 223 | 224 | 225 | distiller.add_command(texts) 226 | distiller.add_command(conversations) 227 | 228 | 229 | def main() -> None: 230 | """Run the distiller CLI.""" 231 | distiller() 232 | -------------------------------------------------------------------------------- /src/distiller/conversations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Any, Dict, Tuple, Union 3 | 4 | from langchain.prompts import ( 5 | ChatPromptTemplate, 6 | MessagesPlaceholder, 7 | SystemMessagePromptTemplate, 8 | HumanMessagePromptTemplate 9 | ) 10 | 11 | from langchain.chains import ConversationChain 12 | from langchain.chat_models import ChatOpenAI 13 | from langchain.memory import ConversationBufferMemory 14 | from langchain.schema import SystemMessage 15 | from langchain import HuggingFaceHub 16 | from .base import DatasetGenerator 17 | 18 | OPTIONS_CONFIG_KEYS = ["length", "temperature", "initial_utterance"] 19 | GENERATOR_CONFIG_KEYS = ["lengths", "temperatures", "initial_utterances"] 20 | 21 | 22 | @dataclass 23 | class ConversationsGeneratorConfig: 24 | agents: List[str] 25 | """List of agent descriptions to construct their system message""" 26 | agent_type: str 27 | """type of language odel either openai or huggingface""" 28 | hf_id: str 29 | """repo id for the hf model""" 30 | openai_api_key: str 31 | """OpenAI API key.""" 32 | agent1: str 33 | """Description of the first agent used to construct its system message.""" 34 | agent2: str 35 | """Description of the second agent used to construct its system message.""" 36 | initial_utterances: List[str] = "Hello." 37 | """Utterances to be provisioned to the first agent.""" 38 | num_samples: int = 1 39 | """Number of conversations to generate for each options combination.""" 40 | interruption: str = "length" 41 | """Interruption mode.""" 42 | end_phrase: str = "Goodbye!" 43 | """Phrase to look for when checking whether to interrupt a conversation.""" 44 | end_agent: str = "both" 45 | """Agent whose messages to check for the interruption phrase.""" 46 | lengths: List[int] = field(default_factory=lambda: [5]) 47 | """Possible lengths of the conversations. If end_phrase interruption is enabled these will be used for maximum lengths.""" 48 | temperatures: List[float] = field(default_factory=lambda: [0]) 49 | """Possible temperatures for the backend LLM.""" 50 | options: List[Tuple[str, str]] = field(default_factory=lambda: []) 51 | """Additional options defined in the system prompts with curly brackets.""" 52 | 53 | 54 | class ConversationsGenerator(DatasetGenerator): 55 | """Generator producing conversations between two AI agents.""" 56 | 57 | config: ConversationsGeneratorConfig 58 | """Configuration for a ConversationsGenerator.""" 59 | 60 | def __init__(self, config: ConversationsGeneratorConfig) -> None: 61 | """Initialize ConversationsGenerator.""" 62 | super().__init__(config) 63 | 64 | def initialize_options_configs( 65 | self, 66 | options_config_keys: List[str] = OPTIONS_CONFIG_KEYS, 67 | generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS 68 | ) -> None: 69 | """Prepare options combinations.""" 70 | super().initialize_options_configs(options_config_keys, generator_config_keys) 71 | 72 | def initialize_chain( 73 | self, 74 | agent: str, 75 | system_prompt: str, 76 | conversation_config: Dict[str, Any] 77 | ) -> Tuple[ConversationChain, str]: 78 | """Initialize a conversation and return a chain and a formatted system prompt.""" 79 | if self.config.interruption == "end_phrase": 80 | if self.config.end_agent == agent or self.config.end_agent == "both": 81 | system_prompt += f" When the whole conversation is over end with \"{self.config.end_phrase}\"." 82 | 83 | system_template = SystemMessagePromptTemplate.from_template( 84 | system_prompt) 85 | template_params = {key: conversation_config[key] 86 | for key in system_template.input_variables} 87 | system_message = system_template.format(**template_params).content 88 | 89 | prompt = ChatPromptTemplate.from_messages([ 90 | SystemMessage(content=system_message), 91 | MessagesPlaceholder(variable_name="history"), 92 | HumanMessagePromptTemplate.from_template("{input}") 93 | ]) 94 | 95 | memory = ConversationBufferMemory(return_messages=True) 96 | 97 | 98 | if self.config.agent_type == "openai": 99 | llm = ChatOpenAI(temperature=conversation_config["temperature"], 100 | openai_api_key=self.config.openai_api_key) 101 | elif self.config.agent_type == "huggingface": 102 | llm = HuggingFaceHub(repo_id=self.config.repo_id, model_kwargs={"temperature": conversation_config["temperature"], "max_length": 64}) 103 | 104 | 105 | chain = ConversationChain(memory=memory, prompt=prompt, llm=llm) 106 | 107 | return chain, system_message 108 | 109 | def initialize_chains( 110 | self, 111 | conversation_config: Dict[str, Any] 112 | ) -> Tuple[List[ConversationChain], List[str]]: 113 | chains = [] 114 | system_prompts = [] 115 | for agent in self.config.agents: 116 | chain, system_prompt = self.initialize_chain(agent, agent, conversation_config) 117 | chains.append(chain) 118 | system_prompts.append(system_prompt) 119 | return chains, system_prompts 120 | 121 | def end_phrase_interruption(self, agent: str, message: str) -> bool: 122 | """Check whether to interrupt conversation generation.""" 123 | if self.config.interruption == "end_phrase": 124 | if self.config.end_agent == agent or self.config.end_agent == "both": 125 | if self.config.end_phrase in message: 126 | return True 127 | 128 | return False 129 | 130 | def generate_item(self) -> Dict[str, Union[List[List[Any]], float, int]]: 131 | """Run two chains to talk with one another and record the chat history.""" 132 | if self.generator_index >= len(self.options_configs): 133 | raise StopIteration() 134 | 135 | conversation_config = self.options_configs[self.generator_index] 136 | self.generator_index += 1 137 | 138 | chains, system_prompts = self.initialize_chain(conversation_config) 139 | 140 | utterances = [] 141 | 142 | 143 | chain_inp = conversation_config["initial_utterance"] 144 | for _ in range(conversation_config["length"]): 145 | for i, chain in enumerate(chains): 146 | agent = f"agent{i + 1}" 147 | chain_out = chain.predict(input=chain_inp) 148 | utterances.append([agent, chain_out]) 149 | 150 | if self.end_phrase_interruption(agent, chain_out): 151 | break 152 | 153 | chain_inp = chain_out 154 | 155 | 156 | return {**conversation_config, 157 | **{f"agent{i + 1}": system_prompts[i] for i in range(len(system_prompts))}, 158 | "utterances": utterances} -------------------------------------------------------------------------------- /src/distiller/outputs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from uuid import uuid4 5 | from typing import Dict, Any, List 6 | 7 | 8 | class DatasetWriter: 9 | """Handle outputting dataset items.""" 10 | 11 | single_file: bool 12 | """Whether to save all dataset items in a single file.""" 13 | path: str 14 | """Path of the output file or directory.""" 15 | dataset_items: List[Dict[str, Any]] 16 | """Collection of all the items in the current dataset.""" 17 | 18 | def __init__(self, path: str = None, single_file: bool = False) -> None: 19 | """Initialize DatasetWriter.""" 20 | if path == None and single_file: 21 | path = self.get_unique_filename(os.getcwd()) 22 | elif path == None and not single_file: 23 | path = self.get_unique_dirname(os.getcwd()) 24 | elif os.path.isdir(path) and single_file: 25 | path = self.get_unique_filename(path) 26 | elif os.path.isfile(path) and not single_file: 27 | raise ValueError( 28 | "Cannot write to a file with the single_file mode disabled. Try setting --single-file.") 29 | 30 | self.single_file = single_file 31 | self.path = path 32 | self.dataset_items = [] 33 | 34 | def get_unique_dirname(self, base_path): 35 | """Get a unique dirname.""" 36 | return os.path.join(base_path, str(uuid4())) 37 | 38 | def get_unique_filename(self, base_path): 39 | """Get a unique filename.""" 40 | return os.path.join(base_path, f"{uuid4()}.json") 41 | 42 | def save_intermediate_result(self, result: Dict[str, Any]): 43 | """Either save an item to its own file or concatenate it with all dataset items in a single file.""" 44 | if self.single_file: 45 | self.dataset_items.append(result) 46 | 47 | current_directory = os.path.dirname(self.path) 48 | if current_directory != "" and current_directory != ".": 49 | os.makedirs(current_directory, exist_ok=True) 50 | 51 | with open(self.path, "w") as output_file: 52 | json.dump(self.dataset_items, output_file) 53 | else: 54 | current_filepath = self.get_unique_filename(self.path) 55 | 56 | os.makedirs(self.path, exist_ok=True) 57 | with open(current_filepath, "w") as output_file: 58 | json.dump(result, output_file) 59 | -------------------------------------------------------------------------------- /src/distiller/texts.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Any, Dict, Tuple, Union 3 | 4 | from langchain.prompts import PromptTemplate 5 | from langchain.llms import BaseLLM 6 | from langchain.chains import LLMChain 7 | 8 | from .base import DatasetGenerator 9 | 10 | OPTIONS_CONFIG_KEYS = ["backend", "max_length", "temperature"] 11 | GENERATOR_CONFIG_KEYS = ["backends", "max_lengths", "temperatures"] 12 | 13 | 14 | @dataclass 15 | class TextsGeneratorConfig: 16 | agents: List[str] 17 | """An array that allows you to dynamically scale up agents""" 18 | prompt: str 19 | """Text prompt.""" 20 | backends: List[Tuple[str, str, str]] 21 | """LLM APIs to use as backends.""" 22 | num_samples: int = 1 23 | """Number of texts to generate for each options combination.""" 24 | max_lengths: List[int] = field(default_factory=lambda: [5]) 25 | """Maximum lengths in tokens for the output of each generation.""" 26 | temperatures: List[float] = field(default_factory=lambda: [0]) 27 | """Possible temperatures for the backend LLM.""" 28 | options: List[Tuple[str, str]] = field(default_factory=lambda: []) 29 | """Additional options defined in the system prompts with curly brackets.""" 30 | 31 | 32 | class TextsGenerator(DatasetGenerator): 33 | """Generator producing texts by varying model parameters and prompt options.""" 34 | 35 | config: TextsGeneratorConfig 36 | """Configuration for a TextsGenerator.""" 37 | 38 | def __init__(self, config: TextsGeneratorConfig) -> None: 39 | """Initialize TextsGenerator.""" 40 | super().__init__(config) 41 | 42 | def initialize_options_configs( 43 | self, 44 | options_config_keys: List[str] = OPTIONS_CONFIG_KEYS, 45 | generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS 46 | ) -> None: 47 | """Prepare options combinations.""" 48 | super().initialize_options_configs(options_config_keys, generator_config_keys) 49 | 50 | def initialize_backend(self, text_config: Dict[str, Any]) -> BaseLLM: 51 | """Initialize a specific LLM.""" 52 | backend_str = text_config["backend"] 53 | temperature = text_config["temperature"] 54 | max_length = text_config["max_length"] 55 | 56 | backend, model = backend_str.split("|") 57 | 58 | if backend.lower() == "openai": 59 | from langchain.llms import OpenAI 60 | llm = OpenAI(model_name=model, 61 | temperature=temperature, 62 | max_tokens=max_length) 63 | elif backend.lower() == "cohere": 64 | from langchain.llms import Cohere 65 | llm = Cohere(model=model, 66 | temperature=temperature, 67 | max_tokens=max_length) 68 | elif backend.lower() == "petals": 69 | from langchain.llms import Petals 70 | llm = Petals(model_name=model, 71 | temperature=temperature, 72 | max_new_tokens=max_length) 73 | elif backend.lower() == "huggingface": 74 | from langchain import HuggingFaceHub 75 | llm = HuggingFaceHub(repo_id=model, 76 | temperature=temperature, 77 | max_tokens=max_length) 78 | else: 79 | raise ValueError("Cannot use the specified backend.") 80 | 81 | return llm 82 | 83 | def initialize_backends(self, text_config: Dict[str, Any]) -> List[BaseLLM]: 84 | backends = [] 85 | for _ in self.config.agents: 86 | backend = self.initialize_backend(text_config) 87 | backends.append(backend) 88 | return backends 89 | 90 | def generate_item(self) -> Dict[str, Union[List[List[Any]], float, int]]: 91 | """Produce text with a LLM Chain.""" 92 | if self.generator_index >= len(self.options_configs): 93 | raise StopIteration() 94 | 95 | text_config = self.options_configs[self.generator_index] 96 | self.generator_index += 1 97 | 98 | input_variables = text_config.keys() - ["sample_id", 99 | "backend", 100 | "temperature", 101 | "max_length"] 102 | 103 | prompt_template = PromptTemplate(template=self.config.prompt, 104 | input_variables=input_variables) 105 | 106 | llm = self.initialize_backend(text_config) 107 | 108 | prompt_params = {k: text_config[k] for k in input_variables} 109 | input_prompt = prompt_template.format(**prompt_params) 110 | 111 | # chain = LLMChain(prompt=prompt_template, llm=llm) 112 | # output = chain.predict(**prompt_params) 113 | 114 | # return {**text_config, 115 | # "prompt": input_prompt, 116 | # "output": output} 117 | 118 | backends = self.initialize_backends(text_config) 119 | 120 | outputs = [] 121 | for backend in backends: 122 | chain = LLMChain(prompt=prompt_template, llm=backend) 123 | output = chain.predict(**prompt_params) 124 | outputs.append(output) 125 | 126 | return {**text_config, 127 | "prompt": input_prompt, 128 | "outputs": outputs} 129 | 130 | 131 | agents = [ 132 | "You're a shop assistant in a pet store. Answer to customer questions politely.", 133 | "You're a customer in a pet store. You should behave like a human. You want to buy {n} pets. Ask questions about the pets in the store.", 134 | "You're another customer in the pet store. You should behave like a human. You want to buy {n} pets. Ask questions about the pets in the store." 135 | ] 136 | 137 | generator_config = TextsGeneratorConfig(prompt="your prompt", 138 | agents = agents, 139 | backends=[('huggingface', 'distilgpt', '')], 140 | num_samples=2, 141 | max_lengths=[49], 142 | temperatures=[0.1, 0.2], 143 | options=[("n", "n"), ("n", "3")]) 144 | 145 | 146 | texts_generator = TextsGenerator(generator_config) 147 | 148 | for text in texts_generator: 149 | print(text) --------------------------------------------------------------------------------