├── .gitignore ├── README.md ├── pyproject.toml ├── setup.py └── src └── datasetGPT ├── __init__.py ├── __main__.py ├── base.py ├── cli.py ├── conversations.py ├── outputs.py └── texts.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # macOS local attributes 156 | .DS_Store 157 | 158 | # Local experimentation 159 | lab 160 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # datasetGPT 2 | 3 | `datasetGPT` is a command-line interface and a Python library for inferencing Large Language Models to generate textual datasets. 4 | 5 | Possible use cases may include: 6 | 7 | - Constructing textual corpora to train/fine-tune detectors for content written by AI. 8 | - Collecting datasets of LLM-produced conversations for research purposes, analysis of AI performance/impact/ethics, etc. 9 | - Automating a task that a LLM can handle over big amounts of input texts. For example, using GPT-3 to summarize 1000 paragraphs with a single CLI command. 10 | - Leveraging APIs of especially big LLMs to produce diverse texts for a specific task and then fine-tune a smaller model with them. 11 | 12 | > This tool is distributed freely and doesn't imply any restrictions on the downstream use cases. 13 | > However, you should make sure to follow the **Terms of use** of the backend APIs (OpenAI, Cohere, Petals, etc.) in your specific context. 14 | 15 | ## Installation 16 | 17 | ``` 18 | pip install datasetGPT 19 | ``` 20 | 21 | Most of the generation features rely on third-party APIs. Install their respective packages: 22 | 23 | ``` 24 | pip install openai cohere petals 25 | ``` 26 | 27 | ## Usage examples 28 | 29 | ### Inference LLMs at scale 30 | 31 | ```bash 32 | export OPENAI_API_KEY="..." 33 | export COHERE_API_KEY="..." 34 | 35 | datasetGPT texts \ 36 | --prompt "If {country} was a planet in the Star Wars universe it would be called" \ 37 | --backend "openai|text-davinci-003" \ 38 | --backend "cohere|medium" \ 39 | --temperature 0.9 \ 40 | --option country Germany \ 41 | --option country France \ 42 | --max-length 50 \ 43 | --num-samples 1 \ 44 | --single-file 45 | ``` 46 | 47 | The command above should produce a dataset file with 4 texts. Each possible combination of options is used for each of the backend LLMs. Check out the [CLI reference](#cli-reference) for more details. A dataset file looks like this: 48 | 49 | ```json 50 | [ 51 | { 52 | "sample_id": 0, 53 | "backend": "openai|text-davinci-003", 54 | "max_length": 50, 55 | "temperature": 0.9, 56 | "country": "Germany", 57 | "prompt": "If Germany was a planet in the Star Wars universe it would be called", 58 | "output": " Euron. The planet would be home to a powerful and diverse species of aliens, known as the Eurons, that have evolved to a higher level of understanding and technological advancement compared to many of the other planets in the galaxy. The planet would be" 59 | }, 60 | { 61 | "sample_id": 0, 62 | "backend": "openai|text-davinci-003", 63 | "max_length": 50, 64 | "temperature": 0.9, 65 | "country": "France", 66 | "prompt": "If France was a planet in the Star Wars universe it would be called", 67 | "output": " The Empire of Liberty. It would be a peaceful, democratic planet with a strong sense of justice and equality. The planet would be home to many different species of aliens but the majority of its population would be humans. It would have a strong military and" 68 | }, 69 | { 70 | "sample_id": 0, 71 | "backend": "cohere|medium", 72 | "max_length": 50, 73 | "temperature": 0.9, 74 | "country": "Germany", 75 | "prompt": "If Germany was a planet in the Star Wars universe it would be called", 76 | "output": " the Hoth of the universe.\nAfter the Soviet invasion of Eastern Europe and the subsequent Western anti-Soviet sentiment, Germany's arms manufacturers went into hyperdrive and the country churned out guns at a frightening pace. By the early 1930" 77 | }, 78 | ... 79 | ] 80 | ``` 81 | 82 | Alternatively, you can use our [`TextsGenerator`](https://github.com/radi-cho/datasetGPT/blob/main/datasetGPT/texts.py#L27) to produce texts in Python. 83 | 84 | ### Generate conversations with the ChatGPT API 85 | 86 | ```bash 87 | export OPENAI_API_KEY="..." 88 | datasetGPT conversations \ 89 | --length 4 \ 90 | --length 5 \ 91 | --agent1 "You're a shop assistant in a pet store. Answer to customer questions politely." \ 92 | --agent2 "You're a customer in a pet store. You should behave like a human. You want to buy {n} pets. Ask questions about the pets in the store." \ 93 | --temperature 0.1 \ 94 | --temperature 0.2 \ 95 | --option n 2 \ 96 | --option n 3 \ 97 | --path dataset 98 | ``` 99 | 100 | The command above should produce a dataset directory with 16 conversations saved as JSON files. You can specify if you want all of them to be saved in a single file. But why 16? Because `num_samples` dialogues are generated for each possible combination of parameters (conversation length, LLM temperature, and custom prompt options). A dataset item looks like this: 101 | 102 | ```json 103 | { 104 | "length": 5, 105 | "temperature": 0.1, 106 | "n": "2", 107 | "agent1": "You're a shop assistant in a pet store. Answer to customer questions politely. When the whole conversation is over end with \"Goodbye\".", 108 | "agent2": "You're a customer in a pet store. You should behave like a human. You want to buy 2 pets. Ask questions about the pets in the store. When the whole conversation is over end with \"Goodbye\".", 109 | "utterances": [ 110 | [ 111 | "agent1", 112 | "Hello! How can I assist you today?" 113 | ], 114 | [ 115 | "agent2", 116 | "Hi! I'm interested in buying two pets. Can you tell me what kind of pets you have available in the store?" 117 | ], 118 | [ 119 | "agent1", 120 | "Certainly! We have a variety of pets available, including dogs, cats, birds, fish, hamsters, guinea pigs, rabbits, and reptiles. Is there a specific type of pet you're interested in?" 121 | ], 122 | [ 123 | "agent2", 124 | "I'm not sure yet. Can you tell me more about the dogs and cats you have available? What breeds do you have?" 125 | ], 126 | ... 127 | ] 128 | } 129 | ``` 130 | 131 | You can also use our [`ConversationsGenerator`](https://github.com/radi-cho/datasetGPT/blob/main/datasetGPT/conversations.py#L43) to produce texts with the `gpt-3.5-turbo` API programatically. 132 | 133 | ```python 134 | from datasetGPT import ConversationsGenerator, ConversationsGeneratorConfig, DatasetWriter 135 | 136 | dataset_writer = DatasetWriter() # single_file=True 137 | 138 | generator_config = ConversationsGeneratorConfig(agent1="You're a shop assistant in a pet store. Answer to customer questions politely.", 139 | agent2="You're a customer in a pet store. You should behave like a human. You want to buy {n} pets. Ask questions about the pets in the store.", 140 | num_samples=2, 141 | interruption="length", 142 | lengths=[4, 5], 143 | temperatures=[0.1, 0.2], 144 | options=[("n", "2"), ("n", "3")], 145 | model="gpt-3.5-turbo", # or model="gpt-4" 146 | # Specific models per agent can be set. If one of the parameters is set, the other must also be provided, otherwise both agents use the "model" value. 147 | model_agent_one="gpt-3.5-turbo", 148 | model_agent_two="gpt-4" 149 | 150 | ) 151 | 152 | conversations_generator = ConversationsGenerator(generator_config) 153 | 154 | for conversation in conversations_generator: 155 | dataset_writer.save_intermediate_result(conversation) 156 | ``` 157 | 158 | ## Contributing 159 | 160 | > Still under active development. 161 | 162 | Contributions will be highly appreciated. Currently these features are under development: 163 | 164 | - [x] `datasetGPT conversations` - Make two ChatGPT agents talk with one another and record the conversation history. 165 | - [x] `datasetGPT texts` - Inference different LLMs with a given input prompt and generate multiple outputs by varying parameters. 166 | - [ ] `datasetGPT transformations` - Apply a list of transformations to a list of texts. For example, summarizing a list of texts for a {child | university student | PhD candidate} to understand. 167 | - [ ] Support more backend LLMs. 168 | 169 | To set up a local development environment: 170 | 171 | ```bash 172 | git clone https://github.com/radi-cho/datasetGPT/ 173 | cd datasetGPT 174 | pip install -e . 175 | ``` 176 | 177 | ## CLI Reference 178 | 179 | ``` 180 | datasetGPT [OPTIONS] COMMAND [ARGS]... 181 | 182 | Command line interface that generates datasets with LLMs. 183 | 184 | Options: 185 | --help Show this message and exit. 186 | 187 | Commands: 188 | conversations Produce conversations between two gpt-3.5-turbo agents... 189 | texts Inference multiple LLMs at scale. 190 | ``` 191 | 192 | ``` 193 | datasetGPT texts [OPTIONS] 194 | 195 | Inference multiple LLMs at scale. 196 | 197 | Options: 198 | -p, --prompt TEXT Input prompt. [required] 199 | -b, --backend TEXT LLM APIs to use as backends. Use 200 | "backend|model_name" notation. For example: 201 | "openai|text-davinci-003". 202 | -l, --max-length INTEGER Maximum number of tokens to generate for each 203 | prompt. 204 | -t, --temperature FLOAT Possible temperature values for the backend 205 | language model. 206 | -n, --num-samples INTEGER Number of conversations for each configuration. 207 | -o, --option ... Values for additional options denoted in your 208 | prompts by {OPTION_NAME}. 209 | -f, --path PATH Where to save the dataset. Either a file or a 210 | directory (folder). 211 | -s, --single-file Either save the whole dataset to a single file 212 | or create multiple files. 213 | --help Show this message and exit. 214 | ``` 215 | 216 | - You can specify multiple variants for the following options: `--length`, `--temperature`, `--num-samples`, `--option`. A dataset item will be generated for each possible combination of the supplied values. 217 | - Each `--option` provided must be formatted as follows: `--option option_name "Some option value"`. 218 | - Currently supported backends: GPT-3 and GPT-4 model variants by [OpenAI](https://openai.com/blog/openai-api), the language models by [Cohere](https://pypi.org/project/cohere/), BLOOM through the [Petals API](https://petals.ml/). 219 | 220 | ``` 221 | datasetGPT conversations [OPTIONS] 222 | 223 | Produce conversations between two gpt-3.5-turbo agents with given roles. 224 | 225 | Options: 226 | -k, --openai-api-key TEXT OpenAI API key. 227 | -a, --agent1 TEXT Agent role description. [required] 228 | -b, --agent2 TEXT Agent role description. [required] 229 | -i, --interruption [length|end_phrase] 230 | Interruption mode. 231 | -e, --end-phrase TEXT Interrupt after this phrase is outputted by 232 | one of the agents. 233 | -d, --end-agent [agent1|agent2|both] 234 | In which agent's messages to look for the 235 | end phrase. 236 | -l, --length INTEGER Maximum number of utterances for each agent. 237 | A conversation sample will be generated for 238 | each length. 239 | -t, --temperature FLOAT Possible temperature values for the backend 240 | language model. 241 | -m, --model [gpt-3.5-turbo|gpt-4] 242 | OpenAI Chat model to use. GPT-4 is only 243 | supported if provided API key has access to 244 | GPT4. Defaults to GPT-3.5-Turbo. 245 | -m1, --model-agent1 [gpt-3.5-turbo|gpt-4] 246 | OpenAI Chat model to use for agent1. GPT-4 247 | is only supported if given API key has 248 | access to GPT4. Defaults to GPT-3.5-Turbo. 249 | If set, --model-agent2 must also be 250 | provided, otherwise --model value will be 251 | used. 252 | -m2, --model-agent2 [gpt-3.5-turbo|gpt-4] 253 | OpenAI Chat model to use for agent2. GPT-4 254 | is only supported if given API key has 255 | access to GPT4. Defaults to GPT-3.5-Turbo If 256 | set, --model-agent1 must also be provided, 257 | otherwise --model value will be used. 258 | -n, --num-samples INTEGER Number of conversations for each 259 | configuration. 260 | -o, --option ... Values for additional options denoted in 261 | your prompts by {OPTION_NAME}. 262 | -f, --path PATH Where to save the dataset. Either a file or 263 | a directory (folder). 264 | -s, --single-file Either save the whole dataset to a single 265 | file or create multiple files. 266 | --help Show this message and exit. 267 | ``` 268 | 269 | - The length parameter specifies how many utterances each agent should make. A length of 4 typically produces 8 utterances in total. 270 | - You can specify either `length` (default) or `end_phrase` as an interruption strategy. When using `end_phrase` a conversation will be interrupted once the `--end-phrase` has appeared in the messages of the `--end-agent` (could be both). In this case, the lengths provided will be treated as maximum conversation lengths. 271 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ['setuptools>=61.0'] 3 | build-backend = 'setuptools.build_meta' -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from setuptools import find_packages 3 | 4 | with open("README.md", "r", encoding = "utf-8") as readme: 5 | long_description = readme.read() 6 | 7 | setup( 8 | name="datasetGPT", 9 | version="0.0.4", 10 | description="Generate textual and conversational datasets with LLMs.", 11 | long_description = long_description, 12 | long_description_content_type = "text/markdown", 13 | author="Radostin Cholakov", 14 | author_email="radicho123@gmail.com", 15 | url="https://github.com/radi-cho/datasetGPT", 16 | # download_url="https://github.com/radi-cho/datasetGPT/archive/v0.0.1.tar.gz", 17 | keywords=["dataset", "llm", "langchain", "openai"], 18 | package_dir={"": "src"}, 19 | packages = find_packages(where="src"), 20 | install_requires=[ 21 | "langchain>=0.0.113", 22 | "click>=8.1" 23 | ], 24 | entry_points={ 25 | "console_scripts": [ 26 | "datasetGPT=datasetGPT:datasetGPT" 27 | ], 28 | }, 29 | ) 30 | -------------------------------------------------------------------------------- /src/datasetGPT/__init__.py: -------------------------------------------------------------------------------- 1 | from .cli import datasetGPT 2 | from .conversations import * 3 | from .texts import * 4 | from .outputs import * 5 | -------------------------------------------------------------------------------- /src/datasetGPT/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import main 2 | 3 | main() 4 | -------------------------------------------------------------------------------- /src/datasetGPT/base.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import List, Any, Dict, Tuple, Generator, Iterator, Protocol 3 | 4 | OPTIONS_CONFIG_KEYS = ["temperature"] 5 | GENERATOR_CONFIG_KEYS = ["temperatures"] 6 | 7 | 8 | class DatasetGeneratorConfig(Protocol): 9 | """Base generator configuration protocol.""" 10 | openai_api_key: str 11 | """OpenAI API key.""" 12 | num_samples: int 13 | """Number of texts to generate for each options combination.""" 14 | options: List[Tuple[str, str]] 15 | """Additional options defined in the text prompt with curly brackets.""" 16 | 17 | 18 | class DatasetGenerator: 19 | """Abstraction of a dataset generator.""" 20 | 21 | config: DatasetGeneratorConfig 22 | """Generator configuration.""" 23 | options_configs: List[Dict[str, Any]] 24 | """Possible combinations of the provided options.""" 25 | generator_index: int = 0 26 | """Index of the next item to be returned by the generator.""" 27 | 28 | def __init__(self, config: DatasetGeneratorConfig) -> None: 29 | self.config = config 30 | self.initialize_options_configs() 31 | 32 | def initialize_options_configs( 33 | self, 34 | options_config_keys: List[str] = OPTIONS_CONFIG_KEYS, 35 | generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS 36 | ) -> None: 37 | """Prepare options combinations.""" 38 | options_keys = ["sample_id", *options_config_keys] 39 | options_values = [range(self.config.num_samples)] 40 | options_values += [getattr(self.config, key) for key in generator_config_keys] 41 | 42 | for option in self.config.options: 43 | if option[0] not in options_keys: 44 | options_keys.append(option[0]) 45 | options_values.append([option[1]]) 46 | else: 47 | index = options_keys.index(option[0]) 48 | if option[1] not in options_values[index]: 49 | options_values[index].append(option[1]) 50 | 51 | self.options_configs = list(map(lambda x: dict(zip(options_keys, x)), 52 | itertools.product(*options_values))) 53 | 54 | def generate_item(self) -> Dict[str, Any]: 55 | """Produce a data item.""" 56 | return {} 57 | 58 | def __next__(self) -> Generator[Dict[str, Any], None, None]: 59 | return self.generate_item() 60 | 61 | def __iter__(self) -> Iterator: 62 | return self 63 | -------------------------------------------------------------------------------- /src/datasetGPT/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | from typing import List, Tuple 3 | 4 | from .conversations import ConversationsGeneratorConfig, ConversationsGenerator 5 | from .texts import TextsGeneratorConfig, TextsGenerator 6 | from .outputs import DatasetWriter 7 | 8 | 9 | @click.group() 10 | def datasetGPT() -> None: 11 | """Command line interface that generates datasets with LLMs.""" 12 | pass 13 | 14 | 15 | click_options = click.option("--option", 16 | "-o", 17 | "options", 18 | type=(str, str), 19 | multiple=True, 20 | help="Values for additional options denoted in your prompts by {OPTION_NAME}.") 21 | 22 | click_path = click.option("--path", 23 | "-f", 24 | "path", 25 | type=click.Path(), 26 | help="Where to save the dataset. Either a file or a directory (folder).") 27 | 28 | click_single_file = click.option("--single-file", 29 | "-s", 30 | "single_file", 31 | type=bool, 32 | is_flag=True, 33 | help="Either save the whole dataset to a single file or create multiple files.") 34 | 35 | click_num_samples = click.option("--num-samples", 36 | "-n", 37 | "num_samples", 38 | type=int, 39 | default=1, 40 | help="Number of conversations for each configuration.") 41 | 42 | click_temperatures = click.option("--temperature", 43 | "-t", 44 | "temperatures", 45 | type=float, 46 | multiple=True, 47 | default=[0.5], 48 | help="Possible temperature values for the backend language model.") 49 | 50 | 51 | @click.command() 52 | @click.option("--openai-api-key", 53 | "-k", 54 | "openai_api_key", 55 | type=str, 56 | envvar="OPENAI_API_KEY", 57 | help="OpenAI API key.") 58 | @click.option("--agent1", 59 | "-a", 60 | "agent1", 61 | type=str, 62 | required=True, 63 | help="Agent role description.") 64 | @click.option("--agent2", 65 | "-b", 66 | "agent2", 67 | type=str, 68 | required=True, 69 | help="Agent role description.") 70 | @click.option("--initial-utterance", 71 | "-u", 72 | "initial_utterances", 73 | type=str, 74 | default=["Hello."], 75 | multiple=True, 76 | help="Utterance to be provisioned to the first agent. For many use cases a \"Hello\" is enough.") 77 | @click.option("--interruption", 78 | "-i", 79 | "interruption", 80 | type=click.Choice(["length", "end_phrase"]), 81 | default="length", 82 | help="Interruption mode.") 83 | @click.option("--end-phrase", 84 | "-e", 85 | "end_phrase", 86 | type=str, 87 | default="Goodbye", 88 | help="Interrupt after this phrase is outputted by one of the agents.") 89 | @click.option("--end-agent", 90 | "-d", 91 | "end_agent", 92 | type=click.Choice(["agent1", "agent2", "both"]), 93 | default="both", 94 | help="In which agent's messages to look for the end phrase.") 95 | @click.option("--length", 96 | "-l", 97 | "lengths", 98 | type=int, 99 | multiple=True, 100 | default=[5], 101 | help="Maximum number of utterances for each agent. A conversation sample will be generated for each length.") 102 | @click.option("--model", 103 | "-m", 104 | "model", 105 | type=click.Choice(["gpt-3.5-turbo", "gpt-4"]), 106 | multiple=False, 107 | default="gpt-3.5-turbo", 108 | help="OpenAI Chat model to use. GPT-4 is only supported if given API key has access to GPT4. Defaults to GPT-3.5-Turbo.") 109 | @click.option("--model-agent1", 110 | "-m1", 111 | "model_agent_one", 112 | type=click.Choice(["gpt-3.5-turbo", "gpt-4"]), 113 | multiple=False, 114 | help="OpenAI Chat model to use for agent1. GPT-4 is only supported if given API key has access to GPT4. Defaults to GPT-3.5-Turbo. If set, --model-agent2 must also be provided, otherwise --model value will be used.") 115 | @click.option("--model-agent2", 116 | "-m2", 117 | "model_agent_two", 118 | type=click.Choice(["gpt-3.5-turbo", "gpt-4"]), 119 | multiple=False, 120 | help="OpenAI Chat model to use for agent2. GPT-4 is only supported if given API key has access to GPT4. Defaults to GPT-3.5-Turbo If set, --model-agent1 must also be provided, otherwise --model value will be used.") 121 | @click_temperatures 122 | @click_num_samples 123 | @click_options 124 | @click_path 125 | @click_single_file 126 | def conversations( 127 | openai_api_key: str, 128 | agent1: str, 129 | agent2: str, 130 | initial_utterances: List[str], 131 | num_samples: int, 132 | interruption: str, 133 | end_phrase: str, 134 | end_agent: str, 135 | lengths: List[int], 136 | temperatures: List[int], 137 | options: List[Tuple[str, str]], 138 | path: str, 139 | single_file: bool, 140 | model: str, 141 | model_agent_one: str, 142 | model_agent_two: str 143 | ) -> None: 144 | """Produce conversations between two gpt-3.5-turbo agents with given roles.""" 145 | dataset_writer = DatasetWriter(path, single_file) 146 | 147 | generator_config = ConversationsGeneratorConfig(openai_api_key=openai_api_key, 148 | agent1=agent1, 149 | agent2=agent2, 150 | initial_utterances=initial_utterances, 151 | num_samples=num_samples, 152 | interruption=interruption, 153 | end_phrase=end_phrase, 154 | end_agent=end_agent, 155 | lengths=lengths, 156 | temperatures=temperatures, 157 | options=options, 158 | model=model, 159 | model_agent_one=model_agent_one, 160 | model_agent_two=model_agent_two) 161 | 162 | conversations_generator = ConversationsGenerator(generator_config) 163 | 164 | for conversation in conversations_generator: 165 | dataset_writer.save_intermediate_result(conversation) 166 | 167 | 168 | @click.command() 169 | @click.option("--prompt", 170 | "-p", 171 | "prompt", 172 | type=str, 173 | required=True, 174 | help="Input prompt.") 175 | @click.option("--backend", 176 | "-b", 177 | "backends", 178 | type=str, 179 | multiple=True, 180 | default=["openai|text-davinci-003"], 181 | help="LLM APIs to use as backends. Use \"backend|model_name\" notation. For example: \"openai|text-davinci-003\".") 182 | @click.option("--max-length", 183 | "-l", 184 | "max_lengths", 185 | type=int, 186 | multiple=True, 187 | default=[100], 188 | help="Maximum number of tokens to generate for each prompt.") 189 | @click_temperatures 190 | @click_num_samples 191 | @click_options 192 | @click_path 193 | @click_single_file 194 | def texts( 195 | prompt: str, 196 | num_samples: int, 197 | max_lengths: List[int], 198 | temperatures: List[int], 199 | backends: List[str], 200 | options: List[Tuple[str, str]], 201 | path: str, 202 | single_file: bool 203 | ) -> None: 204 | """Inference multiple LLMs at scale.""" 205 | dataset_writer = DatasetWriter(path, single_file) 206 | 207 | generator_config = TextsGeneratorConfig(prompt=prompt, 208 | backends=backends, 209 | num_samples=num_samples, 210 | max_lengths=max_lengths, 211 | temperatures=temperatures, 212 | options=options) 213 | 214 | texts_generator = TextsGenerator(generator_config) 215 | 216 | for text_object in texts_generator: 217 | dataset_writer.save_intermediate_result(text_object) 218 | 219 | 220 | datasetGPT.add_command(texts) 221 | datasetGPT.add_command(conversations) 222 | 223 | 224 | def main() -> None: 225 | """Run the datasetGPT CLI.""" 226 | datasetGPT() 227 | -------------------------------------------------------------------------------- /src/datasetGPT/conversations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Any, Dict, Tuple, Union 3 | 4 | from langchain.prompts import ( 5 | ChatPromptTemplate, 6 | MessagesPlaceholder, 7 | SystemMessagePromptTemplate, 8 | HumanMessagePromptTemplate 9 | ) 10 | 11 | from langchain.chains import ConversationChain 12 | from langchain.chat_models import ChatOpenAI 13 | from langchain.memory import ConversationBufferMemory 14 | from langchain.schema import SystemMessage 15 | 16 | from .base import DatasetGenerator 17 | 18 | OPTIONS_CONFIG_KEYS = ["length", "temperature", "initial_utterance"] 19 | GENERATOR_CONFIG_KEYS = ["lengths", "temperatures", "initial_utterances"] 20 | 21 | 22 | @dataclass 23 | class ConversationsGeneratorConfig: 24 | openai_api_key: str 25 | """OpenAI API key.""" 26 | agent1: str 27 | """Description of the first agent used to construct its system message.""" 28 | agent2: str 29 | """Description of the second agent used to construct its system message.""" 30 | initial_utterances: List[str] = field(default_factory=lambda: ["Hello."]) 31 | """Utterances to be provisioned to the first agent.""" 32 | num_samples: int = 1 33 | """Number of conversations to generate for each options combination.""" 34 | interruption: str = "length" 35 | """Interruption mode.""" 36 | end_phrase: str = "Goodbye!" 37 | """Phrase to look for when checking whether to interrupt a conversation.""" 38 | end_agent: str = "both" 39 | """Agent whose messages to check for the interruption phrase.""" 40 | lengths: List[int] = field(default_factory=lambda: [5]) 41 | """Possible lengths of the conversations. If end_phrase interruption is enabled these will be used for maximum lengths.""" 42 | temperatures: List[float] = field(default_factory=lambda: [0]) 43 | """Possible temperatures for the backend LLM.""" 44 | options: List[Tuple[str, str]] = field(default_factory=lambda: []) 45 | """Additional options defined in the system prompts with curly brackets.""" 46 | model: str = "gpt-3.5-turbo" 47 | """Model to select for both agents""" 48 | model_agent_one: str = "gpt-3.5-turbo" 49 | """Model to select for agent1""" 50 | model_agent_two: str = "gpt-3.5-turbo" 51 | """Model to select for agent2""" 52 | 53 | 54 | class ConversationsGenerator(DatasetGenerator): 55 | """Generator producing conversations between two AI agents.""" 56 | 57 | config: ConversationsGeneratorConfig 58 | """Configuration for a ConversationsGenerator.""" 59 | 60 | def __init__(self, config: ConversationsGeneratorConfig) -> None: 61 | """Initialize ConversationsGenerator.""" 62 | super().__init__(config) 63 | 64 | def initialize_options_configs( 65 | self, 66 | options_config_keys: List[str] = OPTIONS_CONFIG_KEYS, 67 | generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS 68 | ) -> None: 69 | """Prepare options combinations.""" 70 | super().initialize_options_configs(options_config_keys, generator_config_keys) 71 | 72 | def initialize_chain( 73 | self, 74 | agent: str, 75 | system_prompt: str, 76 | conversation_config: Dict[str, Any] 77 | ) -> Tuple[ConversationChain, str]: 78 | """Initialize a conversation and return a chain and a formatted system prompt.""" 79 | if self.config.interruption == "end_phrase": 80 | if self.config.end_agent == agent or self.config.end_agent == "both": 81 | system_prompt += f" When the whole conversation is over end with \"{self.config.end_phrase}\"." 82 | 83 | system_template = SystemMessagePromptTemplate.from_template( 84 | system_prompt) 85 | template_params = {key: conversation_config[key] 86 | for key in system_template.input_variables} 87 | system_message = system_template.format(**template_params).content 88 | 89 | prompt = ChatPromptTemplate.from_messages([ 90 | SystemMessage(content=system_message), 91 | MessagesPlaceholder(variable_name="history"), 92 | HumanMessagePromptTemplate.from_template("{input}") 93 | ]) 94 | 95 | # Select model for each agent. Only if specific model for both agents is provided, value will be used. 96 | model_for_llm = self.config.model 97 | if(self.config.model_agent_one and self.config.model_agent_one): 98 | if(agent == "agent1"): 99 | model_for_llm = self.config.model_agent_one 100 | elif(agent == "agent2"): 101 | model_for_llm = self.config.model_agent_two 102 | 103 | memory = ConversationBufferMemory(return_messages=True) 104 | llm = ChatOpenAI(temperature=conversation_config["temperature"], 105 | openai_api_key=self.config.openai_api_key, model=model_for_llm) 106 | chain = ConversationChain(memory=memory, prompt=prompt, llm=llm) 107 | 108 | return chain, system_message 109 | 110 | def end_phrase_interruption(self, agent: str, message: str) -> bool: 111 | """Check whether to interrupt conversation generation.""" 112 | if self.config.interruption == "end_phrase": 113 | if self.config.end_agent == agent or self.config.end_agent == "both": 114 | if self.config.end_phrase in message: 115 | return True 116 | 117 | return False 118 | 119 | def generate_item(self) -> Dict[str, Union[List[List[Any]], float, int]]: 120 | """Run two chains to talk with one another and record the chat history.""" 121 | if self.generator_index >= len(self.options_configs): 122 | raise StopIteration() 123 | 124 | conversation_config = self.options_configs[self.generator_index] 125 | self.generator_index += 1 126 | 127 | chain1, system_prompt1 = self.initialize_chain("agent1", 128 | self.config.agent1, 129 | conversation_config) 130 | 131 | chain2, system_prompt2 = self.initialize_chain("agent2", 132 | self.config.agent2, 133 | conversation_config) 134 | 135 | utterances = [] 136 | 137 | chain1_inp = conversation_config["initial_utterance"] 138 | for _ in range(conversation_config["length"]): 139 | chain1_out = chain1.predict(input=chain1_inp) 140 | utterances.append(["agent1", chain1_out]) 141 | 142 | if self.end_phrase_interruption("agent1", chain1_out): 143 | break 144 | 145 | chain2_out = chain2.predict(input=chain1_out) 146 | utterances.append(["agent2", chain2_out]) 147 | 148 | if self.end_phrase_interruption("agent2", chain2_out): 149 | break 150 | 151 | chain1_inp = chain2_out 152 | 153 | return {**conversation_config, 154 | "agent1": system_prompt1, 155 | "agent2": system_prompt2, 156 | "utterances": utterances} 157 | -------------------------------------------------------------------------------- /src/datasetGPT/outputs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from uuid import uuid4 5 | from typing import Dict, Any, List 6 | 7 | 8 | class DatasetWriter: 9 | """Handle outputting dataset items.""" 10 | 11 | single_file: bool 12 | """Whether to save all dataset items in a single file.""" 13 | path: str 14 | """Path of the output file or directory.""" 15 | dataset_items: List[Dict[str, Any]] 16 | """Collection of all the items in the current dataset.""" 17 | 18 | def __init__(self, path: str = None, single_file: bool = False) -> None: 19 | """Initialize DatasetWriter.""" 20 | if path == None and single_file: 21 | path = self.get_unique_filename(os.getcwd()) 22 | elif path == None and not single_file: 23 | path = self.get_unique_dirname(os.getcwd()) 24 | elif os.path.isdir(path) and single_file: 25 | path = self.get_unique_filename(path) 26 | elif os.path.isfile(path) and not single_file: 27 | raise ValueError( 28 | "Cannot write to a file with the single_file mode disabled. Try setting --single-file.") 29 | 30 | self.single_file = single_file 31 | self.path = path 32 | self.dataset_items = [] 33 | 34 | def get_unique_dirname(self, base_path): 35 | """Get a unique dirname.""" 36 | return os.path.join(base_path, str(uuid4())) 37 | 38 | def get_unique_filename(self, base_path): 39 | """Get a unique filename.""" 40 | return os.path.join(base_path, f"{uuid4()}.json") 41 | 42 | def save_intermediate_result(self, result: Dict[str, Any]): 43 | """Either save an item to its own file or concatenate it with all dataset items in a single file.""" 44 | if self.single_file: 45 | self.dataset_items.append(result) 46 | 47 | current_directory = os.path.dirname(self.path) 48 | if current_directory != "" and current_directory != ".": 49 | os.makedirs(current_directory, exist_ok=True) 50 | 51 | with open(self.path, "w") as output_file: 52 | json.dump(self.dataset_items, output_file) 53 | else: 54 | current_filepath = self.get_unique_filename(self.path) 55 | 56 | os.makedirs(self.path, exist_ok=True) 57 | with open(current_filepath, "w") as output_file: 58 | json.dump(result, output_file) 59 | -------------------------------------------------------------------------------- /src/datasetGPT/texts.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Any, Dict, Tuple, Union 3 | 4 | from langchain.prompts import PromptTemplate 5 | from langchain.llms import BaseLLM 6 | from langchain.chains import LLMChain 7 | 8 | from .base import DatasetGenerator 9 | 10 | OPTIONS_CONFIG_KEYS = ["backend", "max_length", "temperature"] 11 | GENERATOR_CONFIG_KEYS = ["backends", "max_lengths", "temperatures"] 12 | 13 | 14 | @dataclass 15 | class TextsGeneratorConfig: 16 | prompt: str 17 | """Text prompt.""" 18 | backends: List[Tuple[str, str, str]] 19 | """LLM APIs to use as backends.""" 20 | num_samples: int = 1 21 | """Number of texts to generate for each options combination.""" 22 | max_lengths: List[int] = field(default_factory=lambda: [5]) 23 | """Maximum lengths in tokens for the output of each generation.""" 24 | temperatures: List[float] = field(default_factory=lambda: [0]) 25 | """Possible temperatures for the backend LLM.""" 26 | options: List[Tuple[str, str]] = field(default_factory=lambda: []) 27 | """Additional options defined in the system prompts with curly brackets.""" 28 | 29 | 30 | class TextsGenerator(DatasetGenerator): 31 | """Generator producing texts by varying model parameters and prompt options.""" 32 | 33 | config: TextsGeneratorConfig 34 | """Configuration for a TextsGenerator.""" 35 | 36 | def __init__(self, config: TextsGeneratorConfig) -> None: 37 | """Initialize TextsGenerator.""" 38 | super().__init__(config) 39 | 40 | def initialize_options_configs( 41 | self, 42 | options_config_keys: List[str] = OPTIONS_CONFIG_KEYS, 43 | generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS 44 | ) -> None: 45 | """Prepare options combinations.""" 46 | super().initialize_options_configs(options_config_keys, generator_config_keys) 47 | 48 | def initialize_backend(self, text_config: Dict[str, Any]) -> BaseLLM: 49 | """Initialize a specific LLM.""" 50 | backend_str = text_config["backend"] 51 | temperature = text_config["temperature"] 52 | max_length = text_config["max_length"] 53 | 54 | backend, model = backend_str.split("|") 55 | 56 | if backend.lower() == "openai": 57 | from langchain.llms import OpenAI 58 | llm = OpenAI(model_name=model, 59 | temperature=temperature, 60 | max_tokens=max_length) 61 | elif backend.lower() == "cohere": 62 | from langchain.llms import Cohere 63 | llm = Cohere(model=model, 64 | temperature=temperature, 65 | max_tokens=max_length) 66 | elif backend.lower() == "petals": 67 | from langchain.llms import Petals 68 | llm = Petals(model_name=model, 69 | temperature=temperature, 70 | max_new_tokens=max_length) 71 | else: 72 | raise ValueError("Cannot use the specified backend.") 73 | 74 | return llm 75 | 76 | def generate_item(self) -> Dict[str, Union[List[List[Any]], float, int]]: 77 | """Produce text with a LLM Chain.""" 78 | if self.generator_index >= len(self.options_configs): 79 | raise StopIteration() 80 | 81 | text_config = self.options_configs[self.generator_index] 82 | self.generator_index += 1 83 | 84 | input_variables = text_config.keys() - ["sample_id", 85 | "backend", 86 | "temperature", 87 | "max_length"] 88 | 89 | prompt_template = PromptTemplate(template=self.config.prompt, 90 | input_variables=input_variables) 91 | 92 | llm = self.initialize_backend(text_config) 93 | 94 | prompt_params = {k: text_config[k] for k in input_variables} 95 | input_prompt = prompt_template.format(**prompt_params) 96 | 97 | chain = LLMChain(prompt=prompt_template, llm=llm) 98 | output = chain.predict(**prompt_params) 99 | 100 | return {**text_config, 101 | "prompt": input_prompt, 102 | "output": output} 103 | --------------------------------------------------------------------------------