├── .gitignore
├── README.md
├── pyproject.toml
├── setup.py
└── src
    └── datasetGPT
        ├── __init__.py
        ├── __main__.py
        ├── base.py
        ├── cli.py
        ├── conversations.py
        ├── outputs.py
        └── texts.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # macOS local attributes
156 | .DS_Store
157 | 
158 | # Local experimentation
159 | lab
160 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # datasetGPT
  2 | 
  3 | `datasetGPT` is a command-line interface and a Python library for inferencing Large Language Models to generate textual datasets.
  4 | 
  5 | Possible use cases may include:
  6 | 
  7 | - Constructing textual corpora to train/fine-tune detectors for content written by AI.
  8 | - Collecting datasets of LLM-produced conversations for research purposes, analysis of AI performance/impact/ethics, etc.
  9 | - Automating a task that a LLM can handle over big amounts of input texts. For example, using GPT-3 to summarize 1000 paragraphs with a single CLI command.
 10 | - Leveraging APIs of especially big LLMs to produce diverse texts for a specific task and then fine-tune a smaller model with them.
 11 | 
 12 | > This tool is distributed freely and doesn't imply any restrictions on the downstream use cases.
 13 | > However, you should make sure to follow the **Terms of use** of the backend APIs (OpenAI, Cohere, Petals, etc.) in your specific context.
 14 | 
 15 | ## Installation
 16 | 
 17 | ```
 18 | pip install datasetGPT
 19 | ```
 20 | 
 21 | Most of the generation features rely on third-party APIs. Install their respective packages:
 22 | 
 23 | ```
 24 | pip install openai cohere petals
 25 | ```
 26 | 
 27 | ## Usage examples
 28 | 
 29 | ### Inference LLMs at scale
 30 | 
 31 | ```bash
 32 | export OPENAI_API_KEY="..."
 33 | export COHERE_API_KEY="..."
 34 | 
 35 | datasetGPT texts \
 36 |     --prompt "If {country} was a planet in the Star Wars universe it would be called" \
 37 |     --backend "openai|text-davinci-003" \
 38 |     --backend "cohere|medium" \
 39 |     --temperature 0.9 \
 40 |     --option country Germany \
 41 |     --option country France \
 42 |     --max-length 50 \
 43 |     --num-samples 1 \
 44 |     --single-file
 45 | ```
 46 | 
 47 | The command above should produce a dataset file with 4 texts. Each possible combination of options is used for each of the backend LLMs. Check out the [CLI reference](#cli-reference) for more details. A dataset file looks like this:
 48 | 
 49 | ```json
 50 | [
 51 |     {
 52 |         "sample_id": 0,
 53 |         "backend": "openai|text-davinci-003",
 54 |         "max_length": 50,
 55 |         "temperature": 0.9,
 56 |         "country": "Germany",
 57 |         "prompt": "If Germany was a planet in the Star Wars universe it would be called",
 58 |         "output": " Euron. The planet would be home to a powerful and diverse species of aliens, known as the Eurons, that have evolved to a higher level of understanding and technological advancement compared to many of the other planets in the galaxy. The planet would be"
 59 |     },
 60 |     {
 61 |         "sample_id": 0,
 62 |         "backend": "openai|text-davinci-003",
 63 |         "max_length": 50,
 64 |         "temperature": 0.9,
 65 |         "country": "France",
 66 |         "prompt": "If France was a planet in the Star Wars universe it would be called",
 67 |         "output": " The Empire of Liberty. It would be a peaceful, democratic planet with a strong sense of justice and equality. The planet would be home to many different species of aliens but the majority of its population would be humans. It would have a strong military and"
 68 |     },
 69 |     {
 70 |         "sample_id": 0,
 71 |         "backend": "cohere|medium",
 72 |         "max_length": 50,
 73 |         "temperature": 0.9,
 74 |         "country": "Germany",
 75 |         "prompt": "If Germany was a planet in the Star Wars universe it would be called",
 76 |         "output": " the Hoth of the universe.\nAfter the Soviet invasion of Eastern Europe and the subsequent Western anti-Soviet sentiment, Germany's arms manufacturers went into hyperdrive and the country churned out guns at a frightening pace. By the early 1930"
 77 |     },
 78 |     ...
 79 | ]
 80 | ```
 81 | 
 82 | Alternatively, you can use our [`TextsGenerator`](https://github.com/radi-cho/datasetGPT/blob/main/datasetGPT/texts.py#L27) to produce texts in Python.
 83 | 
 84 | ### Generate conversations with the ChatGPT API
 85 | 
 86 | ```bash
 87 | export OPENAI_API_KEY="..."
 88 | datasetGPT conversations \
 89 |     --length 4 \
 90 |     --length 5 \
 91 |     --agent1 "You're a shop assistant in a pet store. Answer to customer questions politely." \
 92 |     --agent2 "You're a customer in a pet store. You should behave like a human. You want to buy {n} pets. Ask questions about the pets in the store." \
 93 |     --temperature 0.1 \
 94 |     --temperature 0.2 \
 95 |     --option n 2 \
 96 |     --option n 3 \
 97 |     --path dataset
 98 | ```
 99 | 
100 | The command above should produce a dataset directory with 16 conversations saved as JSON files. You can specify if you want all of them to be saved in a single file. But why 16? Because `num_samples` dialogues are generated for each possible combination of parameters (conversation length, LLM temperature, and custom prompt options). A dataset item looks like this:
101 | 
102 | ```json
103 | {
104 |     "length": 5,
105 |     "temperature": 0.1,
106 |     "n": "2",
107 |     "agent1": "You're a shop assistant in a pet store. Answer to customer questions politely. When the whole conversation is over end with \"Goodbye\".",
108 |     "agent2": "You're a customer in a pet store. You should behave like a human. You want to buy 2 pets. Ask questions about the pets in the store. When the whole conversation is over end with \"Goodbye\".",
109 |     "utterances": [
110 |         [
111 |             "agent1",
112 |             "Hello! How can I assist you today?"
113 |         ],
114 |         [
115 |             "agent2",
116 |             "Hi! I'm interested in buying two pets. Can you tell me what kind of pets you have available in the store?"
117 |         ],
118 |         [
119 |             "agent1",
120 |             "Certainly! We have a variety of pets available, including dogs, cats, birds, fish, hamsters, guinea pigs, rabbits, and reptiles. Is there a specific type of pet you're interested in?"
121 |         ],
122 |         [
123 |             "agent2",
124 |             "I'm not sure yet. Can you tell me more about the dogs and cats you have available? What breeds do you have?"
125 |         ],
126 |         ...
127 |     ]
128 | }
129 | ```
130 | 
131 | You can also use our [`ConversationsGenerator`](https://github.com/radi-cho/datasetGPT/blob/main/datasetGPT/conversations.py#L43) to produce texts with the `gpt-3.5-turbo` API programatically.
132 | 
133 | ```python
134 | from datasetGPT import ConversationsGenerator, ConversationsGeneratorConfig, DatasetWriter
135 | 
136 | dataset_writer = DatasetWriter() # single_file=True
137 | 
138 | generator_config = ConversationsGeneratorConfig(agent1="You're a shop assistant in a pet store. Answer to customer questions politely.",
139 |                                                 agent2="You're a customer in a pet store. You should behave like a human. You want to buy {n} pets. Ask questions about the pets in the store.",
140 |                                                 num_samples=2,
141 |                                                 interruption="length",
142 |                                                 lengths=[4, 5],
143 |                                                 temperatures=[0.1, 0.2],
144 |                                                 options=[("n", "2"), ("n", "3")],
145 |                                                 model="gpt-3.5-turbo", # or model="gpt-4"
146 |                                                 # Specific models per agent can be set. If one of the parameters is set, the other must also be provided, otherwise both agents use the "model" value.
147 |                                                 model_agent_one="gpt-3.5-turbo",
148 |                                                 model_agent_two="gpt-4"
149 | 
150 |                                              )
151 | 
152 | conversations_generator = ConversationsGenerator(generator_config)
153 | 
154 | for conversation in conversations_generator:
155 |     dataset_writer.save_intermediate_result(conversation)
156 | ```
157 | 
158 | ## Contributing
159 | 
160 | > Still under active development.
161 | 
162 | Contributions will be highly appreciated. Currently these features are under development:
163 | 
164 | - [x] `datasetGPT conversations` - Make two ChatGPT agents talk with one another and record the conversation history.
165 | - [x] `datasetGPT texts` - Inference different LLMs with a given input prompt and generate multiple outputs by varying parameters.
166 | - [ ] `datasetGPT transformations` - Apply a list of transformations to a list of texts. For example, summarizing a list of texts for a {child | university student | PhD candidate} to understand.
167 | - [ ] Support more backend LLMs.
168 | 
169 | To set up a local development environment:
170 | 
171 | ```bash
172 | git clone https://github.com/radi-cho/datasetGPT/
173 | cd datasetGPT
174 | pip install -e .
175 | ```
176 | 
177 | ## CLI Reference
178 | 
179 | ```
180 | datasetGPT [OPTIONS] COMMAND [ARGS]...
181 | 
182 |   Command line interface that generates datasets with LLMs.
183 | 
184 | Options:
185 |   --help  Show this message and exit.
186 | 
187 | Commands:
188 |   conversations  Produce conversations between two gpt-3.5-turbo agents...
189 |   texts          Inference multiple LLMs at scale.
190 | ```
191 | 
192 | ```
193 | datasetGPT texts [OPTIONS]
194 | 
195 |   Inference multiple LLMs at scale.
196 | 
197 | Options:
198 |   -p, --prompt TEXT            Input prompt.  [required]
199 |   -b, --backend TEXT           LLM APIs to use as backends. Use
200 |                                "backend|model_name" notation. For example:
201 |                                "openai|text-davinci-003".
202 |   -l, --max-length INTEGER     Maximum number of tokens to generate for each
203 |                                prompt.
204 |   -t, --temperature FLOAT      Possible temperature values for the backend
205 |                                language model.
206 |   -n, --num-samples INTEGER    Number of conversations for each configuration.
207 |   -o, --option <TEXT TEXT>...  Values for additional options denoted in your
208 |                                prompts by {OPTION_NAME}.
209 |   -f, --path PATH              Where to save the dataset. Either a file or a
210 |                                directory (folder).
211 |   -s, --single-file            Either save the whole dataset to a single file
212 |                                or create multiple files.
213 |   --help                       Show this message and exit.
214 | ```
215 | 
216 | - You can specify multiple variants for the following options: `--length`, `--temperature`, `--num-samples`, `--option`. A dataset item will be generated for each possible combination of the supplied values.
217 | - Each `--option` provided must be formatted as follows: `--option option_name "Some option value"`.
218 | - Currently supported backends: GPT-3 and GPT-4 model variants by [OpenAI](https://openai.com/blog/openai-api), the language models by [Cohere](https://pypi.org/project/cohere/), BLOOM through the [Petals API](https://petals.ml/).
219 | 
220 | ```
221 | datasetGPT conversations [OPTIONS]
222 | 
223 |   Produce conversations between two gpt-3.5-turbo agents with given roles.
224 | 
225 | Options:
226 |   -k, --openai-api-key TEXT       OpenAI API key.
227 |   -a, --agent1 TEXT               Agent role description.  [required]
228 |   -b, --agent2 TEXT               Agent role description.  [required]
229 |   -i, --interruption [length|end_phrase]
230 |                                   Interruption mode.
231 |   -e, --end-phrase TEXT           Interrupt after this phrase is outputted by
232 |                                   one of the agents.
233 |   -d, --end-agent [agent1|agent2|both]
234 |                                   In which agent's messages to look for the
235 |                                   end phrase.
236 |   -l, --length INTEGER            Maximum number of utterances for each agent.
237 |                                   A conversation sample will be generated for
238 |                                   each length.
239 |   -t, --temperature FLOAT         Possible temperature values for the backend
240 |                                   language model.
241 |   -m, --model [gpt-3.5-turbo|gpt-4]
242 |                                   OpenAI Chat model to use. GPT-4 is only
243 |                                   supported if provided API key has access to
244 |                                   GPT4. Defaults to GPT-3.5-Turbo.
245 |   -m1, --model-agent1 [gpt-3.5-turbo|gpt-4]
246 |                                   OpenAI Chat model to use for agent1. GPT-4
247 |                                   is only supported if given API key has
248 |                                   access to GPT4. Defaults to GPT-3.5-Turbo.
249 |                                   If set, --model-agent2 must also be
250 |                                   provided, otherwise --model value will be
251 |                                   used.
252 |   -m2, --model-agent2 [gpt-3.5-turbo|gpt-4]
253 |                                   OpenAI Chat model to use for agent2. GPT-4
254 |                                   is only supported if given API key has
255 |                                   access to GPT4. Defaults to GPT-3.5-Turbo If
256 |                                   set, --model-agent1 must also be provided,
257 |                                   otherwise --model value will be used.
258 |   -n, --num-samples INTEGER       Number of conversations for each
259 |                                   configuration.
260 |   -o, --option <TEXT TEXT>...     Values for additional options denoted in
261 |                                   your prompts by {OPTION_NAME}.
262 |   -f, --path PATH                 Where to save the dataset. Either a file or
263 |                                   a directory (folder).
264 |   -s, --single-file               Either save the whole dataset to a single
265 |                                   file or create multiple files.
266 |   --help                          Show this message and exit.
267 | ```
268 | 
269 | - The length parameter specifies how many utterances each agent should make. A length of 4 typically produces 8 utterances in total.
270 | - You can specify either `length` (default) or `end_phrase` as an interruption strategy. When using `end_phrase` a conversation will be interrupted once the `--end-phrase` has appeared in the messages of the `--end-agent` (could be both). In this case, the lengths provided will be treated as maximum conversation lengths.
271 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ['setuptools>=61.0']
3 | build-backend = 'setuptools.build_meta'


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from setuptools import find_packages
 3 | 
 4 | with open("README.md", "r", encoding = "utf-8") as readme:
 5 |     long_description = readme.read()
 6 | 
 7 | setup(
 8 |     name="datasetGPT",
 9 |     version="0.0.4",
10 |     description="Generate textual and conversational datasets with LLMs.",
11 |     long_description = long_description,
12 |     long_description_content_type = "text/markdown",
13 |     author="Radostin Cholakov",
14 |     author_email="radicho123@gmail.com",
15 |     url="https://github.com/radi-cho/datasetGPT",
16 |     # download_url="https://github.com/radi-cho/datasetGPT/archive/v0.0.1.tar.gz",
17 |     keywords=["dataset", "llm", "langchain", "openai"],
18 |     package_dir={"": "src"},
19 |     packages = find_packages(where="src"),
20 |     install_requires=[
21 |         "langchain>=0.0.113",
22 |         "click>=8.1"
23 |     ],
24 |     entry_points={
25 |         "console_scripts": [
26 |             "datasetGPT=datasetGPT:datasetGPT"
27 |         ],
28 |     },
29 | )
30 | 


--------------------------------------------------------------------------------
/src/datasetGPT/__init__.py:
--------------------------------------------------------------------------------
1 | from .cli import datasetGPT
2 | from .conversations import *
3 | from .texts import *
4 | from .outputs import *
5 | 


--------------------------------------------------------------------------------
/src/datasetGPT/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import main
2 | 
3 | main()
4 | 


--------------------------------------------------------------------------------
/src/datasetGPT/base.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from typing import List, Any, Dict, Tuple, Generator, Iterator, Protocol
 3 | 
 4 | OPTIONS_CONFIG_KEYS = ["temperature"]
 5 | GENERATOR_CONFIG_KEYS =  ["temperatures"]
 6 | 
 7 | 
 8 | class DatasetGeneratorConfig(Protocol):
 9 |     """Base generator configuration protocol."""
10 |     openai_api_key: str
11 |     """OpenAI API key."""
12 |     num_samples: int
13 |     """Number of texts to generate for each options combination."""
14 |     options: List[Tuple[str, str]]
15 |     """Additional options defined in the text prompt with curly brackets."""
16 | 
17 | 
18 | class DatasetGenerator:
19 |     """Abstraction of a dataset generator."""
20 | 
21 |     config: DatasetGeneratorConfig
22 |     """Generator configuration."""
23 |     options_configs: List[Dict[str, Any]]
24 |     """Possible combinations of the provided options."""
25 |     generator_index: int = 0
26 |     """Index of the next item to be returned by the generator."""
27 | 
28 |     def __init__(self, config: DatasetGeneratorConfig) -> None:
29 |         self.config = config
30 |         self.initialize_options_configs()
31 | 
32 |     def initialize_options_configs(
33 |         self,
34 |         options_config_keys: List[str] = OPTIONS_CONFIG_KEYS,
35 |         generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS
36 |     ) -> None:
37 |         """Prepare options combinations."""
38 |         options_keys = ["sample_id", *options_config_keys]
39 |         options_values = [range(self.config.num_samples)]
40 |         options_values += [getattr(self.config, key) for key in generator_config_keys]
41 | 
42 |         for option in self.config.options:
43 |             if option[0] not in options_keys:
44 |                 options_keys.append(option[0])
45 |                 options_values.append([option[1]])
46 |             else:
47 |                 index = options_keys.index(option[0])
48 |                 if option[1] not in options_values[index]:
49 |                     options_values[index].append(option[1])
50 | 
51 |         self.options_configs = list(map(lambda x: dict(zip(options_keys, x)),
52 |                                         itertools.product(*options_values)))
53 | 
54 |     def generate_item(self) -> Dict[str, Any]:
55 |         """Produce a data item."""
56 |         return {}
57 | 
58 |     def __next__(self) -> Generator[Dict[str, Any], None, None]:
59 |         return self.generate_item()
60 | 
61 |     def __iter__(self) -> Iterator:
62 |         return self
63 | 


--------------------------------------------------------------------------------
/src/datasetGPT/cli.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | from typing import List, Tuple
  3 | 
  4 | from .conversations import ConversationsGeneratorConfig, ConversationsGenerator
  5 | from .texts import TextsGeneratorConfig, TextsGenerator
  6 | from .outputs import DatasetWriter
  7 | 
  8 | 
  9 | @click.group()
 10 | def datasetGPT() -> None:
 11 |     """Command line interface that generates datasets with LLMs."""
 12 |     pass
 13 | 
 14 | 
 15 | click_options = click.option("--option",
 16 |                              "-o",
 17 |                              "options",
 18 |                              type=(str, str),
 19 |                              multiple=True,
 20 |                              help="Values for additional options denoted in your prompts by {OPTION_NAME}.")
 21 | 
 22 | click_path = click.option("--path",
 23 |                           "-f",
 24 |                           "path",
 25 |                           type=click.Path(),
 26 |                           help="Where to save the dataset. Either a file or a directory (folder).")
 27 | 
 28 | click_single_file = click.option("--single-file",
 29 |                                  "-s",
 30 |                                  "single_file",
 31 |                                  type=bool,
 32 |                                  is_flag=True,
 33 |                                  help="Either save the whole dataset to a single file or create multiple files.")
 34 | 
 35 | click_num_samples = click.option("--num-samples",
 36 |                                  "-n",
 37 |                                  "num_samples",
 38 |                                  type=int,
 39 |                                  default=1,
 40 |                                  help="Number of conversations for each configuration.")
 41 | 
 42 | click_temperatures = click.option("--temperature",
 43 |                                   "-t",
 44 |                                   "temperatures",
 45 |                                   type=float,
 46 |                                   multiple=True,
 47 |                                   default=[0.5],
 48 |                                   help="Possible temperature values for the backend language model.")
 49 | 
 50 | 
 51 | @click.command()
 52 | @click.option("--openai-api-key",
 53 |               "-k",
 54 |               "openai_api_key",
 55 |               type=str,
 56 |               envvar="OPENAI_API_KEY",
 57 |               help="OpenAI API key.")
 58 | @click.option("--agent1",
 59 |               "-a",
 60 |               "agent1",
 61 |               type=str,
 62 |               required=True,
 63 |               help="Agent role description.")
 64 | @click.option("--agent2",
 65 |               "-b",
 66 |               "agent2",
 67 |               type=str,
 68 |               required=True,
 69 |               help="Agent role description.")
 70 | @click.option("--initial-utterance",
 71 |               "-u",
 72 |               "initial_utterances",
 73 |               type=str,
 74 |               default=["Hello."],
 75 |               multiple=True,
 76 |               help="Utterance to be provisioned to the first agent. For many use cases a \"Hello\" is enough.")
 77 | @click.option("--interruption",
 78 |               "-i",
 79 |               "interruption",
 80 |               type=click.Choice(["length", "end_phrase"]),
 81 |               default="length",
 82 |               help="Interruption mode.")
 83 | @click.option("--end-phrase",
 84 |               "-e",
 85 |               "end_phrase",
 86 |               type=str,
 87 |               default="Goodbye",
 88 |               help="Interrupt after this phrase is outputted by one of the agents.")
 89 | @click.option("--end-agent",
 90 |               "-d",
 91 |               "end_agent",
 92 |               type=click.Choice(["agent1", "agent2", "both"]),
 93 |               default="both",
 94 |               help="In which agent's messages to look for the end phrase.")
 95 | @click.option("--length",
 96 |               "-l",
 97 |               "lengths",
 98 |               type=int,
 99 |               multiple=True,
100 |               default=[5],
101 |               help="Maximum number of utterances for each agent. A conversation sample will be generated for each length.")
102 | @click.option("--model",
103 |               "-m",
104 |               "model",
105 |               type=click.Choice(["gpt-3.5-turbo", "gpt-4"]),
106 |               multiple=False,
107 |               default="gpt-3.5-turbo",
108 |               help="OpenAI Chat model to use. GPT-4 is only supported if given API key has access to GPT4. Defaults to GPT-3.5-Turbo.")
109 | @click.option("--model-agent1",
110 |               "-m1",
111 |               "model_agent_one",
112 |               type=click.Choice(["gpt-3.5-turbo", "gpt-4"]),
113 |               multiple=False,
114 |               help="OpenAI Chat model to use for agent1. GPT-4 is only supported if given API key has access to GPT4. Defaults to GPT-3.5-Turbo. If set, --model-agent2 must also be provided, otherwise --model value will be used.")
115 | @click.option("--model-agent2",
116 |               "-m2",
117 |               "model_agent_two",
118 |               type=click.Choice(["gpt-3.5-turbo", "gpt-4"]),
119 |               multiple=False,
120 |               help="OpenAI Chat model to use for agent2. GPT-4 is only supported if given API key has access to GPT4. Defaults to GPT-3.5-Turbo If set, --model-agent1 must also be provided, otherwise --model value will be used.")
121 | @click_temperatures
122 | @click_num_samples
123 | @click_options
124 | @click_path
125 | @click_single_file
126 | def conversations(
127 |     openai_api_key: str,
128 |     agent1: str,
129 |     agent2: str,
130 |     initial_utterances: List[str],
131 |     num_samples: int,
132 |     interruption: str,
133 |     end_phrase: str,
134 |     end_agent: str,
135 |     lengths: List[int],
136 |     temperatures: List[int],
137 |     options: List[Tuple[str, str]],
138 |     path: str,
139 |     single_file: bool,
140 |     model: str,
141 |     model_agent_one: str,
142 |     model_agent_two: str
143 | ) -> None:
144 |     """Produce conversations between two gpt-3.5-turbo agents with given roles."""
145 |     dataset_writer = DatasetWriter(path, single_file)
146 | 
147 |     generator_config = ConversationsGeneratorConfig(openai_api_key=openai_api_key,
148 |                                                     agent1=agent1,
149 |                                                     agent2=agent2,
150 |                                                     initial_utterances=initial_utterances,
151 |                                                     num_samples=num_samples,
152 |                                                     interruption=interruption,
153 |                                                     end_phrase=end_phrase,
154 |                                                     end_agent=end_agent,
155 |                                                     lengths=lengths,
156 |                                                     temperatures=temperatures,
157 |                                                     options=options,
158 |                                                     model=model,
159 |                                                     model_agent_one=model_agent_one,
160 |                                                     model_agent_two=model_agent_two)
161 | 
162 |     conversations_generator = ConversationsGenerator(generator_config)
163 | 
164 |     for conversation in conversations_generator:
165 |         dataset_writer.save_intermediate_result(conversation)
166 | 
167 | 
168 | @click.command()
169 | @click.option("--prompt",
170 |               "-p",
171 |               "prompt",
172 |               type=str,
173 |               required=True,
174 |               help="Input prompt.")
175 | @click.option("--backend",
176 |               "-b",
177 |               "backends",
178 |               type=str,
179 |               multiple=True,
180 |               default=["openai|text-davinci-003"],
181 |               help="LLM APIs to use as backends. Use \"backend|model_name\" notation. For example: \"openai|text-davinci-003\".")
182 | @click.option("--max-length",
183 |               "-l",
184 |               "max_lengths",
185 |               type=int,
186 |               multiple=True,
187 |               default=[100],
188 |               help="Maximum number of tokens to generate for each prompt.")
189 | @click_temperatures
190 | @click_num_samples
191 | @click_options
192 | @click_path
193 | @click_single_file
194 | def texts(
195 |     prompt: str,
196 |     num_samples: int,
197 |     max_lengths: List[int],
198 |     temperatures: List[int],
199 |     backends: List[str],
200 |     options: List[Tuple[str, str]],
201 |     path: str,
202 |     single_file: bool
203 | ) -> None:
204 |     """Inference multiple LLMs at scale."""
205 |     dataset_writer = DatasetWriter(path, single_file)
206 | 
207 |     generator_config = TextsGeneratorConfig(prompt=prompt,
208 |                                             backends=backends,
209 |                                             num_samples=num_samples,
210 |                                             max_lengths=max_lengths,
211 |                                             temperatures=temperatures,
212 |                                             options=options)
213 | 
214 |     texts_generator = TextsGenerator(generator_config)
215 | 
216 |     for text_object in texts_generator:
217 |         dataset_writer.save_intermediate_result(text_object)
218 | 
219 | 
220 | datasetGPT.add_command(texts)
221 | datasetGPT.add_command(conversations)
222 | 
223 | 
224 | def main() -> None:
225 |     """Run the datasetGPT CLI."""
226 |     datasetGPT()
227 | 


--------------------------------------------------------------------------------
/src/datasetGPT/conversations.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import List, Any, Dict, Tuple, Union
  3 | 
  4 | from langchain.prompts import (
  5 |     ChatPromptTemplate,
  6 |     MessagesPlaceholder,
  7 |     SystemMessagePromptTemplate,
  8 |     HumanMessagePromptTemplate
  9 | )
 10 | 
 11 | from langchain.chains import ConversationChain
 12 | from langchain.chat_models import ChatOpenAI
 13 | from langchain.memory import ConversationBufferMemory
 14 | from langchain.schema import SystemMessage
 15 | 
 16 | from .base import DatasetGenerator
 17 | 
 18 | OPTIONS_CONFIG_KEYS = ["length", "temperature", "initial_utterance"]
 19 | GENERATOR_CONFIG_KEYS = ["lengths", "temperatures", "initial_utterances"]
 20 | 
 21 | 
 22 | @dataclass
 23 | class ConversationsGeneratorConfig:
 24 |     openai_api_key: str
 25 |     """OpenAI API key."""
 26 |     agent1: str
 27 |     """Description of the first agent used to construct its system message."""
 28 |     agent2: str
 29 |     """Description of the second agent used to construct its system message."""
 30 |     initial_utterances: List[str] = field(default_factory=lambda: ["Hello."])
 31 |     """Utterances to be provisioned to the first agent."""
 32 |     num_samples: int = 1
 33 |     """Number of conversations to generate for each options combination."""
 34 |     interruption: str = "length"
 35 |     """Interruption mode."""
 36 |     end_phrase: str = "Goodbye!"
 37 |     """Phrase to look for when checking whether to interrupt a conversation."""
 38 |     end_agent: str = "both"
 39 |     """Agent whose messages to check for the interruption phrase."""
 40 |     lengths: List[int] = field(default_factory=lambda: [5])
 41 |     """Possible lengths of the conversations. If end_phrase interruption is enabled these will be used for maximum lengths."""
 42 |     temperatures: List[float] = field(default_factory=lambda: [0])
 43 |     """Possible temperatures for the backend LLM."""
 44 |     options: List[Tuple[str, str]] = field(default_factory=lambda: [])
 45 |     """Additional options defined in the system prompts with curly brackets."""
 46 |     model: str = "gpt-3.5-turbo"
 47 |     """Model to select for both agents"""
 48 |     model_agent_one: str = "gpt-3.5-turbo"
 49 |     """Model to select for agent1"""
 50 |     model_agent_two: str = "gpt-3.5-turbo"
 51 |     """Model to select for agent2"""
 52 | 
 53 | 
 54 | class ConversationsGenerator(DatasetGenerator):
 55 |     """Generator producing conversations between two AI agents."""
 56 | 
 57 |     config: ConversationsGeneratorConfig
 58 |     """Configuration for a ConversationsGenerator."""
 59 | 
 60 |     def __init__(self, config: ConversationsGeneratorConfig) -> None:
 61 |         """Initialize ConversationsGenerator."""
 62 |         super().__init__(config)
 63 | 
 64 |     def initialize_options_configs(
 65 |         self,
 66 |         options_config_keys: List[str] = OPTIONS_CONFIG_KEYS,
 67 |         generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS
 68 |     ) -> None:
 69 |         """Prepare options combinations."""
 70 |         super().initialize_options_configs(options_config_keys, generator_config_keys)
 71 | 
 72 |     def initialize_chain(
 73 |         self,
 74 |         agent: str,
 75 |         system_prompt: str,
 76 |         conversation_config: Dict[str, Any]
 77 |     ) -> Tuple[ConversationChain, str]:
 78 |         """Initialize a conversation and return a chain and a formatted system prompt."""
 79 |         if self.config.interruption == "end_phrase":
 80 |             if self.config.end_agent == agent or self.config.end_agent == "both":
 81 |                 system_prompt += f" When the whole conversation is over end with \"{self.config.end_phrase}\"."
 82 | 
 83 |         system_template = SystemMessagePromptTemplate.from_template(
 84 |             system_prompt)
 85 |         template_params = {key: conversation_config[key]
 86 |                            for key in system_template.input_variables}
 87 |         system_message = system_template.format(**template_params).content
 88 | 
 89 |         prompt = ChatPromptTemplate.from_messages([
 90 |             SystemMessage(content=system_message),
 91 |             MessagesPlaceholder(variable_name="history"),
 92 |             HumanMessagePromptTemplate.from_template("{input}")
 93 |         ])
 94 | 
 95 |         # Select model for each agent. Only if specific model for both agents is provided, value will be used.
 96 |         model_for_llm = self.config.model
 97 |         if(self.config.model_agent_one and self.config.model_agent_one):
 98 |             if(agent == "agent1"):
 99 |                 model_for_llm = self.config.model_agent_one
100 |             elif(agent == "agent2"):
101 |                 model_for_llm = self.config.model_agent_two
102 | 
103 |         memory = ConversationBufferMemory(return_messages=True)
104 |         llm = ChatOpenAI(temperature=conversation_config["temperature"],
105 |                          openai_api_key=self.config.openai_api_key, model=model_for_llm)
106 |         chain = ConversationChain(memory=memory, prompt=prompt, llm=llm)
107 | 
108 |         return chain, system_message
109 | 
110 |     def end_phrase_interruption(self, agent: str, message: str) -> bool:
111 |         """Check whether to interrupt conversation generation."""
112 |         if self.config.interruption == "end_phrase":
113 |             if self.config.end_agent == agent or self.config.end_agent == "both":
114 |                 if self.config.end_phrase in message:
115 |                     return True
116 | 
117 |         return False
118 | 
119 |     def generate_item(self) -> Dict[str, Union[List[List[Any]], float, int]]:
120 |         """Run two chains to talk with one another and record the chat history."""
121 |         if self.generator_index >= len(self.options_configs):
122 |             raise StopIteration()
123 | 
124 |         conversation_config = self.options_configs[self.generator_index]
125 |         self.generator_index += 1
126 | 
127 |         chain1, system_prompt1 = self.initialize_chain("agent1",
128 |                                                        self.config.agent1,
129 |                                                        conversation_config)
130 | 
131 |         chain2, system_prompt2 = self.initialize_chain("agent2",
132 |                                                        self.config.agent2,
133 |                                                        conversation_config)
134 | 
135 |         utterances = []
136 | 
137 |         chain1_inp = conversation_config["initial_utterance"]
138 |         for _ in range(conversation_config["length"]):
139 |             chain1_out = chain1.predict(input=chain1_inp)
140 |             utterances.append(["agent1", chain1_out])
141 | 
142 |             if self.end_phrase_interruption("agent1", chain1_out):
143 |                 break
144 | 
145 |             chain2_out = chain2.predict(input=chain1_out)
146 |             utterances.append(["agent2", chain2_out])
147 | 
148 |             if self.end_phrase_interruption("agent2", chain2_out):
149 |                 break
150 | 
151 |             chain1_inp = chain2_out
152 | 
153 |         return {**conversation_config,
154 |                 "agent1": system_prompt1,
155 |                 "agent2": system_prompt2,
156 |                 "utterances": utterances}
157 | 


--------------------------------------------------------------------------------
/src/datasetGPT/outputs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | from uuid import uuid4
 5 | from typing import Dict, Any, List
 6 | 
 7 | 
 8 | class DatasetWriter:
 9 |     """Handle outputting dataset items."""
10 | 
11 |     single_file: bool
12 |     """Whether to save all dataset items in a single file."""
13 |     path: str
14 |     """Path of the output file or directory."""
15 |     dataset_items: List[Dict[str, Any]]
16 |     """Collection of all the items in the current dataset."""
17 | 
18 |     def __init__(self, path: str = None, single_file: bool = False) -> None:
19 |         """Initialize DatasetWriter."""
20 |         if path == None and single_file:
21 |             path = self.get_unique_filename(os.getcwd())
22 |         elif path == None and not single_file:
23 |             path = self.get_unique_dirname(os.getcwd())
24 |         elif os.path.isdir(path) and single_file:
25 |             path = self.get_unique_filename(path)
26 |         elif os.path.isfile(path) and not single_file:
27 |             raise ValueError(
28 |                 "Cannot write to a file with the single_file mode disabled. Try setting --single-file.")
29 | 
30 |         self.single_file = single_file
31 |         self.path = path
32 |         self.dataset_items = []
33 | 
34 |     def get_unique_dirname(self, base_path):
35 |         """Get a unique dirname."""
36 |         return os.path.join(base_path, str(uuid4()))
37 | 
38 |     def get_unique_filename(self, base_path):
39 |         """Get a unique filename."""
40 |         return os.path.join(base_path, f"{uuid4()}.json")
41 | 
42 |     def save_intermediate_result(self, result: Dict[str, Any]):
43 |         """Either save an item to its own file or concatenate it with all dataset items in a single file."""
44 |         if self.single_file:
45 |             self.dataset_items.append(result)
46 | 
47 |             current_directory = os.path.dirname(self.path)
48 |             if current_directory != "" and current_directory != ".":
49 |                 os.makedirs(current_directory, exist_ok=True)
50 | 
51 |             with open(self.path, "w") as output_file:
52 |                 json.dump(self.dataset_items, output_file)
53 |         else:
54 |             current_filepath = self.get_unique_filename(self.path)
55 | 
56 |             os.makedirs(self.path, exist_ok=True)
57 |             with open(current_filepath, "w") as output_file:
58 |                 json.dump(result, output_file)
59 | 


--------------------------------------------------------------------------------
/src/datasetGPT/texts.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import List, Any, Dict, Tuple, Union
  3 | 
  4 | from langchain.prompts import PromptTemplate
  5 | from langchain.llms import BaseLLM
  6 | from langchain.chains import LLMChain
  7 | 
  8 | from .base import DatasetGenerator
  9 | 
 10 | OPTIONS_CONFIG_KEYS = ["backend", "max_length", "temperature"]
 11 | GENERATOR_CONFIG_KEYS =  ["backends", "max_lengths", "temperatures"]
 12 | 
 13 | 
 14 | @dataclass
 15 | class TextsGeneratorConfig:
 16 |     prompt: str
 17 |     """Text prompt."""
 18 |     backends: List[Tuple[str, str, str]]
 19 |     """LLM APIs to use as backends."""
 20 |     num_samples: int = 1
 21 |     """Number of texts to generate for each options combination."""
 22 |     max_lengths: List[int] = field(default_factory=lambda: [5])
 23 |     """Maximum lengths in tokens for the output of each generation."""
 24 |     temperatures: List[float] = field(default_factory=lambda: [0])
 25 |     """Possible temperatures for the backend LLM."""
 26 |     options: List[Tuple[str, str]] = field(default_factory=lambda: [])
 27 |     """Additional options defined in the system prompts with curly brackets."""
 28 | 
 29 | 
 30 | class TextsGenerator(DatasetGenerator):
 31 |     """Generator producing texts by varying model parameters and prompt options."""
 32 | 
 33 |     config: TextsGeneratorConfig
 34 |     """Configuration for a TextsGenerator."""
 35 | 
 36 |     def __init__(self, config: TextsGeneratorConfig) -> None:
 37 |         """Initialize TextsGenerator."""
 38 |         super().__init__(config)
 39 | 
 40 |     def initialize_options_configs(
 41 |         self,
 42 |         options_config_keys: List[str] = OPTIONS_CONFIG_KEYS,
 43 |         generator_config_keys: List[str] = GENERATOR_CONFIG_KEYS
 44 |     ) -> None:
 45 |         """Prepare options combinations."""
 46 |         super().initialize_options_configs(options_config_keys, generator_config_keys)
 47 | 
 48 |     def initialize_backend(self, text_config: Dict[str, Any]) -> BaseLLM:
 49 |         """Initialize a specific LLM."""
 50 |         backend_str = text_config["backend"]
 51 |         temperature = text_config["temperature"]
 52 |         max_length = text_config["max_length"]
 53 | 
 54 |         backend, model = backend_str.split("|")
 55 | 
 56 |         if backend.lower() == "openai":
 57 |             from langchain.llms import OpenAI
 58 |             llm = OpenAI(model_name=model,
 59 |                          temperature=temperature,
 60 |                          max_tokens=max_length)
 61 |         elif backend.lower() == "cohere":
 62 |             from langchain.llms import Cohere
 63 |             llm = Cohere(model=model,
 64 |                          temperature=temperature,
 65 |                          max_tokens=max_length)
 66 |         elif backend.lower() == "petals":
 67 |             from langchain.llms import Petals
 68 |             llm = Petals(model_name=model,
 69 |                          temperature=temperature,
 70 |                          max_new_tokens=max_length)
 71 |         else:
 72 |             raise ValueError("Cannot use the specified backend.")
 73 | 
 74 |         return llm
 75 | 
 76 |     def generate_item(self) -> Dict[str, Union[List[List[Any]], float, int]]:
 77 |         """Produce text with a LLM Chain."""
 78 |         if self.generator_index >= len(self.options_configs):
 79 |             raise StopIteration()
 80 | 
 81 |         text_config = self.options_configs[self.generator_index]
 82 |         self.generator_index += 1
 83 | 
 84 |         input_variables = text_config.keys() - ["sample_id",
 85 |                                                 "backend",
 86 |                                                 "temperature",
 87 |                                                 "max_length"]
 88 | 
 89 |         prompt_template = PromptTemplate(template=self.config.prompt,
 90 |                                          input_variables=input_variables)
 91 | 
 92 |         llm = self.initialize_backend(text_config)
 93 | 
 94 |         prompt_params = {k: text_config[k] for k in input_variables}
 95 |         input_prompt = prompt_template.format(**prompt_params)
 96 | 
 97 |         chain = LLMChain(prompt=prompt_template, llm=llm)
 98 |         output = chain.predict(**prompt_params)
 99 | 
100 |         return {**text_config,
101 |                 "prompt": input_prompt,
102 |                 "output": output}
103 | 


--------------------------------------------------------------------------------