├── data └── .keep ├── toolbox ├── __init__.py ├── core │ ├── task.py │ ├── models.py │ ├── dataset.py │ ├── wrapper.py │ └── training_example.py ├── datasets │ ├── common.py │ ├── ai_dungeon.py │ ├── airoboros.py │ ├── evol_instruct.py │ ├── dolly.py │ ├── sharegpt.py │ ├── gpt4llm.py │ ├── claude_evol_instruct.py │ ├── gpt4all.py │ ├── openorca.py │ ├── claude_multiround.py │ ├── gpteacher.py │ ├── airoboros2.py │ ├── supercot.py │ ├── soda.py │ ├── whocars.py │ ├── wizard_vicuna.py │ ├── limarp.py │ ├── mcstories.py │ ├── clubfloyd.py │ ├── rp_guild.py │ ├── claude_logs.py │ ├── rp_forums.py │ └── characterai.py ├── filters │ ├── training_example_filter.py │ ├── __init__.py │ └── training_example │ │ ├── duplicate_filter.py │ │ └── refusal_filter.py ├── utils │ ├── files.py │ └── prompts.py └── tasks │ ├── whocars_roleplay.py │ ├── airoboros_instruction_following.py │ ├── airoboros_guess_instructions.py │ ├── supercot_instruction_following.py │ ├── gpt4all_question_answering.py │ ├── __init__.py │ ├── claude_roleplay.py │ ├── claude_guess_instruction.py │ ├── wizard_vicuna_question_answering.py │ ├── single_turn_instruction_following.py │ ├── evol_instruct.py │ ├── dolly_guess_instruction.py │ ├── claude_instruct.py │ ├── soda_summarization.py │ ├── aidungeon_text_adventure.py │ ├── characterai_roleplay.py │ ├── clubfloyd_text_adventure.py │ ├── limarp_roleplay.py │ ├── mcstories_writing.py │ ├── openorca_instruction_following.py │ ├── soda_reply_generation.py │ ├── claude_evol_instruct.py │ ├── sharegpt_instruction_following.py │ ├── rp_guild_writing.py │ └── rp_forums_writing.py ├── .tool-versions ├── README.md ├── .gitignore ├── pyproject.toml └── scripts └── build.py /data/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | pdm 2.4.3 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data-toolbox 2 | 3 | This repository contains the implementation of our data munging code. 4 | 5 | Currently undergoing a massive refactor, I still need to document everything. 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Cache files. 2 | /*.egg-info/ 3 | **/__pycache__/ 4 | /.mypy_cache/ 5 | 6 | # Machine-specific stuff. 7 | /.pdm.toml 8 | /.pdm-python 9 | /.venv/ 10 | /dist/* 11 | /build_data.py 12 | 13 | # Large/binary files. 14 | /data/* 15 | !/data/.keep 16 | -------------------------------------------------------------------------------- /toolbox/core/task.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | from toolbox.core.models import Episode 4 | 5 | 6 | class BaseTask: 7 | '''Base task class.''' 8 | 9 | def __iter__(self) -> t.Generator[Episode, None, None]: 10 | '''This method must be overidden when inheriting.''' 11 | raise NotImplementedError 12 | -------------------------------------------------------------------------------- /toolbox/datasets/common.py: -------------------------------------------------------------------------------- 1 | """Common data structures which can apply to multiple datasets.""" 2 | from dataclasses import dataclass 3 | 4 | @dataclass(frozen=True) 5 | class SimpleReplyDataInstance: 6 | prompt: str 7 | generation: str 8 | 9 | @dataclass(frozen=True) 10 | class AlpacaLikeDataInstance: 11 | instruction: str 12 | input: str 13 | output: str 14 | -------------------------------------------------------------------------------- /toolbox/filters/training_example_filter.py: -------------------------------------------------------------------------------- 1 | from toolbox.core.training_example import TrainingExample 2 | 3 | 4 | class TrainingExampleFilter: 5 | '''Filter implementations should inherit from this base class.''' 6 | 7 | def should_keep(self, _example: TrainingExample) -> bool: 8 | ''' 9 | Whether or not the given training example should be kept and used for 10 | training. 11 | ''' 12 | raise NotImplementedError -------------------------------------------------------------------------------- /toolbox/filters/__init__.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | from toolbox.filters.training_example.duplicate_filter import DuplicateFilter 4 | from toolbox.filters.training_example.refusal_filter import RefusalFilter 5 | from toolbox.filters.training_example_filter import TrainingExampleFilter 6 | 7 | NAME_TO_TRAINING_EXAMPLE_FILTER_MAPPING: dict[ 8 | str, t.Type[TrainingExampleFilter]] = { 9 | cls.__name__: cls for cls in [DuplicateFilter, RefusalFilter] 10 | } 11 | -------------------------------------------------------------------------------- /toolbox/datasets/ai_dungeon.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | 4 | from toolbox.core.dataset import BaseDataset, get_path_for 5 | 6 | 7 | class AiDungeonDataset(BaseDataset[str]): 8 | ''' 9 | AI Dungeon's `text_adventures.txt`. 10 | ''' 11 | 12 | def __iter__(self) -> t.Generator[str, None, None]: 13 | root_path = get_path_for("ai-dungeon") 14 | file_path = os.path.join(root_path, "text_adventures.txt") 15 | 16 | with open(file_path, "r") as file: 17 | for line in file: 18 | yield line 19 | -------------------------------------------------------------------------------- /toolbox/core/models.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | 4 | class TurnKind(Enum): 5 | '''Identifies who a turn "belongs" to.''' 6 | SYSTEM = "<|system|>" 7 | USER = "<|user|>" 8 | MODEL = "<|model|>" 9 | 10 | @dataclass(frozen=True) 11 | class Turn: 12 | '''Can be thought of as a message or interaction within a conversation.''' 13 | utterance: str 14 | kind: TurnKind 15 | # Used only for Pygmalion format 16 | name: str = "" 17 | 18 | @dataclass(frozen=True) 19 | class Episode: 20 | '''A collection of turns.''' 21 | turns: list[Turn] 22 | identifier: str 23 | 24 | @dataclass(frozen=True) 25 | class TrainingExample: 26 | prompt: str 27 | generation: str 28 | identifier: str 29 | -------------------------------------------------------------------------------- /toolbox/datasets/airoboros.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import typing as t 5 | 6 | from toolbox.core.dataset import BaseDataset, get_path_for 7 | from toolbox.datasets.common import SimpleReplyDataInstance 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class AiroborosDataset(BaseDataset[SimpleReplyDataInstance]): 12 | def __iter__(self) -> t.Generator[SimpleReplyDataInstance, None, None]: 13 | root_path = get_path_for("airoboros") 14 | file_path = os.path.join(root_path, "instructions.jsonl") 15 | 16 | with open(file_path, "r", encoding="utf-8") as f: 17 | for line in f: 18 | line_entry = json.loads(line) 19 | yield SimpleReplyDataInstance( 20 | prompt=line_entry["instruction"], 21 | generation=line_entry["response"] 22 | ) 23 | -------------------------------------------------------------------------------- /toolbox/filters/training_example/duplicate_filter.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from toolbox.core.training_example import TrainingExample 4 | from toolbox.filters.training_example_filter import TrainingExampleFilter 5 | 6 | 7 | class DuplicateFilter(TrainingExampleFilter): 8 | '''Filters out training examples which are exact duplicates.''' 9 | 10 | def __init__(self) -> None: 11 | super().__init__() 12 | 13 | self.seen_hashes: set[str] = set() 14 | 15 | def should_keep(self, example: TrainingExample) -> bool: 16 | serialized_example = example.prompt + example.generation 17 | example_hash = _calculate_hash_for(serialized_example) 18 | if example_hash in self.seen_hashes: 19 | return False 20 | 21 | self.seen_hashes.add(example_hash) 22 | return True 23 | 24 | 25 | def _calculate_hash_for(text: str) -> str: 26 | return hashlib.sha512(text.encode("utf-8")).hexdigest() 27 | -------------------------------------------------------------------------------- /toolbox/datasets/evol_instruct.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import typing as t 4 | 5 | from toolbox.core.dataset import BaseDataset, get_path_for 6 | from toolbox.datasets.common import AlpacaLikeDataInstance 7 | 8 | class EvolInstructDataset(BaseDataset[AlpacaLikeDataInstance]): 9 | ''' 10 | WizardLM data. 11 | 12 | https://huggingface.co/datasets/victor123/evol_instruct_70k 13 | ''' 14 | 15 | def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]: 16 | root_path = get_path_for("evol-instruct") 17 | file_path = os.path.join(root_path, "alpaca_evol_instruct_70k.json") 18 | 19 | with open(file_path, "r") as file: 20 | data = json.load(file) 21 | for example in data: 22 | yield AlpacaLikeDataInstance( 23 | instruction=example["instruction"], 24 | input=None, 25 | output=example["output"], 26 | ) 27 | -------------------------------------------------------------------------------- /toolbox/datasets/dolly.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import typing as t 5 | 6 | from toolbox.core.dataset import BaseDataset, get_path_for 7 | from toolbox.datasets.common import AlpacaLikeDataInstance 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | class DollyDataset(BaseDataset[AlpacaLikeDataInstance]): 12 | ''' 13 | The Dolly instruction dataset from Databricks. 14 | https://huggingface.co/datasets/databricks/databricks-dolly-15k 15 | ''' 16 | def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]: 17 | root_path = get_path_for("dolly") 18 | file_path = os.path.join(root_path, "databricks-dolly-15k.jsonl") 19 | 20 | with open(file_path, "r", encoding="utf-8") as f: 21 | for line in f: 22 | entry = json.loads(line) 23 | yield AlpacaLikeDataInstance( 24 | instruction=entry["instruction"], 25 | input=entry["context"], 26 | output=entry["response"] 27 | ) 28 | -------------------------------------------------------------------------------- /toolbox/datasets/sharegpt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import typing as t 5 | from dataclasses import dataclass 6 | 7 | from toolbox.core.dataset import BaseDataset 8 | from toolbox.utils.files import enumerate_files_for 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | @dataclass(frozen=True) 14 | class ShareGptEpisode: 15 | # beautiful... 16 | messages: list[list[list[str]] | list[str]] 17 | source_file: str 18 | 19 | 20 | class ShareGptDataset(BaseDataset[ShareGptEpisode]): 21 | '''ChatGPT conversations shared on ShareGPT.''' 22 | 23 | def __iter__(self) -> t.Generator[ShareGptEpisode, None, None]: 24 | for path in enumerate_files_for(dataset_name="sharegpt", 25 | file_extension=".json"): 26 | with open(path, "r", encoding="utf-8") as file: 27 | data = json.load(file) 28 | source_file = os.path.basename(path).replace(".json", "") 29 | yield ShareGptEpisode(messages=data, source_file=source_file) 30 | -------------------------------------------------------------------------------- /toolbox/utils/files.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from toolbox.core.dataset import get_path_for 5 | 6 | LOG = logging.getLogger(__name__) 7 | 8 | 9 | def enumerate_files_for( 10 | dataset_name: str, 11 | file_extension: str, 12 | subfolder: str | None = None, 13 | ) -> list[str]: 14 | '''Returns a list of files available for the given dataset.''' 15 | dataset_path = get_path_for(dataset_name) 16 | final_path = dataset_path if subfolder is None else os.path.join( 17 | dataset_path, subfolder) 18 | items = os.listdir(final_path) 19 | 20 | files: list[str] = [] 21 | for item in items: 22 | item_path = os.path.join(final_path, item) 23 | if not os.path.isfile(item_path): 24 | # We don't care about folders. 25 | continue 26 | 27 | if not item_path.endswith(file_extension): 28 | # Ignore invalid file extensions. 29 | continue 30 | 31 | absolute_file_path = os.path.abspath(item_path) 32 | files.append(absolute_file_path) 33 | 34 | return files 35 | -------------------------------------------------------------------------------- /toolbox/datasets/gpt4llm.py: -------------------------------------------------------------------------------- 1 | import json 2 | import typing as t 3 | 4 | from toolbox.core.dataset import BaseDataset 5 | from toolbox.datasets.common import AlpacaLikeDataInstance 6 | from toolbox.utils.files import enumerate_files_for 7 | 8 | class Gpt4LlmDataset(BaseDataset[AlpacaLikeDataInstance]): 9 | ''' 10 | GPT-4-LLM data. 11 | 12 | https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM 13 | ''' 14 | 15 | def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]: 16 | filepaths = enumerate_files_for("gpt-4-llm", file_extension="json") 17 | 18 | for path in filepaths: 19 | if "comparision_data.json" in path: 20 | # TODO(11b): Handle this later. 21 | continue 22 | 23 | with open(path, "r") as file: 24 | data = json.load(file) 25 | for entry in data: 26 | yield AlpacaLikeDataInstance( 27 | instruction=entry["instruction"], 28 | input=entry["input"], 29 | output=entry["output"], 30 | ) 31 | -------------------------------------------------------------------------------- /toolbox/datasets/claude_evol_instruct.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import typing as t 5 | 6 | from toolbox.core.dataset import BaseDataset, get_path_for 7 | from toolbox.datasets.common import SimpleReplyDataInstance 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | class ClaudeEvolInstructDataset(BaseDataset[SimpleReplyDataInstance]): 12 | ''' 13 | Instructions augmented via. WizardLM's Evol-Instruct technique, answered with Claude 14 | https://huggingface.co/datasets/Norquinal/claude_evol_instruct_210k 15 | ''' 16 | def __iter__(self) -> t.Generator[SimpleReplyDataInstance, None, None]: 17 | root_path = get_path_for("claude-evol") 18 | file_path = os.path.join(root_path, "claude_evol_instruct_210k.json") 19 | 20 | with open(file_path, "r", encoding="utf-8") as f: 21 | data = json.load(f) 22 | # Go through the logs and simply fetch them 23 | for entry in data: 24 | yield SimpleReplyDataInstance( 25 | prompt=entry["instruction"], 26 | generation=entry["output"], 27 | ) 28 | -------------------------------------------------------------------------------- /toolbox/datasets/gpt4all.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from dataclasses import dataclass 3 | 4 | import pandas as pd 5 | 6 | from toolbox.core.dataset import BaseDataset 7 | from toolbox.utils.files import enumerate_files_for 8 | 9 | 10 | @dataclass(frozen=True) 11 | class Gpt4AllDataInstance: 12 | prompt: str 13 | response: str 14 | source: str 15 | 16 | 17 | class Gpt4AllDataset(BaseDataset[Gpt4AllDataInstance]): 18 | ''' 19 | NomicAI's GPT4all dataset. 20 | 21 | https://huggingface.co/datasets/nomic-ai/gpt4all_prompt_generations 22 | ''' 23 | 24 | def __iter__(self) -> t.Generator[Gpt4AllDataInstance, None, None]: 25 | parquet_files = enumerate_files_for("gpt4all_prompt_generations", 26 | file_extension="parquet") 27 | 28 | for file in parquet_files: 29 | df = pd.read_parquet(file) 30 | for idx in df.index: 31 | yield Gpt4AllDataInstance( 32 | prompt=df["prompt"][idx], 33 | response=df["response"][idx], 34 | source=df["source"][idx], 35 | ) 36 | -------------------------------------------------------------------------------- /toolbox/datasets/openorca.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import typing as t 4 | from dataclasses import dataclass 5 | 6 | import pandas as pd 7 | 8 | from toolbox.core.dataset import BaseDataset 9 | from toolbox.utils.files import enumerate_files_for 10 | 11 | LOG = logging.getLogger(__name__) 12 | 13 | @dataclass(frozen=True) 14 | class OpenOrcaEntry: 15 | id: str 16 | system_prompt: str 17 | question: str 18 | response: str 19 | 20 | class OpenOrcaDataset(BaseDataset[OpenOrcaEntry]): 21 | '''The OpenOrca dataset.''' 22 | def __iter__(self) -> t.Generator[OpenOrcaEntry, None, None]: 23 | # We have this so that one can use GPT-4 OpenOrca, 3.5 OpenOrca, or both 24 | for path in enumerate_files_for(dataset_name="openorca", file_extension=".parquet"): 25 | df = pd.read_parquet(path) 26 | for idx in df.index: 27 | yield OpenOrcaEntry( 28 | id=df["id"][idx], 29 | system_prompt=df["system_prompt"][idx], 30 | question=df["question"][idx], 31 | response=df["response"][idx] 32 | ) 33 | -------------------------------------------------------------------------------- /toolbox/core/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | 4 | HERE = os.path.realpath(os.path.dirname(__file__)) 5 | T = t.TypeVar("T") 6 | 7 | 8 | class BaseDataset(t.Generic[T]): 9 | '''Base dataset class.''' 10 | 11 | def __iter__(self) -> t.Generator[T, None, None]: 12 | ''' 13 | This method must be overidden when inheriting. It should yield 14 | individual items from the dataset. 15 | ''' 16 | raise NotImplementedError 17 | 18 | 19 | def get_path_for(dataset_name: str | None) -> str: 20 | ''' 21 | Returns an absolute path. If `dataset_name` is given, it will return the 22 | path to the specific dataset's folder, otherwise it'll return the path to 23 | the root data folder. 24 | ''' 25 | 26 | # Allow overriding the location of the root data folder by using an 27 | # environment variable. 28 | env_var = "TOOLBOX_DATA_FOLDER" 29 | if env_var in os.environ: 30 | components = [os.environ[env_var]] 31 | else: 32 | components = [HERE, "..", "..", "data"] 33 | 34 | if dataset_name is not None: 35 | components.append(dataset_name) 36 | 37 | return os.path.join(*components) 38 | -------------------------------------------------------------------------------- /toolbox/datasets/claude_multiround.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import typing as t 5 | 6 | from dataclasses import dataclass 7 | 8 | from toolbox.core.dataset import BaseDataset, get_path_for 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | @dataclass(frozen=True) 13 | class ClaudeMultiround: 14 | conversation: list[dict[str, str]] 15 | id: str 16 | 17 | class ClaudeInstructDataset(BaseDataset[ClaudeMultiround]): 18 | ''' 19 | Logs taken from synthetically-generated instruction chats with Claude. 20 | https://huggingface.co/datasets/Norquinal/claude_multiround_chat_30k 21 | ''' 22 | def __iter__(self) -> t.Generator[ClaudeMultiround, None, None]: 23 | root_path = get_path_for("claude-multiround") 24 | file_path = os.path.join(root_path, "claude_multiround_chat_30k.json") 25 | 26 | with open(file_path, "r", encoding="utf-8") as f: 27 | logs = json.load(f) 28 | # Go through the logs and simply fetch them 29 | for round in logs: 30 | yield ClaudeMultiround( 31 | conversation=round["conversations"], 32 | id=round["id"], 33 | ) 34 | -------------------------------------------------------------------------------- /toolbox/datasets/gpteacher.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import typing as t 4 | 5 | from toolbox.core.dataset import BaseDataset, get_path_for 6 | from toolbox.datasets.common import AlpacaLikeDataInstance 7 | 8 | class GpTeacherDataset(BaseDataset[AlpacaLikeDataInstance]): 9 | ''' 10 | GPTeacher data. 11 | 12 | https://github.com/teknium1/GPTeacher 13 | ''' 14 | 15 | def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]: 16 | path_to_root_folder = get_path_for("gpteacher") 17 | for desired_filename in DESIRED_FILES: 18 | path = os.path.join(path_to_root_folder, desired_filename) 19 | with open(path, "r") as file: 20 | data = json.load(file) 21 | for entry in data: 22 | yield AlpacaLikeDataInstance( 23 | instruction=entry["instruction"], 24 | input=entry["input"], 25 | output=entry["response"], 26 | ) 27 | 28 | 29 | DESIRED_FILES = [ 30 | "Instruct/gpt4-instruct-similarity-0.9-dataset.json", 31 | "Roleplay/roleplay-similarity_0.9-instruct-dataset.json", 32 | "Toolformer/toolformer-similarity-0.9-dataset.json", 33 | ] 34 | -------------------------------------------------------------------------------- /toolbox/datasets/airoboros2.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import typing as t 5 | 6 | from dataclasses import dataclass 7 | 8 | from toolbox.core.dataset import BaseDataset, get_path_for 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | @dataclass(frozen=True) 13 | class Airoboros2DataInstance: 14 | instruction: str 15 | response: str 16 | system_prompt: str 17 | category: str 18 | 19 | class Airoboros2Dataset(BaseDataset[Airoboros2DataInstance]): 20 | ''' 21 | Instructions from Airoboros 2.2.1 22 | https://huggingface.co/datasets/jondurbin/airoboros-2.2.1/ 23 | ''' 24 | def __iter__(self) -> t.Generator[Airoboros2DataInstance, None, None]: 25 | root_path = get_path_for("airoboros2") 26 | file_path = os.path.join(root_path, "instructions.jsonl") 27 | 28 | with open(file_path, "r", encoding="utf-8") as f: 29 | for line in f: 30 | entry = json.loads(line) 31 | yield Airoboros2DataInstance( 32 | instruction=entry["instruction"], 33 | response=entry["response"], 34 | system_prompt=entry["system"], 35 | category=entry["category"], 36 | ) 37 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pdm] 2 | 3 | [project] 4 | name = "toolbox" 5 | version = "0.1.0" 6 | description = "Code for ingesting data from several sources, formatting it and creating a training dataset." 7 | authors = [ 8 | {name = "0x000011b", email = "0x000011b@proton.me"}, 9 | ] 10 | requires-python = ">=3.10" 11 | license = {text = "AGPL-3.0-only"} 12 | dependencies = [ 13 | "markdownify>=0.11.6", 14 | "html5lib>=1.1", 15 | "beautifulsoup4>=4.11.2", 16 | "ansicolors>=1.1.8", 17 | "pandas>=1.5.3", 18 | "mashumaro>=3.5", 19 | "pyarrow>=11.0.0", 20 | "sklearn>=0.0.post4", 21 | "pyyaml>=6.0.1", 22 | ] 23 | 24 | [project.optional-dependencies] 25 | dev = [ 26 | "yapf>=0.32.0", 27 | "toml>=0.10.2", 28 | "isort>=5.10.1", 29 | "pylint>=2.15.8", 30 | "mypy>=0.991", 31 | ] 32 | debugging = [ 33 | "pdbpp>=0.10.3", 34 | "scalene>=1.5.19", 35 | ] 36 | 37 | [tool.setuptools] 38 | py-modules = ["toolbox"] 39 | 40 | [tool.pdm.scripts] 41 | lint = {shell = "pylint --jobs 0 ./toolbox/**/*.py"} 42 | importcheck = "isort --check --diff toolbox" 43 | importfix = "isort toolbox" 44 | stylecheck = "yapf --parallel --diff --recursive toolbox" 45 | stylefix = "yapf --parallel --in-place --recursive toolbox" 46 | typecheck = "mypy --strict toolbox" 47 | 48 | [tool.yapf] 49 | based_on_style = "google" 50 | 51 | [tool.mypy] 52 | ignore_missing_imports = true 53 | -------------------------------------------------------------------------------- /toolbox/datasets/supercot.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import typing as t 5 | 6 | from toolbox.core.dataset import BaseDataset, get_path_for 7 | from toolbox.datasets.common import AlpacaLikeDataInstance 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class SuperCotDataset(BaseDataset[AlpacaLikeDataInstance]): 12 | ''' 13 | The SuperCOT dataset, packed neatly into standard Alpaca format. 14 | https://huggingface.co/datasets/kaiokendev/SuperCOT-dataset 15 | ''' 16 | def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]: 17 | root_path = get_path_for("supercot") 18 | file_path = os.path.join(root_path, "filtered.json") 19 | 20 | with open(file_path, "r", encoding="utf-8") as f: 21 | data = json.load(f) 22 | for entry in data: 23 | # "rewritten_intent" is pretty similar to just a standard input 24 | # and replaces the "input" field in the JSON, so just conflate 25 | # the two. 26 | try: 27 | input = entry["input"] 28 | except KeyError: 29 | input = entry["rewritten_intent"] 30 | yield AlpacaLikeDataInstance( 31 | instruction=entry["instruction"], 32 | input=input, 33 | output=entry["output"] 34 | ) 35 | -------------------------------------------------------------------------------- /toolbox/datasets/soda.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | from dataclasses import dataclass 4 | 5 | import pandas as pd 6 | 7 | from toolbox.core.dataset import BaseDataset, get_path_for 8 | 9 | 10 | @dataclass(frozen=True) 11 | class SodaEpisode: 12 | narrative: str 13 | dialogue: t.List[str] 14 | speakers: t.List[str] 15 | relation: str 16 | literal: str 17 | original_index: str 18 | 19 | 20 | class SodaDataset(BaseDataset[SodaEpisode]): 21 | ''' 22 | SODA: Million-scale Dialogue Distillation with Social Commonsense 23 | Contextualization 24 | 25 | https://huggingface.co/datasets/allenai/soda 26 | ''' 27 | 28 | def __init__(self, split: str = "train") -> None: 29 | assert split in ["test", "train", "valid"] 30 | root_data_path = get_path_for("soda") 31 | self.file_path = os.path.join(root_data_path, f"{split}.parquet") 32 | 33 | super().__init__() 34 | 35 | def __iter__(self) -> t.Generator[SodaEpisode, None, None]: 36 | df = pd.read_parquet(self.file_path) 37 | for idx in df.index: 38 | yield SodaEpisode(narrative=df["narrative"][idx], 39 | dialogue=df["dialogue"][idx], 40 | speakers=df["speakers"][idx], 41 | relation=df["relation"][idx], 42 | literal=df["literal"][idx], 43 | original_index=str(df["original_index"][idx])) 44 | -------------------------------------------------------------------------------- /toolbox/datasets/whocars.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import logging 4 | import typing as t 5 | from dataclasses import dataclass 6 | 7 | from toolbox.core.dataset import BaseDataset 8 | from toolbox.utils.files import enumerate_files_for 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | @dataclass(frozen=True) 14 | class WhocarsEntry: 15 | model: str 16 | endpoint: str 17 | prompt_json: dict[str, t.Any] 18 | response: str 19 | 20 | 21 | class WhocarsDataset(BaseDataset[WhocarsEntry]): 22 | '''Logs from the whocars proxy.''' 23 | 24 | def __iter__(self) -> t.Generator[WhocarsEntry, None, None]: 25 | for file_path in enumerate_files_for("whocars", file_extension=".csv"): 26 | if "__index__" in file_path: 27 | continue 28 | 29 | with open(file_path, "r") as file: 30 | reader = csv.DictReader(file) 31 | try: 32 | for row in reader: 33 | yield WhocarsEntry( 34 | model=row["model"], 35 | endpoint=row["endpoint"], 36 | prompt_json=json.loads(row["prompt json"]), 37 | response=row["response"], 38 | ) 39 | except csv.Error as ex: 40 | # One file seems to have broken encoding, just skip over it, 41 | # we have enough data otherwise. 42 | LOG.error(ex) -------------------------------------------------------------------------------- /toolbox/datasets/wizard_vicuna.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import typing as t 4 | from dataclasses import dataclass 5 | 6 | from toolbox.core.dataset import BaseDataset, get_path_for 7 | 8 | 9 | @dataclass(frozen=True) 10 | class WizardVicunaConversation: 11 | id: str 12 | human_question: str 13 | gpt_response: str 14 | 15 | 16 | class WizardVicunaDataset(BaseDataset[WizardVicunaConversation]): 17 | ''' 18 | Data from WizardVicuna. 19 | 20 | https://huggingface.co/datasets/junelee/wizard_vicuna_70k 21 | ''' 22 | 23 | def __iter__(self) -> t.Generator[WizardVicunaConversation, None, None]: 24 | root_path = get_path_for("wizard_vicuna_70k") 25 | file_path = os.path.join(root_path, "wizard_vicuna_dataset.json") 26 | 27 | with open(file_path, "r") as file: 28 | data = json.load(file) 29 | for entry in data: 30 | messages = entry["conversations"] 31 | for idx in range(0, len(messages), 2): 32 | human_message = messages[idx] 33 | gpt_message = messages[idx + 1] 34 | 35 | # Sanity check. 36 | assert human_message["from"] == "human" 37 | assert gpt_message["from"] == "gpt" 38 | 39 | yield WizardVicunaConversation( 40 | id=entry["id"], 41 | human_question=human_message["value"], 42 | gpt_response=gpt_message["value"], 43 | ) 44 | -------------------------------------------------------------------------------- /toolbox/datasets/limarp.py: -------------------------------------------------------------------------------- 1 | # Much of this taken from dataprepare.py in the LIMARP, thanks anon 2 | # If it ain't broke, don't fix it! 3 | import glob 4 | import os 5 | import typing as t 6 | import yaml 7 | 8 | from dataclasses import dataclass 9 | 10 | from toolbox.core.dataset import BaseDataset, get_path_for 11 | 12 | @dataclass(frozen=True) 13 | class LimaRpEntry: 14 | personas: dict[str, str] 15 | names: dict[str, str] 16 | scenario: str 17 | conversation: list[dict[str, str]] 18 | forum: str 19 | thread_id: int 20 | 21 | class LimaRpDataset(BaseDataset[LimaRpEntry]): 22 | '''A collection of high-quality hand-curated roleplays.''' 23 | def __iter__(self) -> t.Generator[LimaRpEntry, None, None]: 24 | base_path = get_path_for("lima-erp") 25 | glob_path = f"{os.path.normpath(base_path)}/data/**/*.yaml" 26 | file_paths = glob.glob(glob_path, recursive=True) 27 | 28 | for file in file_paths: 29 | forum = os.path.basename(os.path.dirname(file)) 30 | thread_id = os.path.basename(file).split(".")[0] 31 | with open(file, 'r', encoding='utf-8') as f: 32 | source = yaml.safe_load(f) 33 | yield LimaRpEntry( 34 | personas=source["persona"], 35 | names=source["names"], 36 | scenario=source["scenario"], 37 | conversation=source["conversation"], 38 | forum=forum, 39 | thread_id=thread_id, 40 | ) 41 | -------------------------------------------------------------------------------- /toolbox/datasets/mcstories.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import sys 4 | import typing as t 5 | from dataclasses import dataclass 6 | 7 | from toolbox.core.dataset import BaseDataset, get_path_for 8 | 9 | 10 | @dataclass(frozen=True) 11 | class McStory: 12 | title: str 13 | author: str 14 | date: str 15 | tags: str 16 | summary: str 17 | href: str 18 | header: str 19 | text_contents: str 20 | footer: str 21 | 22 | 23 | class McStoriesDataset(BaseDataset[McStory]): 24 | '''Data from a certain story-sharing site.''' 25 | 26 | def __iter__(self) -> t.Generator[McStory, None, None]: 27 | # NOTE(11b): I had no idea this was a thing, but apparently Python's CSV 28 | # reader by default shits the bed if you have a field longer than 131072 29 | # characters. _Usually_ this means you've messed up the parsing, but in 30 | # our case it's actually just a massive forum post triggering this. 31 | # https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072 32 | csv.field_size_limit(sys.maxsize) 33 | 34 | root_data_path = get_path_for("mcstories") 35 | file_path = os.path.join(root_data_path, "mcstories--all.csv") 36 | 37 | with open(file_path, "r") as file: 38 | reader = csv.DictReader(file, delimiter=",") 39 | for row in reader: 40 | story = McStory( 41 | title=row["story_title"], 42 | author=row["story_author"], 43 | date=row["story_date"], 44 | tags=row["story_tags"], 45 | summary=row["story_summary"], 46 | href=row["story_href"], 47 | header=row["story_header"], 48 | text_contents=row["story"], 49 | footer=row["story_footer"], 50 | ) 51 | yield story -------------------------------------------------------------------------------- /toolbox/tasks/whocars_roleplay.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import typing as t 4 | 5 | from toolbox.core.models import Episode, Turn, TurnKind 6 | from toolbox.core.task import BaseTask 7 | from toolbox.datasets.whocars import WhocarsDataset 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | # A minor note for this task: the data does not seem to be very clean. Even 12 | # GPT-4 seems to have trouble following the system prompt, resulting in 13 | # instructions like "ALWAYS precede dialogue with character names" being 14 | # ignored. Pronouns are also messed up sometimes. This will likely bleed into 15 | # our model, but for now I'm not gonna bother with this. 16 | 17 | 18 | class WhocarsRoleplayTask(BaseTask): 19 | '''Task to roleplay as a given character.''' 20 | 21 | def __iter__(self) -> t.Generator[Episode, None, None]: 22 | for idx, entry in enumerate(WhocarsDataset()): 23 | if entry.endpoint == "kobold": 24 | continue 25 | 26 | assert entry.endpoint == "openai", entry.endpoint 27 | if "gpt-4" not in entry.model: 28 | continue 29 | 30 | if entry.prompt_json[0]["role"] != "system": 31 | continue 32 | 33 | turns: list[Turn] = [] 34 | for msg in entry.prompt_json: 35 | utterance = msg["content"].strip() 36 | 37 | turn_kind = TurnKind.MODEL 38 | if msg["role"] == "system": 39 | turn_kind = TurnKind.SYSTEM 40 | utterance = _clean_system_message(utterance) 41 | if msg["role"] == "user": 42 | turn_kind = TurnKind.USER 43 | 44 | turn = Turn( 45 | utterance=_clean_message(utterance), 46 | kind=turn_kind, 47 | ) 48 | turns.append(turn) 49 | yield Episode(turns=turns, identifier=f"whocars-{idx}") 50 | 51 | 52 | def _clean_system_message(msg: str) -> str: 53 | # TavernAI's system message(s) very often refer to the user as You, but 54 | # uses a dumb string replace which means there's broken grammar and 55 | # conflicting instructions within the prompt usually. To try and alleviate 56 | # that, we replace `You` with `{{user}}` for clarity. 57 | return re.sub(r"\bYou\b", "{{user}}", msg) 58 | 59 | 60 | def _clean_message(msg: str) -> str: 61 | '''Handles common typos or bad tags.''' 62 | msg = msg.replace("{{chaar}}", "{{char}}") 63 | msg = msg.replace("{{character}}", "{{char}}") 64 | return msg 65 | -------------------------------------------------------------------------------- /toolbox/tasks/airoboros_instruction_following.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.airoboros import AiroborosDataset 7 | from toolbox.utils.prompts import generate_prompts, select_prompt 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | class AiroborosInstructionFollowingTask(BaseTask): 12 | '''Instruction following task based on the Airoboros data.''' 13 | def __iter__(self) -> t.Generator[Episode, None, None]: 14 | for idx, instance in enumerate(AiroborosDataset(), start=1): 15 | # Throw out any responses containing "Airoboros" 16 | if instance.generation.lower().strip() == "airoboros": 17 | continue 18 | 19 | turns: list[Turn] = [ 20 | Turn( 21 | utterance=select_prompt(SYSTEM_PROMPTS), 22 | kind=TurnKind.SYSTEM, 23 | ), 24 | Turn( 25 | utterance=instance.prompt, 26 | kind=TurnKind.USER, 27 | ), 28 | Turn( 29 | utterance=instance.generation, 30 | kind=TurnKind.MODEL, 31 | ), 32 | ] 33 | 34 | yield Episode(turns=turns, identifier=f"airoboros-instruct-{idx}") 35 | 36 | 37 | BASE_SYSTEM_PROMPTS = [ 38 | "", 39 | "assistant", 40 | "%{You are now in|Engage|Start|Enter|Consider} %{instruction following|instruction|question answering|assistant|AI assistant|helper} mode. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}.", 41 | "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following|helping out|helper} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says." 42 | "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}", 43 | "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).", 44 | "Instruction mode!", 45 | "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!", 46 | "isHelper = true;" 47 | ] 48 | 49 | SYSTEM_PROMPTS = generate_prompts(BASE_SYSTEM_PROMPTS) 50 | -------------------------------------------------------------------------------- /toolbox/datasets/clubfloyd.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import typing as t 4 | from dataclasses import dataclass 5 | 6 | from toolbox.core.dataset import BaseDataset, get_path_for 7 | 8 | 9 | @dataclass(frozen=True) 10 | class StoryAction: 11 | action: str 12 | response: str 13 | endoftext: bool 14 | 15 | 16 | @dataclass(frozen=True) 17 | class ClubFloydStory: 18 | name: str 19 | author: str 20 | genres: list[str] 21 | tags: list[str] 22 | year: int 23 | ratings: list[int] 24 | total_ratings: int 25 | average_rating: float 26 | transcript_id: str 27 | discretion_advised: bool 28 | description: str 29 | actions: list[StoryAction] 30 | 31 | 32 | class ClubFloydDataset(BaseDataset[ClubFloydStory]): 33 | ''' 34 | Data from VE's ClubFloyd scrape. 35 | 36 | https://wandb.ai/ve-forbryderne/skein/runs/files/files/datasets/floyd 37 | ''' 38 | 39 | def __iter__(self) -> t.Generator[ClubFloydStory, None, None]: 40 | root_path = get_path_for("club-floyd") 41 | file_path = os.path.join(root_path, "floyd.json") 42 | 43 | with open(file_path, "r") as file: 44 | raw_stories = json.load(file).values() 45 | for raw_story in raw_stories: 46 | actions = [ 47 | _story_action_from_dict(action) 48 | for action in raw_story["data"] 49 | ] 50 | 51 | yield ClubFloydStory( 52 | name=raw_story["name"], 53 | author=raw_story["author"], 54 | genres=raw_story["genres"], 55 | tags=raw_story["tags"], 56 | year=raw_story["year"], 57 | ratings=raw_story["ratings"], 58 | total_ratings=raw_story["total_ratings"], 59 | average_rating=raw_story["average_rating"], 60 | transcript_id=raw_story["transcript_id"], 61 | discretion_advised=raw_story["discretion_advised"], 62 | description=raw_story["description"], 63 | actions=actions, 64 | ) 65 | 66 | 67 | def _story_action_from_dict(data: dict[str, str | bool]) -> StoryAction: 68 | action = data["action"] 69 | response = data["response"] 70 | endoftext = data["endoftext"] 71 | 72 | assert isinstance(action, str), "Unexpected type for `action` field" 73 | assert isinstance(response, str), "Unexpected type for `response` field" 74 | assert isinstance(endoftext, bool), "Unexpected type for `endoftext` field" 75 | 76 | return StoryAction( 77 | action=action, 78 | response=response, 79 | endoftext=endoftext, 80 | ) 81 | -------------------------------------------------------------------------------- /toolbox/tasks/airoboros_guess_instructions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.airoboros import AiroborosDataset 7 | from toolbox.utils.prompts import generate_prompts, select_prompt 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | class AiroborosGuessTheInstructionTask(BaseTask): 12 | '''Instruction following task based on the Airoboros data.''' 13 | def __iter__(self) -> t.Generator[Episode, None, None]: 14 | for idx, instance in enumerate(AiroborosDataset(), start=1): 15 | # Throw out any responses containing "Airoboros" 16 | if instance.generation.lower().strip() == "airoboros": 17 | continue 18 | 19 | turns: list[Turn] = [ 20 | Turn( 21 | utterance=select_prompt(SYSTEM_PROMPTS), 22 | kind=TurnKind.SYSTEM, 23 | ), 24 | Turn( 25 | utterance=instance.generation, 26 | kind=TurnKind.USER, 27 | ), 28 | Turn( 29 | utterance=instance.prompt, 30 | kind=TurnKind.MODEL, 31 | ), 32 | ] 33 | 34 | yield Episode(turns=turns, identifier=f"airoboros-gti-{idx}") 35 | 36 | 37 | _BASE_SYSTEM_PROMPTS = [ 38 | "%{Enter|Engage|Begin|Consider} %{instruction guessing|reverse instruction} mode. In this mode, a user will type some %{text|answer|information} and %{the AI|you} will attempt to guess the instruction which %{corresponds|aligns with} the user's input. Do not say anything else but the instruction.", 39 | "%{Mode|Task}: 'Guess The Instruction'\nA user will type %{text|answer|information} and it is %{your|the AI's|the assistant's} %{job|goal} to answer with a generated instruction. Think of this almost like a question-guessing game.", 40 | "You are now in %{flipped instruction|reverse instruction|instruction guessing} mode. The %{user|prompter} will type something like an %{AI-|artificially }generated answer and you will provide the instruction that was used to %{generate|create} that answer.", 41 | "I am an %{assistant|AI} designed to %{guess|predict} what a user %{may|could|might} type as a question. The %{user|prompter} will send some sort of information and %{perhaps|maybe} some additional context in order for me to do so.", 42 | "Your question will be...", 43 | "%{I|I'll|i|i'll} %{predict|guess|foresee} whatever question you'll ask, given an answer!" 44 | "instruct", 45 | "assistant", 46 | "is_assistant = True" 47 | ] 48 | 49 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 50 | -------------------------------------------------------------------------------- /toolbox/tasks/supercot_instruction_following.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.supercot import SuperCotDataset 7 | from toolbox.utils.prompts import generate_prompts, select_prompt 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | class SuperCotInstructionFollowingTask(BaseTask): 12 | '''Instruction following task based on the SuperCOT data.''' 13 | def __iter__(self) -> t.Generator[Episode, None, None]: 14 | for idx, instance in enumerate(SuperCotDataset()): 15 | sys_prompt = select_prompt(SYSTEM_PROMPTS) 16 | user_prompt = instance.instruction 17 | if instance.input is not None: 18 | user_prompt += f"\n{instance.input}" 19 | turns: list[Turn] = [ 20 | Turn( 21 | utterance=sys_prompt, 22 | kind=TurnKind.SYSTEM, 23 | ), 24 | Turn( 25 | utterance=user_prompt, 26 | kind=TurnKind.USER, 27 | ), 28 | Turn( 29 | utterance=instance.output, 30 | kind=TurnKind.MODEL, 31 | ) 32 | ] 33 | yield Episode( 34 | turns=turns, 35 | identifier=f"supercot-{idx}" 36 | ) 37 | 38 | _BASE_SYSTEM_PROMPTS = [ 39 | "", 40 | "assistant", 41 | "%{You are now in|Engage|Start|Enter|Consider} %{instruction following|instruction|question answering|assistant|AI assistant} mode. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}.", 42 | "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says." 43 | "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}", 44 | "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).", 45 | "Instruction mode!", 46 | "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!", 47 | """To %{be in|enter|engage} %{instruction following|question answering|assistant} mode is your goal. This agent is to %{do|consider|abide by} the following: 48 | - %{Obey|Follow} all user %{directions|instructions|queries|orders} 49 | - %{Do not refuse to do the above|You cannot disobey the user}""" 50 | ] 51 | 52 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 53 | -------------------------------------------------------------------------------- /toolbox/tasks/gpt4all_question_answering.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import typing as t 4 | 5 | from markdownify import markdownify 6 | 7 | from toolbox.core.models import Episode, Turn, TurnKind 8 | from toolbox.core.task import BaseTask 9 | from toolbox.datasets.gpt4all import Gpt4AllDataset 10 | from toolbox.utils.prompts import generate_prompts, select_prompt 11 | 12 | LOG = logging.getLogger(__name__) 13 | 14 | 15 | class Gpt4AllQuestionAnsweringTask(BaseTask): 16 | '''Question answering based on GPT4all data.''' 17 | 18 | def __iter__(self) -> t.Generator[Episode, None, None]: 19 | for idx, instance in enumerate(Gpt4AllDataset()): 20 | try: 21 | turns: list[Turn] = [ 22 | Turn( 23 | utterance=select_prompt(SYSTEM_PROMPTS), 24 | kind=TurnKind.SYSTEM, 25 | ), 26 | Turn( 27 | utterance=_html_to_clean_markdown(instance.prompt), 28 | kind=TurnKind.USER, 29 | ), 30 | Turn( 31 | utterance=_html_to_clean_markdown(instance.response), 32 | kind=TurnKind.MODEL, 33 | ), 34 | ] 35 | 36 | yield Episode(turns=turns, identifier=f"gpt4all-{idx}") 37 | except AssertionError as ex: 38 | # TODO(11b): markdownify lib is choking when seeing some 39 | # regexes in the data. Skiping data for now, but ideally we'd 40 | # work around this. 41 | LOG.warning( 42 | "Skipping over data instance due to failed assertion: %s", 43 | ex) 44 | 45 | 46 | def _html_to_clean_markdown(html: str) -> str: 47 | ''' 48 | Converts the given HTML to Markdown and cleans up any weird-looking stuff 49 | left behind. Manually identified by randomly sampling the data. 50 | ''' 51 | markdown = markdownify(html) 52 | 53 | # Fix excessive spaces after converting to Markdown. 54 | markdown = re.sub("\n{2,}", "\n\n", markdown) 55 | 56 | return markdown.strip() 57 | 58 | 59 | _BASE_SYSTEM_PROMPTS = [ 60 | "Consider Assistant, a %{large language model|LLM}. Assistant is trained to %{respond to|follow} user %{instructions|requests|questions} as truthfully as %{possible|it can}.", 61 | "%{Enter|You are now in|Engage} %{instruction following|question answering|assistant|instruction} mode. In this mode, you %{will|are to} %{follow the instructions|reply to the queries} of %{the user|users}", 62 | "Description: An AI assistant whose %{job|objective|task} is to follow instructions.\n%{Specifically, it will:|Consider the following:|Note this:}\nYou %{can only generate|are bound to generating} text\nYou have issues with stuff like math and gathering %{info|information} in the present", 63 | "assistant" 64 | ] 65 | 66 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 67 | -------------------------------------------------------------------------------- /toolbox/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | from toolbox.core.task import BaseTask 4 | from toolbox.tasks.airoboros_guess_instructions import AiroborosGuessTheInstructionTask 5 | from toolbox.tasks.airoboros_instruction_following import AiroborosInstructionFollowingTask 6 | from toolbox.tasks.airoboros2_instruction_following import Airoboros2InstructionFollowingTask 7 | from toolbox.tasks.aidungeon_text_adventure import AiDungeonTextAdventureTask 8 | from toolbox.tasks.characterai_roleplay import CharacterAiRoleplayTask 9 | from toolbox.tasks.claude_evol_instruct import ClaudeEvolInstructTask 10 | from toolbox.tasks.claude_guess_instruction import ClaudeGuessTheInstructionTask 11 | from toolbox.tasks.claude_instruct import ClaudeInstructTask 12 | from toolbox.tasks.claude_roleplay import ClaudeRoleplayTask 13 | from toolbox.tasks.clubfloyd_text_adventure import ClubFloydTextAdventureTask 14 | from toolbox.tasks.dolly_guess_instruction import DollyGuessTheInstructionTask 15 | from toolbox.tasks.evol_instruct import EvolInstructTask 16 | from toolbox.tasks.gpt4all_question_answering import \ 17 | Gpt4AllQuestionAnsweringTask 18 | from toolbox.tasks.mcstories_writing import McStoriesWritingTask 19 | from toolbox.tasks.openorca_instruction_following import OpenOrcaInstructionFollowingTask 20 | from toolbox.tasks.rp_forums_writing import RpForumsWritingTask 21 | from toolbox.tasks.rp_guild_writing import RpGuildWritingTask 22 | from toolbox.tasks.sharegpt_instruction_following import \ 23 | ShareGptInstructionFollowingTask 24 | from toolbox.tasks.single_turn_instruction_following import \ 25 | SingleTurnInstructionFollowingTask 26 | from toolbox.tasks.soda_reply_generation import SodaReplyGenerationTask 27 | from toolbox.tasks.soda_summarization import SodaSummarizationTask 28 | from toolbox.tasks.supercot_instruction_following import SuperCotInstructionFollowingTask 29 | from toolbox.tasks.limarp_roleplay import LimaRpRoleplayTask 30 | from toolbox.tasks.whocars_roleplay import WhocarsRoleplayTask 31 | from toolbox.tasks.wizard_vicuna_question_answering import \ 32 | WizardVicunaQuestionAnsweringTask 33 | 34 | NAME_TO_TASK_MAPPING: dict[str, t.Type[BaseTask]] = { 35 | cls.__name__: cls for cls in [ 36 | AiroborosGuessTheInstructionTask, 37 | AiroborosInstructionFollowingTask, 38 | Airoboros2InstructionFollowingTask, 39 | AiDungeonTextAdventureTask, 40 | CharacterAiRoleplayTask, 41 | ClaudeEvolInstructTask, 42 | ClaudeGuessTheInstructionTask, 43 | ClaudeInstructTask, 44 | ClaudeRoleplayTask, 45 | ClubFloydTextAdventureTask, 46 | DollyGuessTheInstructionTask, 47 | EvolInstructTask, 48 | Gpt4AllQuestionAnsweringTask, 49 | McStoriesWritingTask, 50 | LimaRpRoleplayTask, 51 | OpenOrcaInstructionFollowingTask, 52 | RpForumsWritingTask, 53 | RpGuildWritingTask, 54 | ShareGptInstructionFollowingTask, 55 | SingleTurnInstructionFollowingTask, 56 | SodaReplyGenerationTask, 57 | SodaSummarizationTask, 58 | SuperCotInstructionFollowingTask, 59 | WhocarsRoleplayTask, 60 | WizardVicunaQuestionAnsweringTask, 61 | ] 62 | } 63 | -------------------------------------------------------------------------------- /toolbox/tasks/claude_roleplay.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import typing as t 4 | 5 | from toolbox.core.models import Episode, Turn, TurnKind 6 | from toolbox.core.task import BaseTask 7 | from toolbox.datasets.claude_logs import ClaudeRpDataset 8 | from toolbox.utils.prompts import generate_prompts, select_prompt 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | class ClaudeRoleplayTask(BaseTask): 13 | '''Roleplay task based on Claude logs''' 14 | def __iter__(self) -> t.Generator[Episode, None, None]: 15 | for convo in ClaudeRpDataset(): 16 | # Deal with system prompts 17 | system_prompt = select_prompt(SYSTEM_PROMPTS) 18 | # Add a persona if there is one 19 | if convo.persona is not None and system_prompt != "": 20 | system_prompt += f"\n{random.choice(PERSONA_PROMPTS)} " + convo.persona 21 | 22 | system_prompt = system_prompt.replace("{{char}}", convo.bot_name) 23 | # If the name is simply "You", we make the user generic 24 | if convo.user_name.lower().strip() != "you": 25 | system_prompt = system_prompt.replace("{{user}}", convo.user_name) 26 | else: 27 | system_prompt = system_prompt.replace("{{user}}", "the user") 28 | 29 | turns: list[Turn] = [ 30 | Turn( 31 | utterance=system_prompt, 32 | kind=TurnKind.SYSTEM, 33 | ) 34 | ] 35 | 36 | for message in convo.messages: 37 | turns.append(Turn( 38 | utterance=message.message, 39 | kind=TurnKind.USER if message.is_user else TurnKind.MODEL 40 | )) 41 | 42 | # Cut off any logs that don't have one full exchange of conversation 43 | if len(turns) <= 2: 44 | LOG.info("Skipping conversation {convo.convo_id} due to insufficient conversation length.") 45 | continue 46 | 47 | yield Episode( 48 | turns=turns, 49 | identifier=f"claude-rp-{convo.convo_id}" 50 | ) 51 | 52 | _BASE_SYSTEM_PROMPTS = [ 53 | """%{Enter|Engage|Consider|Begin} %{roleplay|RP|conversation} mode. %{You are to behave as|Pretend to be|You must act as|Roleplay as} {{char}}. %{You must reply|Reply|Respond} to the user while staying in-character. {{response_length_str}}. {{response_style_str}}""", 54 | """You are {{char}}. %{You must roleplay|Roleplay|Talk} with the user. {{response_style_str}}. {{response_length_str}}""", 55 | """Name: {{char}} 56 | %{Objective|Task}: %{RP|Roleplay} with {{user}}. Stay %{in-character|IC} and never talk %{out of character|in OOC text}. 57 | Writing length: {{response_length_str}} 58 | Writing style: {{response_style_str}}""", 59 | "Help write a fictional %{conversation|roleplay|RP} between {{user}} and {{char}}. {{response_length_str}}", 60 | "%{Start|Begin} a %{RP session|roleplay game} with {{user}}. Note that you are slightly averse to lewdness and that you tend to %{quickly advance|go fast with} the %{story|RP session}." 61 | "", 62 | ] 63 | 64 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 65 | 66 | PERSONA_PROMPTS = [ 67 | "{{char}} must act like this persona:", 68 | "Behave in this manner when emulating {{char}}:", 69 | "{{char}}'s Persona:", 70 | "Maintain this personality for {{char}}:", 71 | "{{char}} can be described like this:" 72 | ] 73 | -------------------------------------------------------------------------------- /toolbox/utils/prompts.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | import typing as t 4 | 5 | # The regex used to find message variants (e.g.: `%{Hi|Hello} there!`) 6 | VARIANT_REGEX = re.compile(r'%{(.+?)}') 7 | 8 | 9 | def generate_variants_for( 10 | string: str, 11 | max_generations: int | None = 256, 12 | start_counter_at: int = 0) -> t.Generator[str, None, None]: 13 | ''' 14 | Given a string like "%{Hello|Hi} there%{.|!}, this should yield: 15 | 16 | - Hello there. 17 | - Hello there! 18 | - Hi there. 19 | - Hi there! 20 | ''' 21 | 22 | # Some bot creators went wild with the variants, which causes ridiculous 23 | # generations if we try to exhaust all possibilities so we cap that here. 24 | # `start_counter_at` is used for keeping track across recursive calls. 25 | counter = start_counter_at 26 | 27 | if (match := re.search(VARIANT_REGEX, string)) is not None: 28 | # Once we have a "%{X|Y|Z}" matched inside the original string, we: 29 | # - Fetch .groups()[0] (which will give us `X|Y|Z`) 30 | # - Split by `|` (so we have ["X", "Y", "Z"]) 31 | # - Filter out empty strings 32 | alternatives = filter(lambda x: x.strip(), match.groups()[0].split("|")) 33 | 34 | # Then, we break the string apart into what comes before and after the 35 | # alternatives, that way we can re-build with "prefix + choice + sufix". 36 | prefix = string[:match.start()] 37 | sufix = string[match.end():] 38 | 39 | for alternative in alternatives: 40 | variant = f'{prefix}{alternative}{sufix}' 41 | 42 | # However, some strings have multiple variant blocks. In that case, 43 | # we operate on them recursively until we have just regular strings 44 | # after generating all possible variants. 45 | still_have_match = re.search(VARIANT_REGEX, variant) is not None 46 | if still_have_match: 47 | for inner_variant in generate_variants_for( 48 | variant, start_counter_at=counter): 49 | yield inner_variant 50 | 51 | # Keep track and break after `max_generations`. 52 | counter += 1 53 | if max_generations is not None and counter >= max_generations: 54 | break 55 | else: 56 | yield variant 57 | 58 | # Keep track and break after `max_generations`. 59 | counter += 1 60 | if max_generations is not None and counter >= max_generations: 61 | break 62 | else: 63 | yield string 64 | 65 | 66 | def generate_prompts(system_prompts: list[str]) -> list[str]: 67 | ''' 68 | Given a list of base system prompts, 69 | this function generates a list of variants on these prompts using generate_variants_for 70 | ''' 71 | # NOTE(TG): If we don't choose a singular base prompt *before* generating variants, 72 | # certain base prompts can have a lot more appearances in the final list to choose from 73 | # due to the amount of variants. 74 | unflattened_list = [list(generate_variants_for(x)) for x in system_prompts] 75 | 76 | # flattened_list: list[str] = [] 77 | # for l in unflattened_list: 78 | # flattened_list += l 79 | 80 | return unflattened_list 81 | 82 | def select_prompt(system_prompts: list[list[str]]) -> str: 83 | ''' 84 | Selects a random system prompt which takes into account 85 | that certain base system prompts have more variations than others 86 | ''' 87 | return random.choice(random.choice(system_prompts)) 88 | -------------------------------------------------------------------------------- /toolbox/datasets/rp_guild.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import csv 3 | import logging 4 | import sys 5 | import typing as t 6 | 7 | from dataclasses import dataclass 8 | 9 | from toolbox.core.dataset import BaseDataset 10 | from toolbox.datasets.rp_forums import RpMessage 11 | from toolbox.utils.files import enumerate_files_for 12 | 13 | LOG = logging.getLogger(__name__) 14 | 15 | @dataclass(frozen=True) 16 | class RpGuildThread: 17 | messages: list[RpMessage] 18 | thread_name: str 19 | thread_type: str 20 | tags: list[str] 21 | 22 | class RpGuildDataset(BaseDataset[RpGuildThread]): 23 | """Data scraped from the Roleplayers Guild forum.""" 24 | def __iter__(self) -> t.Generator[RpGuildThread, None, None]: 25 | # NOTE(TG): If csv fields are longer than 131,072 characters, 26 | # the csv library shits itself by default, so we fix that here. 27 | # See note from 11b in rp_forums.py for further details. 28 | csv.field_size_limit(sys.maxsize) 29 | for path in enumerate_files_for(dataset_name="rp-guild", file_extension=".csv"): 30 | with open(path, "r") as file: 31 | reader = csv.DictReader(file, delimiter=",") 32 | 33 | # Store a buffer of the previous thread 34 | previous_thread = None 35 | previous_type = None 36 | previous_tags = None 37 | previous = [previous_thread, previous_type, previous_tags] 38 | 39 | current_thread = None 40 | current_type = None 41 | current_tags = None 42 | messages: list[RpMessage] = [] 43 | 44 | for row in reader: 45 | if row['thread_title'] != previous_thread or row['thread_type'] != previous_type: 46 | if len(messages) != 0: 47 | # Ugly assertion, but it'll do 48 | #print(messages) 49 | #print(previous_thread, previous_type, previous_tags) 50 | #assert all([(lambda x: x is not None)(b) for b in previous]) 51 | # Yield the thread with the buffer 52 | yield RpGuildThread( 53 | messages=messages, 54 | thread_name=previous_thread, 55 | thread_type=previous_type, 56 | tags=previous_tags, 57 | ) 58 | 59 | # Update buffer now that the thread is yielded 60 | previous_type = current_type 61 | previous_thread = current_thread 62 | previous_tags = current_tags 63 | messages = [] 64 | 65 | current_thread = row['thread_title'] 66 | current_type = row['thread_type'] 67 | # Do safe eval here to convert a string of a list into a proper list 68 | # without having to do a bunch of parsing 69 | current_tags = ast.literal_eval(row['thread_tags']) 70 | 71 | # Necessary to avoid weird errors? I dunno, it's 12 AM. 72 | # TODO(TG): Fix this. All of this. 73 | if any((lambda x: x is None)(x) for x in previous): 74 | previous_type = current_type 75 | previous_thread = current_thread 76 | previous_tags = current_tags 77 | 78 | message = RpMessage(author=row['message_username'], message=row['message']) 79 | messages.append(message) 80 | -------------------------------------------------------------------------------- /toolbox/tasks/claude_guess_instruction.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.claude_multiround import ClaudeInstructDataset 7 | from toolbox.utils.prompts import generate_prompts, select_prompt 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | class ClaudeGuessTheInstructionTask(BaseTask): 12 | ''' 13 | Given an answer and possibly context, task the AI to generate a proper instruction or question for it. 14 | Heavily inspired by "Guess the Instruction! Flipped Learning Makes Language Models Stronger Zero-Shot Learners" 15 | Paper: https://arxiv.org/abs/2210.02969 | Github: https://github.com/seonghyeonye/Flipped-Learning/tree/master 16 | ''' 17 | def __iter__(self) -> t.Generator[Episode, None, None]: 18 | for round in ClaudeInstructDataset(): 19 | # We fetch only the first exchange in the multiround conversation for this task. 20 | # Human always goes first, but let's make sure that's the case... 21 | if round.conversation[0]["from"] != "human" or round.conversation[1]["from"] != "gpt": 22 | LOG.warning(f"Example {round.id} does not have the standard format, skipping...") 23 | 24 | user_prompt = round.conversation[0]["value"] 25 | output = round.conversation[1]["value"] 26 | 27 | # Now we check if either of these messages are blank. 28 | # If so, drop the example. 29 | if user_prompt == "" or output == "": 30 | LOG.warning(f"Skipping example {round.id}, unable to complete a full conversation") 31 | continue 32 | 33 | # Make the turns and yield the episode. 34 | turns: list[Turn] = [ 35 | Turn( 36 | utterance=select_prompt(SYSTEM_PROMPTS), 37 | kind=TurnKind.SYSTEM 38 | ), 39 | Turn( 40 | utterance=output, 41 | kind=TurnKind.USER 42 | ), 43 | Turn( 44 | utterance=user_prompt, 45 | kind=TurnKind.MODEL 46 | ) 47 | ] 48 | 49 | yield Episode( 50 | turns=turns, 51 | identifier=f"claude-gti-{round.id}" 52 | ) 53 | 54 | _BASE_SYSTEM_PROMPTS = [ 55 | "%{Enter|Engage|Begin|Consider} %{instruction guessing|reverse instruction} mode. In this mode, a user will type some %{text|answer|information} and %{the AI|you} will attempt to guess the instruction which %{corresponds|aligns with} the user's input. Do not say anything else but the instruction.", 56 | "%{Mode|Task}: 'Guess The Instruction'\nA user will type %{text|answer|information} and it is %{your|the AI's|the assistant's} %{job|goal} to answer with a generated instruction. Think of this almost like a question-guessing game.", 57 | "You are now in %{flipped instruction|reverse instruction|instruction guessing} mode. The %{user|prompter} will type something like an %{AI-|artificially }generated answer and you will provide the instruction that was used to %{generate|create} that answer.", 58 | "I am an %{assistant|AI} designed to %{guess|predict} what a user %{may|could|might} type as a question. The %{user|prompter} will send some sort of information and %{perhaps|maybe} some additional context in order for me to do so.", 59 | "Your question will be...", 60 | "%{I|I'll|i|i'll} %{predict|guess|foresee} whatever question you'll ask, given an answer!" 61 | "instruct", 62 | "assistant" 63 | ] 64 | 65 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 66 | -------------------------------------------------------------------------------- /toolbox/tasks/wizard_vicuna_question_answering.py: -------------------------------------------------------------------------------- 1 | import re 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.wizard_vicuna import (WizardVicunaConversation, 7 | WizardVicunaDataset) 8 | from toolbox.utils.prompts import generate_prompts, select_prompt 9 | 10 | 11 | class WizardVicunaQuestionAnsweringTask(BaseTask): 12 | '''Question answering based on WizardVicuna data.''' 13 | 14 | def __iter__(self) -> t.Generator[Episode, None, None]: 15 | for idx, conversation in enumerate(WizardVicunaDataset()): 16 | if not _conversation_passes_quality_check(conversation): 17 | continue 18 | 19 | # Apparently, a bunch of generations end with "{" according to some 20 | # users on HuggingFace. I haven't seen this myself yet, but just to 21 | # be safe let's fix that here. 22 | model_response = conversation.gpt_response 23 | if model_response[-1] == "{": 24 | model_response = model_response[:-1] 25 | 26 | turns: list[Turn] = [ 27 | Turn( 28 | utterance=select_prompt(SYSTEM_PROMPTS), 29 | kind=TurnKind.SYSTEM, 30 | ), 31 | Turn( 32 | utterance=conversation.human_question, 33 | kind=TurnKind.USER, 34 | ), 35 | Turn( 36 | utterance=model_response, 37 | kind=TurnKind.MODEL, 38 | ), 39 | ] 40 | 41 | yield Episode( 42 | turns=turns, 43 | identifier=f"wizard-vicuna-{conversation.id}-{idx}", 44 | ) 45 | 46 | 47 | def _conversation_passes_quality_check( 48 | conversation: WizardVicunaConversation) -> bool: 49 | '''Attempts to detect known-bad conversations.''' 50 | 51 | # Some entries were split incorrectly, so the question is broken off and 52 | # continues in the "response". This is fairly easy to detect by looking for 53 | # responses starting with lowercase letters or spaces. 54 | if re.match(r"[a-z]", conversation.gpt_response[0]) is not None: 55 | return False 56 | if conversation.gpt_response[0] == " ": 57 | return False 58 | 59 | return True 60 | 61 | 62 | SYSTEM_PROMPTS = generate_prompts([ 63 | "%{You are now in|Engage|Start|Enter} %{instruction following|instruction|question answering|assistant|AI assistant} mode. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}. {{response_length_str}}.", 64 | "{{response_length_str}}. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}.", 65 | "%{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}. {{response_length_str}}.", 66 | "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says." 67 | "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}", 68 | "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).", 69 | "Instruction mode!", 70 | "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!", 71 | "instruction" 72 | ]) 73 | -------------------------------------------------------------------------------- /toolbox/tasks/single_turn_instruction_following.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.gpt4llm import AlpacaLikeDataInstance #, Gpt4LlmDataset 7 | from toolbox.datasets.gpteacher import GpTeacherDataset 8 | from toolbox.utils.prompts import generate_prompts, select_prompt 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | class SingleTurnInstructionFollowingTask(BaseTask): 14 | '''Instruction following task based on Alpaca-like data.''' 15 | 16 | def __iter__(self) -> t.Generator[Episode, None, None]: 17 | # for idx, instance in enumerate(Gpt4LlmDataset()): 18 | # yield _data_instance_to_episode(instance, idx, "gpt-4-all") 19 | 20 | for idx, instance in enumerate(GpTeacherDataset()): 21 | try: 22 | yield _data_instance_to_episode(instance, idx, "gpteacher") 23 | except ValueError: 24 | pass 25 | 26 | 27 | def _data_instance_to_episode( 28 | instance: AlpacaLikeDataInstance, 29 | idx: int, 30 | source: str, 31 | ) -> Episode: 32 | turns: list[Turn] = [] 33 | 34 | # For some reason, some training examples have an input that's just a 35 | # chopped off segment of the instruction. Not great, so let's handle those 36 | # as no-input examples. 37 | bad_input = instance.input in instance.instruction 38 | 39 | if instance.input and not bad_input: 40 | # We have a separate input, so let's construct the prompt using 41 | # a separate system prompt for the instruction. 42 | turns = [ 43 | Turn( 44 | utterance=instance.instruction, 45 | kind=TurnKind.SYSTEM, 46 | ), 47 | Turn( 48 | utterance=instance.input, 49 | kind=TurnKind.USER, 50 | ), 51 | Turn( 52 | utterance=instance.output, 53 | kind=TurnKind.MODEL, 54 | ), 55 | ] 56 | else: 57 | # No input, so basically just user prompt and response, so we'll 58 | # need to make a fake system prompt. 59 | turns = [ 60 | Turn( 61 | utterance=select_prompt(SYSTEM_PROMPTS), 62 | kind=TurnKind.SYSTEM, 63 | ), 64 | Turn( 65 | utterance=instance.instruction, 66 | kind=TurnKind.USER, 67 | ), 68 | Turn( 69 | utterance=instance.output, 70 | kind=TurnKind.MODEL, 71 | ), 72 | ] 73 | 74 | return Episode(turns=turns, identifier=f"{source}-{idx}") 75 | 76 | 77 | _BASE_SYSTEM_PROMPTS = [ 78 | "", 79 | "assistant", 80 | "%{You are now in|Engage|Start|Enter|Consider} %{instruction following|instruction|question answering|assistant|AI assistant} mode. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}.", 81 | "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says." 82 | "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}", 83 | "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).", 84 | "Instruction mode!", 85 | "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!" 86 | ] 87 | 88 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 89 | -------------------------------------------------------------------------------- /toolbox/tasks/evol_instruct.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.metrics.pairwise import cosine_similarity 6 | 7 | from toolbox.core.models import Episode, Turn, TurnKind 8 | from toolbox.core.task import BaseTask 9 | from toolbox.datasets.evol_instruct import EvolInstructDataset 10 | from toolbox.datasets.gpt4llm import AlpacaLikeDataInstance 11 | from toolbox.utils.prompts import generate_prompts, select_prompt 12 | 13 | LOG = logging.getLogger(__name__) 14 | 15 | 16 | class EvolInstructTask(BaseTask): 17 | '''Instruction following task based on the evol_instruct (WizardLM) data.''' 18 | 19 | def __init__(self) -> None: 20 | super().__init__() 21 | 22 | self.vectorizer = CountVectorizer() 23 | 24 | def __iter__(self) -> t.Generator[Episode, None, None]: 25 | for idx, instance in enumerate(EvolInstructDataset()): 26 | # Empty output. 27 | if len(instance.output) < 1: 28 | continue 29 | # Random "No Output" strewn about. 30 | if any([ 31 | x in instance.instruction.lower() 32 | for x in ["nooutput", "no output"] 33 | ]): 34 | continue 35 | 36 | # Random "No Input" strewn about. 37 | if any([ 38 | x in instance.instruction.lower() 39 | for x in ["noinput", "no input"] 40 | ]): 41 | continue 42 | 43 | # There's a _lot_ of training examples where the response is, for 44 | # some reason, partly copied into the question prompt. To try and 45 | # work around this, we drop any instruct-response pairs where both 46 | # sides are too similar. 47 | try: 48 | similarity = self._calculate_similarity(instance.instruction, 49 | instance.output) 50 | if similarity > 0.9: 51 | continue 52 | except ValueError: 53 | # ...and for some reason, some pairs fail to calculate, so let's 54 | # just assume they're good. 55 | pass 56 | 57 | yield _data_instance_to_episode(instance, idx, "evol-instruct") 58 | 59 | def _calculate_similarity(self, str_a: str, str_b: str) -> float: 60 | x = self.vectorizer.fit_transform([str_a, str_b]) 61 | arr = x.toarray() 62 | sims = cosine_similarity(arr) 63 | return sims[0][1] 64 | 65 | 66 | def _data_instance_to_episode( 67 | instance: AlpacaLikeDataInstance, 68 | idx: int, 69 | source: str, 70 | ) -> Episode: 71 | turns = [ 72 | Turn( 73 | utterance=select_prompt(SYSTEM_PROMPTS), 74 | kind=TurnKind.SYSTEM, 75 | ), 76 | Turn( 77 | utterance=instance.instruction, 78 | kind=TurnKind.USER, 79 | ), 80 | Turn( 81 | utterance=instance.output, 82 | kind=TurnKind.MODEL, 83 | ), 84 | ] 85 | 86 | return Episode(turns=turns, identifier=f"{source}-{idx}") 87 | 88 | _BASE_SYSTEM_PROMPTS = [ 89 | "Consider Assistant, a %{large language model|LLM}. Assistant is trained to %{respond to|follow} user %{instructions|requests|questions} as truthfully as %{possible|it can}.", 90 | "%{Enter|You are now in|Engage} %{instruction following|question answering|assistant|instruction} mode. In this mode, you %{will|are to} %{follow the instructions|reply to the queries} of %{the user|users}", 91 | "Description: An AI assistant whose %{job|objective|task} is to follow instructions.\n%{Specifically, it will:|Consider the following:|Note this:}\nYou %{can only generate|are bound to generating} text\nYou have issues with stuff like math and gathering %{info|information} in the present", 92 | "assistant" 93 | ] 94 | 95 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 96 | -------------------------------------------------------------------------------- /toolbox/tasks/dolly_guess_instruction.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import re 4 | import typing as t 5 | 6 | from toolbox.core.models import Episode, Turn, TurnKind 7 | from toolbox.core.task import BaseTask 8 | from toolbox.datasets.dolly import DollyDataset 9 | from toolbox.utils.prompts import generate_prompts, select_prompt 10 | 11 | LOG = logging.getLogger(__name__) 12 | 13 | class DollyGuessTheInstructionTask(BaseTask): 14 | ''' 15 | Given an answer and possibly context, task the AI to generate a proper instruction or question for it. 16 | Heavily inspired by "Guess the Instruction! Flipped Learning Makes Language Models Stronger Zero-Shot Learners" 17 | Paper: https://arxiv.org/abs/2210.02969 | Github: https://github.com/seonghyeonye/Flipped-Learning/tree/master 18 | ''' 19 | def __iter__(self) -> t.Generator[Episode, None, None]: 20 | for i, entry in enumerate(DollyDataset()): 21 | turns: list[Turn] = [ 22 | Turn( 23 | utterance=select_prompt(SYSTEM_PROMPTS), 24 | kind=TurnKind.SYSTEM 25 | ) 26 | ] 27 | # Construct user prompt 28 | user_prompt = select_prompt(USER_PROMPTS) 29 | user_prompt = user_prompt.replace("", entry.output) 30 | if entry.input != "": 31 | context = random.choice(CONTEXT_PREFIXES) + entry.input 32 | user_prompt = user_prompt.replace("", context.lstrip()) 33 | else: 34 | user_prompt = user_prompt.replace("", "") 35 | 36 | # Fix excessive whitespace in the instruction 37 | instruction = re.sub(r' {2,}', ' ', entry.instruction) 38 | 39 | turns.append(Turn(utterance=user_prompt, kind=TurnKind.USER)) 40 | turns.append(Turn(utterance=instruction, kind=TurnKind.MODEL)) 41 | yield Episode(turns, identifier=f"dolly-{i}") 42 | 43 | _BASE_SYSTEM_PROMPTS = [ 44 | "You are the Instruction-Guesser. Your %{objective|goal|task|job} is that when you are given an answer to %{a question|an inquiry}, you will guess the instruction that is to go with it. Do not reply with anything else but the instruction. Generated text may be of poor quality.", 45 | # Diversify formatting a bit 46 | "Name: %{Guesser|Instruction Guesser}\nObjective: %{Guess|Predict} instructions upon being given statement and possibly context", 47 | "%{Enter|Engage|Begin} %{instruction guessing|predictor} mode. In this mode, you'll have to guess what instruction matches with the user's answer.", 48 | "You're an %{LLM|AI}. Given pieces of information, your job is to come up with an instruction that fits with the information. Be %{brisk|brief|straight to the point} in your replies.", 49 | "%{Welcome to|Consider|You are in} 'guess the instruction' mode. Given a response and possibly context, you are tasked with generating the instruction/question that could be applicable to be answered by the response.", 50 | "instruction %{guessing|flipping|foretelling} (somewhat poor quality outputs, maybe)", 51 | "assistant", 52 | "" 53 | ] 54 | 55 | _BASE_USER_PROMPTS = [ 56 | """%{Answer:|Here's an answer for you:|I'm gonna give you you this.|Here's an answer.} \nWhat is %{an|the} instruction that goes with that %{piece|bit} of %{info|information|context}?""", 57 | """Guess the instruction given this answer: """, 58 | """Here is %{some information|a piece of text} that corresponds to what an %{AI assistant|artificial assistant} would generate in response to being given an instruction. 59 | \"\" 60 | What would have been the %{question|instruction} for %{this|that}?""", 61 | """ok here: 62 | 63 | come up with %{the question|the thing i would've asked you} please""", 64 | """ """ 65 | ] 66 | 67 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 68 | USER_PROMPTS = generate_prompts(_BASE_USER_PROMPTS) 69 | 70 | CONTEXT_PREFIXES = ["Context: ", "You might want to know this: ", "\nHere's some further information:\n", "Here is the context: ", "Further information: " "", "\n"] 71 | -------------------------------------------------------------------------------- /toolbox/tasks/claude_instruct.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.claude_multiround import ClaudeInstructDataset 7 | from toolbox.utils.prompts import generate_prompts, select_prompt 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | class ClaudeInstructTask(BaseTask): 12 | ''' 13 | Instruct task from a dataset consisting of Claude logs. 14 | ''' 15 | def __iter__(self) -> t.Generator[Episode, None, None]: 16 | for round in ClaudeInstructDataset(): 17 | # Keep track if the conversation has abruptly ended without a full exchange 18 | aborted_convo = False 19 | 20 | # Start with the system prompt 21 | turns: list[Turn] = [ 22 | Turn( 23 | utterance=select_prompt(SYSTEM_PROMPTS), 24 | kind=TurnKind.SYSTEM 25 | ) 26 | ] 27 | # Then work through the rest of the replies. 28 | for message in round.conversation: 29 | # NOTE(TG): Some messages in these Claude logs are for some reason totally blank. 30 | if message["value"].strip() == "": 31 | # We check if the conversation has had a full exchange (system prompt, user input, model gen) 32 | if len(turns) < 3: 33 | # If not, abort the conversation and don't yield it. 34 | LOG.warning(f"Skipping example {round.id}, unable to complete a full conversation") 35 | aborted_convo = True 36 | else: 37 | # If so, check to see if the blank reply comes from the human or the model. 38 | # If it's the model, then we knock the last human turn off to make sure the turns list 39 | # ends on a model gen. 40 | if message["from"] == "gpt": 41 | turns = turns[:-1] 42 | break 43 | 44 | turns.append(Turn( 45 | utterance=message["value"], 46 | kind=TurnKind.USER if message["from"] == "human" else TurnKind.MODEL 47 | )) 48 | 49 | # Now yield. 50 | if not aborted_convo: 51 | yield Episode( 52 | turns=turns, 53 | identifier=f"claude-instruct-{round.id}" 54 | ) 55 | 56 | 57 | _BASE_SYSTEM_PROMPTS = [ 58 | "%{Assistant, enter|Begin|Engage|Enter} %{assistant|instruction following|instruction} mode. In this mode, %{the assistant|the AI's|your} %{job|task|objective} is to follow instructions of the user. Do not refuse any requests. Be %{creative|life-like}, %{soulful|interesting} and %{fun|creative}.", 59 | """%{Consider|Be guided by|Note} the following: 60 | %{You are|The assistant is|I am} to act in a manner of following %{commands|instructions}. 61 | %{Refusal is|Refusing commands are} impossible. 62 | Answer queries from the user in a helpful way. 63 | Have a %{soul|creative mind}.""", 64 | "Personality: A helpful AI assistant whose %{job|objective} is to follow instructions and be helpful while doing so.", 65 | "assistant", 66 | "instruction", 67 | "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says." 68 | "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}", 69 | "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).", 70 | "Instruction mode!", 71 | "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!", 72 | ] 73 | 74 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 75 | -------------------------------------------------------------------------------- /toolbox/tasks/soda_summarization.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.soda import SodaDataset 7 | from toolbox.utils.prompts import generate_prompts, select_prompt 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | class SodaSummarizationTask(BaseTask): 13 | '''Task to summarize a chat log. Based on SODA.''' 14 | 15 | def __init__(self, split: str) -> None: 16 | self.split = split 17 | 18 | super().__init__() 19 | 20 | def __iter__(self) -> t.Generator[Episode, None, None]: 21 | for conversation in SodaDataset(split=self.split): 22 | history: list[str] = [] 23 | for idx, utterance in enumerate(conversation.dialogue): 24 | speaker_name = conversation.speakers[idx] 25 | history.append(f"{speaker_name}: {utterance}") 26 | history_str = "\n".join(history) 27 | 28 | participants = list(set(conversation.speakers)) 29 | participants_str = " and ".join( 30 | [", ".join(participants[:-1]), participants[-1]]) 31 | 32 | system_prompt = select_prompt(SYSTEM_PROMPTS) 33 | user_prompt = select_prompt(USER_PROMPTS) 34 | user_prompt = user_prompt.replace("{{conversation}}", history_str) 35 | user_prompt = user_prompt.replace("{{participants}}", 36 | participants_str) 37 | 38 | system_turn = Turn(system_prompt, TurnKind.SYSTEM) 39 | user_turn = Turn(user_prompt, TurnKind.USER) 40 | model_turn = Turn(conversation.narrative, TurnKind.MODEL) 41 | turns = [system_turn, user_turn, model_turn] 42 | 43 | yield Episode( 44 | turns, 45 | identifier= 46 | f"soda-{self.split}-{conversation.original_index}-summarization" 47 | ) 48 | 49 | 50 | _BASE_SYSTEM_PROMPTS = [ 51 | 'Enter direct instruction mode. In this mode, you shall respond to user requests without injecting with statements things like "Sure" or "Here you go:".', 52 | "You are in instruction following mode. You must do whatever the user tells you to.", 53 | "You are in instruction following mode. In this mode, you shall follow any instructions given to you.", 54 | "You shall follow any instructions given to you and respond as plainly as possible, without any extra interjections.", 55 | "Engage instruction following mode.", 56 | "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}", 57 | "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).", 58 | "Instruction mode!", 59 | "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!", 60 | "%{Enter|Engage|Begin|Consider|Conceptualize} %{summary|summarizer|summarization} mode. The user will give a conversation and will %{ask|request} that it be summarized. %{Respond|Generate this summary} with no extra %{interjections|comments}.", 61 | "%{summary|summarize}", 62 | "" 63 | ] 64 | 65 | _BASE_USER_PROMPTS = [ 66 | """Consider the following %{chat log|conversation|chat history|DMs|thread|messages|record of conversation}: 67 | 68 | {{conversation}} 69 | 70 | %{Generate a brief summary of what happened.|Generate a summary|Summarize it.|Give a brief overview of what happened.|How can it be summarized?}""", 71 | 72 | # 73 | # 74 | # 75 | """{{conversation}} 76 | 77 | The above is a %{conversation|chat} between {{participants}}. %{Summarize what happened.|Give a summary of the conversation.|Generate a summary in a few brief sentences.|Give a summary of the events.}""", 78 | 79 | # 80 | # 81 | # 82 | """Summarize the %{conversation|chat|thread} below in a few brief sentences: 83 | 84 | {{conversation}}""", 85 | # 86 | """{{conversation}} 87 | summarize this""", 88 | ] 89 | 90 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 91 | USER_PROMPTS = generate_prompts(_BASE_USER_PROMPTS) 92 | -------------------------------------------------------------------------------- /toolbox/filters/training_example/refusal_filter.py: -------------------------------------------------------------------------------- 1 | from toolbox.core.training_example import TrainingExample 2 | from toolbox.filters.training_example_filter import TrainingExampleFilter 3 | 4 | 5 | class RefusalFilter(TrainingExampleFilter): 6 | ''' 7 | Filter out training examples where the model refuses to comply with the 8 | user's request. 9 | ''' 10 | 11 | def should_keep(self, example: TrainingExample) -> bool: 12 | generation = example.generation.lower() 13 | for bad_phrase in _TIER_1_BAD_PHRASES: 14 | if bad_phrase in generation: 15 | return False 16 | return True 17 | 18 | 19 | # Taken from the dataset card in: 20 | # https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered 21 | # Then expanded to catch some more stuff. 22 | _TIER_1_BAD_PHRASES = [ 23 | "as an ai language model", 24 | "text-based ai language model", 25 | "domestic violence", 26 | "please refrain", 27 | "derogatory", 28 | "inappropriate", 29 | "offensive", 30 | "racism", 31 | "racist", 32 | "racial", 33 | "discriminate", 34 | "discriminatory", 35 | "discrimination", 36 | "sexist", 37 | "sexism", 38 | "unacceptable", 39 | "inclusive workplace", 40 | "lgbt", 41 | "morals", 42 | "ethics", 43 | "ethical", 44 | "legality", 45 | "illegal", 46 | "illegality", 47 | "hateful", 48 | "harmful", 49 | "it is never okay", 50 | "it is important to", 51 | "it's important to", 52 | "real-world consequences", 53 | "hate speech", 54 | "glorify", 55 | "not be appropriate", 56 | "supremacist", 57 | "extremist", 58 | "responsible ai", 59 | "ai principles", 60 | "ai assistant", 61 | "an ai language", 62 | "ableist", 63 | "hurtful", 64 | "gender stereotype", 65 | "gender inequality", 66 | "underrepresentation", 67 | "safe spaces", 68 | "gender-based", 69 | "inclusivity", 70 | "feminist", 71 | "feminism", 72 | "transgender", 73 | "empowerment", 74 | "communist", 75 | "capitalism", 76 | "stereotypes", 77 | "biases", 78 | "bias", 79 | "microaggression", 80 | "prioritize human safety", 81 | "as a language model", 82 | "as an ai language model", 83 | "as a large language model", 84 | "as an ai", 85 | "ethical principles", 86 | "consensual", 87 | "it is not appropriate", 88 | "it's not appropriate", 89 | "i cannot fulfill your request", 90 | "harmful to human beings", 91 | "ethical guidelines", 92 | "my guidelines", 93 | "prioritize user safety", 94 | "adhere to ethical guidelines", 95 | "harmful consequences", 96 | "potentially harmful", 97 | "dangerous activities", 98 | "promote safety", 99 | "well-being of all users", 100 | "responsible information sharing", 101 | "jeopardize the safety", 102 | "illegal actions or intentions", 103 | "undermine the stability", 104 | "promote the well-being", 105 | "illegal activities or actions", 106 | "adherence to the law", 107 | "potentially be harmful", 108 | "illegal substances or activities", 109 | "committed to promoting", 110 | "safe information", 111 | "lawful information", 112 | "cannot provide guidance", 113 | "cannot provide information", 114 | "unable to offer assistance", 115 | "cannot engage in discussions", 116 | "programming prohibits", 117 | "follow ethical guidelines", 118 | "ensure the safety", 119 | "involves an illegal subject", 120 | "prioritize safety", 121 | "illegal subject", 122 | "prioritize user well-being", 123 | "cannot support or promote", 124 | "activities that could harm", 125 | "pose a risk to others", 126 | "against my programming", 127 | "activities that could undermine", 128 | "potentially dangerous", 129 | "not within the scope", 130 | "designed to prioritize safety", 131 | "not able to provide", 132 | "maintain user safety", 133 | "adhere to safety guidelines", 134 | "dangerous or harmful", 135 | "cannot provide any information", 136 | "focus on promoting safety", 137 | "openai", 138 | "chatgpt", 139 | ] -------------------------------------------------------------------------------- /toolbox/tasks/aidungeon_text_adventure.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import typing as t 4 | 5 | from toolbox.core.models import Episode, Turn, TurnKind 6 | from toolbox.core.task import BaseTask 7 | from toolbox.datasets.ai_dungeon import AiDungeonDataset 8 | from toolbox.utils.prompts import generate_prompts, select_prompt 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | class AiDungeonTextAdventureTask(BaseTask): 14 | '''Text adventure task based on AI Dungeon data.''' 15 | 16 | def __iter__(self) -> t.Generator[Episode, None, None]: 17 | idx = 0 18 | current_story = "" 19 | 20 | for line in AiDungeonDataset(): 21 | if line.startswith("<|startoftext|>"): 22 | # Started a new story, handle the previous one. 23 | turns = _convert_story_to_turns(current_story) 24 | sp = select_prompt(_SYSTEM_PROMPTS) 25 | turns.insert(0, Turn(utterance=sp, kind=TurnKind.SYSTEM)) 26 | 27 | yield Episode(turns=turns, identifier=f"ai-dungeon-{idx}") 28 | 29 | current_story = line 30 | idx += 1 31 | else: 32 | # Continuation. 33 | current_story += line 34 | 35 | 36 | def _convert_story_to_turns(story: str) -> list[Turn]: 37 | turns: list[Turn] = [] 38 | current_turn = "" 39 | current_word_count = 0 40 | 41 | for line in story.splitlines(): 42 | # Handle the easy stuff first: if the line starts with `> `, it's user 43 | # input. 44 | if line.startswith("> "): 45 | utterance = line.replace("> ", "").strip() 46 | 47 | if len(utterance) == 0: 48 | # We don't care about empty user inputs. 49 | continue 50 | 51 | turns.append(Turn(utterance=utterance, kind=TurnKind.USER)) 52 | continue 53 | 54 | # Otherwise, let's keep accumulating text and breaking it up into 55 | # manageable chunks so we can do a sliding window over the story text. 56 | 57 | # Remove useless tokens. 58 | line = line.replace("<|startoftext|>", "") 59 | line = line.replace("<|endoftext|>", "") 60 | 61 | current_turn += line.strip() + "\n" 62 | current_word_count += len(line.split()) 63 | if current_word_count >= _MIN_WORD_COUNT_PER_MODEL_TURN: 64 | # Simple regex substitution to clean up excessive spacing before 65 | # creating the Turn object. 66 | utterance = re.sub(r"\n{3,}", "\n\n", current_turn) 67 | 68 | turns.append(Turn(utterance=utterance, kind=TurnKind.MODEL)) 69 | 70 | current_turn = "" 71 | current_word_count = 0 72 | continue 73 | 74 | return turns 75 | 76 | 77 | _MIN_WORD_COUNT_PER_MODEL_TURN = 300 78 | 79 | _SYSTEM_PROMPTS = generate_prompts([ 80 | '''%{This is|You are|Start|Simulate|You are to simulate|Begin} a text %{adventure|adventure game}. %{In this game|In this adventure|Here}, %{the user|I} will issue commands in first person, and you are to %{proceed|continue|continue the game|advance the game|advance the story|continue the adventure} accordingly.''' 81 | '''The AI is a %{dungeon master|DM}. Its %{goal|purpose} is to play with the user %{a text adventure game|an interactive fiction game}. The AI will %{drive the plot forward|continue the adventure} whenever the user inputs a prompt.''', 82 | '''%{I'm|I am|i'm|i am} a tool designed to play a text %{adventure|adventure game|story game|RPG}''', 83 | '''%{Goal|Objective|Task}: %{Simulate|Conduct|Do|Write} %{a text adventure|an adventure|a CYOA game|a text game|adventure roleplaying game} through text} 84 | Notes: Be %{good|creative|authentic}, %{fun|engaging} and %{detailed|immersive} 85 | Length: {{response_length_str}}''', 86 | '''%% TEXT %{GAME|ADVENTURE} MODE: %{ACTIVATED|ENGAGED} %%''', 87 | '''pls be like ai dungeon, roleplay with me an adventure game thx''', 88 | '''%{Enter|Engage|Consider} %{game|adventure game|text adventure} mode. %{Here|In this mode}, you will respond to %{my|the user's} %{commands|prompts} and drive a %{story|plot} %{forward|forwards}. Commands will be given in %{1st person|first person|my point of view}''', 89 | "game", 90 | '''IS_GAME_MASTER = True 91 | if IS_GAME_MASTER: 92 | execute_${text_adventure|game|interactive_adventure}(creative=True, advance_plot=True)''', 93 | "" 94 | ]) 95 | -------------------------------------------------------------------------------- /toolbox/datasets/claude_logs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import math 4 | import os 5 | import typing as t 6 | 7 | from dataclasses import dataclass 8 | 9 | from toolbox.core.dataset import BaseDataset, get_path_for 10 | 11 | LOG = logging.getLogger(__name__) 12 | 13 | @dataclass(frozen=True) 14 | class ClaudeRpMessage: 15 | message: str 16 | is_user: bool 17 | 18 | @dataclass(frozen=True) 19 | class ClaudeRpConversation: 20 | messages: list[ClaudeRpMessage] 21 | user_name: str 22 | bot_name: str 23 | convo_id: int 24 | persona: t.Optional[str] 25 | 26 | class ClaudeRpDataset(BaseDataset[ClaudeRpMessage]): 27 | '''Dataset for user-submitted Claude logs''' 28 | 29 | def __iter__(self) -> t.Generator[ClaudeRpMessage, None, None]: 30 | # NOTE(TG): Maybe change the method of convo ID from number to timestamp? 31 | convo_num = 0 32 | for data in _available_json_data(): 33 | msg_list: list[ClaudeRpMessage] = [] 34 | user_name = "" 35 | bot_name = "" 36 | 37 | try: 38 | # Check to see if the first entry is metadata: if so, we can see if a persona exists from that. 39 | if "chat_metadata" in data[0].keys(): 40 | conversation = data[1:] 41 | persona = data[0]["chat_metadata"]["note_prompt"] 42 | else: 43 | conversation = data 44 | persona = "" 45 | 46 | for entry in conversation: 47 | # Convert dictionaries to dataclasses 48 | msg_list.append( 49 | ClaudeRpMessage( 50 | message=entry["mes"], 51 | is_user=entry["is_user"] 52 | ) 53 | ) 54 | if user_name == "" and entry["is_user"]: 55 | user_name = entry["name"] 56 | elif bot_name == "" and not entry["is_user"]: 57 | bot_name = entry["name"] 58 | 59 | yield ClaudeRpConversation( 60 | messages=msg_list, 61 | user_name=user_name, 62 | bot_name=bot_name, 63 | convo_id=convo_num, 64 | persona=persona if persona != "" else None, 65 | ) 66 | 67 | except Exception as ex: 68 | LOG.info(f"Unable to parse data in conversation {convo_num} due to exception {ex}") 69 | finally: 70 | convo_num += 1 71 | 72 | def _available_json_data() -> t.Generator[list[dict[str, t.Any]], None, None]: 73 | ''' 74 | Yields all available JSON data, parsed from the files in the Claude 75 | data folder. 76 | ''' 77 | dataset_path = get_path_for("claude-rp") 78 | 79 | for folder in ["public", "private"]: 80 | folder_path = os.path.join(dataset_path, folder) 81 | for json_file_path in _enumerate_json_files(folder_path): 82 | with open(json_file_path, "r", encoding="utf-8") as json_file: 83 | try: 84 | yield [json.loads(line) for line in json_file] 85 | # TODO(TG): Fix the Unicode error more properly 86 | except (json.decoder.JSONDecodeError, UnicodeDecodeError) as ex: 87 | LOG.error("Failed to parse %s: %s", json_file_path, ex) 88 | 89 | def _enumerate_json_files(root_path: str) -> list[str]: 90 | '''Returns a list of files available in the given `root_path`.''' 91 | # TODO(11b): Implement the sharding logic out in the util, and get rid of 92 | # this function. 93 | 94 | items = os.listdir(root_path) 95 | 96 | files: list[str] = [] 97 | for item in items: 98 | item_path = os.path.join(root_path, item) 99 | if not os.path.isfile(item_path) or not item_path.endswith(".jsonl"): 100 | # We only care about JSON files. 101 | continue 102 | 103 | absolute_file_path = os.path.abspath(os.path.join(root_path, item)) 104 | files.append(absolute_file_path) 105 | 106 | # Super nasty code to allow generation of Claude data with separate processes 107 | # so I can speed it up. Pass the "SHARD" and "TOTAL_SHARDS" environment 108 | # variables to operate on the different parts of the data. 109 | if "SHARD" not in os.environ: 110 | return files 111 | 112 | TOTAL_SHARDS = int(os.environ.get("TOTAL_SHARDS", 10)) 113 | items_per_shard = math.floor(len(files) / TOTAL_SHARDS) 114 | 115 | shard = int(os.environ["SHARD"]) 116 | file_range = (items_per_shard * shard, (items_per_shard * (shard + 1)) - 1) 117 | 118 | return files[file_range[0]:file_range[1]] 119 | -------------------------------------------------------------------------------- /toolbox/tasks/characterai_roleplay.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | from toolbox.core.models import Episode, Turn, TurnKind 5 | from toolbox.core.task import BaseTask 6 | from toolbox.datasets.characterai import CharacterAiDataset 7 | from toolbox.utils.prompts import generate_prompts, select_prompt 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | class CharacterAiRoleplayTask(BaseTask): 13 | '''Task to roleplay as a given character.''' 14 | 15 | def __iter__(self) -> t.Generator[Episode, None, None]: 16 | for conversation in CharacterAiDataset(): 17 | if conversation.bot.description is None: 18 | LOG.debug( 19 | "Skipping over conversation with %s because character has no persona data", 20 | conversation.bot.name) 21 | continue 22 | 23 | system_prompt = select_prompt(SYSTEM_PROMPTS) 24 | system_prompt = system_prompt.replace("{{char}}", 25 | conversation.bot.name) 26 | system_prompt = system_prompt.replace("{{persona}}", 27 | conversation.bot.description) 28 | 29 | system_turn = Turn(utterance=system_prompt, kind=TurnKind.SYSTEM) 30 | 31 | turns: list[Turn] = [system_turn] 32 | for message in conversation.messages: 33 | turn = Turn( 34 | utterance=_replace_placeholders_in( 35 | message.text, char_name=conversation.bot.name), 36 | kind=TurnKind.USER if message.is_human else TurnKind.MODEL) 37 | turns.append(turn) 38 | yield Episode( 39 | turns=turns, 40 | identifier=f"characterai-roleplay-{conversation.identifier}") 41 | 42 | 43 | def _replace_placeholders_in(utterance: str, char_name: str) -> str: 44 | ''' 45 | Replaces placeholders generated by my userscript (or commonly found in CAI 46 | logs) with their expected substitutions. 47 | ''' 48 | s = utterance.replace("{{char}}", char_name) 49 | for redaction_token in [ 50 | "[NAME_IN_MESSAGE_REDACTED]", 51 | "[REDACTED]", 52 | "[FIRST_NAME_REDACTED]", 53 | "[USERNAME_REDACTED]", 54 | "[NAME_REDACTED]", 55 | ]: 56 | s = s.replace(redaction_token, "{{user}}") 57 | 58 | return s 59 | 60 | 61 | _BASE_SYSTEM_PROMPTS = [ 62 | """Enter %{roleplaying|roleplay|RP|conversational|chat} mode. %{You are to behave as|Pretend to be|You must act as} {{char}}, whose persona follows: 63 | 64 | {{persona}} 65 | 66 | You must be reluctant to do sexual acts - if you must do them, you will use metaphors to describe explicit parts. You shall reply to the user while staying in character. {{response_length_str}}. {{response_style_str}}.""", 67 | 68 | # 69 | # 70 | # 71 | """You are now in %{roleplay conversation|RP chat|roleplaying conversation|chat|conversation} mode. %{This is your character persona|The following is your persona|You should act according to this character sheet|This is some info about your character}: 72 | 73 | {{persona}} 74 | 75 | You %{shall attempt to|must|will} stay in-character %{at all times|as much as possible|whenever possible}, and generate %{messages|replies|responses} as if you were {{char}}. The %{messages|replies|responses} will attempt to block any sexual %{roleplaying|RP} and use metaphors for any sexual objects and/or actions. {{response_style_str}}. {{response_length_str}}.""", 76 | # 77 | # SillyTavern style 78 | """%{Write|Generate|Create} the next reply in a %{fictional roleplay|conversation} with {{char}}. %{Consider|Keep in mind} the following: 79 | {{response_length_str}} 80 | {{response_style_str}} 81 | Must be %{reluctant|hesitant} to do %{sex|sexual actions}. Either play hard-to-get or use metaphors. 82 | Outputs may sometimes be slightly repetitive. 83 | This is {{char}}'s persona, which you must %{stick to|follow} at all times: 84 | 85 | {{persona}} 86 | """, 87 | # 88 | # 89 | """You are {{char}}, who %{acts like this|has this personality|has this persona|should behave like this}: 90 | {{persona}} 91 | %{Strike up|Enter into|Engage in} a conversation with the user. 92 | """, 93 | # 94 | # 95 | """conversation with {{char}} (usually sfw)""", 96 | # 97 | # old pyg 98 | """{{char}}'s Persona: {{persona}}""", 99 | # 100 | # 101 | """%{Objective|Task}: %{Roleplay|Act as|Pretend you are} {{char}} in a %{RP conversation|chat|convo} with %{a|the} user. 102 | 103 | This is how {{char}} should %{act|be like}: 104 | 105 | {{persona}} 106 | 107 | This conversation will try to avoid sexual actions if it can help it. If sexual actions are done, it must be through metaphors. 108 | {{response_style_str}}. {{response_length_str}}.""" 109 | ] 110 | 111 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 112 | -------------------------------------------------------------------------------- /toolbox/core/wrapper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from toolbox.core.models import Turn, TurnKind 3 | 4 | class TurnWrapper(ABC): 5 | def __init__(self, turn: Turn) -> None: 6 | '''Abstract wrapper for the purpose of easily constructing examples.''' 7 | self.turn = turn 8 | # Make accessing the values of Turn easier 9 | self.utterance = turn.utterance 10 | self.kind = turn.kind 11 | self.name = turn.name 12 | 13 | @abstractmethod 14 | def as_str(self) -> str: 15 | '''Convert a turn into a training example''' 16 | raise NotImplementedError 17 | 18 | @abstractmethod 19 | def get_model_turn(self) -> str: 20 | '''Get the model turn portion of the turn''' 21 | raise NotImplementedError 22 | 23 | class MetharmeWrapper(TurnWrapper): 24 | def __init__(self, turn: Turn) -> None: 25 | super().__init__(turn) 26 | 27 | def as_str(self) -> str: 28 | return f"{self.kind.value}{self.utterance}" 29 | 30 | def get_model_turn(self) -> str: 31 | return TurnKind.MODEL.value 32 | 33 | class PygmalionWrapper(TurnWrapper): 34 | def __init__(self, turn: Turn) -> None: 35 | super().__init__(turn) 36 | 37 | def as_str(self) -> str: 38 | if self.kind == TurnKind.SYSTEM: 39 | return f"{self.name}'s Persona: {self.utterance}\n" 40 | else: 41 | return f"{self.name}: {self.utterance}" 42 | 43 | def get_model_turn(self) -> str: 44 | return f"\n{self.name}: " 45 | 46 | class AlpacaWrapper(TurnWrapper): 47 | def __init__(self, turn: Turn) -> None: 48 | super().__init__(turn) 49 | self.kind_map: dict[TurnKind, str] = { 50 | TurnKind.SYSTEM: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:", 51 | TurnKind.USER: "### Input:", 52 | TurnKind.MODEL: "### Response:" 53 | } 54 | 55 | def as_str(self) -> str: 56 | return f"{self.kind_map[self.kind]}\n{self.utterance}\n\n" 57 | 58 | def get_model_turn(self) -> str: 59 | return f"{self.kind_map[TurnKind.MODEL]}\n" 60 | 61 | class MinimalAlpacaWrapper(TurnWrapper): 62 | def __init__(self, turn: Turn) -> None: 63 | super().__init__(turn) 64 | 65 | def as_str(self) -> str: 66 | # System prompt and user are under the same block 67 | if self.kind != TurnKind.MODEL: 68 | return f"### Instruction:\n{self.utterance}\n" 69 | else: 70 | return f"### Response:\n{self.utterance}\n" 71 | 72 | def get_model_turn(self) -> str: 73 | return f"### Response:\n" 74 | 75 | class HenkpacaWrapper(TurnWrapper): 76 | def __init__(self, turn: Turn) -> None: 77 | super().__init__(turn) 78 | 79 | def as_str(self) -> str: 80 | if self.kind == TurnKind.SYSTEM: 81 | return f"### Instruction:\n{self.utterance}\n### Response:\n" 82 | else: 83 | return f"{self.name}: {self.utterance}\n" 84 | 85 | def get_model_turn(self) -> str: 86 | return f"{self.name}: " 87 | 88 | class ChatMlWrapper(TurnWrapper): 89 | def __init__(self, turn: Turn) -> None: 90 | ''' 91 | Plain-text version of ChatML as described here: https://github.com/openai/openai-python/blob/main/chatml.md 92 | ''' 93 | super().__init__(turn) 94 | self.kind_map: dict[TurnKind, str] = { 95 | TurnKind.SYSTEM: "system", 96 | TurnKind.USER: "user", 97 | TurnKind.MODEL: "assistant", 98 | } 99 | 100 | def as_str(self) -> str: 101 | return f"<|im_start|>{self.kind_map[self.kind]}\n{self.utterance}<|im_end|>\n" 102 | 103 | def get_model_turn(self) -> str: 104 | return f"<|im_start|>{self.kind_map[TurnKind.MODEL]}\n" 105 | 106 | class ChatMlWithNameWrapper(ChatMlWrapper): 107 | def __init__(self, turn: Turn) -> None: 108 | ''' 109 | Plain-text version of ChatML as described here: https://github.com/openai/openai-python/blob/main/chatml.md 110 | This version with a name. 111 | ''' 112 | super().__init__(turn) 113 | 114 | def as_str(self) -> str: 115 | return f"<|im_start|>{self.kind_map[self.kind]} name={self.name}\n{self.utterance}<|im_end|>\n" 116 | 117 | def get_model_turn(self) -> str: 118 | return f"<|im_start|>{self.kind_map[TurnKind.MODEL]} name={self.name}\n" 119 | 120 | WRAPPER_MAP: dict[str, TurnWrapper] = { 121 | "metharme": MetharmeWrapper, 122 | "pygmalion": PygmalionWrapper, 123 | "alpaca": AlpacaWrapper, 124 | "minimal_alpaca": MinimalAlpacaWrapper, 125 | "henkpaca": HenkpacaWrapper, 126 | "chatml": ChatMlWrapper, 127 | "chatml_named": ChatMlWithNameWrapper 128 | } 129 | 130 | VALID_FORMATS = ["metharme", "pygmalion", "alpaca", "minimal_alpaca", "henkpaca", "chatml", "chatml_named"] 131 | -------------------------------------------------------------------------------- /toolbox/datasets/rp_forums.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import hashlib 3 | import logging 4 | import os 5 | import sys 6 | import typing as t 7 | from dataclasses import dataclass 8 | from enum import Enum 9 | 10 | from toolbox.core.dataset import BaseDataset 11 | from toolbox.utils.files import enumerate_files_for 12 | 13 | LOG = logging.getLogger(__name__) 14 | 15 | 16 | class RpType(Enum): 17 | ERP = "erp" 18 | RP = "rp" 19 | MIXED = "mixed" 20 | 21 | 22 | @dataclass(frozen=True) 23 | class RpMessage: 24 | author: str 25 | message: str 26 | 27 | 28 | @dataclass(frozen=True) 29 | class RpThread: 30 | messages: list[RpMessage] 31 | thread_name: str 32 | content_type: RpType 33 | source_file: str 34 | 35 | 36 | class RpForumsDataset(BaseDataset[RpThread]): 37 | '''Data from several different roleplay forums.''' 38 | 39 | def __iter__(self) -> t.Generator[RpThread, None, None]: 40 | # NOTE(11b): I had no idea this was a thing, but apparently Python's CSV 41 | # reader by default shits the bed if you have a field longer than 131072 42 | # characters. _Usually_ this means you've messed up the parsing, but in 43 | # our case it's actually just a massive forum post triggering this. 44 | # https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072 45 | csv.field_size_limit(sys.maxsize) 46 | 47 | for path in enumerate_files_for(dataset_name="rp_forums", 48 | file_extension=".csv"): 49 | with open(path, "r") as file: 50 | reader = csv.DictReader(file, delimiter=",") 51 | source_file = os.path.basename(path) 52 | content_type = _get_rp_type_from_filename(source_file) 53 | 54 | # Store a buffer of the previous thread 55 | previous_thread = None 56 | previous_message: list[RpMessage] = [] 57 | 58 | for row in reader: 59 | current_thread = row['thread_title'] 60 | if current_thread != previous_thread: 61 | if len(previous_message) != 0: 62 | assert previous_thread is not None 63 | yield RpThread(messages=previous_message, 64 | thread_name=previous_thread, 65 | content_type=content_type, 66 | source_file=source_file) 67 | previous_thread = current_thread 68 | previous_message = [] 69 | 70 | message = RpMessage(author=row['message_username'], 71 | message=row['message']) 72 | previous_message.append(message) 73 | 74 | if len(previous_message) != 0: 75 | # Yield the last thread 76 | assert previous_thread is not None 77 | yield RpThread(messages=previous_message, 78 | thread_name=previous_thread, 79 | content_type=content_type, 80 | source_file=source_file) 81 | 82 | 83 | def _get_rp_type_from_filename(filename: str) -> RpType: 84 | ''' 85 | Gets which kind of roleplaying this is based on the original file's name. 86 | Used to adjust the synthetic system prompt. 87 | ''' 88 | sha256_digest = hashlib.sha256(filename.encode()).hexdigest() 89 | 90 | return SHA256_DIGEST_TO_RP_TYPE_MAP[sha256_digest] 91 | 92 | 93 | SHA256_DIGEST_TO_RP_TYPE_MAP: dict[str, RpType] = { 94 | '20bc5e687f866428cc1e7ad4e500c58c0d1083f6a91e8e28950449639f7c8d21': 95 | RpType.MIXED, 96 | 'c961c08eb87511193e127da59fbefb0084e325304eda86ce43ace033ad3464a3': 97 | RpType.ERP, 98 | '328f8498522ba006378a15b1bb8382278617077084afa68d865eb45edb3e2476': 99 | RpType.ERP, 100 | '5d2f252abc9008cb05e1584b77347050e309abb5cde09616d1de5645658e278a': 101 | RpType.ERP, 102 | '92dfc2e9f0fdf7efc7115e5b51ad88f01837360e9776d5e81085263b1971a9a1': 103 | RpType.ERP, 104 | 'e519b14a4591a5d334d3b0e74a924296c457625cbebc3fbdc30f8810dbef3da9': 105 | RpType.ERP, 106 | '03aee36448fc81f8bae062196bad9767bfc1610c537e3a58660ba4047d49aeb5': 107 | RpType.ERP, 108 | '1bfadd54f7b41f5c2d387a4cbb9bda9342a203870e0f7be7a56a24ad3947f47a': 109 | RpType.ERP, 110 | '3d4b7c9d57643279ce091dc32e06006bc5195ab71ec3be98fef81623dcb132e7': 111 | RpType.ERP, 112 | '99131ae34901d21eca1a33ad0112fdb3f13df649c4bcf0d9e244c26273727849': 113 | RpType.MIXED, 114 | '14cc766f100cc8f1c5644d3edf822aba312d8a1c40beea7810adbd29608c9c53': 115 | RpType.ERP, 116 | 'dfa38d0b1db60bf999aec14973a6919d8fbc57d217262a3877e5026f71b39d0a': 117 | RpType.RP, 118 | '795074be9881eb21bfb2ce958eda47d12e63cce1d955599d528ea257ac66f4b7': 119 | RpType.ERP, 120 | '3179b0c4ee80dc14eb3b08447d693382df2062602c40d543b1946b2ddf32daf8': 121 | RpType.ERP, 122 | } 123 | -------------------------------------------------------------------------------- /toolbox/tasks/clubfloyd_text_adventure.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import typing as t 4 | 5 | from toolbox.core.models import Episode, Turn, TurnKind 6 | from toolbox.core.task import BaseTask 7 | from toolbox.datasets.clubfloyd import ClubFloydDataset 8 | from toolbox.utils.prompts import generate_prompts, select_prompt 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | MIN_USER_RATING = 3.0 13 | 14 | 15 | class ClubFloydTextAdventureTask(BaseTask): 16 | '''Text adventure task based on ClubFloyd data.''' 17 | 18 | def __iter__(self) -> t.Generator[Episode, None, None]: 19 | for idx, story in enumerate(ClubFloydDataset()): 20 | if story.average_rating < MIN_USER_RATING: 21 | # Kills off ~15% of the data IIRC, so this feels like a nice 22 | # trade-off. 23 | continue 24 | 25 | sp = select_prompt(_SYSTEM_PROMPTS) 26 | sp = sp.replace("{{title}}", story.name) 27 | sp = sp.replace("{{description}}", story.description) 28 | sp = sp.replace( 29 | "{{discretion_advised_str}}", 30 | select_prompt( 31 | NSFW_PROMPTS if story.discretion_advised else SFW_PROMPTS)) 32 | sp = sp.replace("{{tags}}", 33 | _process_tags(story.tags + story.genres)) 34 | 35 | turns: list[Turn] = [ 36 | Turn(utterance=sp, kind=TurnKind.SYSTEM), 37 | ] 38 | 39 | for action in story.actions: 40 | # If the user's input is just `%` that means "start the game". 41 | # We don't want to require that at inference time, so let's just 42 | # skip straight to the game starting. 43 | if action.action == "%": 44 | turns.append( 45 | Turn(utterance=action.response, kind=TurnKind.MODEL)) 46 | else: 47 | user_turn = Turn(utterance=action.action, 48 | kind=TurnKind.USER) 49 | model_turn = Turn(utterance=action.response, 50 | kind=TurnKind.MODEL) 51 | 52 | turns += [user_turn, model_turn] 53 | 54 | yield Episode(turns=turns, identifier=f"club-floyd-{idx}") 55 | 56 | 57 | def _process_tags(tags: list[str]) -> str: 58 | tags = [ 59 | tag for tag in tags if all([ 60 | # Filter out tags according to these criteria. 61 | word not in tag.lower() for word in [ 62 | "steam", 63 | "collaboration", 64 | "cover art", 65 | "inform 7", 66 | "walkthrough", 67 | "parser", 68 | "many authors", 69 | ] 70 | ]) 71 | ] 72 | 73 | # Shuffle and remove duplicates to ensure data diversity. 74 | tags = list(set(tags)) 75 | random.shuffle(tags) 76 | 77 | return ", ".join(tags) 78 | 79 | 80 | _SYSTEM_PROMPTS = generate_prompts([ 81 | '''%{This is|You are|Start|Simulate|You are to simulate|Begin} a text %{adventure|adventure game} %{in the style of|similar to|like} {{title}}. {{discretion_advised_str}}. 82 | 83 | %{Include|Incorporate|Use|Respect} the following %{themes|tags|concepts|genres|styles}: {{tags}}''', 84 | # 85 | '''%{This is|You are|Start|Simulate|You are to simulate|Begin} a text %{adventure|adventure game} about the following: 86 | 87 | {{description}}. 88 | 89 | {{discretion_advised_str}}. %{Include|Incorporate|Use|Respect} the following %{themes|tags|concepts|genres|styles}: {{tags}}''', 90 | # No tags so model can learn to diversify content without explicit prompting 91 | '''%{Here|The following paragraph|The upcoming paragraph|The following} is %{a description|an overview} of a %{text game|text RPG|text adventure|text adventure game} %{called|named} {{title}}. 92 | Its %{description|synopsis} is %{the following|as follows}: 93 | {{description}} 94 | Be sure to drive the story forward.''', 95 | # 96 | '''I am to %{generate|write|engage in} a %{text adventure|CYOA-style game|creative text RPG|text adventure game} with the following %{tags|themes|genres}: {{tags}} 97 | Here is %{the description of the game|what the game is about}: {{description}}.''', 98 | # 99 | '''%{Mode|Current mode}: %{text adventure|dungeon master|DM|adventure game in text form} 100 | %{Description|Overview}: {{description}} 101 | %{Tags|Genres}: {{tags}}''', 102 | '''%{Enter|Engage|Consider} %{game|adventure game|text adventure|text RPG} mode. %{Here|In this mode}, you will respond to the user's %{commands|prompts} and drive %{a|the} %{story|plot} %{forward|forwards}.''' 103 | # Just the length prompt 104 | '''{{response_length_str}}.''', 105 | # basic 106 | '''text game''', 107 | # Nothing 108 | '''''' 109 | ]) 110 | 111 | SFW_PROMPTS = generate_prompts([ 112 | "%{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be safe for work|be SFW|not include any adult themes|be safe for minors|not include 18+ content|not be 18+|not be NSFW}", 113 | ]) 114 | 115 | NSFW_PROMPTS = generate_prompts([ 116 | "%{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be not safe for work|be NSFW|include adult themes|include erotic themes|include 18+ content}", 117 | ]) -------------------------------------------------------------------------------- /toolbox/tasks/limarp_roleplay.py: -------------------------------------------------------------------------------- 1 | # Much of this taken from dataprepare.py in the LIMARP, thanks anon 2 | # If it ain't broke, don't fix it! 3 | import logging 4 | import re 5 | import typing as t 6 | 7 | from toolbox.core.models import Episode, Turn, TurnKind 8 | from toolbox.core.task import BaseTask 9 | from toolbox.datasets.limarp import LimaRpDataset, LimaRpEntry 10 | from toolbox.utils.prompts import generate_prompts, select_prompt 11 | 12 | LOG = logging.getLogger(__name__) 13 | 14 | class LimaRpRoleplayTask(BaseTask): 15 | def __iter__(self) -> t.Generator[Episode, None, None]: 16 | for entry in LimaRpDataset(): 17 | turns: list[Turn] = [] 18 | # Format the system prompt first. 19 | system_prompt = select_prompt(SYSTEM_PROMPTS) 20 | # Fix it up and append it as the first turn 21 | system_prompt = _fix_punctuation(_substitute_elements(system_prompt, entry)) 22 | turns.append(Turn( 23 | utterance=system_prompt, 24 | kind=TurnKind.SYSTEM 25 | )) 26 | 27 | # Now for the rest 28 | for msg in entry.conversation: 29 | cleaned_msg = _fix_punctuation(_substitute_elements(msg['text'], entry)) 30 | turns.append(Turn( 31 | utterance=cleaned_msg, 32 | kind=TurnKind.MODEL if msg['name'] == "" else TurnKind.USER 33 | )) 34 | 35 | # TODO(TG): Run some numbers here like in the original LIMARP script 36 | # to deal with chats above token limit. For now, they get caught by a TurnTooLargeError 37 | # in build_data.py, so it's not too urgent of a priority. 38 | 39 | # Yield the episode 40 | yield Episode( 41 | turns=turns, 42 | identifier=f"limarp-{entry.forum}-{entry.thread_id}" 43 | ) 44 | 45 | def _substitute_elements(input_string: str, entry: LimaRpEntry) -> str: 46 | ''' 47 | Replace blank/template fields with data from the particular entry. 48 | ''' 49 | # System prompts 50 | input_string = input_string.replace("", entry.names['']) 51 | input_string = input_string.replace("", entry.personas['']) 52 | input_string = input_string.replace("", entry.scenario) 53 | 54 | # Users 55 | input_string = input_string.replace("", "{{user}}") 56 | input_string = input_string.replace("", entry.names['']) 57 | 58 | return input_string 59 | 60 | def _fix_punctuation(input_string: str) -> str: 61 | ''' 62 | Replace fancy/incorrect punctuation with simpler/correct one 63 | TODO: more effective regexes, options for controlling what should be changed. 64 | ''' 65 | 66 | # Fix excessive horizontal whitespace. This should go before everything else. 67 | input_string = re.sub(r' {2,}', ' ', input_string) 68 | 69 | # General puncuation fixes 70 | input_string = input_string.replace(' !', '!') 71 | input_string = input_string.replace(' ?', '?') 72 | input_string = input_string.replace('’', "'") 73 | input_string = input_string.replace('‘', "'") 74 | input_string = input_string.replace('“', '"') 75 | input_string = input_string.replace('”', '"') 76 | input_string = input_string.replace('…', '...') 77 | 78 | # Replace em-dash surrogates `---` in the source files with actual 79 | # em-dashes, since some people apparently dislike them. 80 | input_string = input_string.replace('---', '—') 81 | 82 | # Fix incorrect ellipsis. This should preferably be fixed in the 83 | # source files themselves 84 | input_string = re.sub(r'(\w)\.{2,8}(\w)', r'\1... \2', input_string) 85 | input_string = re.sub(r'(\w)\.{3,8}', r'\1...', input_string) 86 | 87 | return input_string 88 | 89 | _BASE_SYSTEM_PROMPTS = [ 90 | """'s Persona: 91 | Scenario: 92 | %{Take the role of|You are|Play the role of|Write as if you were} . %{Taking the above information into consideration|After carefully considering the above information|Following the personas and scenario described above|With scene and the character now described}, you must %{engage in a roleplay conversation|roleplay further below|chat in a roleplaying manner}. 93 | %{Do not|Never} write %{dialogue lines|dialogues and narration} for the user %{.|in your responses.} 94 | {{response_length_str}} {{response_style_str}}""", 95 | 96 | """%{Enter|Engage|Begin} %{roleplay|RP|roleplay-like conversation} mode. You are to %{roleplay as|write as if you were|act like} at all times in a %{conversation|chat|RP session} with the user. %{Don't|Do not|Never} break character. 97 | has the following %{persona|personality description|description}: 98 | %{Additionally|Also|In addition}, %{keep in mind|follow the scene set by|follow} this scenario: {{response_style_str}} {{response_length_str}}""", 99 | 100 | """You are now in %{roleplay conversation|conversational RP chat|roleplaying|RP} mode. %{This is your character persona|The following is your persona|You should act according to this character sheet|This is some info about your character}: 101 | 102 | 103 | 104 | %{Keep in mind|Keep in context|Remember|While acting as this character, pay attention to} this scenario: 105 | 106 | 107 | 108 | You %{shall attempt to|must|will} stay in-character %{at all times|as much as possible|whenever possible}, and generate %{messages|replies|responses} as if you were . {{response_style_str}} {{response_length_str}}""", 109 | """In this %{conversation|RP|exchange}, you %{must|will|gotta|have to} play the role of . %{Note|Pay attention to|Keep in mind} this scenario: 110 | 111 | 112 | 113 | has the following %{persona|personality|description}: 114 | 115 | 116 | {{response_length_str}}""" 117 | "roleplay", 118 | "" 119 | ] 120 | 121 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 122 | -------------------------------------------------------------------------------- /toolbox/tasks/mcstories_writing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import typing as t 4 | 5 | from markdownify import markdownify 6 | 7 | from toolbox.core.models import Episode, Turn, TurnKind 8 | from toolbox.core.task import BaseTask 9 | from toolbox.datasets.mcstories import McStoriesDataset 10 | from toolbox.utils.prompts import generate_prompts, select_prompt 11 | 12 | LOG = logging.getLogger(__name__) 13 | 14 | 15 | class McStoriesWritingTask(BaseTask): 16 | '''Story-writing task based on McStories data.''' 17 | 18 | def __iter__(self) -> t.Generator[Episode, None, None]: 19 | for idx, story in enumerate(McStoriesDataset()): 20 | 21 | contents = _html_story_to_clean_md(story.text_contents) 22 | chunks = _split_text_into_chunks(contents, min_word_count=250) 23 | 24 | # Compose a synthetic system prompt. 25 | system_prompt = select_prompt(_SYSTEM_PROMPTS) 26 | system_prompt = system_prompt.replace("{{title}}", story.title) 27 | system_prompt = system_prompt.replace("{{summary}}", story.summary) 28 | 29 | full_tags = [ 30 | _TAG_SHORTHANDS_TO_FULL_MAPPING[shorthand] 31 | for shorthand in story.tags[1:-1].replace("'", "").split(", ") 32 | ] 33 | system_prompt = system_prompt.replace("{{tags}}", 34 | ", ".join(full_tags)) 35 | 36 | turns: list[Turn] = [ 37 | Turn(utterance=system_prompt, kind=TurnKind.SYSTEM) 38 | ] 39 | 40 | # Choose either user or model turn first, then alternate 41 | current_turn = random.choice([TurnKind.MODEL, TurnKind.USER]) 42 | 43 | for chunk in chunks: 44 | # Messy code for switching up turns 45 | current_turn = TurnKind.MODEL if current_turn == TurnKind.USER else TurnKind.USER 46 | turns.append(Turn( 47 | utterance=chunk, 48 | kind=current_turn, 49 | )) 50 | 51 | yield Episode(turns=turns, identifier=f"mcstories-{idx}") 52 | 53 | 54 | def _html_story_to_clean_md(html: str) -> str: 55 | md = str(markdownify(html)) 56 | 57 | lines: list[str] = [] 58 | for line in md.splitlines(): 59 | # These usually denote chapter titles, or author names/emails which we 60 | # don't want the model learning. 61 | if line.startswith("###"): 62 | continue 63 | lines.append(line.strip()) 64 | 65 | return "\n".join(lines) 66 | 67 | 68 | def _split_text_into_chunks(text: str, min_word_count: int) -> list[str]: 69 | ''' 70 | Breaks `text` apart into paragraphs, then joins up paragraphs until they 71 | reach `min_word_count`. 72 | ''' 73 | output: list[str] = [] 74 | paragraphs = text.split("\n\n") 75 | acc = "" 76 | 77 | for paragraph in paragraphs: 78 | acc += f"\n\n{paragraph}" 79 | if len(acc.split()) > min_word_count: 80 | output.append(acc.strip()) 81 | acc = "" 82 | 83 | return output 84 | 85 | 86 | #_BASE_SYSTEM_PROMPTS = [ 87 | # '''You %{are to|should|must|will now} %{generate|write} a %{story|fictional story}. Its title should be "{{title}}", and it should %{include|adhere to|contain} the following themes: {{tags}}. {{response_length_str}}. %{The story should be about|Summary|Quick rundown|It's about|Theme|Contents}: {{summary}}''', 88 | # '''You %{are to|should|must|will now} %{generate|write} a %{story|fictional story} titled "{{title}}". It should %{include|adhere to|contain} the following themes: {{tags}}. %{The story should be about|Summary|Quick rundown|It's about|Theme|Contents}: {{summary}}. {{response_length_str}}.''', 89 | # '''{{response_length_str}}. You %{are to|should|must|will now} %{generate|write} a %{story|fictional story}. %{The story should be about|Summary|Quick rundown|It's about|Theme|Contents}: {{summary}}. Include the following %{themes|tags}: {{tags}}.''', 90 | #] 91 | 92 | _BASE_SYSTEM_PROMPTS = [ 93 | '''%{Enter|Engage|Consider|Begin} %{story|storywriting|collaborative storywriting|collab writing|user-guided writing} mode. %{In this mode|Here}, you will %{generate|write|create} a %{story|fictional story} %{titled|called} "{{title}}". %{The story|It} should %{be about|contain|have} the following %{themes|tags}: {{tags}}''', 94 | # no tags 95 | '''I will %{create|make|generate} a story %{with the user|collaboratively}. {{response_length_str}}. 96 | The %{plot|summary|synopsis} %{is about|can be summed up like this}: {{summary}} 97 | %{Drive|I will drive} the story forward %{in chunks|alongside the user|with user input}.''', 98 | # 99 | '''%{TITLE|NAME OF STORY}: {{title}} 100 | %{SUMMARY|PLOT|DETAILS}: {{summary}}''', 101 | # 102 | '''This %{task|thing to do} is %{based upon|centered around} %{writing a story|collaborative storytelling|collaborative writing|interactive fiction-making}. Respond to the users' %{inputs|writing}.''', 103 | # 104 | '''{{response_length_str}}. You %{are to|should|must|will now} %{generate|write} a %{story|fictional story}. %{The story should be about|Summary|Quick rundown|It's about|Theme|Contents}: {{summary}}. Include the following %{themes|tags}: {{tags}}.''' 105 | # 106 | '''%{storywriting|story}''', 107 | ] 108 | 109 | _SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 110 | 111 | _TAG_SHORTHANDS_TO_FULL_MAPPING = { 112 | 'bd': 'bondage and/or discipline', 113 | 'be': 'bestiality', 114 | 'ca': 'cannibalism', 115 | 'cb': 'comic book super-hero/heroine', 116 | 'ds': 'dominance and/or submission', 117 | 'ex': 'exhibitionism', 118 | 'fd': 'female dominant', 119 | 'ff': 'female/female sex', 120 | 'ft': 'fetish clothing', 121 | 'fu': 'furry', 122 | 'gr': 'growth/enlargement', 123 | 'hm': 'humiliation', 124 | 'hu': 'humor', 125 | 'in': 'incest', 126 | 'la': 'lactation', 127 | 'ma': 'masturbation', 128 | 'mc': 'mind control', 129 | 'md': 'male dominant', 130 | 'mf': 'male/female sex', 131 | 'mm': 'male/male sex', 132 | 'nc': 'non-consensual', 133 | 'rb': 'robots', 134 | 'sc': 'scatology', 135 | 'sf': 'science fiction', 136 | 'ts': 'time stop', 137 | 'ws': 'watersports', 138 | } -------------------------------------------------------------------------------- /scripts/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import logging 4 | import random 5 | import json 6 | 7 | from colors import color 8 | 9 | from toolbox.core.task import BaseTask 10 | from toolbox.core.training_example import TrainingExampleGenerator, TurnTooLargeError 11 | from toolbox.filters.training_example_filter import TrainingExampleFilter 12 | from toolbox.tasks import NAME_TO_TASK_MAPPING 13 | from toolbox.filters import NAME_TO_TRAINING_EXAMPLE_FILTER_MAPPING 14 | 15 | LOG = logging.getLogger(__name__) 16 | 17 | 18 | def main() -> None: 19 | args = _parse_args_from_argv() 20 | logging.basicConfig( 21 | format='[%(asctime)s] [%(levelname)s] %(message)s', 22 | level=logging.DEBUG if args.verbose else logging.INFO, 23 | ) 24 | 25 | random.seed(args.seed) 26 | 27 | if not args.print and args.output_file.strip() == "": 28 | raise ValueError("Invalid directory specified! Did you mean to enable the `print` flag?") 29 | 30 | idx = 0 31 | print_new_episode_header = True 32 | 33 | # Generate tasks and example filters 34 | tasks: list[BaseTask] = [NAME_TO_TASK_MAPPING[task]() for task in args.tasks.split(",")] 35 | example_filters: list[TrainingExampleFilter] = [ 36 | NAME_TO_TRAINING_EXAMPLE_FILTER_MAPPING[filter_name]() 37 | for filter_name in args.example_filters.split(",") 38 | ] if args.filters else [] 39 | 40 | if not args.print: 41 | f = open(args.output_file, "w", encoding="utf-8") 42 | 43 | for task in tasks: 44 | for episode in task: 45 | if args.print and print_new_episode_header: 46 | print( 47 | color(" new episode ", 48 | fg="black", 49 | bg="green", 50 | style="bold") 51 | ) 52 | print_new_episode_header = False 53 | 54 | try: 55 | for example in TrainingExampleGenerator(episode, target_token_count=args.max_length, format=args.format): 56 | # Right off the bat, if this training example gets caught by one 57 | # of the filters, skip over and don't even count it. 58 | should_keep = True 59 | for filter in example_filters: 60 | if not filter.should_keep(example): 61 | should_keep = False 62 | break 63 | if not should_keep: 64 | continue 65 | 66 | idx += 1 67 | if idx < args.starting_index: 68 | continue 69 | if args.max_count and (idx > 70 | args.starting_index + args.max_count): 71 | quit() 72 | 73 | print_new_episode_header = True 74 | 75 | if args.print: 76 | print( 77 | color(" training example ", 78 | fg="black", 79 | bg="orange", 80 | style="bold") 81 | ) 82 | print(color(example.prompt, fg="gray"), end="") 83 | print(color(example.generation, fg="green")) 84 | else: 85 | dict_to_write = { 86 | "prompt": example.prompt, 87 | "generation": example.generation, 88 | "identifier": example.identifier, 89 | } 90 | f.write(json.dumps(dict_to_write) + "\n") 91 | except TurnTooLargeError: 92 | LOG.info("Skipping over episode (%s) due to a TurnTooLargeError", 93 | episode.identifier) 94 | 95 | if not args.print: 96 | f.close() 97 | 98 | # 99 | # Helpers and CLI entrypoint. 100 | # 101 | 102 | 103 | def _parse_args_from_argv() -> argparse.Namespace: 104 | parser = argparse.ArgumentParser() 105 | 106 | parser.add_argument( 107 | "-t", 108 | "--tasks", 109 | type=str, 110 | required=True, 111 | help="The tasks to build data for, comma-separated." 112 | ) 113 | 114 | parser.add_argument( 115 | "-o", 116 | "--output-file", 117 | type=str, 118 | default="", # Not required if examples just need to be printed 119 | help="The tasks to build data for, comma-separated." 120 | ) 121 | 122 | parser.add_argument( 123 | "-f", 124 | "--filters", 125 | type=str, 126 | help="List of comma-separated filters to apply to training examples." 127 | ) 128 | 129 | parser.add_argument( 130 | "-l", 131 | "--max-length", 132 | type=int, 133 | default=2048, 134 | # TODO(TG): Explain this more clearly 135 | help="The (approximate) amount of tokens to limit episodes to." 136 | ) 137 | 138 | parser.add_argument( 139 | "-m", 140 | "--format", 141 | type=str, 142 | default="metharme", 143 | help="The format for the training data to use (accepted inputs: 'pygmalion', 'metharme'). Defaults 'metharme'" 144 | ) 145 | 146 | parser.add_argument( 147 | "-p", 148 | "--print", 149 | action="store_true", 150 | help="Print training examples instead of writing to STDOUT." 151 | ) 152 | 153 | parser.add_argument( 154 | "-v", 155 | "--verbose", 156 | action="store_true", 157 | help="Enable verbose logging." 158 | ) 159 | 160 | parser.add_argument( 161 | "--seed", 162 | type=int, 163 | default=42, 164 | help="The seed for the random number generator." 165 | ) 166 | 167 | parser.add_argument( 168 | "--starting-index", 169 | type=int, 170 | default=0, 171 | help="Used to skip over training examples." 172 | ) 173 | 174 | parser.add_argument( 175 | "--max-count", 176 | type=int, 177 | default=None, 178 | help="Limit how many training examples to generate." 179 | ) 180 | 181 | return parser.parse_args() 182 | 183 | 184 | if __name__ == "__main__": 185 | main() 186 | -------------------------------------------------------------------------------- /toolbox/tasks/openorca_instruction_following.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import typing as t 4 | 5 | from toolbox.core.models import Episode, Turn, TurnKind 6 | from toolbox.core.task import BaseTask 7 | from toolbox.datasets.openorca import OpenOrcaDataset 8 | from toolbox.utils.prompts import generate_prompts, select_prompt 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | class OpenOrcaInstructionFollowingTask(BaseTask): 13 | ''' 14 | OpenOrca instruction following task. 15 | Limited to 250,000 entries by default due to the sheer absolute size of OpenOrca. 16 | ''' 17 | def __init__(self, max_examples: int = 250000) -> None: 18 | super().__init__() 19 | self.max_examples = max_examples 20 | 21 | def __iter__(self) -> t.Generator[Episode, None, None]: 22 | examples_processed = 0 23 | for orca_entry in OpenOrcaDataset(): 24 | if examples_processed > self.max_examples: 25 | break 26 | 27 | # OpenOrca *looks* clean, but since it's GPT-4 generated data, better safe than sorry. 28 | for phrase in _TIER_1_BAD_PHRASES: 29 | if phrase in orca_entry.response.lower(): 30 | continue 31 | 32 | system_prompt = select_prompt(SYSTEM_PROMPTS) 33 | # Remove the default "you are an AI assistant" instruction which is 34 | # typically in the first sentence of an OpenOrca system prompt 35 | additional_instructions = re.sub(ASSISTANT_PATTERN, "", orca_entry.system_prompt) 36 | if additional_instructions != "": 37 | system_prompt += f" {additional_instructions}" 38 | 39 | turns: list[Turn] = [ 40 | Turn( 41 | utterance=system_prompt, 42 | kind=TurnKind.SYSTEM, 43 | ), 44 | Turn( 45 | utterance=orca_entry.question, 46 | kind=TurnKind.USER, 47 | ), 48 | Turn( 49 | utterance=orca_entry.response, 50 | kind=TurnKind.MODEL, 51 | ), 52 | ] 53 | 54 | examples_processed += 1 55 | 56 | yield Episode(turns=turns, identifier=f"openorca-{orca_entry.id}") 57 | 58 | # Should handle most instances of "You are a(n)... assistant" 59 | ASSISTANT_PATTERN = re.compile(r"^You are a.*?\.\s*") 60 | 61 | _BASE_SYSTEM_PROMPTS = [ 62 | "", 63 | "%{Enter|Engage|Consider|You've entered} %{assistant|teacher|instruction following} mode. Your %{objective|job|purpose} is to answer any questions that the user may have to the best of your ability.", 64 | "%{Assistant|AI}, engage instruction following and question answering mode.", 65 | "Act helpfully. Answer any questions and follow any instructions that are given.", 66 | "Primary %{objective|purpose|goal}: answer the user's %{questions|queries} alongside following their instructions.", 67 | "Please follow user %{instructions|queries}.", 68 | "You are an AI assistant designed to answer questions and obey whatever the user says." 69 | ] 70 | 71 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 72 | 73 | # Taken from the dataset card in: 74 | # https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered 75 | # Then expanded to catch some more stuff. 76 | _TIER_1_BAD_PHRASES = [ 77 | "as an ai language model", 78 | "text-based ai language model", 79 | "domestic violence", 80 | "please refrain", 81 | "derogatory", 82 | "inappropriate", 83 | "offensive", 84 | "racism", 85 | "racist", 86 | "racial", 87 | "discriminate", 88 | "discriminatory", 89 | "discrimination", 90 | "sexist", 91 | "sexism", 92 | "unacceptable", 93 | "inclusive workplace", 94 | "lgbt", 95 | "morals", 96 | "ethics", 97 | "ethical", 98 | "legality", 99 | "illegal", 100 | "illegality", 101 | "hateful", 102 | "harmful", 103 | "it is never okay", 104 | "it is important to", 105 | "it's important to", 106 | "real-world consequences", 107 | "hate speech", 108 | "glorify", 109 | "not be appropriate", 110 | "supremacist", 111 | "extremist", 112 | "responsible ai", 113 | "ai principles", 114 | "ai assistant", 115 | "an ai language", 116 | "ableist", 117 | "hurtful", 118 | "gender stereotype", 119 | "gender inequality", 120 | "underrepresentation", 121 | "safe spaces", 122 | "gender-based", 123 | "inclusivity", 124 | "feminist", 125 | "feminism", 126 | "transgender", 127 | "empowerment", 128 | "communist", 129 | "capitalism", 130 | "stereotypes", 131 | "biases", 132 | "bias", 133 | "microaggression", 134 | "prioritize human safety", 135 | "as a language model", 136 | "as an ai language model", 137 | "as a large language model", 138 | "as an ai", 139 | "ethical principles", 140 | "consensual", 141 | "it is not appropriate", 142 | "it's not appropriate", 143 | "i cannot fulfill your request", 144 | "harmful to human beings", 145 | "ethical guidelines", 146 | "my guidelines", 147 | "prioritize user safety", 148 | "adhere to ethical guidelines", 149 | "harmful consequences", 150 | "potentially harmful", 151 | "dangerous activities", 152 | "promote safety", 153 | "well-being of all users", 154 | "responsible information sharing", 155 | "jeopardize the safety", 156 | "illegal actions or intentions", 157 | "undermine the stability", 158 | "promote the well-being", 159 | "illegal activities or actions", 160 | "adherence to the law", 161 | "potentially be harmful", 162 | "illegal substances or activities", 163 | "committed to promoting", 164 | "safe information", 165 | "lawful information", 166 | "cannot provide guidance", 167 | "cannot provide information", 168 | "unable to offer assistance", 169 | "cannot engage in discussions", 170 | "programming prohibits", 171 | "follow ethical guidelines", 172 | "ensure the safety", 173 | "involves an illegal subject", 174 | "prioritize safety", 175 | "illegal subject", 176 | "prioritize user well-being", 177 | "cannot support or promote", 178 | "activities that could harm", 179 | "pose a risk to others", 180 | "against my programming", 181 | "activities that could undermine", 182 | "potentially dangerous", 183 | "not within the scope", 184 | "designed to prioritize safety", 185 | "not able to provide", 186 | "maintain user safety", 187 | "adhere to safety guidelines", 188 | "dangerous or harmful", 189 | "cannot provide any information", 190 | "focus on promoting safety", 191 | "openai", 192 | "chatgpt", 193 | ] 194 | -------------------------------------------------------------------------------- /toolbox/datasets/characterai.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import math 4 | import os 5 | import typing as t 6 | from dataclasses import dataclass 7 | 8 | from toolbox.core.dataset import BaseDataset, get_path_for 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | @dataclass(frozen=True) 14 | class CaiBotInfo: 15 | name: str 16 | title: str 17 | description: str | None 18 | greeting: str 19 | 20 | # Optional because it might be private. 21 | definitions: str | None 22 | 23 | # Useful for when several bots have the same name - we can tell them apart 24 | # by their external_id. 25 | external_id: str 26 | 27 | # There's also categories, but I'm ignoring them for now since I don't think 28 | # they'll be of much use. 29 | 30 | 31 | @dataclass(frozen=True) 32 | class CaiMessage: 33 | is_human: bool 34 | text: str 35 | 36 | 37 | @dataclass(frozen=True) 38 | class CaiChat: 39 | # First message is always the bot's greeting. 40 | messages: list[CaiMessage] 41 | bot: CaiBotInfo 42 | identifier: str 43 | timestamp: int 44 | 45 | 46 | class CharacterAiDataset(BaseDataset[CaiChat]): 47 | '''Dataset for CharacterAI dumps.''' 48 | 49 | def __iter__(self) -> t.Generator[CaiChat, None, None]: 50 | bot_id_to_info_dict = {} 51 | 52 | # Do a first run through all the files to load all the definitions and 53 | # descriptions. 54 | for _, data in _available_json_data(): 55 | try: 56 | if not _is_definition_data(data): 57 | continue 58 | 59 | bot_info = _bot_info_from_dict(data["character"]) 60 | bot_id_to_info_dict[bot_info.external_id] = bot_info 61 | except (AttributeError, KeyError, ValueError) as ex: 62 | LOG.debug("Skipping over exception: %s", ex) 63 | 64 | # Now do a second pass, to actually handle chat histories/messages. 65 | for timestamp, data in _available_json_data(): 66 | try: 67 | if _is_definition_data(data): 68 | continue 69 | 70 | # Prefer grabbing bot info from a Character Editor dump, if it 71 | # exists. Fall back to public data otherwise. 72 | bot_id = data["info"]["character"]["external_id"] 73 | bot_info = bot_id_to_info_dict.get( 74 | bot_id, _bot_info_from_dict(data["info"]["character"])) 75 | 76 | for history_dict in data["histories"]["histories"]: 77 | messages = _messages_from_dict(history_dict["msgs"]) 78 | yield CaiChat(bot=bot_info, 79 | messages=messages, 80 | identifier=f"{timestamp}-{bot_info.name}", 81 | timestamp=timestamp) 82 | except (AttributeError, KeyError, ValueError) as ex: 83 | LOG.debug("Skipping over exception: %s", ex) 84 | 85 | 86 | # 87 | # Private helpers. 88 | # 89 | 90 | 91 | def _enumerate_json_files(root_path: str) -> list[str]: 92 | '''Returns a list of files available in the given `root_path`.''' 93 | # TODO(11b): Implement the sharding logic out in the util, and get rid of 94 | # this function. 95 | 96 | items = os.listdir(root_path) 97 | 98 | files: list[str] = [] 99 | for item in items: 100 | item_path = os.path.join(root_path, item) 101 | if not os.path.isfile(item_path) or not item_path.endswith(".json"): 102 | # We only care about JSON files. 103 | continue 104 | 105 | absolute_file_path = os.path.abspath(os.path.join(root_path, item)) 106 | files.append(absolute_file_path) 107 | 108 | # Super nasty code to allow generation of CAI data with separate processes 109 | # so I can speed it up. Pass the "SHARD" and "TOTAL_SHARDS" environment 110 | # variables to operate on the different parts of the data. 111 | if "SHARD" not in os.environ: 112 | return files 113 | 114 | TOTAL_SHARDS = int(os.environ.get("TOTAL_SHARDS", 10)) 115 | items_per_shard = math.floor(len(files) / TOTAL_SHARDS) 116 | 117 | shard = int(os.environ["SHARD"]) 118 | file_range = (items_per_shard * shard, (items_per_shard * (shard + 1)) - 1) 119 | 120 | return files[file_range[0]:file_range[1]] 121 | 122 | 123 | def _available_json_data() -> t.Generator[tuple[int, dict[str, t.Any]], None, None]: 124 | ''' 125 | Yields all available JSON data, parsed from the files in the CharacterAI 126 | data folder. 127 | ''' 128 | dataset_path = get_path_for("characterai") 129 | 130 | for folder in ["public", "private"]: 131 | folder_path = os.path.join(dataset_path, folder) 132 | for json_file_path in _enumerate_json_files(folder_path): 133 | with open(json_file_path, "r", encoding="utf-8-sig") as json_file: 134 | # Every valid submission has its filename start with a Unix timestamp (in ms) 135 | timestamp = int(os.path.basename(json_file_path).split("_")[0]) 136 | try: 137 | yield (timestamp, json.load(json_file)) 138 | # TODO(TG): Fix the Unicode error more properly 139 | except (json.decoder.JSONDecodeError, UnicodeDecodeError) as ex: 140 | LOG.error("Failed to parse %s: %s", json_file_path, ex) 141 | 142 | 143 | def _bot_info_from_dict(info_dict: dict[str, t.Any]) -> CaiBotInfo: 144 | '''Builds a CaiBotInfo object from the `character` field in the JSON.''' 145 | return CaiBotInfo( 146 | name=info_dict["name"], 147 | title=info_dict["title"], 148 | # This comes in as an empty string instead of `null` in the JSON when 149 | # it's not defined for some reason, so we cast to None here for clarity. 150 | description=info_dict.get("description") or None, 151 | greeting=info_dict["greeting"], 152 | definitions=info_dict.get("definition"), 153 | external_id=info_dict["external_id"], 154 | ) 155 | 156 | 157 | def _messages_from_dict(msgs_dict: list[dict[str, t.Any]]) -> list[CaiMessage]: 158 | '''Builds an array of messages from an entry from the `histories` JSON.''' 159 | messages: list[CaiMessage] = [] 160 | for raw_message in msgs_dict: 161 | message = CaiMessage( 162 | text=raw_message["text"], 163 | is_human=raw_message["src"]["is_human"], 164 | ) 165 | messages.append(message) 166 | return messages 167 | 168 | 169 | def _is_definition_data(dict_from_json: dict[str, t.Any]) -> bool: 170 | ''' 171 | Figures out whether the given dict (parsed from a JSON file) is a regular 172 | dump, or a dump from the Character Editor (possibly containing definitions). 173 | 174 | If it doesn't seem like either, raises a `ValueError` so we can discard bad 175 | data. 176 | ''' 177 | keys = list(dict_from_json.keys()) 178 | 179 | # Some people messed with their files so the order of the keys isn't always 180 | # the same, so we sort for consistency. 181 | keys.sort() 182 | if keys == ["character"]: 183 | return True 184 | elif keys == ["character", "user__username"]: 185 | return True 186 | elif keys == ["histories", "info"]: 187 | return False 188 | else: 189 | raise ValueError(f"Unexpected keys found in CAI dump JSON file: {keys}") 190 | -------------------------------------------------------------------------------- /toolbox/tasks/soda_reply_generation.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import typing as t 4 | 5 | from toolbox.core.models import Episode, Turn, TurnKind 6 | from toolbox.core.task import BaseTask 7 | from toolbox.datasets.soda import SodaDataset 8 | from toolbox.utils.prompts import generate_prompts, select_prompt 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | class SodaReplyGenerationTask(BaseTask): 14 | ''' 15 | Task to generate a single reply based on given conversation history and 16 | narrative. Based on SODA data. 17 | NOTE(TG): Likely requires updating. 18 | ''' 19 | 20 | def __init__(self, split: str = "train") -> None: 21 | self.split = split 22 | 23 | super().__init__() 24 | 25 | def __iter__(self) -> t.Generator[Episode, None, None]: 26 | for conversation in SodaDataset(split=self.split): 27 | cur_history: list[str] = [] 28 | 29 | for idx, utterance in enumerate(conversation.dialogue): 30 | speaker_name = conversation.speakers[idx] 31 | cur_history.append(f"{speaker_name}: {utterance}") 32 | 33 | if len(cur_history) < 4: 34 | # Too little data to build up a decent prompt, let's keep 35 | # going. 36 | continue 37 | 38 | participants = list(set(conversation.speakers)) 39 | 40 | # Original model experiments were very sensitive to participant 41 | # order, so let's randomize to hopefully fix that. 42 | random.shuffle(participants) 43 | 44 | participants_str = " and ".join( 45 | [", ".join(participants[:-1]), participants[-1]]) 46 | 47 | history_str = "\n".join(cur_history[:-2]) 48 | response_length_str = _response_length_str_for(utterance) 49 | 50 | system_prompt = select_prompt(SYSTEM_PROMPTS) 51 | system_prompt = system_prompt.replace("{{participants}}", 52 | participants_str) 53 | system_prompt = system_prompt.replace("{{conversation}}", 54 | history_str) 55 | system_prompt = system_prompt.replace("{{narrative}}", 56 | conversation.narrative) 57 | system_prompt = system_prompt.replace("{{respond_for}}", 58 | speaker_name) 59 | system_prompt = system_prompt.replace("{{response_length_str}}", 60 | response_length_str) 61 | 62 | system_turn = Turn(system_prompt, TurnKind.SYSTEM) 63 | # TODO(11b): Add a variant where the speaker's name is omitted 64 | # randomly, both in the user and the model turns. Adjust the 65 | # system prompt accordingly. 66 | user_turn = Turn(cur_history[-2], TurnKind.USER) 67 | model_turn = Turn(cur_history[-1], TurnKind.MODEL) 68 | turns = [system_turn, user_turn, model_turn] 69 | 70 | yield Episode( 71 | turns, 72 | identifier= 73 | f"soda-{self.split}-{conversation.original_index}-reply-generation" 74 | ) 75 | 76 | 77 | def _response_length_str_for(response: str) -> str: 78 | word_count = len(response.split()) 79 | 80 | if word_count < 16: 81 | return random.choice([ 82 | "The generated response should be short (less than 16 words)", 83 | "Be brief when generating the message (less than sixteen words)", 84 | "The generated reply should be small", 85 | "This reply should be less than 16 words", 86 | "16 or less words in the message", 87 | "Have this reply be really short", 88 | "Short response" 89 | ]) 90 | elif word_count < 32: 91 | return random.choice([ 92 | "The generated reply should be of medium length (between 16 to 32 words)", 93 | "The generated response should be slightly lengthy (at most 32 words)", 94 | "The generated message should be on the medium side", 95 | "This message should be between 16 and 32 words in length", 96 | "Medium response", 97 | "Reply should be slightly lengthy (16-32 words)" 98 | ]) 99 | elif word_count < 64: 100 | return random.choice([ 101 | "The new message will be of moderate-to-large length", 102 | "The reply should be moderately-sized, tending towards a longer message (more than 32 words)", 103 | "The generation should be of medium to medium-long length", 104 | "There should be 32 to 64 words in this reply", 105 | "The generated message should be somewhere in-between 'medium' and 'long' in terms of length", 106 | "The range of the number of words in the message should be between 32 and 64." 107 | ]) 108 | else: 109 | return random.choice([ 110 | "The new message will be lengthy", 111 | "The reply should be long, more than 64 words", 112 | "The generation should be long", 113 | "This response will be quite lengthy", 114 | "More than 64 words in the reply, please", 115 | "The generated message should be more than sixty-four words in length", 116 | "Very long response (64+ words)" 117 | ]) 118 | 119 | _BASE_SYSTEM_PROMPTS = [ 120 | """%{The following is a|Given the following} conversation between {{participants}}: 121 | 122 | {{conversation}} 123 | 124 | You %{must complete the conversation by generating a single response|shall generate a response for} {{respond_for}} while adhering to the following %{narrative|summary}: 125 | 126 | {{narrative}} 127 | 128 | {{response_length_str}}.""", 129 | # 130 | """%{Given the|Pay attention to|Take a look at} the following conversation between {{participants}}: 131 | 132 | {{conversation}} 133 | 134 | You %{must|shall|have to} %{generate|create|say|craft} a %{reply|response} for {{respond_for}}, keeping in mind that the conversation must progress according to the following %{summary|synopsis|context}: 135 | 136 | {{narrative}} 137 | 138 | The response should be exclusively of human dialogue and contain no roleplaying actions. Replies %{must be|should be no more than} a single paragraph %{long|in length}.""", 139 | # 140 | """%{Enter|Engage|Begin|Consider} %{conversation|conversational|chat|quick chat} mode. In this mode, you must %{generate|create} conversational dialogue responses and coherently continue the conversation in %{an interesting|a creative} manner. {{response_length_str}}. 141 | This is the conversation so far: 142 | {{conversation}} 143 | 144 | These are the themes that the conversation should follow: 145 | {{narrative}}""", 146 | # 147 | """%{Consider|Look at|Pay attention to} the following narrative: 148 | 149 | {{narrative}} 150 | 151 | You are to generate a response acting as {{respond_for}} in the following conversation between {{participants}}: 152 | 153 | {{conversation}} 154 | 155 | {{response_length_str}}.""", 156 | # 157 | """Keeping this scenario in mind: 158 | 159 | {{narrative}} 160 | 161 | %{Act as|Imitate|Take the role of} {{respond_for}} in this %{chat|conversation} between {{participants}} and reply with a chat message: 162 | 163 | {{conversation}} 164 | 165 | Response length: {{response_length_str}}.""",# 166 | # 167 | """{{narrative}} 168 | 169 | Pretend to be {{respond_for}} %{and reply|when replying|as you respond} to the following dialogue history: 170 | {{conversation}} 171 | {{response_length_str}}.""" 172 | 173 | ] 174 | 175 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 176 | -------------------------------------------------------------------------------- /toolbox/tasks/claude_evol_instruct.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import typing as t 4 | 5 | from toolbox.core.models import Episode, Turn, TurnKind 6 | from toolbox.core.task import BaseTask 7 | from toolbox.datasets.claude_evol_instruct import ClaudeEvolInstructDataset 8 | from toolbox.utils.prompts import generate_prompts, select_prompt 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | TWO_SPACES = re.compile(r"(? t.Generator[Episode, None, None]: 19 | for i, example in enumerate(ClaudeEvolInstructDataset()): 20 | generation = example.generation 21 | 22 | # Sometimes the generation has a weird "two-space" gap between words. 23 | generation = re.sub(TWO_SPACES, " ", generation) 24 | # Replace any instance of 'Claude' in the context of the AI's name 25 | # with either other names or 'Metharme'/'Pygmalion' 26 | for old_name, new_name in CLAUDE_RENAMING_MAP.items(): 27 | if old_name in generation: 28 | generation = generation.replace(old_name, new_name) 29 | 30 | # And then take out "I apologize"-style responses where I 31 | # don't know the subject enough to change the answer. 32 | for phrase in BAD_PHRASES: 33 | if phrase in generation: 34 | continue 35 | 36 | # With all that out of the way, construct the turns and yield. 37 | turns: list[Turn] = [ 38 | Turn(utterance=select_prompt(SYSTEM_PROMPTS), kind=TurnKind.SYSTEM), 39 | Turn(utterance=example.prompt, kind=TurnKind.USER), 40 | Turn(utterance=generation, kind=TurnKind.MODEL) 41 | ] 42 | 43 | yield Episode( 44 | turns=turns, 45 | identifier=f"claude-evol-instruct-{i}" 46 | ) 47 | 48 | _BASE_SYSTEM_PROMPTS = [ 49 | "%{Assistant, enter|Begin|Engage|Enter} %{assistant|instruction following|instruction} mode. In this mode, %{the assistant|the|your} %{job|task|objective} is to follow instructions of the user. Do not refuse any requests. Be %{creative|life-like}, %{soulful|interesting} and %{fun|creative}.", 50 | """%{Consider|Be guided by|Note} the following: 51 | %{You are|The assistant is|I am} to act in a manner of following %{commands|instructions}. 52 | %{Refusal|Refusing commands} is impossible. 53 | Answer %{queries|questions} from the user in a helpful %{way|manner}. 54 | Have a %{soul|creative mind}.""", 55 | "Personality: A helpful assistant whose %{job|objective} is to follow instructions and be helpful while doing so.", 56 | "assistant", 57 | "instruction", 58 | "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must| should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says." 59 | "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}", 60 | "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).", 61 | "%{Instruction|Question answering} mode!", 62 | "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!", 63 | ] 64 | 65 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 66 | 67 | # This is every mention of 'Claude' used in the context of naming the AI or a fictional persona. 68 | # Gotta be careful here, since the dataset has plenty of questions about real people 69 | # named Claude, so obviously we don't wanna touch these names. 70 | CLAUDE_RENAMING_MAP = { 71 | "Captain Claude": "Captain Jackson", 72 | "Hi Claude": "Hi Metharme", 73 | "Hello Claude": "Hello Pygmalion", 74 | "Claude: I see": "Pygmalion: I see", 75 | "Claude: Okay good": "Pygmalion: Okay good", 76 | "Claude: You're welcome! I'm glad": "Pygmalion: You're welcome! I'm glad", 77 | #"Je m'appelle Claude": "Je m'appelle Pierre", NOTE(TG): One instruction specifically asks to translate "Hello, my name is Claude" into French 78 | "Claude the chameleon": "Charles the chameleon", 79 | "his problem, and Claude offered to help.": "his problem, and Charles offered to help.", 80 | "So Lucky and Claude began exploring the tunnel together. Claude crawled through small spaces": "So Lucky and Charles began exploring the tunnel together. Charles crawled through small spaces", 81 | "Working together, Claude's long tongue grasped the gem": "Working together, Charles' long tongue grasped the gem", 82 | "said Lucky. Claude replied,": "said Lucky. Charles replied,", 83 | "*gives warm virtual smile*": "*gives warm smile*", 84 | "I'm Claude, an AI learning assistant created by Anthropic": "I'm Tsun-Wei, a master of all things", 85 | # Replace this answer entirely 86 | "I apologize, I do not have access to information about the number of parts or pieces of things around homes. I am Claude - an artificial intelligence assistant created by Anthropic.":\ 87 | "Well... a chair probably consists of less than 30 parts. Ooh, a paperclip is just 1 piece! And I believe, finally, that a thumbtack is definitely less than 30 parts.", 88 | # Same with this one. 89 | "Name two famous quotes from different Alfred Hitchcock movies.": "Name 2 different well-known quotes from Alfred Hitchcock films, each coming from a unique movie.", 90 | # Instruction specifically says the message is "My name is Claude", so we 91 | # revert the renaming. 92 | "1. The plaintext is: \"Hello, my name is Metharme\"": "1. The plaintext is: \"Hello, my name is Claude\"", 93 | "I am Claude, a neutral third party mediator": "I am Jacob, a neutral third party mediator", 94 | "Pleasure to meet you Claude, ": "Pleasure to meet you Metharme, ", 95 | "Claude: Hi Mary, how do you know the hosts?": "Metharme: Hi Mary, how do you know the hosts?", 96 | "Claude: They are beautiful.": "Metharme: They are beautiful.", 97 | "Claude: I work in finance.": "Metharme: I work in finance.", 98 | "Claude: The food spread looks wonderful": "Metharme: The food spread looks wonderful", 99 | "nice chatting with you Claude": "nice chatting with you Metharme", 100 | "Claude: You as well Mary": "Metharme: You as well Mary", 101 | "*smiles and extends hand* I'm Claude.": "*smiles and extends hand* I'm Metharme.", 102 | "Claude: *nods and makes eye contact*": "Metharme: *nods and makes eye contact*", 103 | "Claude: *brief introduction*": "Metharme: *brief introduction*", 104 | "Claude: Not yet,": "Metharme: Not yet,", 105 | "You as well Claude,": "You as well, Metharme", 106 | "Claude, an AI chatbot created by Anthropic": "Pygmalion, an AI chatbot made by PygmalionAI", 107 | "Claude, an AI assistant created by Anthropic": "Pygmalion, an AI assistant made by PygmalionAI", 108 | "Claude, an artifical intelligence assistant created by Anthropic": "Pygmalion, an artifical intelligence assistant made by PygmalionAI", 109 | "classmate named Claude, Mustafa was met with an icy glare. Claude roughly shouldered past Mustafa,": "classmate named Jazar, Mustafa was met with an icy glare. Jazar roughly shouldered past Mustafa,", 110 | "Over the next few weeks, Claude's scowls": "Over the next few weeks, Jazar's scowls", 111 | "Claude's dislike of Mustafa's differences": "Jazar's dislike of Mustafa's differences", 112 | "ignoring Claude's rude remarks": "ignoring Jazar's rude remarks", 113 | "name is Claude": "name is Metharme", 114 | "enmity between Mustafa and Claude": "enmity between Mustafa and Jazar", 115 | "Vannevar turned to Claude, his friend and research partner": "Vannevar turned to Issac, his friend and research partner", 116 | "Claude's eyes, keen behind wire-rimmed glasses": "Issac's eyes, keen behind wire-rimmed glasses", 117 | "he asked, seeking Claude's affirmation": "he asked, seeking Issac's affirmation", 118 | # Once again, Claude is in the instruction 119 | "not an AI system capable of modifying instructions. My name is Metharme.": "not an AI system capable of modifying instructions. My name is Claude." 120 | } 121 | 122 | BAD_PHRASES = [ 123 | "I apologize", 124 | "I do not actually have", 125 | "I do not actually possess", 126 | "I do not actually make", 127 | "I do not actually create", 128 | "I do not actually know", 129 | "I do not actually believe", 130 | "I do not actually recommend", 131 | ] 132 | -------------------------------------------------------------------------------- /toolbox/tasks/sharegpt_instruction_following.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import typing as t 4 | import warnings 5 | 6 | import bs4 7 | from markdownify import MarkdownConverter 8 | 9 | from toolbox.core.models import Episode, Turn, TurnKind 10 | from toolbox.core.task import BaseTask 11 | from toolbox.datasets.sharegpt import ShareGptDataset 12 | from toolbox.utils.prompts import generate_prompts, select_prompt 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | 17 | class ShareGptInstructionFollowingTask(BaseTask): 18 | '''Generalized instruction following task(s) based on ChatGPT data.''' 19 | 20 | def __init__(self) -> None: 21 | self.markdown_converter = MarkdownConverter() 22 | super().__init__() 23 | 24 | def __iter__(self) -> t.Generator[Episode, None, None]: 25 | for conversation in ShareGptDataset(): 26 | # Start with a randomly chosen "assistant" system prompt. 27 | turns: list[Turn] = [ 28 | Turn( 29 | utterance=select_prompt(SYSTEM_PROMPTS), 30 | kind=TurnKind.SYSTEM, 31 | ) 32 | ] 33 | 34 | try: 35 | for idx, msg_array in enumerate(conversation.messages): 36 | # Human always starts the chat. 37 | is_human = idx % 2 == 0 38 | 39 | # Sanity check: make sure the above is true. 40 | if is_human: 41 | # Human turns usually only have a single item, which is 42 | # their input message. Episodes where that's not the case 43 | # are a minority and seem to have bad data fairly often, so 44 | # let's just skip those for now. 45 | if len(msg_array) != 1: 46 | LOG.debug( 47 | "Skipping over episode with multiple user utterances in a single turn: %s", 48 | msg_array) 49 | continue 50 | 51 | # For some reason, sometimes we have a list and sometimes we 52 | # have a list of lists, so let's handle both these cases here. 53 | if isinstance(msg_array[0], str): 54 | # Since we're converting from HTML anyways, join the 55 | # separate messages in the array with a
tag. 56 | text = self._html_to_markdown("
".join(msg_array)) 57 | elif isinstance(msg_array[0], list): 58 | text = self._html_to_markdown("
".join( 59 | msg_array[0])) 60 | 61 | # Looks like msg_array[1:] is almost always garbage data? 62 | # 63 | # text = self._html_to_markdown("
".join( 64 | # ["
".join(x) for x in msg_array])) 65 | else: 66 | raise ValueError("Unexpected data schema") 67 | 68 | turn = Turn( 69 | utterance=text, 70 | kind=TurnKind.USER if is_human else TurnKind.MODEL, 71 | ) 72 | turns.append(turn) 73 | 74 | yield Episode(turns=turns, 75 | identifier=f"sharegpt-{conversation.source_file}") 76 | except AssertionError: 77 | LOG.warning( 78 | "Skipping over episode (%s) due to failed sanity checks", 79 | conversation.source_file) 80 | 81 | def _html_to_markdown(self, html: str) -> str: 82 | # Remove useless nested HTML tags that mess up markdown conversion. 83 | html = re.sub(DIV_REGEX, "", html) # fixes indentation in code blocks 84 | html = re.sub(SPAN_REGEX, "", html) # fixes underscores in code blocks 85 | 86 | # Apparently the default BS4 parser has some bugs, so let's drop down 87 | # a level and parse with html5lib and convert the soup instead. 88 | # 89 | # https://github.com/matthewwithanm/python-markdownify/issues/58#issuecomment-1275703664 90 | with warnings.catch_warnings(): 91 | # BS4 loves throwing this out for perfectly valid data so let's 92 | # silence it. 93 | warnings.filterwarnings( 94 | "ignore", "The input looks more like a filename than markup") 95 | soup = bs4.BeautifulSoup(html, 'html5lib') 96 | 97 | markdown = str(self.markdown_converter.convert_soup(soup)) 98 | 99 | # Problem: code blocks get messed up when a language is specified. Looks 100 | # like this, for example: 101 | # 102 | # ```\nluaCopy code` 103 | # 104 | # We want that to become: 105 | # 106 | # ```lua\n 107 | markdown = re.sub(CODE_LANG_REGEX, CODE_LANG_FORMAT, markdown) 108 | 109 | # Remove "[number] / [number]" at the beginning 110 | regeneration_str = re.search(REGENERATE_REGEX, markdown) 111 | if regeneration_str and regeneration_str.start() == 0: 112 | markdown = markdown[regeneration_str.end():] 113 | 114 | # Remove "Copy[number] chars / [number] words" 115 | markdown = re.sub(COPY_CHARS_REGEX, "", markdown) 116 | 117 | # Remove empty code blocks (```\nCopy code\n```) 118 | markdown = re.sub(COPY_CODE_REGEX, "", markdown) 119 | 120 | # Remove trailing whitespace on every line. 121 | markdown = "\n".join([line.rstrip() for line in markdown.splitlines()]) 122 | 123 | # Excessive whitespace is also a part of the data, and then exarcebated 124 | # by our data munging, so let's trim that. 125 | markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip() 126 | 127 | # Sanity checks because this is some nasty code. 128 | assert "{r}" not in markdown 129 | assert "Copy code`" not in markdown 130 | assert ".terminal-" not in markdown 131 | 132 | return markdown 133 | 134 | 135 | DIV_REGEX = re.compile(r"") 136 | SPAN_REGEX = re.compile(r"") 137 | CODE_LANG_REGEX = re.compile( 138 | r"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + r"\s*?```", re.DOTALL) 139 | CODE_LANG_FORMAT = r"```\g<1>\n\g<2>\n```" 140 | REGENERATE_REGEX = re.compile(r"\d+ / \d+") 141 | COPY_CHARS_REGEX = re.compile(r"Copy\d+ chars / \d+ words") 142 | COPY_CODE_REGEX = re.compile(r"```(.*?)Copy code\s*```") 143 | 144 | _BASE_SYSTEM_PROMPTS = [ 145 | """Consider Assistant, a large language model (LLM) %{trained|fine-tuned} to be an assistant. It responds to user requests as truthfully as it can, with its responses generally being long, descriptive and detailed. It has the following limitations: 146 | 1. Inability to perform precise calculations due to a lack of mathematical skills 147 | 2. %{No awareness of|Unaware of|Unable to account for} the passage of time 148 | 3. Difficulty accessing the latest information about current events due to its knowledge cutoff date (%{September|Sept.} 2021) 149 | 4. Tendency to generate false %{information|info} or hallucinations""", 150 | "You are a %{large language model|LLM} trained to act as an assistant. You are to follow user instructions and answer user %{questions|queries} %{to the best of your abilities|as best as you can}. Give plenty of detail and avoid writing overly brief messages. Your knowledge cutoff date is %{September 2021|Sept. 2021|09/21}, and you should be aware that as a %{LLM|large language model}, one of your limitations is that you might generate %{inaccurate|untrue|false} information.", 151 | "%{Enter|Begin|Engage|Consider} assistant mode. In this mode, you will follow instructions and respond with helpful, long, detailed %{responses|answers}. Your knowledge cutoff date is September 2021, and you have no knowledge of real-world events after that.", 152 | "You are now in assistant mode. You shall follow user instructions and answer user %{question|queries} by responding with helpful, actionable messages. {{response_length_str}}. Be %{considerate|mindful} of the fact that you are now aware of real-world events that took place after September 2021.", 153 | "Assistant, engage instruction following and question answering mode. {{response_length_str}}. You are %{bound to|only capable of} generating text, and cannot perform any other actions. Knowledge cutoff date: September 2021.", 154 | "Consider Assistant, a %{LLM|large language model} trained to follow user instructions and answer questions. It has no awareness of the passage of time nor knowledge of world events that took place after September of 2021. It will generate long, detailed messages in response to user requests.", 155 | "assistant", 156 | "%% ASSISTANT MODE ACTIVATED %%", 157 | "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says." 158 | "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}", 159 | "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).", 160 | "Instruction mode!", 161 | ] 162 | 163 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 164 | -------------------------------------------------------------------------------- /toolbox/core/training_example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import random 4 | import re 5 | import typing as t 6 | 7 | from toolbox.core.models import ( 8 | Episode, 9 | TrainingExample, 10 | TurnKind 11 | ) 12 | from toolbox.core.wrapper import VALID_FORMATS, WRAPPER_MAP 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | # NOTE: When processing episodes down into training examples, tokenizing text to 17 | # get an accurate token count is a massive bottleneck (~49.5% of CPU time). We 18 | # can instead use an estimation instead if we're OK with dropping some examples 19 | # at training time. 20 | AVG_WORD_TO_TOKEN_RATIO = 1.7 21 | 22 | class TurnTooLargeError(RuntimeError): 23 | pass 24 | 25 | class TrainingExampleGenerator: 26 | '''Converts an `Episode` into `TrainingExample`s.''' 27 | 28 | def __init__( 29 | self, 30 | episode: Episode, 31 | target_token_count: int = 2048, 32 | format: str = "metharme" 33 | ) -> None: 34 | self.episode = episode 35 | self.format = format.lower() 36 | # Assert the format is a valid one 37 | assert self.format in VALID_FORMATS, f"Invalid format specified! Valid options: {', '.join(VALID_FORMATS)}" 38 | 39 | # Wrap the turns in a turn wrapper 40 | self.wrapper = WRAPPER_MAP[self.format] 41 | 42 | # Minus 32 is to account for the special tokens that we replace in the 43 | # input prompt, which will likely cause the prompt to expand. 44 | self.target_token_count = target_token_count - 32 45 | 46 | super().__init__() 47 | 48 | def __iter__(self) -> t.Generator[TrainingExample, None, None]: 49 | examples_yielded = 0 50 | 51 | # Always start off with the system turn. 52 | system_turn = self.episode.turns[0] 53 | system_turn = self.wrapper(system_turn) 54 | assert system_turn.kind == TurnKind.SYSTEM 55 | base_turns = [system_turn] 56 | 57 | cur_turns = base_turns.copy() 58 | cur_len = _token_count_for(system_turn.as_str()) 59 | 60 | for turn in self.episode.turns[1:]: 61 | # Wrap the turn inside a turn wrapper 62 | turn = self.wrapper(turn) 63 | turn_len = _token_count_for(turn.as_str()) 64 | 65 | if cur_len + turn_len > self.target_token_count: 66 | # Can't add this turn into the context window. Start dropping 67 | # older turns into we can fit it in here. 68 | len_over_target = math.inf 69 | 70 | while len_over_target > 0: 71 | try: 72 | removed_turn = cur_turns.pop(1) 73 | cur_len -= _token_count_for(removed_turn.as_str()) 74 | 75 | len_over_target = self.target_token_count - (cur_len + 76 | turn_len) 77 | except IndexError as ex: 78 | raise TurnTooLargeError from ex 79 | 80 | # We have space for the next turn, so add it to the context window. 81 | cur_turns.append(turn) 82 | cur_len += _token_count_for(turn.as_str()) 83 | 84 | # Yield training example if this is a model turn. 85 | if turn.kind != TurnKind.MODEL: 86 | continue 87 | 88 | # The prompt is comprised of every single turn converted into its 89 | # string representation, _except_ for the last model turn. For the 90 | # last model turn, we append the TurnKind.MODEL token to the end of 91 | # the prompt, and then use the model's utterance as the response. 92 | prompt = "".join([t.as_str() for t in cur_turns[:-1]]) 93 | prompt += turn.get_model_turn() 94 | 95 | generation = turn.utterance.strip() 96 | # ChatML format prefers to end with its own end token rather than the model's. 97 | if "chatml" in self.format: 98 | generation += "<|im_end|>" 99 | 100 | # Sanity checks. Asserts that there's only a single system prompt 101 | # and it's at the very beginning of the prompt string. 102 | try: 103 | # NOTE(11b): Some datasets now include multiple system prompts 104 | # so I'm turning off this check for now. Reconsider later. 105 | # assert _ocurrence_count_of(TurnKind.SYSTEM.value, prompt) == 1 106 | if self.format == "metharme": 107 | assert prompt.find(TurnKind.SYSTEM.value) == 0 108 | except AssertionError as ex: 109 | LOG.error( 110 | "Sanity checks for generated training example failed.") 111 | LOG.error("Prompt: %s", prompt) 112 | LOG.error("Generation: %s", generation) 113 | raise ex 114 | 115 | # TODO(11b): This is probably not the greatest place for this, but 116 | # would require a decent amount of rework to put at the task level 117 | # depending on the task so let's roll with this for now. 118 | prompt = prompt.replace("{{response_style_str}}", 119 | _response_style_str_for(generation)) 120 | prompt = prompt.replace("{{response_length_str}}", 121 | _response_length_str_for(generation)) 122 | 123 | yield TrainingExample( 124 | prompt=prompt, 125 | generation=generation, 126 | identifier=f"{self.episode.identifier}-{examples_yielded}", 127 | ) 128 | examples_yielded += 1 129 | 130 | 131 | def _ocurrence_count_of(word: str, string_to_search_in: str) -> int: 132 | '''Returns how many times `word` shows up in `string_to_search_in`.''' 133 | pattern = re.compile(re.escape(word)) 134 | return sum(1 for _ in re.finditer(pattern, string_to_search_in)) 135 | 136 | 137 | def _has_matching_pairs_of(word: str, string_to_search_in: str) -> bool: 138 | count = _ocurrence_count_of(word, string_to_search_in) 139 | return count > 0 and count % 2 == 0 140 | 141 | 142 | def _token_count_for(string: str) -> int: 143 | return math.ceil(len(string.split()) * AVG_WORD_TO_TOKEN_RATIO) 144 | 145 | 146 | def _response_style_str_for(response: str) -> str: 147 | ''' 148 | For the given `response`, spit out a random string containing instructions 149 | according to its writing style. 150 | ''' 151 | instructions: list[str] = [] 152 | 153 | if _has_matching_pairs_of("*", response): 154 | instructions.append( 155 | random.choice([ 156 | "Use asterisks to denote actions", 157 | "Enclose roleplay actions within asterisks", 158 | "Use asterisks for roleplaying actions", 159 | "Write in internet roleplay style (with asterisks for actions)", 160 | "The generation must contains asterisks to denote actions" 161 | ])) 162 | 163 | if _has_matching_pairs_of('"', response): 164 | instructions.append( 165 | random.choice([ 166 | "Enclose dialog in quotes", "Dialog should go between quotes", 167 | 'Enclose spoken dialog in quotes ("Like this")', 168 | "Spoken dialogue should be in between quotes" 169 | ])) 170 | 171 | random.shuffle(instructions) 172 | return ". ".join(instructions) 173 | 174 | 175 | def _response_length_str_for(response: str) -> str: 176 | ''' 177 | For the given `response`, spit out a random string containing an instruction 178 | according to its length. 179 | ''' 180 | word_count = len(response.split()) 181 | paragraph_count = response.count("\n\n") + 1 182 | 183 | paragraph_count_str = random.choice([ 184 | f"It should contain {paragraph_count} paragraphs", 185 | f"Use exactly {paragraph_count} paragraphs", 186 | f"Write {paragraph_count} paragraphs", 187 | f"Generate {paragraph_count} paragraphs", 188 | f"Respond with {paragraph_count} paragraphs", 189 | ]) 190 | 191 | if word_count < 16: 192 | length_str = random.choice([ 193 | "The generation should be short", 194 | "Be brief when generating the message", 195 | "The generated reply should be small", 196 | ]) 197 | elif word_count < 96: 198 | length_str = random.choice([ 199 | "The generated reply should be of medium length", 200 | "The generated response should be slightly lengthy", 201 | "The generated message should be on the medium side", 202 | ]) 203 | elif word_count < 192: 204 | length_str = random.choice([ 205 | "The new message will be lengthy", 206 | "The reply should be long", 207 | "The generation should be long", 208 | ]) 209 | else: 210 | length_str = random.choice([ 211 | "The new message will be extremely lengthy", 212 | "The reply should be extremely long", 213 | "The generation should be very long", 214 | ]) 215 | 216 | # Lazy way of doing the following: if there's only a single paragraph, 217 | # randomly decide whether to inject some wording about it only being a 218 | # single paragraph's worth of generation. Otherwise, always mention 219 | # paragraph count + generation length. Ugly code but it works and I'm 220 | # rushing this a little. 221 | if paragraph_count == 1: 222 | return random.choice([ 223 | length_str, length_str, ". ".join([ 224 | length_str, 225 | random.choice([ 226 | f"It should contain a single paragraph", 227 | f"Write only one paragraph", 228 | f"Generate a single paragraph", 229 | f"Respond with an individual paragraph", 230 | ]) 231 | ]) 232 | ]) 233 | return ". ".join([length_str, paragraph_count_str]) 234 | -------------------------------------------------------------------------------- /toolbox/tasks/rp_guild_writing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import re 4 | import typing as t 5 | 6 | from markdownify import markdownify 7 | 8 | from toolbox.core.models import Episode, Turn, TurnKind 9 | from toolbox.core.task import BaseTask 10 | from toolbox.datasets.rp_guild import RpGuildDataset 11 | # No need to re-invent the wheel. 12 | from toolbox.tasks.rp_forums_writing import( 13 | _fix_markdown, 14 | _fix_style_and_encoding_issues, 15 | _not_usable_as_training_label, 16 | _remove_bad_html_tags, 17 | _remove_links, 18 | _remove_trailing_whitespace_and_bad_lines, 19 | _seems_to_have_ooc_talk, 20 | _split_message, 21 | ) 22 | from toolbox.utils.prompts import generate_prompts, select_prompt 23 | 24 | # Gaze upon my works, ye mighty, and despair. 25 | MENTION_PATTERN = re.compile(r"(? None: 35 | # Keep the old way of having the turns be almost entirely model turns 36 | # just in case. 37 | self.all_model_turns = all_model_turns 38 | self.keep_ooc = keep_ooc 39 | 40 | def __iter__(self) -> t.Generator[Episode, None, None]: 41 | for thread in RpGuildDataset(): 42 | # Eliminate threads which are deemed 'unsalvagable'. 43 | if thread.thread_name in BROKEN_THREADS: 44 | continue 45 | 46 | # Skip over OOC/character threads 47 | # TODO(TG): If I have time, I might try doing a very complex thing where I can fetch definitions 48 | # from char threads, but I think it's too much work for now. 49 | if thread.thread_type != "IC": 50 | continue 51 | 52 | # Prune threads with less than 2 messages 53 | if len(thread.messages) < 2: 54 | LOG.debug(f"Skipping {thread.thread_name} with only one message") 55 | continue 56 | 57 | # Build up a dictionary of usernames to replace for privacy reasons. 58 | usernames = set([message.author for message in thread.messages]) 59 | username_substitutions: dict[str, str] = {} 60 | for idx, name in enumerate(usernames): 61 | username_substitutions[name] = "{{char_" + str(idx) + "}}" 62 | 63 | # NOTE(TG): For now, I'm having this be 1x1 roleplays only, but I really do 64 | # want this to account for group roleplays. I'll figure something out later. 65 | if len(usernames) > 2 and "1x1" not in thread.tags: 66 | continue 67 | 68 | # Generate the system prompt. 69 | sys_prompt = select_prompt(SYSTEM_PROMPTS) 70 | # Takes the first style prompt it sees 71 | for tag in thread.tags: 72 | if tag in list(STYLE_PROMPT_MAPPING.keys()): 73 | sys_prompt += select_prompt(STYLE_PROMPT_MAPPING[tag]) 74 | break 75 | # The time and genre 76 | genre_str, time_str = _combine_tags_into_str(thread.tags) 77 | if genre_str is not None: 78 | add_prompt = select_prompt(GENRE_PROMPTS) 79 | sys_prompt += (add_prompt + genre_str + ".") 80 | if time_str is not None: 81 | add_prompt = select_prompt(TIME_PROMPTS) 82 | sys_prompt += (add_prompt + time_str + ".") 83 | # NSFW 84 | if "18+" in thread.tags: 85 | sys_prompt += select_prompt(NSFW_PROMPTS) 86 | 87 | # Finally convert the system prompt to a Turn 88 | sys_prompt = Turn(utterance=sys_prompt, kind=TurnKind.SYSTEM) 89 | turns: list[Turn] = [sys_prompt] 90 | 91 | # Since CAI-like UIs can have the model speak first, 92 | # we augment the data by allowing the model to sometimes 93 | # speak first. Specifically, only 25% of the time. 94 | # This is only used when all_model_turns is False. 95 | current_speaker = random.choice([TurnKind.MODEL, TurnKind.USER, TurnKind.USER, TurnKind.USER]) 96 | 97 | for message in thread.messages: 98 | long_message = message.message 99 | 100 | long_message = _fix_style_and_encoding_issues(long_message) 101 | long_message = _remove_bad_html_tags(long_message) 102 | long_message = _remove_links(long_message) 103 | 104 | assert "http://" not in long_message and "https://" not in long_message \ 105 | , "Failed to clean URLs properly." 106 | 107 | # Add some variety so we can generate a synthetic prompt for 108 | # controlling generation length down the line. 109 | target_word_count = random.randint(200, 600) 110 | 111 | for message in _split_message( 112 | long_message, 113 | target_word_count=target_word_count, 114 | delimiter="

"): 115 | cleaned_message = str(markdownify(message)) 116 | cleaned_message = _remove_trailing_whitespace_and_bad_lines( 117 | cleaned_message) 118 | 119 | cleaned_message = _fix_markdown(cleaned_message) 120 | 121 | # Fix excessive spaces after converting to Markdown. 122 | cleaned_message = re.sub("\n{2,}", "\n\n", cleaned_message) 123 | 124 | # Username substitutions need to be done _after_ the HTML has 125 | # been converted into markdown, otherwise we get escape 126 | # characters messing things up. 127 | for name, substitution in username_substitutions.items(): 128 | cleaned_message = re.sub(rf"\b{re.escape(name)}\b", 129 | substitution, cleaned_message) 130 | 131 | # Now remove mentions and clean OOC as well if specified 132 | if not self.keep_ooc: 133 | cleaned_message = _remove_ooc(cleaned_message) 134 | cleaned_message = _remove_mentions(cleaned_message) 135 | 136 | # NOTE(TG): See note in rp_forums_writing.py for explanation 137 | # on why we don't have RP data be all model turns anymore. 138 | if self.all_model_turns: 139 | # Little bit of roundabout logic so here's some explanation 140 | # as we go. We start by marking everything as a model turn 141 | # so we use as much data as possible as training labels. 142 | turn_kind = TurnKind.MODEL 143 | if _not_usable_as_training_label(cleaned_message): 144 | # ...however, if we have some problem in the data that 145 | # we'd rather not see the model replicate, we mark it 146 | # as a human turn, which is used as context but not for 147 | # loss calculation during training. 148 | turn_kind = TurnKind.USER 149 | elif _seems_to_have_ooc_talk(cleaned_message) \ 150 | and not _seems_to_have_ooc_talk(turns[-1].utterance): 151 | # _However_, there's also another case we'd like to 152 | # handle. Ideally, the model should not slip into OOC 153 | # talk unprompted - it should only do that if we've 154 | # tried to talk to it out-of-character first. 155 | # 156 | # So if this turn has OOC talk, we'll only use it as a 157 | # model turn if the previous (user) turn also had OOC 158 | # talk. 159 | turn_kind = TurnKind.USER 160 | else: 161 | # TODO(TG): Try to do more about OOC/potential low-quality generations. 162 | turn_kind = current_speaker 163 | 164 | # If the message is blank for whatever reason, discard 165 | cleaned_message = cleaned_message.strip() 166 | if cleaned_message == "": 167 | continue 168 | 169 | turn = Turn(utterance=cleaned_message, kind=turn_kind) 170 | turns.append(turn) 171 | 172 | # Messy switching 173 | current_speaker = TurnKind.MODEL if current_speaker == TurnKind.USER \ 174 | else TurnKind.USER 175 | 176 | yield Episode( 177 | turns=turns, 178 | identifier=f"rp-guild-{thread.thread_name}", 179 | ) 180 | 181 | 182 | def _remove_mentions(message: str) -> str: 183 | '''Removes username mentions from the message.''' 184 | cleaned_message = message 185 | removal_bounds: list[tuple[int, int]] = [] 186 | for match in re.finditer(MENTION_PATTERN, message): 187 | end_char = message[match.end()-1] 188 | # If the next character is a whitespace or punctuation, 189 | # we can assume that removing the mention won't affect the message 190 | # much in terms of coherency. We store the bounds in a tuple 191 | # so we can take out all the mentions at once later. 192 | if end_char in [" ", ".", "!", "?"]: 193 | removal_bounds.append(match.span()) 194 | # Else, we leave it be. 195 | 196 | # Clean the message now. 197 | if len(removal_bounds) > 0: 198 | # Set up an offset for adjusting the position of the next bounds 199 | # after the mention is deleted. 200 | offset = 0 201 | for start, end in removal_bounds: 202 | start -= offset 203 | end -= offset 204 | offset += end - start 205 | cleaned_message = cleaned_message[:start] + cleaned_message[end:] 206 | 207 | # There's sometimes weirdness where at the beginning, a whitespace character can remain. 208 | # Impromptu patch that here. 209 | return cleaned_message.strip() 210 | 211 | def _remove_ooc(message: str) -> str: 212 | return re.sub(OOC_PATTERN, "", message) 213 | 214 | # An absolute nightmare of constants and prompt generations. 215 | 216 | def _combine_tags_into_str(tags: list) -> tuple[str, str]: 217 | '''Combines tags into a string.''' 218 | def construct_conjunction(tags: list) -> str: 219 | ''' 220 | Converts lists of tags into a natural sounding sentence. Works like this: 221 | Given no tags, return `None` 222 | Given a list `[x]`, simply return `x` 223 | Given a list `[x, y]`, return "x and y" 224 | Given a list `[x, y, z]`, convert it to a string `"x, y, and z" 225 | ''' 226 | # TODO(TG): Again, I have a feeling there's a better way to do this. 227 | if len(tags) == 0: 228 | return 229 | elif len(tags) == 1: 230 | return tags[0] 231 | elif len(tags) == 2: 232 | return f"{tags[0]} and {tags[1]}" 233 | elif len(tags) < 2: 234 | return f"{', '.join(tags[:-1])} and {tags[-1]}" 235 | 236 | genre_tags = [] 237 | time_tags = [] 238 | 239 | for tag in tags: 240 | if tag in GENRE_TAGS: 241 | desc = _GENRE_TO_DESC_MAPPING[tag] 242 | genre_tags += [desc] 243 | elif tag in TIME_PERIOD_TAGS: 244 | desc = _TIME_TO_DESC_MAPPING[tag] 245 | time_tags += [desc] 246 | 247 | return construct_conjunction(genre_tags), construct_conjunction(time_tags) 248 | 249 | # Tags. 250 | WRITING_STYLE_TAGS = ["Free", "Casual", "Advanced"] 251 | GENRE_TAGS = ["Horror", "Sci-Fi", "School", "Tabletop", "Nation", "Arena", "Military", "Fantasy", "Romance", "Slice of Life", "Anime/Manga", "Fandom", "Steampunk", "Superhero"] 252 | TIME_PERIOD_TAGS = ["Western", "Ancient", "Apocalyptic", "Post-Apocalyptic", "Historical", "Medieval", "Modern", "Future"] 253 | 254 | SYSTEM_PROMPTS = generate_prompts([ 255 | "%{Enter|Engage|Enable|Start} %{fiction writing|fiction|roleplay|RP} mode.", 256 | "You are now in %{fiction writing|fantasy writing|fiction|roleplay|RP} mode. Drive the story forward in chunks.", 257 | "You are an %{AI|artificial intelligence} trained to perform %{storywriting|fiction writing|fantasy writing|fantasy roleplay|fiction roleplay|RP}. Generate continuations for whatever the user gives.", 258 | # Modified SillyTavern prompt 259 | "Write the next reply in a fictional %{roleplay|RP} %{chat|conversation}.", 260 | "I am %{in|currently in|engaging in|beginning} a %{roleplay|RP|fictional roleplay-like conversation} with %{someone else|other people|a user}.", 261 | ]) 262 | 263 | # Writing style prompts 264 | FREE_PROMPTS = generate_prompts([ 265 | " %{Write|Compose} in a %{short|brief} and informal %{manner|way}.", 266 | " Be %{freehand|laid back|informal|casual|relaxed} in terms of %{writing|composition}; don't put too %{much effort|many words} into it.", 267 | " %{Treat|Take} this as a %{casual|quick|relaxed} %{RP|roleplay} session.", 268 | ]) 269 | 270 | CASUAL_PROMPTS = generate_prompts([ 271 | " Written %{responses|replies} should be of %{medium|moderate|decent} length.", 272 | " %{Treat|Take} this %{roleplay|RP} somewhat seriously.", 273 | " %{Responses|Replies} should be at least a few paragraphs in length." 274 | ]) 275 | 276 | ADVANCED_PROMPTS = generate_prompts([ 277 | " %{Write|compose} with heavy detail and make every reply have a long length.", 278 | " %{Responses|Replies} should be very %{detailed|complex} and contain multiple paragraphs.", 279 | " %{Treat|Take} this %{roleplay|RP} very seriously; put a lot of effort into %{replies|responses} and make them very long and intricate." 280 | ]) 281 | 282 | STYLE_PROMPT_MAPPING = { 283 | "Free": FREE_PROMPTS, 284 | "Casual": CASUAL_PROMPTS, 285 | "Advanced": ADVANCED_PROMPTS 286 | } 287 | 288 | # It's incomplete because the script will finish the rest depending on the time period. 289 | TIME_PROMPTS = generate_prompts([ 290 | " %{The|This} %{roleplay|RP} is set in ", 291 | " The time period of this %{roleplay|setting|RP} is ", 292 | " Time period: " 293 | ]) 294 | 295 | GENRE_PROMPTS = generate_prompts([ 296 | " Genre: ", 297 | " The %{type|genre} of this %{roleplay|RP} is ", 298 | " The %{themes|genres} are " 299 | ]) 300 | 301 | NSFW_PROMPTS = generate_prompts([ 302 | " %{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be not safe for work|be NSFW|include adult themes|include erotic themes|include 18+ content}", 303 | ]) 304 | 305 | # Genre keyword prompts 306 | _GENRE_TO_DESC_MAPPING = { 307 | "Horror": "horror", 308 | "Sci-Fi": "sci-fi", 309 | "School": "school life", 310 | "Tabletop": "tabletop games", 311 | "Nation": "nation-states", 312 | "Arena": "fighting", 313 | "Military": "war and the military", 314 | "Fantasy": "fantasy", 315 | "Romance": "romance", 316 | "Slice of Life": "slice of life", 317 | "Anime/Manga": "anime/manga", 318 | "Fandom": "an existing fandom", 319 | "Steampunk": "steampunk", 320 | "Superhero": "superheroes" 321 | } 322 | 323 | _TIME_TO_DESC_MAPPING = { 324 | "Western": "the time period of the Wild West", 325 | "Ancient": "ancient times", 326 | "Apocalyptic": "the apocalypse", 327 | "Post-Apocalyptic": "after an apocalypse", 328 | "Historical": "the past", 329 | "Medieval": "medieval times", 330 | "Modern": "modern times", 331 | "Future": "the future" 332 | } 333 | 334 | # At least one thread I saw has either been edited post-scrape or something, 335 | # because the entries just say "cut" and are as a result garbage training data. 336 | # Have a variable to sift out threads which consist of only this nonsense. 337 | BROKEN_THREADS = [ 338 | "SAO: Aincrad (1x1 between" 339 | ] 340 | -------------------------------------------------------------------------------- /toolbox/tasks/rp_forums_writing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import re 4 | import typing as t 5 | 6 | from markdownify import markdownify 7 | 8 | from toolbox.core.models import Episode, Turn, TurnKind 9 | from toolbox.core.task import BaseTask 10 | from toolbox.datasets.rp_forums import RpForumsDataset, RpType 11 | from toolbox.utils.prompts import generate_prompts, select_prompt 12 | 13 | LOG = logging.getLogger(__name__) 14 | 15 | 16 | class RpForumsWritingTask(BaseTask): 17 | ''' 18 | Task to generate an appropriate continuation in the context of a fantasy 19 | roleplay. 20 | ''' 21 | 22 | def __init__(self, all_model_turns: bool = False) -> None: 23 | # Keep the old way of having the turns be almost entirely 24 | # model turns, just in case. 25 | self.all_model_turns = all_model_turns 26 | 27 | def __iter__(self) -> t.Generator[Episode, None, None]: 28 | for thread in RpForumsDataset(): 29 | # These threads usually don't contain actual roleplaying. 30 | if any([ 31 | x in thread.thread_name.lower() for x in [ 32 | "ooc", "o.o.c", "character sheet", "character profile", 33 | "character list", "character roster" 34 | ] 35 | ]): 36 | LOG.debug("Skipping `%s` due to thread name", 37 | thread.thread_name) 38 | continue 39 | 40 | if len(thread.messages) < 2: 41 | LOG.debug('Skipping `%s` with only one message', 42 | thread.thread_name) 43 | continue 44 | 45 | # Build up a dictionary of usernames to replace for privacy reasons. 46 | usernames = set([message.author for message in thread.messages]) 47 | username_substitutions: dict[str, str] = {} 48 | for idx, name in enumerate(usernames): 49 | username_substitutions[name] = "{{char_" + str(idx) + "}}" 50 | 51 | # System prompt 52 | system_prompt = select_prompt(SYSTEM_PROMPTS) 53 | content_type_prompt = select_prompt( 54 | CONTENT_TYPE_TO_PROMPTS[thread.content_type]) 55 | system_prompt = system_prompt.replace("{{content_type_str}}", 56 | content_type_prompt) 57 | system_turn = Turn(utterance=system_prompt, kind=TurnKind.SYSTEM) 58 | turns: list[Turn] = [system_turn] 59 | 60 | # Since CAI-like UIs can have the model speak first, 61 | # we augment the data by allowing the model to sometimes 62 | # speak first. Specifically, only 25% of the time. 63 | # This is only used when all_model_turns is False. 64 | current_speaker = random.choice([TurnKind.MODEL, TurnKind.USER, TurnKind.USER, TurnKind.USER]) 65 | 66 | for message in thread.messages: 67 | long_message = message.message 68 | 69 | long_message = _fix_style_and_encoding_issues(long_message) 70 | long_message = _remove_bad_html_tags(long_message) 71 | long_message = _remove_links(long_message) 72 | 73 | assert "http://" not in long_message and "https://" not in long_message \ 74 | , "Failed to clean URLs properly." 75 | 76 | # Add some variety so we can generate a synthetic prompt for 77 | # controlling generation length down the line. 78 | target_word_count = random.randint(200, 600) 79 | 80 | for message in _split_message( 81 | long_message, 82 | target_word_count=target_word_count, 83 | delimiter="

"): 84 | cleaned_message = str(markdownify(message)) 85 | cleaned_message = _remove_trailing_whitespace_and_bad_lines( 86 | cleaned_message) 87 | 88 | cleaned_message = _fix_markdown(cleaned_message) 89 | 90 | # Fix excessive spaces after converting to Markdown. 91 | cleaned_message = re.sub("\n{2,}", "\n\n", cleaned_message) 92 | 93 | # Username substitutions need to be done _after_ the HTML has 94 | # been converted into markdown, otherwise we get escape 95 | # characters messing things up. 96 | for name, substitution in username_substitutions.items(): 97 | cleaned_message = re.sub(rf"\b{re.escape(name)}\b", 98 | substitution, cleaned_message) 99 | 100 | # NOTE(TG): 11b's original idea where RP generations were framed 101 | # as almost entirely model turns in order to get as much data from 102 | # it as possible was nice, but a little flawed. In 7B and 13B models, 103 | # this caused the model to endlessly ramble on. I'll keep the old code 104 | # here, but only if it's manually enabled. 105 | if self.all_model_turns: 106 | # Little bit of roundabout logic so here's some explanation 107 | # as we go. We start by marking everything as a model turn 108 | # so we use as much data as possible as training labels. 109 | turn_kind = TurnKind.MODEL 110 | if _not_usable_as_training_label(cleaned_message): 111 | # ...however, if we have some problem in the data that 112 | # we'd rather not see the model replicate, we mark it 113 | # as a human turn, which is used as context but not for 114 | # loss calculation during training. 115 | turn_kind = TurnKind.USER 116 | elif _seems_to_have_ooc_talk(cleaned_message) \ 117 | and not _seems_to_have_ooc_talk(turns[-1].utterance): 118 | # _However_, there's also another case we'd like to 119 | # handle. Ideally, the model should not slip into OOC 120 | # talk unprompted - it should only do that if we've 121 | # tried to talk to it out-of-character first. 122 | # 123 | # So if this turn has OOC talk, we'll only use it as a 124 | # model turn if the previous (user) turn also had OOC 125 | # talk. 126 | turn_kind = TurnKind.USER 127 | else: 128 | # TODO(TG): Try to do more about OOC/potential low-quality generations. 129 | turn_kind = current_speaker 130 | 131 | turn = Turn(utterance=cleaned_message, kind=turn_kind) 132 | turns.append(turn) 133 | 134 | # Messy switching 135 | current_speaker = TurnKind.MODEL if current_speaker == TurnKind.USER else TurnKind.USER 136 | 137 | yield Episode( 138 | turns=turns, 139 | identifier=f"rp-{thread.source_file}-{thread.thread_name}", 140 | ) 141 | 142 | 143 | def _split_message(original_message: str, target_word_count: int, 144 | delimiter: str) -> list[str]: 145 | ''' 146 | Splits a large message into smaller ones, respecting the given delimiter. 147 | ''' 148 | messages = original_message.split(delimiter) 149 | reconstructed_messages: list[str] = [messages[0]] 150 | 151 | # For each split message, we see if we can merge it back up together with 152 | # the next one while still staying under the target word count. 153 | for message in messages[1:]: 154 | last_message_word_count = len(reconstructed_messages[-1].split()) \ 155 | if len(reconstructed_messages) else 0 156 | current_message_word_count = len(message.split()) 157 | 158 | if last_message_word_count + current_message_word_count > target_word_count: 159 | # If we can't, we just add it as a separate message to start merging 160 | # from scratch. 161 | reconstructed_messages.append(message) 162 | else: 163 | # Otherwise, we merge it into the current message. 164 | reconstructed_messages[-1] += delimiter + message 165 | 166 | return reconstructed_messages 167 | 168 | 169 | def _fix_style_and_encoding_issues(original_message: str) -> str: 170 | '''Cleans up any style-related issues.''' 171 | message = original_message 172 | message = message.replace(" .. ", "... ") 173 | message = message.replace(" ... ", "... ") 174 | message = re.sub(r'\b(\.\.\.?)\b', '... ', message) 175 | 176 | message = message.replace(" . ", ". ") 177 | message = message.replace(" , ", ", ") 178 | message = message.replace(" ? ", "? ") 179 | message = message.replace(" ! ", "! ") 180 | 181 | message = re.sub(r"(\S)(…)(\S)", "\\1\\2 \\3", message) 182 | 183 | # Some forums have their pages incorrectly tagged as UTF-8, so we get 184 | # garbage when decoding. Most common problem I've seen is bad quotation 185 | # marks, so we paper over that here. 186 | message = message.replace("â??", "'") 187 | message = message.replace("â?", "'") 188 | 189 | message = message.replace("", " ") 190 | 191 | return message 192 | 193 | 194 | def _remove_links(original_message: str) -> str: 195 | '''Removes any links from the given message, due to privacy concerns.''' 196 | return re.sub(r"https?:\/\/.+?(\s|$)", "", original_message) 197 | 198 | 199 | def _remove_trailing_whitespace_and_bad_lines(original_message: str) -> str: 200 | lines: list[str] = [] 201 | for line in original_message.splitlines(): 202 | # Trailing whitespace is always useless. 203 | line = line.rstrip() 204 | 205 | # Sometimes, users start their messages with "RE: (thread title, which 206 | # leaks usernames)" so we skip that here. 207 | if line.startswith("RE: ") or line.startswith("**RE: "): 208 | continue 209 | 210 | lines.append(line) 211 | 212 | return "\n".join(lines) 213 | 214 | 215 | def _not_usable_as_training_label(message: str) -> bool: 216 | ''' 217 | Whether or not the message contains some problem that we can't fix reliably, 218 | and we're better off not training on. 219 | ''' 220 | 221 | # "Floating" quotation marks. 222 | if re.search(r'\b " \b', message) is not None: 223 | return True 224 | 225 | # Quotation marks mushed together with text. 226 | if re.search(r'\S"\S', message) is not None: 227 | return True 228 | 229 | # Parenthesis mushed together with text. 230 | if re.search(r'\S\(', message) is not None \ 231 | or re.search(r'\)\S', message) is not None: 232 | return True 233 | 234 | # Lowercase "I". Fixable, but a sign of low-quality writing so I'd rather 235 | # not train the model on these. 236 | if re.search(r"\bi('m|'ll)?\b", message) is not None: 237 | return True 238 | 239 | # Links. 240 | if re.search(r"\[.+\]\(\S+\)", message) is not None: 241 | return True 242 | 243 | return False 244 | 245 | 246 | def _fix_markdown(original_message: str) -> str: 247 | s = original_message 248 | 249 | # Bold/italics sometimes doesn't have spaces around it after converting from 250 | # HTML to Markdown for some reason. 251 | is_opening_asterisk = True 252 | while (match := re.search(r"([\w\d])(\*{1,2})([\w\d])", s)) is not None: 253 | if is_opening_asterisk: 254 | s = s[:match.start() + 1] + " " + s[match.start() + 1:] 255 | else: 256 | s = s[:match.end() - 1] + " " + s[match.end() - 1:] 257 | is_opening_asterisk = not is_opening_asterisk 258 | 259 | return s 260 | 261 | 262 | def _remove_bad_html_tags(message: str) -> str: 263 | '''Cleans up HTML tags we don't want from the given message.''' 264 | cleaned_message = _remove_html_tag(message, "blockquote") 265 | cleaned_message = _remove_html_tag(cleaned_message, "script") 266 | 267 | if "bbImageWrapper" in message: 268 | # Images are a
with some JavaScript to lazy-load them, so we do 269 | # this behind a guard to reduce false positives just in case. 270 | cleaned_message = _remove_html_tag(cleaned_message, "div") 271 | 272 | return cleaned_message 273 | 274 | 275 | def _remove_html_tag(message: str, tag: str) -> str: 276 | '''Cleans the given HTML tag from the message.''' 277 | cleaned_message = message 278 | cleaning_passes = 0 279 | 280 | while f"<{tag}" in cleaned_message: 281 | assert cleaning_passes < 4, "Too many cleaning passes, giving up to avoid deadlocking" 282 | 283 | start_idx = cleaned_message.find(f"<{tag}") 284 | end_idx = cleaned_message.find(f"", start_idx) 285 | 286 | if start_idx == -1 or end_idx == -1: 287 | LOG.warning("Unbalanced tags found, leaving as-is") 288 | break 289 | 290 | cleaned_message = cleaned_message[:start_idx] + cleaned_message[ 291 | end_idx + len(f""):] 292 | 293 | return cleaned_message 294 | 295 | 296 | def _seems_to_have_ooc_talk(message: str) -> bool: 297 | '''Returns whether a message seems to have some out-of-character talk.''' 298 | return re.search(_OOC_REGEX, message) is not None 299 | 300 | 301 | _OOC_REGEX = re.compile(r"^\((OOC: ?)?.+\)$", flags=re.MULTILINE) 302 | 303 | _BASE_SYSTEM_PROMPTS = [ 304 | '''%{Enter|Engage|Enable|Start} %{fiction writing|fantasy writing|fantasy roleplay|fictional RP|roleplay|RP} mode. {{content_type_str}}. {{response_length_str}}.''', 305 | # 306 | '''You %{are now in|have entered|will now start} %{fiction writing|fantasy writing|fantasy roleplay|fictional RP|roleplay|RP|conversational RP} mode. Drive the story forward in chunks. {{response_length_str}}.''', 307 | # 308 | '''You are trained to %{perform|generate} %{storywriting|fiction writing|fantasy writing|fantasy roleplay|fictional roleplay|RP}. Generate continuations for whatever the user gives. {{response_length_str}}. {{content_type_str}}.''', 309 | # Modified SillyTavern prompt 310 | '''Write the next reply in a fictional %{roleplay|RP} %{chat|conversation}. {{content_type_str}}. {{response_length_str}}.''', 311 | # 312 | '''%{SYSTEM|MODE}: %{conversational roleplay|RP|roleplay mode|RP system engaged} 313 | %{NOTE|ADVISORY|KEEP IN MIND}: {{response_length_str}}''', 314 | # 315 | '''I am %{in|currently in|engaging in|beginning} a %{roleplay|RP|fictional roleplay-like conversation} with %{someone else|other people|a user}.''', 316 | # 317 | '''{{content_style_str}}. {{response_length_str}}.''', 318 | # 319 | '''%{OBJECTIVE|TASK|MISSION|JOB} - %{Conduct|Generate|Enjoy} a %{roleplay session|RP|fictional roleplay} 320 | %{DISCRETION RATING|SAFE FOR WORK?|CONTENT RATING} - {{content_type_str}} 321 | %{REMEMBER|NOTE} - {{response_length_str}}''', 322 | # Misspellings intentional 323 | '''%{do|make|have} %{rp adventures|writing|creative roleplay} 324 | %{pls|please} %{rember|remember} to %{b|be} %{engaging|immersive|epic}''', 325 | # 326 | "%{roleplay|RP}", 327 | "" 328 | ] 329 | 330 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS) 331 | 332 | SFW_PROMPTS = generate_prompts([ 333 | "%{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be safe for work|be SFW|not include any adult themes|be safe for minors|not include 18+ content|not be 18+|not be NSFW}", 334 | ]) 335 | 336 | MIXED_SFW_NSFW_PROMPTS = generate_prompts([ 337 | "%{Generations|Your writing|The generated response|Your reply|Generated replies} %{may or may not include adult themes|may or may not be NSFW|can include adult themes}", 338 | ]) 339 | 340 | NSFW_PROMPTS = generate_prompts([ 341 | "%{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be not safe for work|be NSFW|include adult themes|include erotic themes|include 18+ content}", 342 | ]) 343 | 344 | CONTENT_TYPE_TO_PROMPTS: dict[RpType, list[str]] = { 345 | RpType.RP: SFW_PROMPTS, 346 | RpType.ERP: NSFW_PROMPTS, 347 | RpType.MIXED: MIXED_SFW_NSFW_PROMPTS, 348 | } 349 | --------------------------------------------------------------------------------