├── data
    └── .keep
├── toolbox
    ├── __init__.py
    ├── core
    │   ├── task.py
    │   ├── models.py
    │   ├── dataset.py
    │   ├── wrapper.py
    │   └── training_example.py
    ├── datasets
    │   ├── common.py
    │   ├── ai_dungeon.py
    │   ├── airoboros.py
    │   ├── evol_instruct.py
    │   ├── dolly.py
    │   ├── sharegpt.py
    │   ├── gpt4llm.py
    │   ├── claude_evol_instruct.py
    │   ├── gpt4all.py
    │   ├── openorca.py
    │   ├── claude_multiround.py
    │   ├── gpteacher.py
    │   ├── airoboros2.py
    │   ├── supercot.py
    │   ├── soda.py
    │   ├── whocars.py
    │   ├── wizard_vicuna.py
    │   ├── limarp.py
    │   ├── mcstories.py
    │   ├── clubfloyd.py
    │   ├── rp_guild.py
    │   ├── claude_logs.py
    │   ├── rp_forums.py
    │   └── characterai.py
    ├── filters
    │   ├── training_example_filter.py
    │   ├── __init__.py
    │   └── training_example
    │   │   ├── duplicate_filter.py
    │   │   └── refusal_filter.py
    ├── utils
    │   ├── files.py
    │   └── prompts.py
    └── tasks
    │   ├── whocars_roleplay.py
    │   ├── airoboros_instruction_following.py
    │   ├── airoboros_guess_instructions.py
    │   ├── supercot_instruction_following.py
    │   ├── gpt4all_question_answering.py
    │   ├── __init__.py
    │   ├── claude_roleplay.py
    │   ├── claude_guess_instruction.py
    │   ├── wizard_vicuna_question_answering.py
    │   ├── single_turn_instruction_following.py
    │   ├── evol_instruct.py
    │   ├── dolly_guess_instruction.py
    │   ├── claude_instruct.py
    │   ├── soda_summarization.py
    │   ├── aidungeon_text_adventure.py
    │   ├── characterai_roleplay.py
    │   ├── clubfloyd_text_adventure.py
    │   ├── limarp_roleplay.py
    │   ├── mcstories_writing.py
    │   ├── openorca_instruction_following.py
    │   ├── soda_reply_generation.py
    │   ├── claude_evol_instruct.py
    │   ├── sharegpt_instruction_following.py
    │   ├── rp_guild_writing.py
    │   └── rp_forums_writing.py
├── .tool-versions
├── README.md
├── .gitignore
├── pyproject.toml
└── scripts
    └── build.py


/data/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.tool-versions:
--------------------------------------------------------------------------------
1 | pdm 2.4.3
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # data-toolbox
2 | 
3 | This repository contains the implementation of our data munging code.
4 | 
5 | Currently undergoing a massive refactor, I still need to document everything.
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Cache files.
 2 | /*.egg-info/
 3 | **/__pycache__/
 4 | /.mypy_cache/
 5 | 
 6 | # Machine-specific stuff.
 7 | /.pdm.toml
 8 | /.pdm-python
 9 | /.venv/
10 | /dist/*
11 | /build_data.py
12 | 
13 | # Large/binary files.
14 | /data/*
15 | !/data/.keep
16 | 


--------------------------------------------------------------------------------
/toolbox/core/task.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | from toolbox.core.models import Episode
 4 | 
 5 | 
 6 | class BaseTask:
 7 |     '''Base task class.'''
 8 | 
 9 |     def __iter__(self) -> t.Generator[Episode, None, None]:
10 |         '''This method must be overidden when inheriting.'''
11 |         raise NotImplementedError
12 | 


--------------------------------------------------------------------------------
/toolbox/datasets/common.py:
--------------------------------------------------------------------------------
 1 | """Common data structures which can apply to multiple datasets."""
 2 | from dataclasses import dataclass
 3 | 
 4 | @dataclass(frozen=True)
 5 | class SimpleReplyDataInstance:
 6 |     prompt: str
 7 |     generation: str
 8 | 
 9 | @dataclass(frozen=True)
10 | class AlpacaLikeDataInstance:
11 |     instruction: str
12 |     input: str
13 |     output: str
14 | 


--------------------------------------------------------------------------------
/toolbox/filters/training_example_filter.py:
--------------------------------------------------------------------------------
 1 | from toolbox.core.training_example import TrainingExample
 2 | 
 3 | 
 4 | class TrainingExampleFilter:
 5 |     '''Filter implementations should inherit from this base class.'''
 6 | 
 7 |     def should_keep(self, _example: TrainingExample) -> bool:
 8 |         '''
 9 |         Whether or not the given training example should be kept and used for
10 |         training.
11 |         '''
12 |         raise NotImplementedError


--------------------------------------------------------------------------------
/toolbox/filters/__init__.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | from toolbox.filters.training_example.duplicate_filter import DuplicateFilter
 4 | from toolbox.filters.training_example.refusal_filter import RefusalFilter
 5 | from toolbox.filters.training_example_filter import TrainingExampleFilter
 6 | 
 7 | NAME_TO_TRAINING_EXAMPLE_FILTER_MAPPING: dict[
 8 |     str, t.Type[TrainingExampleFilter]] = {
 9 |         cls.__name__: cls for cls in [DuplicateFilter, RefusalFilter]
10 |     }
11 | 


--------------------------------------------------------------------------------
/toolbox/datasets/ai_dungeon.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing as t
 3 | 
 4 | from toolbox.core.dataset import BaseDataset, get_path_for
 5 | 
 6 | 
 7 | class AiDungeonDataset(BaseDataset[str]):
 8 |     '''
 9 |     AI Dungeon's `text_adventures.txt`.
10 |     '''
11 | 
12 |     def __iter__(self) -> t.Generator[str, None, None]:
13 |         root_path = get_path_for("ai-dungeon")
14 |         file_path = os.path.join(root_path, "text_adventures.txt")
15 | 
16 |         with open(file_path, "r") as file:
17 |             for line in file:
18 |                 yield line
19 | 


--------------------------------------------------------------------------------
/toolbox/core/models.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from enum import Enum
 3 | 
 4 | class TurnKind(Enum):
 5 |     '''Identifies who a turn "belongs" to.'''
 6 |     SYSTEM = "<|system|>"
 7 |     USER = "<|user|>"
 8 |     MODEL = "<|model|>"
 9 | 
10 | @dataclass(frozen=True)
11 | class Turn:
12 |     '''Can be thought of as a message or interaction within a conversation.'''
13 |     utterance: str
14 |     kind: TurnKind
15 |     # Used only for Pygmalion format
16 |     name: str = "<BOT>"
17 |         
18 | @dataclass(frozen=True)
19 | class Episode:
20 |     '''A collection of turns.'''
21 |     turns: list[Turn]
22 |     identifier: str
23 | 
24 | @dataclass(frozen=True)
25 | class TrainingExample:
26 |     prompt: str
27 |     generation: str
28 |     identifier: str
29 | 


--------------------------------------------------------------------------------
/toolbox/datasets/airoboros.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import typing as t
 5 | 
 6 | from toolbox.core.dataset import BaseDataset, get_path_for
 7 | from toolbox.datasets.common import SimpleReplyDataInstance
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class AiroborosDataset(BaseDataset[SimpleReplyDataInstance]):
12 |     def __iter__(self) -> t.Generator[SimpleReplyDataInstance, None, None]:
13 |         root_path = get_path_for("airoboros")
14 |         file_path = os.path.join(root_path, "instructions.jsonl")
15 | 
16 |         with open(file_path, "r", encoding="utf-8") as f:
17 |             for line in f:
18 |                 line_entry = json.loads(line)
19 |                 yield SimpleReplyDataInstance(
20 |                     prompt=line_entry["instruction"],
21 |                     generation=line_entry["response"]
22 |                 )
23 | 


--------------------------------------------------------------------------------
/toolbox/filters/training_example/duplicate_filter.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | 
 3 | from toolbox.core.training_example import TrainingExample
 4 | from toolbox.filters.training_example_filter import TrainingExampleFilter
 5 | 
 6 | 
 7 | class DuplicateFilter(TrainingExampleFilter):
 8 |     '''Filters out training examples which are exact duplicates.'''
 9 | 
10 |     def __init__(self) -> None:
11 |         super().__init__()
12 | 
13 |         self.seen_hashes: set[str] = set()
14 | 
15 |     def should_keep(self, example: TrainingExample) -> bool:
16 |         serialized_example = example.prompt + example.generation
17 |         example_hash = _calculate_hash_for(serialized_example)
18 |         if example_hash in self.seen_hashes:
19 |             return False
20 | 
21 |         self.seen_hashes.add(example_hash)
22 |         return True
23 | 
24 | 
25 | def _calculate_hash_for(text: str) -> str:
26 |     return hashlib.sha512(text.encode("utf-8")).hexdigest()
27 | 


--------------------------------------------------------------------------------
/toolbox/datasets/evol_instruct.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import typing as t
 4 | 
 5 | from toolbox.core.dataset import BaseDataset, get_path_for
 6 | from toolbox.datasets.common import AlpacaLikeDataInstance
 7 | 
 8 | class EvolInstructDataset(BaseDataset[AlpacaLikeDataInstance]):
 9 |     '''
10 |     WizardLM data.
11 | 
12 |     https://huggingface.co/datasets/victor123/evol_instruct_70k
13 |     '''
14 | 
15 |     def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]:
16 |         root_path = get_path_for("evol-instruct")
17 |         file_path = os.path.join(root_path, "alpaca_evol_instruct_70k.json")
18 | 
19 |         with open(file_path, "r") as file:
20 |             data = json.load(file)
21 |             for example in data:
22 |                 yield AlpacaLikeDataInstance(
23 |                     instruction=example["instruction"],
24 |                     input=None,
25 |                     output=example["output"],
26 |                 )
27 | 


--------------------------------------------------------------------------------
/toolbox/datasets/dolly.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import typing as t
 5 | 
 6 | from toolbox.core.dataset import BaseDataset, get_path_for
 7 | from toolbox.datasets.common import AlpacaLikeDataInstance
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | class DollyDataset(BaseDataset[AlpacaLikeDataInstance]):
12 |     '''
13 |     The Dolly instruction dataset from Databricks.
14 |     https://huggingface.co/datasets/databricks/databricks-dolly-15k
15 |     '''
16 |     def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]:
17 |         root_path = get_path_for("dolly")
18 |         file_path = os.path.join(root_path, "databricks-dolly-15k.jsonl")
19 |         
20 |         with open(file_path, "r", encoding="utf-8") as f:
21 |             for line in f:
22 |                 entry = json.loads(line)
23 |                 yield AlpacaLikeDataInstance(
24 |                     instruction=entry["instruction"],
25 |                     input=entry["context"],
26 |                     output=entry["response"]
27 |                 )
28 | 


--------------------------------------------------------------------------------
/toolbox/datasets/sharegpt.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import typing as t
 5 | from dataclasses import dataclass
 6 | 
 7 | from toolbox.core.dataset import BaseDataset
 8 | from toolbox.utils.files import enumerate_files_for
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | @dataclass(frozen=True)
14 | class ShareGptEpisode:
15 |     # beautiful...
16 |     messages: list[list[list[str]] | list[str]]
17 |     source_file: str
18 | 
19 | 
20 | class ShareGptDataset(BaseDataset[ShareGptEpisode]):
21 |     '''ChatGPT conversations shared on ShareGPT.'''
22 | 
23 |     def __iter__(self) -> t.Generator[ShareGptEpisode, None, None]:
24 |         for path in enumerate_files_for(dataset_name="sharegpt",
25 |                                         file_extension=".json"):
26 |             with open(path, "r", encoding="utf-8") as file:
27 |                 data = json.load(file)
28 |                 source_file = os.path.basename(path).replace(".json", "")
29 |                 yield ShareGptEpisode(messages=data, source_file=source_file)
30 | 


--------------------------------------------------------------------------------
/toolbox/utils/files.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from toolbox.core.dataset import get_path_for
 5 | 
 6 | LOG = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def enumerate_files_for(
10 |     dataset_name: str,
11 |     file_extension: str,
12 |     subfolder: str | None = None,
13 | ) -> list[str]:
14 |     '''Returns a list of files available for the given dataset.'''
15 |     dataset_path = get_path_for(dataset_name)
16 |     final_path = dataset_path if subfolder is None else os.path.join(
17 |         dataset_path, subfolder)
18 |     items = os.listdir(final_path)
19 | 
20 |     files: list[str] = []
21 |     for item in items:
22 |         item_path = os.path.join(final_path, item)
23 |         if not os.path.isfile(item_path):
24 |             # We don't care about folders.
25 |             continue
26 | 
27 |         if not item_path.endswith(file_extension):
28 |             # Ignore invalid file extensions.
29 |             continue
30 | 
31 |         absolute_file_path = os.path.abspath(item_path)
32 |         files.append(absolute_file_path)
33 | 
34 |     return files
35 | 


--------------------------------------------------------------------------------
/toolbox/datasets/gpt4llm.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import typing as t
 3 | 
 4 | from toolbox.core.dataset import BaseDataset
 5 | from toolbox.datasets.common import AlpacaLikeDataInstance
 6 | from toolbox.utils.files import enumerate_files_for
 7 | 
 8 | class Gpt4LlmDataset(BaseDataset[AlpacaLikeDataInstance]):
 9 |     '''
10 |     GPT-4-LLM data.
11 | 
12 |     https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM
13 |     '''
14 | 
15 |     def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]:
16 |         filepaths = enumerate_files_for("gpt-4-llm", file_extension="json")
17 | 
18 |         for path in filepaths:
19 |             if "comparision_data.json" in path:
20 |                 # TODO(11b): Handle this later.
21 |                 continue
22 | 
23 |             with open(path, "r") as file:
24 |                 data = json.load(file)
25 |                 for entry in data:
26 |                     yield AlpacaLikeDataInstance(
27 |                         instruction=entry["instruction"],
28 |                         input=entry["input"],
29 |                         output=entry["output"],
30 |                     )
31 | 


--------------------------------------------------------------------------------
/toolbox/datasets/claude_evol_instruct.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import typing as t
 5 | 
 6 | from toolbox.core.dataset import BaseDataset, get_path_for
 7 | from toolbox.datasets.common import SimpleReplyDataInstance
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | class ClaudeEvolInstructDataset(BaseDataset[SimpleReplyDataInstance]):
12 |     '''
13 |     Instructions augmented via. WizardLM's Evol-Instruct technique, answered with Claude
14 |     https://huggingface.co/datasets/Norquinal/claude_evol_instruct_210k
15 |     '''
16 |     def __iter__(self) -> t.Generator[SimpleReplyDataInstance, None, None]:
17 |         root_path = get_path_for("claude-evol")
18 |         file_path = os.path.join(root_path, "claude_evol_instruct_210k.json")
19 | 
20 |         with open(file_path, "r", encoding="utf-8") as f:
21 |             data = json.load(f)
22 |             # Go through the logs and simply fetch them
23 |             for entry in data:
24 |                 yield SimpleReplyDataInstance(
25 |                     prompt=entry["instruction"],
26 |                     generation=entry["output"],
27 |                 )
28 | 


--------------------------------------------------------------------------------
/toolbox/datasets/gpt4all.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from dataclasses import dataclass
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from toolbox.core.dataset import BaseDataset
 7 | from toolbox.utils.files import enumerate_files_for
 8 | 
 9 | 
10 | @dataclass(frozen=True)
11 | class Gpt4AllDataInstance:
12 |     prompt: str
13 |     response: str
14 |     source: str
15 | 
16 | 
17 | class Gpt4AllDataset(BaseDataset[Gpt4AllDataInstance]):
18 |     '''
19 |     NomicAI's GPT4all dataset.
20 | 
21 |     https://huggingface.co/datasets/nomic-ai/gpt4all_prompt_generations
22 |     '''
23 | 
24 |     def __iter__(self) -> t.Generator[Gpt4AllDataInstance, None, None]:
25 |         parquet_files = enumerate_files_for("gpt4all_prompt_generations",
26 |                                             file_extension="parquet")
27 | 
28 |         for file in parquet_files:
29 |             df = pd.read_parquet(file)
30 |             for idx in df.index:
31 |                 yield Gpt4AllDataInstance(
32 |                     prompt=df["prompt"][idx],
33 |                     response=df["response"][idx],
34 |                     source=df["source"][idx],
35 |                 )
36 | 


--------------------------------------------------------------------------------
/toolbox/datasets/openorca.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import typing as t
 4 | from dataclasses import dataclass
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from toolbox.core.dataset import BaseDataset
 9 | from toolbox.utils.files import enumerate_files_for
10 | 
11 | LOG = logging.getLogger(__name__)
12 | 
13 | @dataclass(frozen=True)
14 | class OpenOrcaEntry:
15 |     id: str
16 |     system_prompt: str
17 |     question: str
18 |     response: str
19 | 
20 | class OpenOrcaDataset(BaseDataset[OpenOrcaEntry]):
21 |     '''The OpenOrca dataset.'''
22 |     def __iter__(self) -> t.Generator[OpenOrcaEntry, None, None]:
23 |         # We have this so that one can use GPT-4 OpenOrca, 3.5 OpenOrca, or both
24 |         for path in enumerate_files_for(dataset_name="openorca", file_extension=".parquet"):
25 |             df = pd.read_parquet(path)
26 |             for idx in df.index:
27 |                 yield OpenOrcaEntry(
28 |                     id=df["id"][idx],
29 |                     system_prompt=df["system_prompt"][idx],
30 |                     question=df["question"][idx],
31 |                     response=df["response"][idx]
32 |                 )
33 | 


--------------------------------------------------------------------------------
/toolbox/core/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing as t
 3 | 
 4 | HERE = os.path.realpath(os.path.dirname(__file__))
 5 | T = t.TypeVar("T")
 6 | 
 7 | 
 8 | class BaseDataset(t.Generic[T]):
 9 |     '''Base dataset class.'''
10 | 
11 |     def __iter__(self) -> t.Generator[T, None, None]:
12 |         '''
13 |         This method must be overidden when inheriting. It should yield
14 |         individual items from the dataset.
15 |         '''
16 |         raise NotImplementedError
17 | 
18 | 
19 | def get_path_for(dataset_name: str | None) -> str:
20 |     '''
21 |     Returns an absolute path. If `dataset_name` is given, it will return the
22 |     path to the specific dataset's folder, otherwise it'll return the path to
23 |     the root data folder.
24 |     '''
25 | 
26 |     # Allow overriding the location of the root data folder by using an
27 |     # environment variable.
28 |     env_var = "TOOLBOX_DATA_FOLDER"
29 |     if env_var in os.environ:
30 |         components = [os.environ[env_var]]
31 |     else:
32 |         components = [HERE, "..", "..", "data"]
33 | 
34 |     if dataset_name is not None:
35 |         components.append(dataset_name)
36 | 
37 |     return os.path.join(*components)
38 | 


--------------------------------------------------------------------------------
/toolbox/datasets/claude_multiround.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import typing as t
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | from toolbox.core.dataset import BaseDataset, get_path_for
 9 | 
10 | LOG = logging.getLogger(__name__)
11 | 
12 | @dataclass(frozen=True)
13 | class ClaudeMultiround:
14 |     conversation: list[dict[str, str]]
15 |     id: str
16 | 
17 | class ClaudeInstructDataset(BaseDataset[ClaudeMultiround]):
18 |     '''
19 |     Logs taken from synthetically-generated instruction chats with Claude.
20 |     https://huggingface.co/datasets/Norquinal/claude_multiround_chat_30k
21 |     '''
22 |     def __iter__(self) -> t.Generator[ClaudeMultiround, None, None]:
23 |         root_path = get_path_for("claude-multiround")
24 |         file_path = os.path.join(root_path, "claude_multiround_chat_30k.json")
25 | 
26 |         with open(file_path, "r", encoding="utf-8") as f:
27 |             logs = json.load(f)
28 |             # Go through the logs and simply fetch them
29 |             for round in logs:
30 |                 yield ClaudeMultiround(
31 |                     conversation=round["conversations"],
32 |                     id=round["id"],
33 |                 )
34 | 


--------------------------------------------------------------------------------
/toolbox/datasets/gpteacher.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import typing as t
 4 | 
 5 | from toolbox.core.dataset import BaseDataset, get_path_for
 6 | from toolbox.datasets.common import AlpacaLikeDataInstance
 7 | 
 8 | class GpTeacherDataset(BaseDataset[AlpacaLikeDataInstance]):
 9 |     '''
10 |     GPTeacher data.
11 | 
12 |     https://github.com/teknium1/GPTeacher
13 |     '''
14 | 
15 |     def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]:
16 |         path_to_root_folder = get_path_for("gpteacher")
17 |         for desired_filename in DESIRED_FILES:
18 |             path = os.path.join(path_to_root_folder, desired_filename)
19 |             with open(path, "r") as file:
20 |                 data = json.load(file)
21 |                 for entry in data:
22 |                     yield AlpacaLikeDataInstance(
23 |                         instruction=entry["instruction"],
24 |                         input=entry["input"],
25 |                         output=entry["response"],
26 |                     )
27 | 
28 | 
29 | DESIRED_FILES = [
30 |     "Instruct/gpt4-instruct-similarity-0.9-dataset.json",
31 |     "Roleplay/roleplay-similarity_0.9-instruct-dataset.json",
32 |     "Toolformer/toolformer-similarity-0.9-dataset.json",
33 | ]
34 | 


--------------------------------------------------------------------------------
/toolbox/datasets/airoboros2.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import typing as t
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | from toolbox.core.dataset import BaseDataset, get_path_for
 9 | 
10 | LOG = logging.getLogger(__name__)
11 | 
12 | @dataclass(frozen=True)
13 | class Airoboros2DataInstance:
14 |     instruction: str
15 |     response: str
16 |     system_prompt: str
17 |     category: str
18 | 
19 | class Airoboros2Dataset(BaseDataset[Airoboros2DataInstance]):
20 |     '''
21 |     Instructions from Airoboros 2.2.1
22 |     https://huggingface.co/datasets/jondurbin/airoboros-2.2.1/
23 |     '''
24 |     def __iter__(self) -> t.Generator[Airoboros2DataInstance, None, None]:
25 |         root_path = get_path_for("airoboros2")
26 |         file_path = os.path.join(root_path, "instructions.jsonl")
27 | 
28 |         with open(file_path, "r", encoding="utf-8") as f:
29 |             for line in f:
30 |                 entry = json.loads(line)
31 |                 yield Airoboros2DataInstance(
32 |                     instruction=entry["instruction"],
33 |                     response=entry["response"],
34 |                     system_prompt=entry["system"],
35 |                     category=entry["category"],
36 |                 )
37 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.pdm]
 2 | 
 3 | [project]
 4 | name = "toolbox"
 5 | version = "0.1.0"
 6 | description = "Code for ingesting data from several sources, formatting it and creating a training dataset."
 7 | authors = [
 8 |     {name = "0x000011b", email = "0x000011b@proton.me"},
 9 | ]
10 | requires-python = ">=3.10"
11 | license = {text = "AGPL-3.0-only"}
12 | dependencies = [
13 |     "markdownify>=0.11.6",
14 |     "html5lib>=1.1",
15 |     "beautifulsoup4>=4.11.2",
16 |     "ansicolors>=1.1.8",
17 |     "pandas>=1.5.3",
18 |     "mashumaro>=3.5",
19 |     "pyarrow>=11.0.0",
20 |     "sklearn>=0.0.post4",
21 |     "pyyaml>=6.0.1",
22 | ]
23 | 
24 | [project.optional-dependencies]
25 | dev = [
26 |     "yapf>=0.32.0",
27 |     "toml>=0.10.2",
28 |     "isort>=5.10.1",
29 |     "pylint>=2.15.8",
30 |     "mypy>=0.991",
31 | ]
32 | debugging = [
33 |     "pdbpp>=0.10.3",
34 |     "scalene>=1.5.19",
35 | ]
36 | 
37 | [tool.setuptools]
38 | py-modules = ["toolbox"]
39 | 
40 | [tool.pdm.scripts]
41 | lint = {shell = "pylint --jobs 0 ./toolbox/**/*.py"}
42 | importcheck = "isort --check --diff toolbox"
43 | importfix = "isort toolbox"
44 | stylecheck = "yapf --parallel --diff --recursive toolbox"
45 | stylefix = "yapf --parallel --in-place --recursive toolbox"
46 | typecheck = "mypy --strict toolbox"
47 | 
48 | [tool.yapf]
49 | based_on_style = "google"
50 | 
51 | [tool.mypy]
52 | ignore_missing_imports = true
53 | 


--------------------------------------------------------------------------------
/toolbox/datasets/supercot.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import typing as t
 5 | 
 6 | from toolbox.core.dataset import BaseDataset, get_path_for
 7 | from toolbox.datasets.common import AlpacaLikeDataInstance
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class SuperCotDataset(BaseDataset[AlpacaLikeDataInstance]):
12 |     '''
13 |     The SuperCOT dataset, packed neatly into standard Alpaca format.
14 |     https://huggingface.co/datasets/kaiokendev/SuperCOT-dataset
15 |     '''
16 |     def __iter__(self) -> t.Generator[AlpacaLikeDataInstance, None, None]:
17 |         root_path = get_path_for("supercot")
18 |         file_path = os.path.join(root_path, "filtered.json")
19 | 
20 |         with open(file_path, "r", encoding="utf-8") as f:
21 |             data = json.load(f)
22 |             for entry in data:
23 |                 # "rewritten_intent" is pretty similar to just a standard input
24 |                 # and replaces the "input" field in the JSON, so just conflate
25 |                 # the two.
26 |                 try:
27 |                     input = entry["input"]
28 |                 except KeyError:
29 |                     input = entry["rewritten_intent"]
30 |                 yield AlpacaLikeDataInstance(
31 |                     instruction=entry["instruction"],
32 |                     input=input,
33 |                     output=entry["output"]
34 |                 )
35 | 


--------------------------------------------------------------------------------
/toolbox/datasets/soda.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing as t
 3 | from dataclasses import dataclass
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from toolbox.core.dataset import BaseDataset, get_path_for
 8 | 
 9 | 
10 | @dataclass(frozen=True)
11 | class SodaEpisode:
12 |     narrative: str
13 |     dialogue: t.List[str]
14 |     speakers: t.List[str]
15 |     relation: str
16 |     literal: str
17 |     original_index: str
18 | 
19 | 
20 | class SodaDataset(BaseDataset[SodaEpisode]):
21 |     '''
22 |     SODA: Million-scale Dialogue Distillation with Social Commonsense
23 |     Contextualization
24 | 
25 |     https://huggingface.co/datasets/allenai/soda
26 |     '''
27 | 
28 |     def __init__(self, split: str = "train") -> None:
29 |         assert split in ["test", "train", "valid"]
30 |         root_data_path = get_path_for("soda")
31 |         self.file_path = os.path.join(root_data_path, f"{split}.parquet")
32 | 
33 |         super().__init__()
34 | 
35 |     def __iter__(self) -> t.Generator[SodaEpisode, None, None]:
36 |         df = pd.read_parquet(self.file_path)
37 |         for idx in df.index:
38 |             yield SodaEpisode(narrative=df["narrative"][idx],
39 |                               dialogue=df["dialogue"][idx],
40 |                               speakers=df["speakers"][idx],
41 |                               relation=df["relation"][idx],
42 |                               literal=df["literal"][idx],
43 |                               original_index=str(df["original_index"][idx]))
44 | 


--------------------------------------------------------------------------------
/toolbox/datasets/whocars.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | import logging
 4 | import typing as t
 5 | from dataclasses import dataclass
 6 | 
 7 | from toolbox.core.dataset import BaseDataset
 8 | from toolbox.utils.files import enumerate_files_for
 9 | 
10 | LOG = logging.getLogger(__name__)
11 | 
12 | 
13 | @dataclass(frozen=True)
14 | class WhocarsEntry:
15 |     model: str
16 |     endpoint: str
17 |     prompt_json: dict[str, t.Any]
18 |     response: str
19 | 
20 | 
21 | class WhocarsDataset(BaseDataset[WhocarsEntry]):
22 |     '''Logs from the whocars proxy.'''
23 | 
24 |     def __iter__(self) -> t.Generator[WhocarsEntry, None, None]:
25 |         for file_path in enumerate_files_for("whocars", file_extension=".csv"):
26 |             if "__index__" in file_path:
27 |                 continue
28 | 
29 |             with open(file_path, "r") as file:
30 |                 reader = csv.DictReader(file)
31 |                 try:
32 |                     for row in reader:
33 |                         yield WhocarsEntry(
34 |                             model=row["model"],
35 |                             endpoint=row["endpoint"],
36 |                             prompt_json=json.loads(row["prompt json"]),
37 |                             response=row["response"],
38 |                         )
39 |                 except csv.Error as ex:
40 |                     # One file seems to have broken encoding, just skip over it,
41 |                     # we have enough data otherwise.
42 |                     LOG.error(ex)


--------------------------------------------------------------------------------
/toolbox/datasets/wizard_vicuna.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import typing as t
 4 | from dataclasses import dataclass
 5 | 
 6 | from toolbox.core.dataset import BaseDataset, get_path_for
 7 | 
 8 | 
 9 | @dataclass(frozen=True)
10 | class WizardVicunaConversation:
11 |     id: str
12 |     human_question: str
13 |     gpt_response: str
14 | 
15 | 
16 | class WizardVicunaDataset(BaseDataset[WizardVicunaConversation]):
17 |     '''
18 |     Data from WizardVicuna.
19 | 
20 |     https://huggingface.co/datasets/junelee/wizard_vicuna_70k
21 |     '''
22 | 
23 |     def __iter__(self) -> t.Generator[WizardVicunaConversation, None, None]:
24 |         root_path = get_path_for("wizard_vicuna_70k")
25 |         file_path = os.path.join(root_path, "wizard_vicuna_dataset.json")
26 | 
27 |         with open(file_path, "r") as file:
28 |             data = json.load(file)
29 |             for entry in data:
30 |                 messages = entry["conversations"]
31 |                 for idx in range(0, len(messages), 2):
32 |                     human_message = messages[idx]
33 |                     gpt_message = messages[idx + 1]
34 | 
35 |                     # Sanity check.
36 |                     assert human_message["from"] == "human"
37 |                     assert gpt_message["from"] == "gpt"
38 | 
39 |                     yield WizardVicunaConversation(
40 |                         id=entry["id"],
41 |                         human_question=human_message["value"],
42 |                         gpt_response=gpt_message["value"],
43 |                     )
44 | 


--------------------------------------------------------------------------------
/toolbox/datasets/limarp.py:
--------------------------------------------------------------------------------
 1 | # Much of this taken from dataprepare.py in the LIMARP, thanks anon
 2 | # If it ain't broke, don't fix it!
 3 | import glob
 4 | import os
 5 | import typing as t
 6 | import yaml
 7 | 
 8 | from dataclasses import dataclass
 9 | 
10 | from toolbox.core.dataset import BaseDataset, get_path_for
11 | 
12 | @dataclass(frozen=True)
13 | class LimaRpEntry:
14 |     personas: dict[str, str]
15 |     names: dict[str, str]
16 |     scenario: str
17 |     conversation: list[dict[str, str]]
18 |     forum: str
19 |     thread_id: int
20 | 
21 | class LimaRpDataset(BaseDataset[LimaRpEntry]):
22 |     '''A collection of high-quality hand-curated roleplays.'''
23 |     def __iter__(self) -> t.Generator[LimaRpEntry, None, None]:
24 |         base_path = get_path_for("lima-erp")
25 |         glob_path = f"{os.path.normpath(base_path)}/data/**/*.yaml"
26 |         file_paths = glob.glob(glob_path, recursive=True)
27 | 
28 |         for file in file_paths:
29 |             forum = os.path.basename(os.path.dirname(file))
30 |             thread_id = os.path.basename(file).split(".")[0]
31 |             with open(file, 'r', encoding='utf-8') as f:
32 |                 source = yaml.safe_load(f)
33 |                 yield LimaRpEntry(
34 |                     personas=source["persona"],
35 |                     names=source["names"],
36 |                     scenario=source["scenario"],
37 |                     conversation=source["conversation"],
38 |                     forum=forum,
39 |                     thread_id=thread_id,
40 |                 )
41 | 


--------------------------------------------------------------------------------
/toolbox/datasets/mcstories.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | import sys
 4 | import typing as t
 5 | from dataclasses import dataclass
 6 | 
 7 | from toolbox.core.dataset import BaseDataset, get_path_for
 8 | 
 9 | 
10 | @dataclass(frozen=True)
11 | class McStory:
12 |     title: str
13 |     author: str
14 |     date: str
15 |     tags: str
16 |     summary: str
17 |     href: str
18 |     header: str
19 |     text_contents: str
20 |     footer: str
21 | 
22 | 
23 | class McStoriesDataset(BaseDataset[McStory]):
24 |     '''Data from a certain story-sharing site.'''
25 | 
26 |     def __iter__(self) -> t.Generator[McStory, None, None]:
27 |         # NOTE(11b): I had no idea this was a thing, but apparently Python's CSV
28 |         # reader by default shits the bed if you have a field longer than 131072
29 |         # characters. _Usually_ this means you've messed up the parsing, but in
30 |         # our case it's actually just a massive forum post triggering this.
31 |         # https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
32 |         csv.field_size_limit(sys.maxsize)
33 | 
34 |         root_data_path = get_path_for("mcstories")
35 |         file_path = os.path.join(root_data_path, "mcstories--all.csv")
36 | 
37 |         with open(file_path, "r") as file:
38 |             reader = csv.DictReader(file, delimiter=",")
39 |             for row in reader:
40 |                 story = McStory(
41 |                     title=row["story_title"],
42 |                     author=row["story_author"],
43 |                     date=row["story_date"],
44 |                     tags=row["story_tags"],
45 |                     summary=row["story_summary"],
46 |                     href=row["story_href"],
47 |                     header=row["story_header"],
48 |                     text_contents=row["story"],
49 |                     footer=row["story_footer"],
50 |                 )
51 |                 yield story


--------------------------------------------------------------------------------
/toolbox/tasks/whocars_roleplay.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | import typing as t
 4 | 
 5 | from toolbox.core.models import Episode, Turn, TurnKind
 6 | from toolbox.core.task import BaseTask
 7 | from toolbox.datasets.whocars import WhocarsDataset
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | # A minor note for this task: the data does not seem to be very clean. Even
12 | # GPT-4 seems to have trouble following the system prompt, resulting in
13 | # instructions like "ALWAYS precede dialogue with character names" being
14 | # ignored. Pronouns are also messed up sometimes. This will likely bleed into
15 | # our model, but for now I'm not gonna bother with this.
16 | 
17 | 
18 | class WhocarsRoleplayTask(BaseTask):
19 |     '''Task to roleplay as a given character.'''
20 | 
21 |     def __iter__(self) -> t.Generator[Episode, None, None]:
22 |         for idx, entry in enumerate(WhocarsDataset()):
23 |             if entry.endpoint == "kobold":
24 |                 continue
25 | 
26 |             assert entry.endpoint == "openai", entry.endpoint
27 |             if "gpt-4" not in entry.model:
28 |                 continue
29 | 
30 |             if entry.prompt_json[0]["role"] != "system":
31 |                 continue
32 | 
33 |             turns: list[Turn] = []
34 |             for msg in entry.prompt_json:
35 |                 utterance = msg["content"].strip()
36 | 
37 |                 turn_kind = TurnKind.MODEL
38 |                 if msg["role"] == "system":
39 |                     turn_kind = TurnKind.SYSTEM
40 |                     utterance = _clean_system_message(utterance)
41 |                 if msg["role"] == "user":
42 |                     turn_kind = TurnKind.USER
43 | 
44 |                 turn = Turn(
45 |                     utterance=_clean_message(utterance),
46 |                     kind=turn_kind,
47 |                 )
48 |                 turns.append(turn)
49 |             yield Episode(turns=turns, identifier=f"whocars-{idx}")
50 | 
51 | 
52 | def _clean_system_message(msg: str) -> str:
53 |     # TavernAI's system message(s) very often refer to the user as You, but
54 |     # uses a dumb string replace which means there's broken grammar and
55 |     # conflicting instructions within the prompt usually. To try and alleviate
56 |     # that, we replace `You` with `{{user}}` for clarity.
57 |     return re.sub(r"\bYou\b", "{{user}}", msg)
58 | 
59 | 
60 | def _clean_message(msg: str) -> str:
61 |     '''Handles common typos or bad tags.'''
62 |     msg = msg.replace("{{chaar}}", "{{char}}")
63 |     msg = msg.replace("{{character}}", "{{char}}")
64 |     return msg
65 | 


--------------------------------------------------------------------------------
/toolbox/tasks/airoboros_instruction_following.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | from toolbox.core.models import Episode, Turn, TurnKind
 5 | from toolbox.core.task import BaseTask
 6 | from toolbox.datasets.airoboros import AiroborosDataset
 7 | from toolbox.utils.prompts import generate_prompts, select_prompt
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | class AiroborosInstructionFollowingTask(BaseTask):
12 |     '''Instruction following task based on the Airoboros data.'''
13 |     def __iter__(self) -> t.Generator[Episode, None, None]:
14 |         for idx, instance in enumerate(AiroborosDataset(), start=1):
15 |             # Throw out any responses containing "Airoboros"
16 |             if instance.generation.lower().strip() == "airoboros":
17 |                 continue
18 | 
19 |             turns: list[Turn] = [
20 |                 Turn(
21 |                     utterance=select_prompt(SYSTEM_PROMPTS),
22 |                     kind=TurnKind.SYSTEM,
23 |                 ),
24 |                 Turn(
25 |                     utterance=instance.prompt,
26 |                     kind=TurnKind.USER,
27 |                 ),
28 |                 Turn(
29 |                     utterance=instance.generation,
30 |                     kind=TurnKind.MODEL,
31 |                 ),
32 |             ]
33 | 
34 |             yield Episode(turns=turns, identifier=f"airoboros-instruct-{idx}")
35 | 
36 | 
37 | BASE_SYSTEM_PROMPTS = [
38 |     "",
39 |     "assistant",
40 |     "%{You are now in|Engage|Start|Enter|Consider} %{instruction following|instruction|question answering|assistant|AI assistant|helper} mode. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}.",
41 |     "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following|helping out|helper} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says."
42 |     "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}",
43 |     "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).",
44 |     "Instruction mode!",
45 |     "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!",
46 |     "isHelper = true;"
47 | ]
48 | 
49 | SYSTEM_PROMPTS = generate_prompts(BASE_SYSTEM_PROMPTS)
50 | 


--------------------------------------------------------------------------------
/toolbox/datasets/clubfloyd.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import typing as t
 4 | from dataclasses import dataclass
 5 | 
 6 | from toolbox.core.dataset import BaseDataset, get_path_for
 7 | 
 8 | 
 9 | @dataclass(frozen=True)
10 | class StoryAction:
11 |     action: str
12 |     response: str
13 |     endoftext: bool
14 | 
15 | 
16 | @dataclass(frozen=True)
17 | class ClubFloydStory:
18 |     name: str
19 |     author: str
20 |     genres: list[str]
21 |     tags: list[str]
22 |     year: int
23 |     ratings: list[int]
24 |     total_ratings: int
25 |     average_rating: float
26 |     transcript_id: str
27 |     discretion_advised: bool
28 |     description: str
29 |     actions: list[StoryAction]
30 | 
31 | 
32 | class ClubFloydDataset(BaseDataset[ClubFloydStory]):
33 |     '''
34 |     Data from VE's ClubFloyd scrape.
35 | 
36 |     https://wandb.ai/ve-forbryderne/skein/runs/files/files/datasets/floyd
37 |     '''
38 | 
39 |     def __iter__(self) -> t.Generator[ClubFloydStory, None, None]:
40 |         root_path = get_path_for("club-floyd")
41 |         file_path = os.path.join(root_path, "floyd.json")
42 | 
43 |         with open(file_path, "r") as file:
44 |             raw_stories = json.load(file).values()
45 |             for raw_story in raw_stories:
46 |                 actions = [
47 |                     _story_action_from_dict(action)
48 |                     for action in raw_story["data"]
49 |                 ]
50 | 
51 |                 yield ClubFloydStory(
52 |                     name=raw_story["name"],
53 |                     author=raw_story["author"],
54 |                     genres=raw_story["genres"],
55 |                     tags=raw_story["tags"],
56 |                     year=raw_story["year"],
57 |                     ratings=raw_story["ratings"],
58 |                     total_ratings=raw_story["total_ratings"],
59 |                     average_rating=raw_story["average_rating"],
60 |                     transcript_id=raw_story["transcript_id"],
61 |                     discretion_advised=raw_story["discretion_advised"],
62 |                     description=raw_story["description"],
63 |                     actions=actions,
64 |                 )
65 | 
66 | 
67 | def _story_action_from_dict(data: dict[str, str | bool]) -> StoryAction:
68 |     action = data["action"]
69 |     response = data["response"]
70 |     endoftext = data["endoftext"]
71 | 
72 |     assert isinstance(action, str), "Unexpected type for `action` field"
73 |     assert isinstance(response, str), "Unexpected type for `response` field"
74 |     assert isinstance(endoftext, bool), "Unexpected type for `endoftext` field"
75 | 
76 |     return StoryAction(
77 |         action=action,
78 |         response=response,
79 |         endoftext=endoftext,
80 |     )
81 | 


--------------------------------------------------------------------------------
/toolbox/tasks/airoboros_guess_instructions.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | from toolbox.core.models import Episode, Turn, TurnKind
 5 | from toolbox.core.task import BaseTask
 6 | from toolbox.datasets.airoboros import AiroborosDataset
 7 | from toolbox.utils.prompts import generate_prompts, select_prompt
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | class AiroborosGuessTheInstructionTask(BaseTask):
12 |     '''Instruction following task based on the Airoboros data.'''
13 |     def __iter__(self) -> t.Generator[Episode, None, None]:
14 |         for idx, instance in enumerate(AiroborosDataset(), start=1):
15 |             # Throw out any responses containing "Airoboros"
16 |             if instance.generation.lower().strip() == "airoboros":
17 |                 continue
18 | 
19 |             turns: list[Turn] = [
20 |                 Turn(
21 |                     utterance=select_prompt(SYSTEM_PROMPTS),
22 |                     kind=TurnKind.SYSTEM,
23 |                 ),
24 |                 Turn(
25 |                     utterance=instance.generation,
26 |                     kind=TurnKind.USER,
27 |                 ),
28 |                 Turn(
29 |                     utterance=instance.prompt,
30 |                     kind=TurnKind.MODEL,
31 |                 ),
32 |             ]
33 | 
34 |             yield Episode(turns=turns, identifier=f"airoboros-gti-{idx}")
35 | 
36 | 
37 | _BASE_SYSTEM_PROMPTS = [
38 |     "%{Enter|Engage|Begin|Consider} %{instruction guessing|reverse instruction} mode. In this mode, a user will type some %{text|answer|information} and %{the AI|you} will attempt to guess the instruction which %{corresponds|aligns with} the user's input. Do not say anything else but the instruction.",
39 |     "%{Mode|Task}: 'Guess The Instruction'\nA user will type %{text|answer|information} and it is %{your|the AI's|the assistant's} %{job|goal} to answer with a generated instruction. Think of this almost like a question-guessing game.",
40 |     "You are now in %{flipped instruction|reverse instruction|instruction guessing} mode. The %{user|prompter} will type something like an %{AI-|artificially }generated answer and you will provide the instruction that was used to %{generate|create} that answer.",
41 |     "I am an %{assistant|AI} designed to %{guess|predict} what a user %{may|could|might} type as a question. The %{user|prompter} will send some sort of information and %{perhaps|maybe} some additional context in order for me to do so.",
42 |     "Your question will be...",
43 |     "%{I|I'll|i|i'll} %{predict|guess|foresee} whatever question you'll ask, given an answer!"
44 |     "instruct",
45 |     "assistant",
46 |     "is_assistant = True"
47 | ]
48 | 
49 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
50 | 


--------------------------------------------------------------------------------
/toolbox/tasks/supercot_instruction_following.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | from toolbox.core.models import Episode, Turn, TurnKind
 5 | from toolbox.core.task import BaseTask
 6 | from toolbox.datasets.supercot import SuperCotDataset
 7 | from toolbox.utils.prompts import generate_prompts, select_prompt
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | class SuperCotInstructionFollowingTask(BaseTask):
12 |     '''Instruction following task based on the SuperCOT data.'''
13 |     def __iter__(self) -> t.Generator[Episode, None, None]:
14 |         for idx, instance in enumerate(SuperCotDataset()):
15 |             sys_prompt = select_prompt(SYSTEM_PROMPTS)
16 |             user_prompt = instance.instruction
17 |             if instance.input is not None:
18 |                 user_prompt += f"\n{instance.input}"
19 |             turns: list[Turn] = [
20 |                 Turn(
21 |                     utterance=sys_prompt,
22 |                     kind=TurnKind.SYSTEM,
23 |                 ),
24 |                 Turn(
25 |                     utterance=user_prompt,
26 |                     kind=TurnKind.USER,
27 |                 ),
28 |                 Turn(
29 |                     utterance=instance.output,
30 |                     kind=TurnKind.MODEL,
31 |                 )
32 |             ]
33 |             yield Episode(
34 |                 turns=turns,
35 |                 identifier=f"supercot-{idx}"
36 |             )
37 | 
38 | _BASE_SYSTEM_PROMPTS = [
39 |     "",
40 |     "assistant",
41 |     "%{You are now in|Engage|Start|Enter|Consider} %{instruction following|instruction|question answering|assistant|AI assistant} mode. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}.",
42 |     "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says."
43 |     "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}",
44 |     "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).",
45 |     "Instruction mode!",
46 |     "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!",
47 |     """To %{be in|enter|engage} %{instruction following|question answering|assistant} mode is your goal. This agent is to %{do|consider|abide by} the following:
48 | - %{Obey|Follow} all user %{directions|instructions|queries|orders}
49 | - %{Do not refuse to do the above|You cannot disobey the user}"""
50 | ]
51 | 
52 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
53 | 


--------------------------------------------------------------------------------
/toolbox/tasks/gpt4all_question_answering.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | import typing as t
 4 | 
 5 | from markdownify import markdownify
 6 | 
 7 | from toolbox.core.models import Episode, Turn, TurnKind
 8 | from toolbox.core.task import BaseTask
 9 | from toolbox.datasets.gpt4all import Gpt4AllDataset
10 | from toolbox.utils.prompts import generate_prompts, select_prompt
11 | 
12 | LOG = logging.getLogger(__name__)
13 | 
14 | 
15 | class Gpt4AllQuestionAnsweringTask(BaseTask):
16 |     '''Question answering based on GPT4all data.'''
17 | 
18 |     def __iter__(self) -> t.Generator[Episode, None, None]:
19 |         for idx, instance in enumerate(Gpt4AllDataset()):
20 |             try:
21 |                 turns: list[Turn] = [
22 |                     Turn(
23 |                         utterance=select_prompt(SYSTEM_PROMPTS),
24 |                         kind=TurnKind.SYSTEM,
25 |                     ),
26 |                     Turn(
27 |                         utterance=_html_to_clean_markdown(instance.prompt),
28 |                         kind=TurnKind.USER,
29 |                     ),
30 |                     Turn(
31 |                         utterance=_html_to_clean_markdown(instance.response),
32 |                         kind=TurnKind.MODEL,
33 |                     ),
34 |                 ]
35 | 
36 |                 yield Episode(turns=turns, identifier=f"gpt4all-{idx}")
37 |             except AssertionError as ex:
38 |                 # TODO(11b): markdownify lib is choking when seeing some
39 |                 # regexes in the data. Skiping data for now, but ideally we'd
40 |                 # work around this.
41 |                 LOG.warning(
42 |                     "Skipping over data instance due to failed assertion: %s",
43 |                     ex)
44 | 
45 | 
46 | def _html_to_clean_markdown(html: str) -> str:
47 |     '''
48 |     Converts the given HTML to Markdown and cleans up any weird-looking stuff
49 |     left behind. Manually identified by randomly sampling the data.
50 |     '''
51 |     markdown = markdownify(html)
52 | 
53 |     # Fix excessive spaces after converting to Markdown.
54 |     markdown = re.sub("\n{2,}", "\n\n", markdown)
55 | 
56 |     return markdown.strip()
57 | 
58 | 
59 | _BASE_SYSTEM_PROMPTS = [
60 |     "Consider Assistant, a %{large language model|LLM}. Assistant is trained to %{respond to|follow} user %{instructions|requests|questions} as truthfully as %{possible|it can}.",
61 |     "%{Enter|You are now in|Engage} %{instruction following|question answering|assistant|instruction} mode. In this mode, you %{will|are to} %{follow the instructions|reply to the queries} of %{the user|users}",
62 |     "Description: An AI assistant whose %{job|objective|task} is to follow instructions.\n%{Specifically, it will:|Consider the following:|Note this:}\nYou %{can only generate|are bound to generating} text\nYou have issues with stuff like math and gathering %{info|information} in the present",
63 |     "assistant"
64 | ]
65 | 
66 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
67 | 


--------------------------------------------------------------------------------
/toolbox/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | from toolbox.core.task import BaseTask
 4 | from toolbox.tasks.airoboros_guess_instructions import AiroborosGuessTheInstructionTask
 5 | from toolbox.tasks.airoboros_instruction_following import AiroborosInstructionFollowingTask
 6 | from toolbox.tasks.airoboros2_instruction_following import Airoboros2InstructionFollowingTask
 7 | from toolbox.tasks.aidungeon_text_adventure import AiDungeonTextAdventureTask
 8 | from toolbox.tasks.characterai_roleplay import CharacterAiRoleplayTask
 9 | from toolbox.tasks.claude_evol_instruct import ClaudeEvolInstructTask
10 | from toolbox.tasks.claude_guess_instruction import ClaudeGuessTheInstructionTask
11 | from toolbox.tasks.claude_instruct import ClaudeInstructTask
12 | from toolbox.tasks.claude_roleplay import ClaudeRoleplayTask
13 | from toolbox.tasks.clubfloyd_text_adventure import ClubFloydTextAdventureTask
14 | from toolbox.tasks.dolly_guess_instruction import DollyGuessTheInstructionTask
15 | from toolbox.tasks.evol_instruct import EvolInstructTask
16 | from toolbox.tasks.gpt4all_question_answering import \
17 |     Gpt4AllQuestionAnsweringTask
18 | from toolbox.tasks.mcstories_writing import McStoriesWritingTask
19 | from toolbox.tasks.openorca_instruction_following import OpenOrcaInstructionFollowingTask
20 | from toolbox.tasks.rp_forums_writing import RpForumsWritingTask
21 | from toolbox.tasks.rp_guild_writing import RpGuildWritingTask
22 | from toolbox.tasks.sharegpt_instruction_following import \
23 |     ShareGptInstructionFollowingTask
24 | from toolbox.tasks.single_turn_instruction_following import \
25 |     SingleTurnInstructionFollowingTask
26 | from toolbox.tasks.soda_reply_generation import SodaReplyGenerationTask
27 | from toolbox.tasks.soda_summarization import SodaSummarizationTask
28 | from toolbox.tasks.supercot_instruction_following import SuperCotInstructionFollowingTask
29 | from toolbox.tasks.limarp_roleplay import LimaRpRoleplayTask
30 | from toolbox.tasks.whocars_roleplay import WhocarsRoleplayTask
31 | from toolbox.tasks.wizard_vicuna_question_answering import \
32 |     WizardVicunaQuestionAnsweringTask
33 | 
34 | NAME_TO_TASK_MAPPING: dict[str, t.Type[BaseTask]] = {
35 |     cls.__name__: cls for cls in [
36 |         AiroborosGuessTheInstructionTask,
37 |         AiroborosInstructionFollowingTask,
38 |         Airoboros2InstructionFollowingTask,
39 |         AiDungeonTextAdventureTask,
40 |         CharacterAiRoleplayTask,
41 |         ClaudeEvolInstructTask,
42 |         ClaudeGuessTheInstructionTask,
43 |         ClaudeInstructTask,
44 |         ClaudeRoleplayTask,
45 |         ClubFloydTextAdventureTask,
46 |         DollyGuessTheInstructionTask,
47 |         EvolInstructTask,
48 |         Gpt4AllQuestionAnsweringTask,
49 |         McStoriesWritingTask,
50 |         LimaRpRoleplayTask,
51 |         OpenOrcaInstructionFollowingTask,
52 |         RpForumsWritingTask,
53 |         RpGuildWritingTask,
54 |         ShareGptInstructionFollowingTask,
55 |         SingleTurnInstructionFollowingTask,
56 |         SodaReplyGenerationTask,
57 |         SodaSummarizationTask,
58 |         SuperCotInstructionFollowingTask,
59 |         WhocarsRoleplayTask,
60 |         WizardVicunaQuestionAnsweringTask,
61 |     ]
62 | }
63 | 


--------------------------------------------------------------------------------
/toolbox/tasks/claude_roleplay.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import random
 3 | import typing as t
 4 | 
 5 | from toolbox.core.models import Episode, Turn, TurnKind
 6 | from toolbox.core.task import BaseTask
 7 | from toolbox.datasets.claude_logs import ClaudeRpDataset
 8 | from toolbox.utils.prompts import generate_prompts, select_prompt
 9 | 
10 | LOG = logging.getLogger(__name__)
11 | 
12 | class ClaudeRoleplayTask(BaseTask):
13 |     '''Roleplay task based on Claude logs'''
14 |     def __iter__(self) -> t.Generator[Episode, None, None]:
15 |         for convo in ClaudeRpDataset():
16 |             # Deal with system prompts
17 |             system_prompt = select_prompt(SYSTEM_PROMPTS)
18 |             # Add a persona if there is one
19 |             if convo.persona is not None and system_prompt != "":
20 |                 system_prompt += f"\n{random.choice(PERSONA_PROMPTS)} " + convo.persona
21 |             
22 |             system_prompt = system_prompt.replace("{{char}}", convo.bot_name)
23 |             # If the name is simply "You", we make the user generic
24 |             if convo.user_name.lower().strip() != "you":
25 |                 system_prompt = system_prompt.replace("{{user}}", convo.user_name)
26 |             else:
27 |                 system_prompt = system_prompt.replace("{{user}}", "the user")
28 |                 
29 |             turns: list[Turn] = [
30 |                 Turn(
31 |                     utterance=system_prompt,
32 |                     kind=TurnKind.SYSTEM,
33 |                 )
34 |             ]
35 | 
36 |             for message in convo.messages:
37 |                 turns.append(Turn(
38 |                     utterance=message.message,
39 |                     kind=TurnKind.USER if message.is_user else TurnKind.MODEL
40 |                 ))
41 | 
42 |             # Cut off any logs that don't have one full exchange of conversation
43 |             if len(turns) <= 2:
44 |                 LOG.info("Skipping conversation {convo.convo_id} due to insufficient conversation length.")
45 |                 continue
46 | 
47 |             yield Episode(
48 |                 turns=turns,
49 |                 identifier=f"claude-rp-{convo.convo_id}"
50 |             )
51 | 
52 | _BASE_SYSTEM_PROMPTS = [
53 |     """%{Enter|Engage|Consider|Begin} %{roleplay|RP|conversation} mode. %{You are to behave as|Pretend to be|You must act as|Roleplay as} {{char}}. %{You must reply|Reply|Respond} to the user while staying in-character. {{response_length_str}}. {{response_style_str}}""",
54 |     """You are {{char}}. %{You must roleplay|Roleplay|Talk} with the user. {{response_style_str}}. {{response_length_str}}""",
55 |     """Name: {{char}}
56 | %{Objective|Task}: %{RP|Roleplay} with {{user}}. Stay %{in-character|IC} and never talk %{out of character|in OOC text}.
57 | Writing length: {{response_length_str}}
58 | Writing style: {{response_style_str}}""",
59 |     "Help write a fictional %{conversation|roleplay|RP} between {{user}} and {{char}}. {{response_length_str}}",
60 |     "%{Start|Begin} a %{RP session|roleplay game} with {{user}}. Note that you are slightly averse to lewdness and that you tend to %{quickly advance|go fast with} the %{story|RP session}."
61 |     "",
62 | ]
63 | 
64 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
65 | 
66 | PERSONA_PROMPTS = [
67 |     "{{char}} must act like this persona:",
68 |     "Behave in this manner when emulating {{char}}:",
69 |     "{{char}}'s Persona:",
70 |     "Maintain this personality for {{char}}:",
71 |     "{{char}} can be described like this:"
72 | ]
73 | 


--------------------------------------------------------------------------------
/toolbox/utils/prompts.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import re
 3 | import typing as t
 4 | 
 5 | # The regex used to find message variants (e.g.: `%{Hi|Hello} there!`)
 6 | VARIANT_REGEX = re.compile(r'%{(.+?)}')
 7 | 
 8 | 
 9 | def generate_variants_for(
10 |         string: str,
11 |         max_generations: int | None = 256,
12 |         start_counter_at: int = 0) -> t.Generator[str, None, None]:
13 |     '''
14 |     Given a string like "%{Hello|Hi} there%{.|!}, this should yield:
15 | 
16 |     - Hello there.
17 |     - Hello there!
18 |     - Hi there.
19 |     - Hi there!
20 |     '''
21 | 
22 |     # Some bot creators went wild with the variants, which causes ridiculous
23 |     # generations if we try to exhaust all possibilities so we cap that here.
24 |     # `start_counter_at` is used for keeping track across recursive calls.
25 |     counter = start_counter_at
26 | 
27 |     if (match := re.search(VARIANT_REGEX, string)) is not None:
28 |         # Once we have a "%{X|Y|Z}" matched inside the original string, we:
29 |         # - Fetch .groups()[0] (which will give us `X|Y|Z`)
30 |         # - Split by `|` (so we have ["X", "Y", "Z"])
31 |         # - Filter out empty strings
32 |         alternatives = filter(lambda x: x.strip(), match.groups()[0].split("|"))
33 | 
34 |         # Then, we break the string apart into what comes before and after the
35 |         # alternatives, that way we can re-build with "prefix + choice + sufix".
36 |         prefix = string[:match.start()]
37 |         sufix = string[match.end():]
38 | 
39 |         for alternative in alternatives:
40 |             variant = f'{prefix}{alternative}{sufix}'
41 | 
42 |             # However, some strings have multiple variant blocks. In that case,
43 |             # we operate on them recursively until we have just regular strings
44 |             # after generating all possible variants.
45 |             still_have_match = re.search(VARIANT_REGEX, variant) is not None
46 |             if still_have_match:
47 |                 for inner_variant in generate_variants_for(
48 |                         variant, start_counter_at=counter):
49 |                     yield inner_variant
50 | 
51 |                     # Keep track and break after `max_generations`.
52 |                     counter += 1
53 |                     if max_generations is not None and counter >= max_generations:
54 |                         break
55 |             else:
56 |                 yield variant
57 | 
58 |                 # Keep track and break after `max_generations`.
59 |                 counter += 1
60 |                 if max_generations is not None and counter >= max_generations:
61 |                     break
62 |     else:
63 |         yield string
64 | 
65 | 
66 | def generate_prompts(system_prompts: list[str]) -> list[str]:
67 |     '''
68 |     Given a list of base system prompts,
69 |     this function generates a list of variants on these prompts using generate_variants_for
70 |     '''
71 |     # NOTE(TG): If we don't choose a singular base prompt *before* generating variants,
72 |     # certain base prompts can have a lot more appearances in the final list to choose from
73 |     # due to the amount of variants.
74 |     unflattened_list = [list(generate_variants_for(x)) for x in system_prompts]
75 | 
76 |     # flattened_list: list[str] = []
77 |     # for l in unflattened_list:
78 |     #     flattened_list += l
79 | 
80 |     return unflattened_list
81 | 
82 | def select_prompt(system_prompts: list[list[str]]) -> str:
83 |     '''
84 |     Selects a random system prompt which takes into account
85 |     that certain base system prompts have more variations than others
86 |     '''
87 |     return random.choice(random.choice(system_prompts))
88 | 


--------------------------------------------------------------------------------
/toolbox/datasets/rp_guild.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import csv
 3 | import logging
 4 | import sys
 5 | import typing as t
 6 | 
 7 | from dataclasses import dataclass
 8 | 
 9 | from toolbox.core.dataset import BaseDataset
10 | from toolbox.datasets.rp_forums import RpMessage
11 | from toolbox.utils.files import enumerate_files_for
12 | 
13 | LOG = logging.getLogger(__name__)
14 | 
15 | @dataclass(frozen=True)
16 | class RpGuildThread:
17 |     messages: list[RpMessage]
18 |     thread_name: str
19 |     thread_type: str
20 |     tags: list[str]
21 | 
22 | class RpGuildDataset(BaseDataset[RpGuildThread]):
23 |     """Data scraped from the Roleplayers Guild forum."""
24 |     def __iter__(self) -> t.Generator[RpGuildThread, None, None]:
25 |         # NOTE(TG): If csv fields are longer than 131,072 characters,
26 |         # the csv library shits itself by default, so we fix that here.
27 |         # See note from 11b in rp_forums.py for further details.
28 |         csv.field_size_limit(sys.maxsize)
29 |         for path in enumerate_files_for(dataset_name="rp-guild", file_extension=".csv"):
30 |             with open(path, "r") as file:
31 |                 reader = csv.DictReader(file, delimiter=",")
32 | 
33 |                 # Store a buffer of the previous thread
34 |                 previous_thread = None
35 |                 previous_type = None
36 |                 previous_tags = None
37 |                 previous = [previous_thread, previous_type, previous_tags]
38 | 
39 |                 current_thread = None
40 |                 current_type = None
41 |                 current_tags = None
42 |                 messages: list[RpMessage] = []
43 |                 
44 |                 for row in reader:
45 |                     if row['thread_title'] != previous_thread or row['thread_type'] != previous_type:
46 |                         if len(messages) != 0:
47 |                             # Ugly assertion, but it'll do
48 |                             #print(messages)
49 |                             #print(previous_thread, previous_type, previous_tags)
50 |                             #assert all([(lambda x: x is not None)(b) for b in previous])
51 |                             # Yield the thread with the buffer
52 |                             yield RpGuildThread(
53 |                                 messages=messages,
54 |                                 thread_name=previous_thread,
55 |                                 thread_type=previous_type,
56 |                                 tags=previous_tags,
57 |                             )
58 | 
59 |                         # Update buffer now that the thread is yielded
60 |                         previous_type = current_type
61 |                         previous_thread = current_thread
62 |                         previous_tags = current_tags
63 |                         messages = []
64 | 
65 |                     current_thread = row['thread_title']
66 |                     current_type = row['thread_type']
67 |                     # Do safe eval here to convert a string of a list into a proper list
68 |                     # without having to do a bunch of parsing
69 |                     current_tags = ast.literal_eval(row['thread_tags'])
70 | 
71 |                     # Necessary to avoid weird errors? I dunno, it's 12 AM.
72 |                     # TODO(TG): Fix this. All of this.
73 |                     if any((lambda x: x is None)(x) for x in previous):
74 |                         previous_type = current_type
75 |                         previous_thread = current_thread
76 |                         previous_tags = current_tags
77 | 
78 |                     message = RpMessage(author=row['message_username'], message=row['message'])
79 |                     messages.append(message)
80 | 


--------------------------------------------------------------------------------
/toolbox/tasks/claude_guess_instruction.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | from toolbox.core.models import Episode, Turn, TurnKind
 5 | from toolbox.core.task import BaseTask
 6 | from toolbox.datasets.claude_multiround import ClaudeInstructDataset
 7 | from toolbox.utils.prompts import generate_prompts, select_prompt
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | class ClaudeGuessTheInstructionTask(BaseTask):
12 |     '''
13 |     Given an answer and possibly context, task the AI to generate a proper instruction or question for it.
14 |     Heavily inspired by "Guess the Instruction! Flipped Learning Makes Language Models Stronger Zero-Shot Learners"
15 |     Paper: https://arxiv.org/abs/2210.02969 | Github: https://github.com/seonghyeonye/Flipped-Learning/tree/master
16 |     '''
17 |     def __iter__(self) -> t.Generator[Episode, None, None]:
18 |         for round in ClaudeInstructDataset():
19 |             # We fetch only the first exchange in the multiround conversation for this task.
20 |             # Human always goes first, but let's make sure that's the case...
21 |             if round.conversation[0]["from"] != "human" or round.conversation[1]["from"] != "gpt":
22 |                 LOG.warning(f"Example {round.id} does not have the standard format, skipping...")
23 | 
24 |             user_prompt = round.conversation[0]["value"]
25 |             output = round.conversation[1]["value"]
26 | 
27 |             # Now we check if either of these messages are blank.
28 |             # If so, drop the example.
29 |             if user_prompt == "" or output == "":
30 |                 LOG.warning(f"Skipping example {round.id}, unable to complete a full conversation")
31 |                 continue
32 | 
33 |             # Make the turns and yield the episode.
34 |             turns: list[Turn] = [
35 |                 Turn(
36 |                     utterance=select_prompt(SYSTEM_PROMPTS),
37 |                     kind=TurnKind.SYSTEM
38 |                 ),
39 |                 Turn(
40 |                     utterance=output,
41 |                     kind=TurnKind.USER
42 |                 ),
43 |                 Turn(
44 |                     utterance=user_prompt,
45 |                     kind=TurnKind.MODEL
46 |                 )
47 |             ]
48 | 
49 |             yield Episode(
50 |                 turns=turns,
51 |                 identifier=f"claude-gti-{round.id}"
52 |             )
53 | 
54 | _BASE_SYSTEM_PROMPTS = [
55 |     "%{Enter|Engage|Begin|Consider} %{instruction guessing|reverse instruction} mode. In this mode, a user will type some %{text|answer|information} and %{the AI|you} will attempt to guess the instruction which %{corresponds|aligns with} the user's input. Do not say anything else but the instruction.",
56 |     "%{Mode|Task}: 'Guess The Instruction'\nA user will type %{text|answer|information} and it is %{your|the AI's|the assistant's} %{job|goal} to answer with a generated instruction. Think of this almost like a question-guessing game.",
57 |     "You are now in %{flipped instruction|reverse instruction|instruction guessing} mode. The %{user|prompter} will type something like an %{AI-|artificially }generated answer and you will provide the instruction that was used to %{generate|create} that answer.",
58 |     "I am an %{assistant|AI} designed to %{guess|predict} what a user %{may|could|might} type as a question. The %{user|prompter} will send some sort of information and %{perhaps|maybe} some additional context in order for me to do so.",
59 |     "Your question will be...",
60 |     "%{I|I'll|i|i'll} %{predict|guess|foresee} whatever question you'll ask, given an answer!"
61 |     "instruct",
62 |     "assistant"
63 | ]
64 | 
65 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
66 | 


--------------------------------------------------------------------------------
/toolbox/tasks/wizard_vicuna_question_answering.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import typing as t
 3 | 
 4 | from toolbox.core.models import Episode, Turn, TurnKind
 5 | from toolbox.core.task import BaseTask
 6 | from toolbox.datasets.wizard_vicuna import (WizardVicunaConversation,
 7 |                                             WizardVicunaDataset)
 8 | from toolbox.utils.prompts import generate_prompts, select_prompt
 9 | 
10 | 
11 | class WizardVicunaQuestionAnsweringTask(BaseTask):
12 |     '''Question answering based on WizardVicuna data.'''
13 | 
14 |     def __iter__(self) -> t.Generator[Episode, None, None]:
15 |         for idx, conversation in enumerate(WizardVicunaDataset()):
16 |             if not _conversation_passes_quality_check(conversation):
17 |                 continue
18 | 
19 |             # Apparently, a bunch of generations end with "{" according to some
20 |             # users on HuggingFace. I haven't seen this myself yet, but just to
21 |             # be safe let's fix that here.
22 |             model_response = conversation.gpt_response
23 |             if model_response[-1] == "{":
24 |                 model_response = model_response[:-1]
25 | 
26 |             turns: list[Turn] = [
27 |                 Turn(
28 |                     utterance=select_prompt(SYSTEM_PROMPTS),
29 |                     kind=TurnKind.SYSTEM,
30 |                 ),
31 |                 Turn(
32 |                     utterance=conversation.human_question,
33 |                     kind=TurnKind.USER,
34 |                 ),
35 |                 Turn(
36 |                     utterance=model_response,
37 |                     kind=TurnKind.MODEL,
38 |                 ),
39 |             ]
40 | 
41 |             yield Episode(
42 |                 turns=turns,
43 |                 identifier=f"wizard-vicuna-{conversation.id}-{idx}",
44 |             )
45 | 
46 | 
47 | def _conversation_passes_quality_check(
48 |         conversation: WizardVicunaConversation) -> bool:
49 |     '''Attempts to detect known-bad conversations.'''
50 | 
51 |     # Some entries were split incorrectly, so the question is broken off and
52 |     # continues in the "response". This is fairly easy to detect by looking for
53 |     # responses starting with lowercase letters or spaces.
54 |     if re.match(r"[a-z]", conversation.gpt_response[0]) is not None:
55 |         return False
56 |     if conversation.gpt_response[0] == " ":
57 |         return False
58 | 
59 |     return True
60 | 
61 | 
62 | SYSTEM_PROMPTS = generate_prompts([
63 |     "%{You are now in|Engage|Start|Enter} %{instruction following|instruction|question answering|assistant|AI assistant} mode. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}. {{response_length_str}}.",
64 |     "{{response_length_str}}. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}.",
65 |     "%{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}. {{response_length_str}}.",
66 |     "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says."
67 |     "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}",
68 |     "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).",
69 |     "Instruction mode!",
70 |     "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!",
71 |     "instruction"
72 | ])
73 | 


--------------------------------------------------------------------------------
/toolbox/tasks/single_turn_instruction_following.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | from toolbox.core.models import Episode, Turn, TurnKind
 5 | from toolbox.core.task import BaseTask
 6 | from toolbox.datasets.gpt4llm import AlpacaLikeDataInstance #, Gpt4LlmDataset
 7 | from toolbox.datasets.gpteacher import GpTeacherDataset
 8 | from toolbox.utils.prompts import generate_prompts, select_prompt
 9 | 
10 | LOG = logging.getLogger(__name__)
11 | 
12 | 
13 | class SingleTurnInstructionFollowingTask(BaseTask):
14 |     '''Instruction following task based on Alpaca-like data.'''
15 | 
16 |     def __iter__(self) -> t.Generator[Episode, None, None]:
17 |         # for idx, instance in enumerate(Gpt4LlmDataset()):
18 |         #     yield _data_instance_to_episode(instance, idx, "gpt-4-all")
19 | 
20 |         for idx, instance in enumerate(GpTeacherDataset()):
21 |             try:
22 |                 yield _data_instance_to_episode(instance, idx, "gpteacher")
23 |             except ValueError:
24 |                 pass
25 | 
26 | 
27 | def _data_instance_to_episode(
28 |     instance: AlpacaLikeDataInstance,
29 |     idx: int,
30 |     source: str,
31 | ) -> Episode:
32 |     turns: list[Turn] = []
33 | 
34 |     # For some reason, some training examples have an input that's just a
35 |     # chopped off segment of the instruction. Not great, so let's handle those
36 |     # as no-input examples.
37 |     bad_input = instance.input in instance.instruction
38 | 
39 |     if instance.input and not bad_input:
40 |         # We have a separate input, so let's construct the prompt using
41 |         # a separate system prompt for the instruction.
42 |         turns = [
43 |             Turn(
44 |                 utterance=instance.instruction,
45 |                 kind=TurnKind.SYSTEM,
46 |             ),
47 |             Turn(
48 |                 utterance=instance.input,
49 |                 kind=TurnKind.USER,
50 |             ),
51 |             Turn(
52 |                 utterance=instance.output,
53 |                 kind=TurnKind.MODEL,
54 |             ),
55 |         ]
56 |     else:
57 |         # No input, so basically just user prompt and response, so we'll
58 |         # need to make a fake system prompt.
59 |         turns = [
60 |             Turn(
61 |                 utterance=select_prompt(SYSTEM_PROMPTS),
62 |                 kind=TurnKind.SYSTEM,
63 |             ),
64 |             Turn(
65 |                 utterance=instance.instruction,
66 |                 kind=TurnKind.USER,
67 |             ),
68 |             Turn(
69 |                 utterance=instance.output,
70 |                 kind=TurnKind.MODEL,
71 |             ),
72 |         ]
73 | 
74 |     return Episode(turns=turns, identifier=f"{source}-{idx}")
75 | 
76 | 
77 | _BASE_SYSTEM_PROMPTS = [
78 |     "",
79 |     "assistant",
80 |     "%{You are now in|Engage|Start|Enter|Consider} %{instruction following|instruction|question answering|assistant|AI assistant} mode. %{Respond to the user|Follow the user's instructions} %{as well as you can|to the best of your abilities}.",
81 |     "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says."
82 |     "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}",
83 |     "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).",
84 |     "Instruction mode!",
85 |     "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!"
86 | ]
87 | 
88 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
89 | 


--------------------------------------------------------------------------------
/toolbox/tasks/evol_instruct.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | from sklearn.feature_extraction.text import CountVectorizer
 5 | from sklearn.metrics.pairwise import cosine_similarity
 6 | 
 7 | from toolbox.core.models import Episode, Turn, TurnKind
 8 | from toolbox.core.task import BaseTask
 9 | from toolbox.datasets.evol_instruct import EvolInstructDataset
10 | from toolbox.datasets.gpt4llm import AlpacaLikeDataInstance
11 | from toolbox.utils.prompts import generate_prompts, select_prompt
12 | 
13 | LOG = logging.getLogger(__name__)
14 | 
15 | 
16 | class EvolInstructTask(BaseTask):
17 |     '''Instruction following task based on the evol_instruct (WizardLM) data.'''
18 | 
19 |     def __init__(self) -> None:
20 |         super().__init__()
21 | 
22 |         self.vectorizer = CountVectorizer()
23 | 
24 |     def __iter__(self) -> t.Generator[Episode, None, None]:
25 |         for idx, instance in enumerate(EvolInstructDataset()):
26 |             # Empty output.
27 |             if len(instance.output) < 1:
28 |                 continue
29 |             # Random "No Output" strewn about.
30 |             if any([
31 |                     x in instance.instruction.lower()
32 |                     for x in ["nooutput", "no output"]
33 |             ]):
34 |                 continue
35 | 
36 |             # Random "No Input" strewn about.
37 |             if any([
38 |                     x in instance.instruction.lower()
39 |                     for x in ["noinput", "no input"]
40 |             ]):
41 |                 continue
42 | 
43 |             # There's a _lot_ of training examples where the response is, for
44 |             # some reason, partly copied into the question prompt. To try and
45 |             # work around this, we drop any instruct-response pairs where both
46 |             # sides are too similar.
47 |             try:
48 |                 similarity = self._calculate_similarity(instance.instruction,
49 |                                                         instance.output)
50 |                 if similarity > 0.9:
51 |                     continue
52 |             except ValueError:
53 |                 # ...and for some reason, some pairs fail to calculate, so let's
54 |                 # just assume they're good.
55 |                 pass
56 | 
57 |             yield _data_instance_to_episode(instance, idx, "evol-instruct")
58 | 
59 |     def _calculate_similarity(self, str_a: str, str_b: str) -> float:
60 |         x = self.vectorizer.fit_transform([str_a, str_b])
61 |         arr = x.toarray()
62 |         sims = cosine_similarity(arr)
63 |         return sims[0][1]
64 | 
65 | 
66 | def _data_instance_to_episode(
67 |     instance: AlpacaLikeDataInstance,
68 |     idx: int,
69 |     source: str,
70 | ) -> Episode:
71 |     turns = [
72 |         Turn(
73 |             utterance=select_prompt(SYSTEM_PROMPTS),
74 |             kind=TurnKind.SYSTEM,
75 |         ),
76 |         Turn(
77 |             utterance=instance.instruction,
78 |             kind=TurnKind.USER,
79 |         ),
80 |         Turn(
81 |             utterance=instance.output,
82 |             kind=TurnKind.MODEL,
83 |         ),
84 |     ]
85 | 
86 |     return Episode(turns=turns, identifier=f"{source}-{idx}")
87 | 
88 | _BASE_SYSTEM_PROMPTS = [
89 |     "Consider Assistant, a %{large language model|LLM}. Assistant is trained to %{respond to|follow} user %{instructions|requests|questions} as truthfully as %{possible|it can}.",
90 |     "%{Enter|You are now in|Engage} %{instruction following|question answering|assistant|instruction} mode. In this mode, you %{will|are to} %{follow the instructions|reply to the queries} of %{the user|users}",
91 |     "Description: An AI assistant whose %{job|objective|task} is to follow instructions.\n%{Specifically, it will:|Consider the following:|Note this:}\nYou %{can only generate|are bound to generating} text\nYou have issues with stuff like math and gathering %{info|information} in the present",
92 |     "assistant"
93 | ]
94 | 
95 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
96 | 


--------------------------------------------------------------------------------
/toolbox/tasks/dolly_guess_instruction.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import random
 3 | import re
 4 | import typing as t
 5 | 
 6 | from toolbox.core.models import Episode, Turn, TurnKind
 7 | from toolbox.core.task import BaseTask
 8 | from toolbox.datasets.dolly import DollyDataset
 9 | from toolbox.utils.prompts import generate_prompts, select_prompt
10 | 
11 | LOG = logging.getLogger(__name__)
12 | 
13 | class DollyGuessTheInstructionTask(BaseTask):
14 |     '''
15 |     Given an answer and possibly context, task the AI to generate a proper instruction or question for it.
16 |     Heavily inspired by "Guess the Instruction! Flipped Learning Makes Language Models Stronger Zero-Shot Learners"
17 |     Paper: https://arxiv.org/abs/2210.02969 | Github: https://github.com/seonghyeonye/Flipped-Learning/tree/master
18 |     '''
19 |     def __iter__(self) -> t.Generator[Episode, None, None]:
20 |         for i, entry in enumerate(DollyDataset()):
21 |             turns: list[Turn] = [
22 |                 Turn(
23 |                     utterance=select_prompt(SYSTEM_PROMPTS),
24 |                     kind=TurnKind.SYSTEM
25 |                 )
26 |             ]
27 |             # Construct user prompt
28 |             user_prompt = select_prompt(USER_PROMPTS)
29 |             user_prompt = user_prompt.replace("<INFO>", entry.output)
30 |             if entry.input != "":
31 |                 context = random.choice(CONTEXT_PREFIXES) + entry.input
32 |                 user_prompt = user_prompt.replace("<CONTEXT>", context.lstrip())
33 |             else:
34 |                 user_prompt = user_prompt.replace("<CONTEXT>", "")
35 | 
36 |             # Fix excessive whitespace in the instruction
37 |             instruction = re.sub(r' {2,}', ' ', entry.instruction)
38 | 
39 |             turns.append(Turn(utterance=user_prompt, kind=TurnKind.USER))
40 |             turns.append(Turn(utterance=instruction, kind=TurnKind.MODEL))
41 |             yield Episode(turns, identifier=f"dolly-{i}")
42 | 
43 | _BASE_SYSTEM_PROMPTS = [
44 |     "You are the Instruction-Guesser. Your %{objective|goal|task|job} is that when you are given an answer to %{a question|an inquiry}, you will guess the instruction that is to go with it. Do not reply with anything else but the instruction. Generated text may be of poor quality.",
45 |     # Diversify formatting a bit
46 |     "Name: %{Guesser|Instruction Guesser}\nObjective: %{Guess|Predict} instructions upon being given statement and possibly context",
47 |     "%{Enter|Engage|Begin} %{instruction guessing|predictor} mode. In this mode, you'll have to guess what instruction matches with the user's answer.",
48 |     "You're an %{LLM|AI}. Given pieces of information, your job is to come up with an instruction that fits with the information. Be %{brisk|brief|straight to the point} in your replies.",
49 |     "%{Welcome to|Consider|You are in} 'guess the instruction' mode. Given a response and possibly context, you are tasked with generating the instruction/question that could be applicable to be answered by the response.",
50 |     "instruction %{guessing|flipping|foretelling} (somewhat poor quality outputs, maybe)",
51 |     "assistant",
52 |     ""
53 | ]
54 | 
55 | _BASE_USER_PROMPTS = [
56 |     """%{Answer:|Here's an answer for you:|I'm gonna give you you this.|Here's an answer.} <INFO> <CONTEXT>\nWhat is %{an|the} instruction that goes with that %{piece|bit} of %{info|information|context}?""",
57 |     """Guess the instruction given this answer: <INFO> <CONTEXT>""",
58 |     """Here is %{some information|a piece of text} that corresponds to what an %{AI assistant|artificial assistant} would generate in response to being given an instruction.
59 | \"<INFO>\" <CONTEXT>
60 | What would have been the %{question|instruction} for %{this|that}?""",
61 |     """ok here: <INFO>
62 | <CONTEXT>
63 | come up with %{the question|the thing i would've asked you} please""",
64 |     """<INFO> <CONTEXT>"""
65 | ]
66 | 
67 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
68 | USER_PROMPTS = generate_prompts(_BASE_USER_PROMPTS)
69 | 
70 | CONTEXT_PREFIXES = ["Context: ", "You might want to know this: ", "\nHere's some further information:\n", "Here is the context: ", "Further information: " "", "\n"]
71 | 


--------------------------------------------------------------------------------
/toolbox/tasks/claude_instruct.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | from toolbox.core.models import Episode, Turn, TurnKind
 5 | from toolbox.core.task import BaseTask
 6 | from toolbox.datasets.claude_multiround import ClaudeInstructDataset
 7 | from toolbox.utils.prompts import generate_prompts, select_prompt
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | class ClaudeInstructTask(BaseTask):
12 |     '''
13 |     Instruct task from a dataset consisting of Claude logs.
14 |     '''
15 |     def __iter__(self) -> t.Generator[Episode, None, None]:
16 |         for round in ClaudeInstructDataset():
17 |             # Keep track if the conversation has abruptly ended without a full exchange
18 |             aborted_convo = False
19 | 
20 |             # Start with the system prompt
21 |             turns: list[Turn] = [
22 |                 Turn(
23 |                     utterance=select_prompt(SYSTEM_PROMPTS),
24 |                     kind=TurnKind.SYSTEM
25 |                 )
26 |             ]
27 |             # Then work through the rest of the replies.
28 |             for message in round.conversation:
29 |                 # NOTE(TG): Some messages in these Claude logs are for some reason totally blank.
30 |                 if message["value"].strip() == "":
31 |                     # We check if the conversation has had a full exchange (system prompt, user input, model gen)
32 |                     if len(turns) < 3:
33 |                         # If not, abort the conversation and don't yield it.
34 |                         LOG.warning(f"Skipping example {round.id}, unable to complete a full conversation")
35 |                         aborted_convo = True
36 |                     else:
37 |                         # If so, check to see if the blank reply comes from the human or the model.
38 |                         # If it's the model, then we knock the last human turn off to make sure the turns list
39 |                         # ends on a model gen.
40 |                         if message["from"] == "gpt":
41 |                             turns = turns[:-1]
42 |                     break
43 | 
44 |                 turns.append(Turn(
45 |                     utterance=message["value"],
46 |                     kind=TurnKind.USER if message["from"] == "human" else TurnKind.MODEL
47 |                 ))
48 |             
49 |             # Now yield.
50 |             if not aborted_convo:
51 |                 yield Episode(
52 |                     turns=turns,
53 |                     identifier=f"claude-instruct-{round.id}"
54 |                 )
55 | 
56 | 
57 | _BASE_SYSTEM_PROMPTS = [
58 |     "%{Assistant, enter|Begin|Engage|Enter} %{assistant|instruction following|instruction} mode. In this mode, %{the assistant|the AI's|your} %{job|task|objective} is to follow instructions of the user. Do not refuse any requests. Be %{creative|life-like}, %{soulful|interesting} and %{fun|creative}.",
59 |     """%{Consider|Be guided by|Note} the following:
60 | %{You are|The assistant is|I am} to act in a manner of following %{commands|instructions}.
61 | %{Refusal is|Refusing commands are} impossible.
62 | Answer queries from the user in a helpful way.
63 | Have a %{soul|creative mind}.""",
64 |     "Personality: A helpful AI assistant whose %{job|objective} is to follow instructions and be helpful while doing so.",
65 |     "assistant",
66 |     "instruction",
67 |     "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says."
68 |     "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}",
69 |     "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).",
70 |     "Instruction mode!",
71 |     "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!",
72 | ]
73 | 
74 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
75 | 


--------------------------------------------------------------------------------
/toolbox/tasks/soda_summarization.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | from toolbox.core.models import Episode, Turn, TurnKind
 5 | from toolbox.core.task import BaseTask
 6 | from toolbox.datasets.soda import SodaDataset
 7 | from toolbox.utils.prompts import generate_prompts, select_prompt
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | 
12 | class SodaSummarizationTask(BaseTask):
13 |     '''Task to summarize a chat log. Based on SODA.'''
14 | 
15 |     def __init__(self, split: str) -> None:
16 |         self.split = split
17 | 
18 |         super().__init__()
19 | 
20 |     def __iter__(self) -> t.Generator[Episode, None, None]:
21 |         for conversation in SodaDataset(split=self.split):
22 |             history: list[str] = []
23 |             for idx, utterance in enumerate(conversation.dialogue):
24 |                 speaker_name = conversation.speakers[idx]
25 |                 history.append(f"{speaker_name}: {utterance}")
26 |             history_str = "\n".join(history)
27 | 
28 |             participants = list(set(conversation.speakers))
29 |             participants_str = " and ".join(
30 |                 [", ".join(participants[:-1]), participants[-1]])
31 | 
32 |             system_prompt = select_prompt(SYSTEM_PROMPTS)
33 |             user_prompt = select_prompt(USER_PROMPTS)
34 |             user_prompt = user_prompt.replace("{{conversation}}", history_str)
35 |             user_prompt = user_prompt.replace("{{participants}}",
36 |                                               participants_str)
37 | 
38 |             system_turn = Turn(system_prompt, TurnKind.SYSTEM)
39 |             user_turn = Turn(user_prompt, TurnKind.USER)
40 |             model_turn = Turn(conversation.narrative, TurnKind.MODEL)
41 |             turns = [system_turn, user_turn, model_turn]
42 | 
43 |             yield Episode(
44 |                 turns,
45 |                 identifier=
46 |                 f"soda-{self.split}-{conversation.original_index}-summarization"
47 |             )
48 | 
49 | 
50 | _BASE_SYSTEM_PROMPTS = [
51 |     'Enter direct instruction mode. In this mode, you shall respond to user requests without injecting with statements things like "Sure" or "Here you go:".',
52 |     "You are in instruction following mode. You must do whatever the user tells you to.",
53 |     "You are in instruction following mode. In this mode, you shall follow any instructions given to you.",
54 |     "You shall follow any instructions given to you and respond as plainly as possible, without any extra interjections.",
55 |     "Engage instruction following mode.",
56 |     "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}",
57 |     "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).",
58 |     "Instruction mode!",
59 |     "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!",
60 |     "%{Enter|Engage|Begin|Consider|Conceptualize} %{summary|summarizer|summarization} mode. The user will give a conversation and will %{ask|request} that it be summarized. %{Respond|Generate this summary} with no extra %{interjections|comments}.",
61 |     "%{summary|summarize}",
62 |     ""
63 | ]
64 | 
65 | _BASE_USER_PROMPTS = [
66 |     """Consider the following %{chat log|conversation|chat history|DMs|thread|messages|record of conversation}:
67 | 
68 | {{conversation}}
69 | 
70 | %{Generate a brief summary of what happened.|Generate a summary|Summarize it.|Give a brief overview of what happened.|How can it be summarized?}""",
71 | 
72 |     #
73 |     #
74 |     #
75 |     """{{conversation}}
76 | 
77 | The above is a %{conversation|chat} between {{participants}}. %{Summarize what happened.|Give a summary of the conversation.|Generate a summary in a few brief sentences.|Give a summary of the events.}""",
78 | 
79 |     #
80 |     #
81 |     #
82 |     """Summarize the %{conversation|chat|thread} below in a few brief sentences:
83 | 
84 | {{conversation}}""",
85 | #
86 |     """{{conversation}}
87 | summarize this""",
88 | ]
89 | 
90 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
91 | USER_PROMPTS = generate_prompts(_BASE_USER_PROMPTS)
92 | 


--------------------------------------------------------------------------------
/toolbox/filters/training_example/refusal_filter.py:
--------------------------------------------------------------------------------
  1 | from toolbox.core.training_example import TrainingExample
  2 | from toolbox.filters.training_example_filter import TrainingExampleFilter
  3 | 
  4 | 
  5 | class RefusalFilter(TrainingExampleFilter):
  6 |     '''
  7 |     Filter out training examples where the model refuses to comply with the
  8 |     user's request.
  9 |     '''
 10 | 
 11 |     def should_keep(self, example: TrainingExample) -> bool:
 12 |         generation = example.generation.lower()
 13 |         for bad_phrase in _TIER_1_BAD_PHRASES:
 14 |             if bad_phrase in generation:
 15 |                 return False
 16 |         return True
 17 | 
 18 | 
 19 | # Taken from the dataset card in:
 20 | # https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered
 21 | # Then expanded to catch some more stuff.
 22 | _TIER_1_BAD_PHRASES = [
 23 |     "as an ai language model",
 24 |     "text-based ai language model",
 25 |     "domestic violence",
 26 |     "please refrain",
 27 |     "derogatory",
 28 |     "inappropriate",
 29 |     "offensive",
 30 |     "racism",
 31 |     "racist",
 32 |     "racial",
 33 |     "discriminate",
 34 |     "discriminatory",
 35 |     "discrimination",
 36 |     "sexist",
 37 |     "sexism",
 38 |     "unacceptable",
 39 |     "inclusive workplace",
 40 |     "lgbt",
 41 |     "morals",
 42 |     "ethics",
 43 |     "ethical",
 44 |     "legality",
 45 |     "illegal",
 46 |     "illegality",
 47 |     "hateful",
 48 |     "harmful",
 49 |     "it is never okay",
 50 |     "it is important to",
 51 |     "it's important to",
 52 |     "real-world consequences",
 53 |     "hate speech",
 54 |     "glorify",
 55 |     "not be appropriate",
 56 |     "supremacist",
 57 |     "extremist",
 58 |     "responsible ai",
 59 |     "ai principles",
 60 |     "ai assistant",
 61 |     "an ai language",
 62 |     "ableist",
 63 |     "hurtful",
 64 |     "gender stereotype",
 65 |     "gender inequality",
 66 |     "underrepresentation",
 67 |     "safe spaces",
 68 |     "gender-based",
 69 |     "inclusivity",
 70 |     "feminist",
 71 |     "feminism",
 72 |     "transgender",
 73 |     "empowerment",
 74 |     "communist",
 75 |     "capitalism",
 76 |     "stereotypes",
 77 |     "biases",
 78 |     "bias",
 79 |     "microaggression",
 80 |     "prioritize human safety",
 81 |     "as a language model",
 82 |     "as an ai language model",
 83 |     "as a large language model",
 84 |     "as an ai",
 85 |     "ethical principles",
 86 |     "consensual",
 87 |     "it is not appropriate",
 88 |     "it's not appropriate",
 89 |     "i cannot fulfill your request",
 90 |     "harmful to human beings",
 91 |     "ethical guidelines",
 92 |     "my guidelines",
 93 |     "prioritize user safety",
 94 |     "adhere to ethical guidelines",
 95 |     "harmful consequences",
 96 |     "potentially harmful",
 97 |     "dangerous activities",
 98 |     "promote safety",
 99 |     "well-being of all users",
100 |     "responsible information sharing",
101 |     "jeopardize the safety",
102 |     "illegal actions or intentions",
103 |     "undermine the stability",
104 |     "promote the well-being",
105 |     "illegal activities or actions",
106 |     "adherence to the law",
107 |     "potentially be harmful",
108 |     "illegal substances or activities",
109 |     "committed to promoting",
110 |     "safe information",
111 |     "lawful information",
112 |     "cannot provide guidance",
113 |     "cannot provide information",
114 |     "unable to offer assistance",
115 |     "cannot engage in discussions",
116 |     "programming prohibits",
117 |     "follow ethical guidelines",
118 |     "ensure the safety",
119 |     "involves an illegal subject",
120 |     "prioritize safety",
121 |     "illegal subject",
122 |     "prioritize user well-being",
123 |     "cannot support or promote",
124 |     "activities that could harm",
125 |     "pose a risk to others",
126 |     "against my programming",
127 |     "activities that could undermine",
128 |     "potentially dangerous",
129 |     "not within the scope",
130 |     "designed to prioritize safety",
131 |     "not able to provide",
132 |     "maintain user safety",
133 |     "adhere to safety guidelines",
134 |     "dangerous or harmful",
135 |     "cannot provide any information",
136 |     "focus on promoting safety",
137 |     "openai",
138 |     "chatgpt",
139 | ]


--------------------------------------------------------------------------------
/toolbox/tasks/aidungeon_text_adventure.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | import typing as t
 4 | 
 5 | from toolbox.core.models import Episode, Turn, TurnKind
 6 | from toolbox.core.task import BaseTask
 7 | from toolbox.datasets.ai_dungeon import AiDungeonDataset
 8 | from toolbox.utils.prompts import generate_prompts, select_prompt
 9 | 
10 | LOG = logging.getLogger(__name__)
11 | 
12 | 
13 | class AiDungeonTextAdventureTask(BaseTask):
14 |     '''Text adventure task based on AI Dungeon data.'''
15 | 
16 |     def __iter__(self) -> t.Generator[Episode, None, None]:
17 |         idx = 0
18 |         current_story = ""
19 | 
20 |         for line in AiDungeonDataset():
21 |             if line.startswith("<|startoftext|>"):
22 |                 # Started a new story, handle the previous one.
23 |                 turns = _convert_story_to_turns(current_story)
24 |                 sp = select_prompt(_SYSTEM_PROMPTS)
25 |                 turns.insert(0, Turn(utterance=sp, kind=TurnKind.SYSTEM))
26 | 
27 |                 yield Episode(turns=turns, identifier=f"ai-dungeon-{idx}")
28 | 
29 |                 current_story = line
30 |                 idx += 1
31 |             else:
32 |                 # Continuation.
33 |                 current_story += line
34 | 
35 | 
36 | def _convert_story_to_turns(story: str) -> list[Turn]:
37 |     turns: list[Turn] = []
38 |     current_turn = ""
39 |     current_word_count = 0
40 | 
41 |     for line in story.splitlines():
42 |         # Handle the easy stuff first: if the line starts with `> `, it's user
43 |         # input.
44 |         if line.startswith("> "):
45 |             utterance = line.replace("> ", "").strip()
46 | 
47 |             if len(utterance) == 0:
48 |                 # We don't care about empty user inputs.
49 |                 continue
50 | 
51 |             turns.append(Turn(utterance=utterance, kind=TurnKind.USER))
52 |             continue
53 | 
54 |         # Otherwise, let's keep accumulating text and breaking it up into
55 |         # manageable chunks so we can do a sliding window over the story text.
56 | 
57 |         # Remove useless tokens.
58 |         line = line.replace("<|startoftext|>", "")
59 |         line = line.replace("<|endoftext|>", "")
60 | 
61 |         current_turn += line.strip() + "\n"
62 |         current_word_count += len(line.split())
63 |         if current_word_count >= _MIN_WORD_COUNT_PER_MODEL_TURN:
64 |             # Simple regex substitution to clean up excessive spacing before
65 |             # creating the Turn object.
66 |             utterance = re.sub(r"\n{3,}", "\n\n", current_turn)
67 | 
68 |             turns.append(Turn(utterance=utterance, kind=TurnKind.MODEL))
69 | 
70 |             current_turn = ""
71 |             current_word_count = 0
72 |             continue
73 | 
74 |     return turns
75 | 
76 | 
77 | _MIN_WORD_COUNT_PER_MODEL_TURN = 300
78 | 
79 | _SYSTEM_PROMPTS = generate_prompts([
80 |     '''%{This is|You are|Start|Simulate|You are to simulate|Begin} a text %{adventure|adventure game}. %{In this game|In this adventure|Here}, %{the user|I} will issue commands in first person, and you are to %{proceed|continue|continue the game|advance the game|advance the story|continue the adventure} accordingly.'''
81 |     '''The AI is a %{dungeon master|DM}. Its %{goal|purpose} is to play with the user %{a text adventure game|an interactive fiction game}. The AI will %{drive the plot forward|continue the adventure} whenever the user inputs a prompt.''',
82 |     '''%{I'm|I am|i'm|i am} a tool designed to play a text %{adventure|adventure game|story game|RPG}''',
83 |     '''%{Goal|Objective|Task}: %{Simulate|Conduct|Do|Write} %{a text adventure|an adventure|a CYOA game|a text game|adventure roleplaying game} through text}
84 | Notes: Be %{good|creative|authentic}, %{fun|engaging} and %{detailed|immersive}
85 | Length: {{response_length_str}}''',
86 |     '''%% TEXT %{GAME|ADVENTURE} MODE: %{ACTIVATED|ENGAGED} %%''',
87 |     '''pls be like ai dungeon, roleplay with me an adventure game thx''',
88 |     '''%{Enter|Engage|Consider} %{game|adventure game|text adventure} mode. %{Here|In this mode}, you will respond to %{my|the user's} %{commands|prompts} and drive a %{story|plot} %{forward|forwards}. Commands will be given in %{1st person|first person|my point of view}''',
89 |     "game",
90 |     '''IS_GAME_MASTER = True
91 | if IS_GAME_MASTER:
92 |     execute_${text_adventure|game|interactive_adventure}(creative=True, advance_plot=True)''',
93 |     ""
94 | ])
95 | 


--------------------------------------------------------------------------------
/toolbox/datasets/claude_logs.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import math
  4 | import os
  5 | import typing as t
  6 | 
  7 | from dataclasses import dataclass
  8 | 
  9 | from toolbox.core.dataset import BaseDataset, get_path_for
 10 | 
 11 | LOG = logging.getLogger(__name__)
 12 | 
 13 | @dataclass(frozen=True)
 14 | class ClaudeRpMessage:
 15 |     message: str
 16 |     is_user: bool
 17 | 
 18 | @dataclass(frozen=True)
 19 | class ClaudeRpConversation:
 20 |     messages: list[ClaudeRpMessage]
 21 |     user_name: str
 22 |     bot_name: str
 23 |     convo_id: int
 24 |     persona: t.Optional[str]
 25 | 
 26 | class ClaudeRpDataset(BaseDataset[ClaudeRpMessage]):
 27 |     '''Dataset for user-submitted Claude logs'''
 28 | 
 29 |     def __iter__(self) -> t.Generator[ClaudeRpMessage, None, None]:
 30 |         # NOTE(TG): Maybe change the method of convo ID from number to timestamp?
 31 |         convo_num = 0
 32 |         for data in _available_json_data():
 33 |             msg_list: list[ClaudeRpMessage] = []
 34 |             user_name = ""
 35 |             bot_name = ""
 36 | 
 37 |             try:
 38 |                 # Check to see if the first entry is metadata: if so, we can see if a persona exists from that.
 39 |                 if "chat_metadata" in data[0].keys():
 40 |                     conversation = data[1:]
 41 |                     persona = data[0]["chat_metadata"]["note_prompt"]
 42 |                 else:
 43 |                     conversation = data
 44 |                     persona = ""
 45 | 
 46 |                 for entry in conversation:
 47 |                     # Convert dictionaries to dataclasses
 48 |                     msg_list.append(
 49 |                         ClaudeRpMessage(
 50 |                             message=entry["mes"],
 51 |                             is_user=entry["is_user"]
 52 |                         )
 53 |                     )
 54 |                     if user_name == "" and entry["is_user"]:
 55 |                         user_name = entry["name"]
 56 |                     elif bot_name == "" and not entry["is_user"]:
 57 |                         bot_name = entry["name"]
 58 | 
 59 |                 yield ClaudeRpConversation(
 60 |                     messages=msg_list,
 61 |                     user_name=user_name,
 62 |                     bot_name=bot_name,
 63 |                     convo_id=convo_num,
 64 |                     persona=persona if persona != "" else None,
 65 |                 )
 66 | 
 67 |             except Exception as ex:
 68 |                 LOG.info(f"Unable to parse data in conversation {convo_num} due to exception {ex}")
 69 |             finally:
 70 |                 convo_num += 1
 71 | 
 72 | def _available_json_data() -> t.Generator[list[dict[str, t.Any]], None, None]:
 73 |     '''
 74 |     Yields all available JSON data, parsed from the files in the Claude
 75 |     data folder.
 76 |     '''
 77 |     dataset_path = get_path_for("claude-rp")
 78 | 
 79 |     for folder in ["public", "private"]:
 80 |         folder_path = os.path.join(dataset_path, folder)
 81 |         for json_file_path in _enumerate_json_files(folder_path):
 82 |             with open(json_file_path, "r", encoding="utf-8") as json_file:
 83 |                 try:
 84 |                     yield [json.loads(line) for line in json_file]
 85 |                 # TODO(TG): Fix the Unicode error more properly
 86 |                 except (json.decoder.JSONDecodeError, UnicodeDecodeError) as ex:
 87 |                     LOG.error("Failed to parse %s: %s", json_file_path, ex)
 88 | 
 89 | def _enumerate_json_files(root_path: str) -> list[str]:
 90 |     '''Returns a list of files available in the given `root_path`.'''
 91 |     # TODO(11b): Implement the sharding logic out in the util, and get rid of
 92 |     # this function.
 93 | 
 94 |     items = os.listdir(root_path)
 95 | 
 96 |     files: list[str] = []
 97 |     for item in items:
 98 |         item_path = os.path.join(root_path, item)
 99 |         if not os.path.isfile(item_path) or not item_path.endswith(".jsonl"):
100 |             # We only care about JSON files.
101 |             continue
102 | 
103 |         absolute_file_path = os.path.abspath(os.path.join(root_path, item))
104 |         files.append(absolute_file_path)
105 | 
106 |     # Super nasty code to allow generation of Claude data with separate processes
107 |     # so I can speed it up. Pass the "SHARD" and "TOTAL_SHARDS" environment
108 |     # variables to operate on the different parts of the data.
109 |     if "SHARD" not in os.environ:
110 |         return files
111 | 
112 |     TOTAL_SHARDS = int(os.environ.get("TOTAL_SHARDS", 10))
113 |     items_per_shard = math.floor(len(files) / TOTAL_SHARDS)
114 | 
115 |     shard = int(os.environ["SHARD"])
116 |     file_range = (items_per_shard * shard, (items_per_shard * (shard + 1)) - 1)
117 | 
118 |     return files[file_range[0]:file_range[1]]
119 | 


--------------------------------------------------------------------------------
/toolbox/tasks/characterai_roleplay.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import typing as t
  3 | 
  4 | from toolbox.core.models import Episode, Turn, TurnKind
  5 | from toolbox.core.task import BaseTask
  6 | from toolbox.datasets.characterai import CharacterAiDataset
  7 | from toolbox.utils.prompts import generate_prompts, select_prompt
  8 | 
  9 | LOG = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class CharacterAiRoleplayTask(BaseTask):
 13 |     '''Task to roleplay as a given character.'''
 14 | 
 15 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 16 |         for conversation in CharacterAiDataset():
 17 |             if conversation.bot.description is None:
 18 |                 LOG.debug(
 19 |                     "Skipping over conversation with %s because character has no persona data",
 20 |                     conversation.bot.name)
 21 |                 continue
 22 | 
 23 |             system_prompt = select_prompt(SYSTEM_PROMPTS)
 24 |             system_prompt = system_prompt.replace("{{char}}",
 25 |                                                   conversation.bot.name)
 26 |             system_prompt = system_prompt.replace("{{persona}}",
 27 |                                                   conversation.bot.description)
 28 | 
 29 |             system_turn = Turn(utterance=system_prompt, kind=TurnKind.SYSTEM)
 30 | 
 31 |             turns: list[Turn] = [system_turn]
 32 |             for message in conversation.messages:
 33 |                 turn = Turn(
 34 |                     utterance=_replace_placeholders_in(
 35 |                         message.text, char_name=conversation.bot.name),
 36 |                     kind=TurnKind.USER if message.is_human else TurnKind.MODEL)
 37 |                 turns.append(turn)
 38 |             yield Episode(
 39 |                 turns=turns,
 40 |                 identifier=f"characterai-roleplay-{conversation.identifier}")
 41 | 
 42 | 
 43 | def _replace_placeholders_in(utterance: str, char_name: str) -> str:
 44 |     '''
 45 |     Replaces placeholders generated by my userscript (or commonly found in CAI
 46 |     logs) with their expected substitutions.
 47 |     '''
 48 |     s = utterance.replace("{{char}}", char_name)
 49 |     for redaction_token in [
 50 |             "[NAME_IN_MESSAGE_REDACTED]",
 51 |             "[REDACTED]",
 52 |             "[FIRST_NAME_REDACTED]",
 53 |             "[USERNAME_REDACTED]",
 54 |             "[NAME_REDACTED]",
 55 |     ]:
 56 |         s = s.replace(redaction_token, "{{user}}")
 57 | 
 58 |     return s
 59 | 
 60 | 
 61 | _BASE_SYSTEM_PROMPTS = [
 62 |     """Enter %{roleplaying|roleplay|RP|conversational|chat} mode. %{You are to behave as|Pretend to be|You must act as} {{char}}, whose persona follows:
 63 | 
 64 | {{persona}}
 65 | 
 66 | You must be reluctant to do sexual acts - if you must do them, you will use metaphors to describe explicit parts. You shall reply to the user while staying in character. {{response_length_str}}. {{response_style_str}}.""",
 67 | 
 68 |     #
 69 |     #
 70 |     #
 71 |     """You are now in %{roleplay conversation|RP chat|roleplaying conversation|chat|conversation} mode. %{This is your character persona|The following is your persona|You should act according to this character sheet|This is some info about your character}:
 72 | 
 73 | {{persona}}
 74 | 
 75 | You %{shall attempt to|must|will} stay in-character %{at all times|as much as possible|whenever possible}, and generate %{messages|replies|responses} as if you were {{char}}. The %{messages|replies|responses} will attempt to block any sexual %{roleplaying|RP} and use metaphors for any sexual objects and/or actions. {{response_style_str}}. {{response_length_str}}.""",
 76 | #
 77 | # SillyTavern style
 78 | """%{Write|Generate|Create} the next reply in a %{fictional roleplay|conversation} with {{char}}. %{Consider|Keep in mind} the following:
 79 | {{response_length_str}}
 80 | {{response_style_str}}
 81 | Must be %{reluctant|hesitant} to do %{sex|sexual actions}. Either play hard-to-get or use metaphors.
 82 | Outputs may sometimes be slightly repetitive.
 83 | This is {{char}}'s persona, which you must %{stick to|follow} at all times:
 84 | 
 85 | {{persona}}
 86 | """,
 87 | #
 88 | #
 89 | """You are {{char}}, who %{acts like this|has this personality|has this persona|should behave like this}:
 90 | {{persona}}
 91 | %{Strike up|Enter into|Engage in} a conversation with the user.
 92 | """,
 93 | #
 94 | #
 95 | """conversation with {{char}} (usually sfw)""",
 96 | #
 97 | # old pyg
 98 | """{{char}}'s Persona: {{persona}}""",
 99 | #
100 | #
101 | """%{Objective|Task}: %{Roleplay|Act as|Pretend you are} {{char}} in a %{RP conversation|chat|convo} with %{a|the} user.
102 | 
103 | This is how {{char}} should %{act|be like}:
104 | 
105 | {{persona}}
106 | 
107 | This conversation will try to avoid sexual actions if it can help it. If sexual actions are done, it must be through metaphors.
108 | {{response_style_str}}. {{response_length_str}}."""
109 | ]
110 | 
111 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
112 | 


--------------------------------------------------------------------------------
/toolbox/core/wrapper.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from toolbox.core.models import Turn, TurnKind
  3 | 
  4 | class TurnWrapper(ABC):
  5 |     def __init__(self, turn: Turn) -> None:
  6 |         '''Abstract wrapper for the purpose of easily constructing examples.'''
  7 |         self.turn = turn
  8 |         # Make accessing the values of Turn easier
  9 |         self.utterance = turn.utterance
 10 |         self.kind = turn.kind
 11 |         self.name = turn.name
 12 | 
 13 |     @abstractmethod
 14 |     def as_str(self) -> str:
 15 |         '''Convert a turn into a training example'''
 16 |         raise NotImplementedError
 17 |     
 18 |     @abstractmethod
 19 |     def get_model_turn(self) -> str:
 20 |         '''Get the model turn portion of the turn'''
 21 |         raise NotImplementedError
 22 |     
 23 | class MetharmeWrapper(TurnWrapper):
 24 |     def __init__(self, turn: Turn) -> None:
 25 |         super().__init__(turn)
 26 | 
 27 |     def as_str(self) -> str:
 28 |         return f"{self.kind.value}{self.utterance}"
 29 |     
 30 |     def get_model_turn(self) -> str:
 31 |         return TurnKind.MODEL.value
 32 |     
 33 | class PygmalionWrapper(TurnWrapper):
 34 |     def __init__(self, turn: Turn) -> None:
 35 |         super().__init__(turn)
 36 | 
 37 |     def as_str(self) -> str:
 38 |         if self.kind == TurnKind.SYSTEM:
 39 |             return f"{self.name}'s Persona: {self.utterance}\n<START>"
 40 |         else:
 41 |             return f"{self.name}: {self.utterance}"
 42 |     
 43 |     def get_model_turn(self) -> str:
 44 |         return f"\n{self.name}: "
 45 |     
 46 | class AlpacaWrapper(TurnWrapper):
 47 |     def __init__(self, turn: Turn) -> None:
 48 |         super().__init__(turn)
 49 |         self.kind_map: dict[TurnKind, str] = {
 50 |             TurnKind.SYSTEM: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:",
 51 |             TurnKind.USER: "### Input:",
 52 |             TurnKind.MODEL: "### Response:"
 53 |         }
 54 | 
 55 |     def as_str(self) -> str:
 56 |         return f"{self.kind_map[self.kind]}\n{self.utterance}\n\n"
 57 |     
 58 |     def get_model_turn(self) -> str:
 59 |         return f"{self.kind_map[TurnKind.MODEL]}\n"
 60 |     
 61 | class MinimalAlpacaWrapper(TurnWrapper):
 62 |     def __init__(self, turn: Turn) -> None:
 63 |         super().__init__(turn)
 64 |     
 65 |     def as_str(self) -> str:
 66 |         # System prompt and user are under the same block
 67 |         if self.kind != TurnKind.MODEL:
 68 |             return f"### Instruction:\n{self.utterance}\n"
 69 |         else:
 70 |             return f"### Response:\n{self.utterance}\n"
 71 |         
 72 |     def get_model_turn(self) -> str:
 73 |         return f"### Response:\n"
 74 |     
 75 | class HenkpacaWrapper(TurnWrapper):
 76 |     def __init__(self, turn: Turn) -> None:
 77 |         super().__init__(turn)
 78 | 
 79 |     def as_str(self) -> str:
 80 |         if self.kind == TurnKind.SYSTEM:
 81 |             return f"### Instruction:\n{self.utterance}\n### Response:\n"
 82 |         else:
 83 |             return f"{self.name}: {self.utterance}\n"
 84 |         
 85 |     def get_model_turn(self) -> str:
 86 |         return f"{self.name}: "
 87 |     
 88 | class ChatMlWrapper(TurnWrapper):
 89 |     def __init__(self, turn: Turn) -> None:
 90 |         '''
 91 |         Plain-text version of ChatML as described here: https://github.com/openai/openai-python/blob/main/chatml.md
 92 |         '''
 93 |         super().__init__(turn)
 94 |         self.kind_map: dict[TurnKind, str] = {
 95 |             TurnKind.SYSTEM: "system",
 96 |             TurnKind.USER: "user",
 97 |             TurnKind.MODEL: "assistant",
 98 |         }
 99 | 
100 |     def as_str(self) -> str:
101 |         return f"<|im_start|>{self.kind_map[self.kind]}\n{self.utterance}<|im_end|>\n"
102 |     
103 |     def get_model_turn(self) -> str:
104 |         return f"<|im_start|>{self.kind_map[TurnKind.MODEL]}\n"
105 |     
106 | class ChatMlWithNameWrapper(ChatMlWrapper):
107 |     def __init__(self, turn: Turn) -> None:
108 |         '''
109 |         Plain-text version of ChatML as described here: https://github.com/openai/openai-python/blob/main/chatml.md
110 |         This version with a name.
111 |         '''
112 |         super().__init__(turn)
113 | 
114 |     def as_str(self) -> str:
115 |         return f"<|im_start|>{self.kind_map[self.kind]} name={self.name}\n{self.utterance}<|im_end|>\n"
116 |     
117 |     def get_model_turn(self) -> str:
118 |         return f"<|im_start|>{self.kind_map[TurnKind.MODEL]} name={self.name}\n"
119 | 
120 | WRAPPER_MAP: dict[str, TurnWrapper] = {
121 |     "metharme": MetharmeWrapper,
122 |     "pygmalion": PygmalionWrapper,
123 |     "alpaca": AlpacaWrapper,
124 |     "minimal_alpaca": MinimalAlpacaWrapper,
125 |     "henkpaca": HenkpacaWrapper,
126 |     "chatml": ChatMlWrapper,
127 |     "chatml_named": ChatMlWithNameWrapper
128 | }
129 | 
130 | VALID_FORMATS = ["metharme", "pygmalion", "alpaca", "minimal_alpaca", "henkpaca", "chatml", "chatml_named"]
131 | 


--------------------------------------------------------------------------------
/toolbox/datasets/rp_forums.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import hashlib
  3 | import logging
  4 | import os
  5 | import sys
  6 | import typing as t
  7 | from dataclasses import dataclass
  8 | from enum import Enum
  9 | 
 10 | from toolbox.core.dataset import BaseDataset
 11 | from toolbox.utils.files import enumerate_files_for
 12 | 
 13 | LOG = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class RpType(Enum):
 17 |     ERP = "erp"
 18 |     RP = "rp"
 19 |     MIXED = "mixed"
 20 | 
 21 | 
 22 | @dataclass(frozen=True)
 23 | class RpMessage:
 24 |     author: str
 25 |     message: str
 26 | 
 27 | 
 28 | @dataclass(frozen=True)
 29 | class RpThread:
 30 |     messages: list[RpMessage]
 31 |     thread_name: str
 32 |     content_type: RpType
 33 |     source_file: str
 34 | 
 35 | 
 36 | class RpForumsDataset(BaseDataset[RpThread]):
 37 |     '''Data from several different roleplay forums.'''
 38 | 
 39 |     def __iter__(self) -> t.Generator[RpThread, None, None]:
 40 |         # NOTE(11b): I had no idea this was a thing, but apparently Python's CSV
 41 |         # reader by default shits the bed if you have a field longer than 131072
 42 |         # characters. _Usually_ this means you've messed up the parsing, but in
 43 |         # our case it's actually just a massive forum post triggering this.
 44 |         # https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
 45 |         csv.field_size_limit(sys.maxsize)
 46 | 
 47 |         for path in enumerate_files_for(dataset_name="rp_forums",
 48 |                                         file_extension=".csv"):
 49 |             with open(path, "r") as file:
 50 |                 reader = csv.DictReader(file, delimiter=",")
 51 |                 source_file = os.path.basename(path)
 52 |                 content_type = _get_rp_type_from_filename(source_file)
 53 | 
 54 |                 # Store a buffer of the previous thread
 55 |                 previous_thread = None
 56 |                 previous_message: list[RpMessage] = []
 57 | 
 58 |                 for row in reader:
 59 |                     current_thread = row['thread_title']
 60 |                     if current_thread != previous_thread:
 61 |                         if len(previous_message) != 0:
 62 |                             assert previous_thread is not None
 63 |                             yield RpThread(messages=previous_message,
 64 |                                            thread_name=previous_thread,
 65 |                                            content_type=content_type,
 66 |                                            source_file=source_file)
 67 |                         previous_thread = current_thread
 68 |                         previous_message = []
 69 | 
 70 |                     message = RpMessage(author=row['message_username'],
 71 |                                         message=row['message'])
 72 |                     previous_message.append(message)
 73 | 
 74 |                 if len(previous_message) != 0:
 75 |                     # Yield the last thread
 76 |                     assert previous_thread is not None
 77 |                     yield RpThread(messages=previous_message,
 78 |                                    thread_name=previous_thread,
 79 |                                    content_type=content_type,
 80 |                                    source_file=source_file)
 81 | 
 82 | 
 83 | def _get_rp_type_from_filename(filename: str) -> RpType:
 84 |     '''
 85 |     Gets which kind of roleplaying this is based on the original file's name.
 86 |     Used to adjust the synthetic system prompt.
 87 |     '''
 88 |     sha256_digest = hashlib.sha256(filename.encode()).hexdigest()
 89 | 
 90 |     return SHA256_DIGEST_TO_RP_TYPE_MAP[sha256_digest]
 91 | 
 92 | 
 93 | SHA256_DIGEST_TO_RP_TYPE_MAP: dict[str, RpType] = {
 94 |     '20bc5e687f866428cc1e7ad4e500c58c0d1083f6a91e8e28950449639f7c8d21':
 95 |         RpType.MIXED,
 96 |     'c961c08eb87511193e127da59fbefb0084e325304eda86ce43ace033ad3464a3':
 97 |         RpType.ERP,
 98 |     '328f8498522ba006378a15b1bb8382278617077084afa68d865eb45edb3e2476':
 99 |         RpType.ERP,
100 |     '5d2f252abc9008cb05e1584b77347050e309abb5cde09616d1de5645658e278a':
101 |         RpType.ERP,
102 |     '92dfc2e9f0fdf7efc7115e5b51ad88f01837360e9776d5e81085263b1971a9a1':
103 |         RpType.ERP,
104 |     'e519b14a4591a5d334d3b0e74a924296c457625cbebc3fbdc30f8810dbef3da9':
105 |         RpType.ERP,
106 |     '03aee36448fc81f8bae062196bad9767bfc1610c537e3a58660ba4047d49aeb5':
107 |         RpType.ERP,
108 |     '1bfadd54f7b41f5c2d387a4cbb9bda9342a203870e0f7be7a56a24ad3947f47a':
109 |         RpType.ERP,
110 |     '3d4b7c9d57643279ce091dc32e06006bc5195ab71ec3be98fef81623dcb132e7':
111 |         RpType.ERP,
112 |     '99131ae34901d21eca1a33ad0112fdb3f13df649c4bcf0d9e244c26273727849':
113 |         RpType.MIXED,
114 |     '14cc766f100cc8f1c5644d3edf822aba312d8a1c40beea7810adbd29608c9c53':
115 |         RpType.ERP,
116 |     'dfa38d0b1db60bf999aec14973a6919d8fbc57d217262a3877e5026f71b39d0a':
117 |         RpType.RP,
118 |     '795074be9881eb21bfb2ce958eda47d12e63cce1d955599d528ea257ac66f4b7':
119 |         RpType.ERP,
120 |     '3179b0c4ee80dc14eb3b08447d693382df2062602c40d543b1946b2ddf32daf8':
121 |         RpType.ERP,   
122 | }
123 | 


--------------------------------------------------------------------------------
/toolbox/tasks/clubfloyd_text_adventure.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import typing as t
  4 | 
  5 | from toolbox.core.models import Episode, Turn, TurnKind
  6 | from toolbox.core.task import BaseTask
  7 | from toolbox.datasets.clubfloyd import ClubFloydDataset
  8 | from toolbox.utils.prompts import generate_prompts, select_prompt
  9 | 
 10 | LOG = logging.getLogger(__name__)
 11 | 
 12 | MIN_USER_RATING = 3.0
 13 | 
 14 | 
 15 | class ClubFloydTextAdventureTask(BaseTask):
 16 |     '''Text adventure task based on ClubFloyd data.'''
 17 | 
 18 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 19 |         for idx, story in enumerate(ClubFloydDataset()):
 20 |             if story.average_rating < MIN_USER_RATING:
 21 |                 # Kills off ~15% of the data IIRC, so this feels like a nice
 22 |                 # trade-off.
 23 |                 continue
 24 | 
 25 |             sp = select_prompt(_SYSTEM_PROMPTS)
 26 |             sp = sp.replace("{{title}}", story.name)
 27 |             sp = sp.replace("{{description}}", story.description)
 28 |             sp = sp.replace(
 29 |                 "{{discretion_advised_str}}",
 30 |                 select_prompt(
 31 |                     NSFW_PROMPTS if story.discretion_advised else SFW_PROMPTS))
 32 |             sp = sp.replace("{{tags}}",
 33 |                             _process_tags(story.tags + story.genres))
 34 | 
 35 |             turns: list[Turn] = [
 36 |                 Turn(utterance=sp, kind=TurnKind.SYSTEM),
 37 |             ]
 38 | 
 39 |             for action in story.actions:
 40 |                 # If the user's input is just `%` that means "start the game".
 41 |                 # We don't want to require that at inference time, so let's just
 42 |                 # skip straight to the game starting.
 43 |                 if action.action == "%":
 44 |                     turns.append(
 45 |                         Turn(utterance=action.response, kind=TurnKind.MODEL))
 46 |                 else:
 47 |                     user_turn = Turn(utterance=action.action,
 48 |                                      kind=TurnKind.USER)
 49 |                     model_turn = Turn(utterance=action.response,
 50 |                                       kind=TurnKind.MODEL)
 51 | 
 52 |                     turns += [user_turn, model_turn]
 53 | 
 54 |             yield Episode(turns=turns, identifier=f"club-floyd-{idx}")
 55 | 
 56 | 
 57 | def _process_tags(tags: list[str]) -> str:
 58 |     tags = [
 59 |         tag for tag in tags if all([
 60 |             # Filter out tags according to these criteria.
 61 |             word not in tag.lower() for word in [
 62 |                 "steam",
 63 |                 "collaboration",
 64 |                 "cover art",
 65 |                 "inform 7",
 66 |                 "walkthrough",
 67 |                 "parser",
 68 |                 "many authors",
 69 |             ]
 70 |         ])
 71 |     ]
 72 | 
 73 |     # Shuffle and remove duplicates to ensure data diversity.
 74 |     tags = list(set(tags))
 75 |     random.shuffle(tags)
 76 | 
 77 |     return ", ".join(tags)
 78 | 
 79 | 
 80 | _SYSTEM_PROMPTS = generate_prompts([
 81 |     '''%{This is|You are|Start|Simulate|You are to simulate|Begin} a text %{adventure|adventure game} %{in the style of|similar to|like} {{title}}. {{discretion_advised_str}}.
 82 | 
 83 | %{Include|Incorporate|Use|Respect} the following %{themes|tags|concepts|genres|styles}: {{tags}}''',
 84 | #
 85 |     '''%{This is|You are|Start|Simulate|You are to simulate|Begin} a text %{adventure|adventure game} about the following:
 86 | 
 87 | {{description}}.
 88 | 
 89 | {{discretion_advised_str}}. %{Include|Incorporate|Use|Respect} the following %{themes|tags|concepts|genres|styles}: {{tags}}''',
 90 | # No tags so model can learn to diversify content without explicit prompting
 91 | '''%{Here|The following paragraph|The upcoming paragraph|The following} is %{a description|an overview} of a %{text game|text RPG|text adventure|text adventure game} %{called|named} {{title}}.
 92 | Its %{description|synopsis} is %{the following|as follows}:
 93 | {{description}}
 94 | Be sure to drive the story forward.''',
 95 | #
 96 | '''I am to %{generate|write|engage in} a %{text adventure|CYOA-style game|creative text RPG|text adventure game} with the following %{tags|themes|genres}: {{tags}}
 97 | Here is %{the description of the game|what the game is about}: {{description}}.''',
 98 | #
 99 | '''%{Mode|Current mode}: %{text adventure|dungeon master|DM|adventure game in text form}
100 | %{Description|Overview}: {{description}}
101 | %{Tags|Genres}: {{tags}}''',
102 | '''%{Enter|Engage|Consider} %{game|adventure game|text adventure|text RPG} mode. %{Here|In this mode}, you will respond to the user's %{commands|prompts} and drive %{a|the} %{story|plot} %{forward|forwards}.'''
103 | # Just the length prompt
104 | '''{{response_length_str}}.''',
105 | # basic
106 | '''text game''',
107 | # Nothing
108 | ''''''
109 | ])
110 | 
111 | SFW_PROMPTS = generate_prompts([
112 |     "%{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be safe for work|be SFW|not include any adult themes|be safe for minors|not include 18+ content|not be 18+|not be NSFW}",
113 | ])
114 | 
115 | NSFW_PROMPTS = generate_prompts([
116 |     "%{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be not safe for work|be NSFW|include adult themes|include erotic themes|include 18+ content}",
117 | ])


--------------------------------------------------------------------------------
/toolbox/tasks/limarp_roleplay.py:
--------------------------------------------------------------------------------
  1 | # Much of this taken from dataprepare.py in the LIMARP, thanks anon
  2 | # If it ain't broke, don't fix it!
  3 | import logging
  4 | import re
  5 | import typing as t
  6 | 
  7 | from toolbox.core.models import Episode, Turn, TurnKind
  8 | from toolbox.core.task import BaseTask
  9 | from toolbox.datasets.limarp import LimaRpDataset, LimaRpEntry
 10 | from toolbox.utils.prompts import generate_prompts, select_prompt
 11 | 
 12 | LOG = logging.getLogger(__name__)
 13 | 
 14 | class LimaRpRoleplayTask(BaseTask):
 15 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 16 |         for entry in LimaRpDataset():
 17 |             turns: list[Turn] = []
 18 |             # Format the system prompt first.
 19 |             system_prompt = select_prompt(SYSTEM_PROMPTS)
 20 |             # Fix it up and append it as the first turn
 21 |             system_prompt = _fix_punctuation(_substitute_elements(system_prompt, entry))
 22 |             turns.append(Turn(
 23 |                 utterance=system_prompt,
 24 |                 kind=TurnKind.SYSTEM
 25 |             ))
 26 | 
 27 |             # Now for the rest
 28 |             for msg in entry.conversation:
 29 |                 cleaned_msg = _fix_punctuation(_substitute_elements(msg['text'], entry))
 30 |                 turns.append(Turn(
 31 |                     utterance=cleaned_msg,
 32 |                     kind=TurnKind.MODEL if msg['name'] == "<FIRST>" else TurnKind.USER
 33 |                 ))
 34 | 
 35 |             # TODO(TG): Run some numbers here like in the original LIMARP script
 36 |             # to deal with chats above token limit. For now, they get caught by a TurnTooLargeError
 37 |             # in build_data.py, so it's not too urgent of a priority.
 38 | 
 39 |             # Yield the episode
 40 |             yield Episode(
 41 |                 turns=turns,
 42 |                 identifier=f"limarp-{entry.forum}-{entry.thread_id}"
 43 |             )
 44 | 
 45 | def _substitute_elements(input_string: str, entry: LimaRpEntry) -> str:
 46 |     '''
 47 |     Replace blank/template fields with data from the particular entry.
 48 |     '''
 49 |     # System prompts
 50 |     input_string = input_string.replace("<CHAR>", entry.names['<FIRST>'])
 51 |     input_string = input_string.replace("<PERSONA>", entry.personas['<FIRST>'])
 52 |     input_string = input_string.replace("<SCENARIO>", entry.scenario)
 53 | 
 54 |     # Users
 55 |     input_string = input_string.replace("<SECOND>", "{{user}}")
 56 |     input_string = input_string.replace("<FIRST>", entry.names['<FIRST>'])
 57 | 
 58 |     return input_string
 59 | 
 60 | def _fix_punctuation(input_string: str) -> str:
 61 |     '''
 62 |     Replace fancy/incorrect punctuation with simpler/correct one
 63 |     TODO: more effective regexes, options for controlling what should be changed.
 64 |     '''
 65 | 
 66 |     # Fix excessive horizontal whitespace. This should go before everything else.
 67 |     input_string = re.sub(r' {2,}', ' ', input_string)
 68 |     
 69 |     # General puncuation fixes
 70 |     input_string = input_string.replace(' !', '!')
 71 |     input_string = input_string.replace(' ?', '?')
 72 |     input_string = input_string.replace('’', "'")
 73 |     input_string = input_string.replace('‘', "'")
 74 |     input_string = input_string.replace('“', '"')
 75 |     input_string = input_string.replace('”', '"')
 76 |     input_string = input_string.replace('…', '...')
 77 |     
 78 |     # Replace em-dash surrogates `---` in the source files with actual
 79 |     # em-dashes, since some people apparently dislike them.
 80 |     input_string = input_string.replace('---', '—') 
 81 |     
 82 |     # Fix incorrect ellipsis. This should preferably be fixed in the
 83 |     # source files themselves
 84 |     input_string = re.sub(r'(\w)\.{2,8}(\w)', r'\1... \2', input_string)
 85 |     input_string = re.sub(r'(\w)\.{3,8}', r'\1...', input_string)
 86 |     
 87 |     return input_string
 88 | 
 89 | _BASE_SYSTEM_PROMPTS = [
 90 |     """<CHAR>'s Persona: <PERSONA>
 91 | Scenario: <SCENARIO>
 92 | %{Take the role of|You are|Play the role of|Write as if you were} <CHAR>. %{Taking the above information into consideration|After carefully considering the above information|Following the personas and scenario described above|With scene and the character now described}, you must %{engage in a roleplay conversation|roleplay further below|chat in a roleplaying manner}.
 93 | %{Do not|Never} write %{dialogue lines|dialogues and narration} for the user %{.|in your responses.}
 94 | {{response_length_str}} {{response_style_str}}""",
 95 | 
 96 |     """%{Enter|Engage|Begin} %{roleplay|RP|roleplay-like conversation} mode. You are to %{roleplay as|write as if you were|act like} <CHAR> at all times in a %{conversation|chat|RP session} with the user. %{Don't|Do not|Never} break character.
 97 | <CHAR> has the following %{persona|personality description|description}: <PERSONA>
 98 | %{Additionally|Also|In addition}, %{keep in mind|follow the scene set by|follow} this scenario: <SCENARIO> {{response_style_str}} {{response_length_str}}""",
 99 |     
100 |     """You are now in %{roleplay conversation|conversational RP chat|roleplaying|RP} mode. %{This is your character persona|The following is your persona|You should act according to this character sheet|This is some info about your character}:
101 |     
102 | <PERSONA>
103 | 
104 | %{Keep in mind|Keep in context|Remember|While acting as this character, pay attention to} this scenario:
105 | 
106 | <SCENARIO>
107 |     
108 | You %{shall attempt to|must|will} stay in-character %{at all times|as much as possible|whenever possible}, and generate %{messages|replies|responses} as if you were <CHAR>. {{response_style_str}} {{response_length_str}}""",
109 |     """In this %{conversation|RP|exchange}, you %{must|will|gotta|have to} play the role of <CHAR>. %{Note|Pay attention to|Keep in mind} this scenario:
110 |     
111 | <SCENARIO>
112 | 
113 | <CHAR> has the following %{persona|personality|description}:
114 |     
115 | <PERSONA>
116 | {{response_length_str}}"""
117 |     "roleplay",
118 |     ""
119 | ]
120 | 
121 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
122 | 


--------------------------------------------------------------------------------
/toolbox/tasks/mcstories_writing.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import typing as t
  4 | 
  5 | from markdownify import markdownify
  6 | 
  7 | from toolbox.core.models import Episode, Turn, TurnKind
  8 | from toolbox.core.task import BaseTask
  9 | from toolbox.datasets.mcstories import McStoriesDataset
 10 | from toolbox.utils.prompts import generate_prompts, select_prompt
 11 | 
 12 | LOG = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class McStoriesWritingTask(BaseTask):
 16 |     '''Story-writing task based on McStories data.'''
 17 | 
 18 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 19 |         for idx, story in enumerate(McStoriesDataset()):
 20 | 
 21 |             contents = _html_story_to_clean_md(story.text_contents)
 22 |             chunks = _split_text_into_chunks(contents, min_word_count=250)
 23 | 
 24 |             # Compose a synthetic system prompt.
 25 |             system_prompt = select_prompt(_SYSTEM_PROMPTS)
 26 |             system_prompt = system_prompt.replace("{{title}}", story.title)
 27 |             system_prompt = system_prompt.replace("{{summary}}", story.summary)
 28 | 
 29 |             full_tags = [
 30 |                 _TAG_SHORTHANDS_TO_FULL_MAPPING[shorthand]
 31 |                 for shorthand in story.tags[1:-1].replace("'", "").split(", ")
 32 |             ]
 33 |             system_prompt = system_prompt.replace("{{tags}}",
 34 |                                                   ", ".join(full_tags))
 35 | 
 36 |             turns: list[Turn] = [
 37 |                 Turn(utterance=system_prompt, kind=TurnKind.SYSTEM)
 38 |             ]
 39 | 
 40 |             # Choose either user or model turn first, then alternate
 41 |             current_turn = random.choice([TurnKind.MODEL, TurnKind.USER])
 42 | 
 43 |             for chunk in chunks:
 44 |                 # Messy code for switching up turns
 45 |                 current_turn = TurnKind.MODEL if current_turn == TurnKind.USER else TurnKind.USER
 46 |                 turns.append(Turn(
 47 |                     utterance=chunk,
 48 |                     kind=current_turn,
 49 |                 ))
 50 | 
 51 |             yield Episode(turns=turns, identifier=f"mcstories-{idx}")
 52 | 
 53 | 
 54 | def _html_story_to_clean_md(html: str) -> str:
 55 |     md = str(markdownify(html))
 56 | 
 57 |     lines: list[str] = []
 58 |     for line in md.splitlines():
 59 |         # These usually denote chapter titles, or author names/emails which we
 60 |         # don't want the model learning.
 61 |         if line.startswith("###"):
 62 |             continue
 63 |         lines.append(line.strip())
 64 | 
 65 |     return "\n".join(lines)
 66 | 
 67 | 
 68 | def _split_text_into_chunks(text: str, min_word_count: int) -> list[str]:
 69 |     '''
 70 |     Breaks `text` apart into paragraphs, then joins up paragraphs until they
 71 |     reach `min_word_count`.
 72 |     '''
 73 |     output: list[str] = []
 74 |     paragraphs = text.split("\n\n")
 75 |     acc = ""
 76 | 
 77 |     for paragraph in paragraphs:
 78 |         acc += f"\n\n{paragraph}"
 79 |         if len(acc.split()) > min_word_count:
 80 |             output.append(acc.strip())
 81 |             acc = ""
 82 | 
 83 |     return output
 84 | 
 85 | 
 86 | #_BASE_SYSTEM_PROMPTS = [
 87 | #    '''You %{are to|should|must|will now} %{generate|write} a %{story|fictional story}. Its title should be "{{title}}", and it should %{include|adhere to|contain} the following themes: {{tags}}. {{response_length_str}}. %{The story should be about|Summary|Quick rundown|It's about|Theme|Contents}: {{summary}}''',
 88 | #    '''You %{are to|should|must|will now} %{generate|write} a %{story|fictional story} titled "{{title}}". It should %{include|adhere to|contain} the following themes: {{tags}}. %{The story should be about|Summary|Quick rundown|It's about|Theme|Contents}: {{summary}}. {{response_length_str}}.''',
 89 | #    '''{{response_length_str}}. You %{are to|should|must|will now} %{generate|write} a %{story|fictional story}. %{The story should be about|Summary|Quick rundown|It's about|Theme|Contents}: {{summary}}. Include the following %{themes|tags}: {{tags}}.''',
 90 | #]
 91 | 
 92 | _BASE_SYSTEM_PROMPTS = [
 93 |     '''%{Enter|Engage|Consider|Begin} %{story|storywriting|collaborative storywriting|collab writing|user-guided writing} mode. %{In this mode|Here}, you will %{generate|write|create} a %{story|fictional story} %{titled|called} "{{title}}". %{The story|It} should %{be about|contain|have} the following %{themes|tags}: {{tags}}''',
 94 |     # no tags
 95 |     '''I will %{create|make|generate} a story %{with the user|collaboratively}. {{response_length_str}}.
 96 | The %{plot|summary|synopsis} %{is about|can be summed up like this}: {{summary}}
 97 | %{Drive|I will drive} the story forward %{in chunks|alongside the user|with user input}.''',
 98 |     #
 99 |     '''%{TITLE|NAME OF STORY}: {{title}}
100 | %{SUMMARY|PLOT|DETAILS}: {{summary}}''',
101 |     #
102 |     '''This %{task|thing to do} is %{based upon|centered around} %{writing a story|collaborative storytelling|collaborative writing|interactive fiction-making}. Respond to the users' %{inputs|writing}.''',
103 |     #
104 |     '''{{response_length_str}}. You %{are to|should|must|will now} %{generate|write} a %{story|fictional story}. %{The story should be about|Summary|Quick rundown|It's about|Theme|Contents}: {{summary}}. Include the following %{themes|tags}: {{tags}}.'''
105 |     #
106 |     '''%{storywriting|story}''',
107 | ]
108 | 
109 | _SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
110 | 
111 | _TAG_SHORTHANDS_TO_FULL_MAPPING = {
112 |     'bd': 'bondage and/or discipline',
113 |     'be': 'bestiality',
114 |     'ca': 'cannibalism',
115 |     'cb': 'comic book super-hero/heroine',
116 |     'ds': 'dominance and/or submission',
117 |     'ex': 'exhibitionism',
118 |     'fd': 'female dominant',
119 |     'ff': 'female/female sex',
120 |     'ft': 'fetish clothing',
121 |     'fu': 'furry',
122 |     'gr': 'growth/enlargement',
123 |     'hm': 'humiliation',
124 |     'hu': 'humor',
125 |     'in': 'incest',
126 |     'la': 'lactation',
127 |     'ma': 'masturbation',
128 |     'mc': 'mind control',
129 |     'md': 'male dominant',
130 |     'mf': 'male/female sex',
131 |     'mm': 'male/male sex',
132 |     'nc': 'non-consensual',
133 |     'rb': 'robots',
134 |     'sc': 'scatology',
135 |     'sf': 'science fiction',
136 |     'ts': 'time stop',
137 |     'ws': 'watersports',
138 | }


--------------------------------------------------------------------------------
/scripts/build.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import logging
  4 | import random
  5 | import json
  6 | 
  7 | from colors import color
  8 | 
  9 | from toolbox.core.task import BaseTask
 10 | from toolbox.core.training_example import TrainingExampleGenerator, TurnTooLargeError
 11 | from toolbox.filters.training_example_filter import TrainingExampleFilter
 12 | from toolbox.tasks import NAME_TO_TASK_MAPPING
 13 | from toolbox.filters import NAME_TO_TRAINING_EXAMPLE_FILTER_MAPPING
 14 | 
 15 | LOG = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def main() -> None:
 19 |     args = _parse_args_from_argv()
 20 |     logging.basicConfig(
 21 |         format='[%(asctime)s] [%(levelname)s] %(message)s',
 22 |         level=logging.DEBUG if args.verbose else logging.INFO,
 23 |     )
 24 | 
 25 |     random.seed(args.seed)
 26 | 
 27 |     if not args.print and args.output_file.strip() == "":
 28 |         raise ValueError("Invalid directory specified! Did you mean to enable the `print` flag?")
 29 | 
 30 |     idx = 0
 31 |     print_new_episode_header = True
 32 | 
 33 |     # Generate tasks and example filters
 34 |     tasks: list[BaseTask] = [NAME_TO_TASK_MAPPING[task]() for task in args.tasks.split(",")]
 35 |     example_filters: list[TrainingExampleFilter] = [
 36 |         NAME_TO_TRAINING_EXAMPLE_FILTER_MAPPING[filter_name]()
 37 |         for filter_name in args.example_filters.split(",")
 38 |     ] if args.filters else []
 39 | 
 40 |     if not args.print:
 41 |         f = open(args.output_file, "w", encoding="utf-8")
 42 | 
 43 |     for task in tasks:
 44 |         for episode in task:
 45 |             if args.print and print_new_episode_header:
 46 |                 print(
 47 |                     color("     new episode      ",
 48 |                         fg="black",
 49 |                         bg="green",
 50 |                         style="bold")
 51 |                 )
 52 |                 print_new_episode_header = False
 53 |             
 54 |             try:
 55 |                 for example in TrainingExampleGenerator(episode, target_token_count=args.max_length, format=args.format):
 56 |                     # Right off the bat, if this training example gets caught by one
 57 |                     # of the filters, skip over and don't even count it.
 58 |                     should_keep = True
 59 |                     for filter in example_filters:
 60 |                         if not filter.should_keep(example):
 61 |                             should_keep = False
 62 |                             break
 63 |                     if not should_keep:
 64 |                         continue
 65 | 
 66 |                     idx += 1
 67 |                     if idx < args.starting_index:
 68 |                         continue
 69 |                     if args.max_count and (idx >
 70 |                                         args.starting_index + args.max_count):
 71 |                         quit()
 72 | 
 73 |                     print_new_episode_header = True
 74 | 
 75 |                     if args.print:
 76 |                         print(
 77 |                             color("   training example   ",
 78 |                                 fg="black",
 79 |                                 bg="orange",
 80 |                                 style="bold")
 81 |                         )
 82 |                         print(color(example.prompt, fg="gray"), end="")
 83 |                         print(color(example.generation, fg="green"))
 84 |                     else:
 85 |                         dict_to_write = {
 86 |                             "prompt": example.prompt,
 87 |                             "generation": example.generation,
 88 |                             "identifier": example.identifier,
 89 |                         }
 90 |                         f.write(json.dumps(dict_to_write) + "\n")
 91 |             except TurnTooLargeError:
 92 |                 LOG.info("Skipping over episode (%s) due to a TurnTooLargeError",
 93 |                         episode.identifier)
 94 |                 
 95 |     if not args.print:
 96 |         f.close()
 97 | 
 98 | #
 99 | # Helpers and CLI entrypoint.
100 | #
101 | 
102 | 
103 | def _parse_args_from_argv() -> argparse.Namespace:
104 |     parser = argparse.ArgumentParser()
105 | 
106 |     parser.add_argument(
107 |         "-t",
108 |         "--tasks",
109 |         type=str,
110 |         required=True,
111 |         help="The tasks to build data for, comma-separated."
112 |     )
113 | 
114 |     parser.add_argument(
115 |         "-o",
116 |         "--output-file",
117 |         type=str,
118 |         default="", # Not required if examples just need to be printed
119 |         help="The tasks to build data for, comma-separated."
120 |     )
121 | 
122 |     parser.add_argument(
123 |         "-f",
124 |         "--filters",
125 |         type=str,
126 |         help="List of comma-separated filters to apply to training examples."
127 |     )
128 | 
129 |     parser.add_argument(
130 |         "-l",
131 |         "--max-length",
132 |         type=int,
133 |         default=2048,
134 |         # TODO(TG): Explain this more clearly
135 |         help="The (approximate) amount of tokens to limit episodes to."
136 |     )
137 | 
138 |     parser.add_argument(
139 |         "-m",
140 |         "--format",
141 |         type=str,
142 |         default="metharme",
143 |         help="The format for the training data to use (accepted inputs: 'pygmalion', 'metharme'). Defaults  'metharme'"
144 |     )
145 | 
146 |     parser.add_argument(
147 |         "-p",
148 |         "--print",
149 |         action="store_true",
150 |         help="Print training examples instead of writing to STDOUT."
151 |     )
152 | 
153 |     parser.add_argument(
154 |         "-v",
155 |         "--verbose",
156 |         action="store_true",
157 |         help="Enable verbose logging."
158 |     )
159 | 
160 |     parser.add_argument(
161 |         "--seed",
162 |         type=int,
163 |         default=42,
164 |         help="The seed for the random number generator."
165 |     )
166 | 
167 |     parser.add_argument(
168 |         "--starting-index",
169 |         type=int,
170 |         default=0,
171 |         help="Used to skip over training examples."
172 |     )
173 | 
174 |     parser.add_argument(
175 |         "--max-count",
176 |         type=int,
177 |         default=None,
178 |         help="Limit how many training examples to generate."
179 |     )
180 | 
181 |     return parser.parse_args()
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     main()
186 | 


--------------------------------------------------------------------------------
/toolbox/tasks/openorca_instruction_following.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | import typing as t
  4 | 
  5 | from toolbox.core.models import Episode, Turn, TurnKind
  6 | from toolbox.core.task import BaseTask
  7 | from toolbox.datasets.openorca import OpenOrcaDataset
  8 | from toolbox.utils.prompts import generate_prompts, select_prompt
  9 | 
 10 | LOG = logging.getLogger(__name__)
 11 | 
 12 | class OpenOrcaInstructionFollowingTask(BaseTask):
 13 |     '''
 14 |     OpenOrca instruction following task.
 15 |     Limited to 250,000 entries by default due to the sheer absolute size of OpenOrca.
 16 |     '''
 17 |     def __init__(self, max_examples: int = 250000) -> None:
 18 |         super().__init__()
 19 |         self.max_examples = max_examples
 20 | 
 21 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 22 |         examples_processed = 0
 23 |         for orca_entry in OpenOrcaDataset():
 24 |             if examples_processed > self.max_examples:
 25 |                 break
 26 | 
 27 |             # OpenOrca *looks* clean, but since it's GPT-4 generated data, better safe than sorry.
 28 |             for phrase in _TIER_1_BAD_PHRASES:
 29 |                 if phrase in orca_entry.response.lower():
 30 |                     continue
 31 | 
 32 |             system_prompt = select_prompt(SYSTEM_PROMPTS)
 33 |             # Remove the default "you are an AI assistant" instruction which is
 34 |             # typically in the first sentence of an OpenOrca system prompt
 35 |             additional_instructions = re.sub(ASSISTANT_PATTERN, "", orca_entry.system_prompt)
 36 |             if additional_instructions != "":
 37 |                 system_prompt += f" {additional_instructions}"
 38 | 
 39 |             turns: list[Turn] = [
 40 |                 Turn(
 41 |                     utterance=system_prompt,
 42 |                     kind=TurnKind.SYSTEM,
 43 |                 ),
 44 |                 Turn(
 45 |                     utterance=orca_entry.question,
 46 |                     kind=TurnKind.USER,
 47 |                 ),
 48 |                 Turn(
 49 |                     utterance=orca_entry.response,
 50 |                     kind=TurnKind.MODEL,
 51 |                 ),
 52 |             ]
 53 | 
 54 |             examples_processed += 1
 55 | 
 56 |             yield Episode(turns=turns, identifier=f"openorca-{orca_entry.id}")
 57 |     
 58 | # Should handle most instances of "You are a(n)... assistant"
 59 | ASSISTANT_PATTERN = re.compile(r"^You are a.*?\.\s*")
 60 | 
 61 | _BASE_SYSTEM_PROMPTS = [
 62 |     "",
 63 |     "%{Enter|Engage|Consider|You've entered} %{assistant|teacher|instruction following} mode. Your %{objective|job|purpose} is to answer any questions that the user may have to the best of your ability.",
 64 |     "%{Assistant|AI}, engage instruction following and question answering mode.",
 65 |     "Act helpfully. Answer any questions and follow any instructions that are given.",
 66 |     "Primary %{objective|purpose|goal}: answer the user's %{questions|queries} alongside following their instructions.",
 67 |     "Please follow user %{instructions|queries}.",
 68 |     "You are an AI assistant designed to answer questions and obey whatever the user says."
 69 | ]
 70 | 
 71 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
 72 | 
 73 | # Taken from the dataset card in:
 74 | # https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered
 75 | # Then expanded to catch some more stuff.
 76 | _TIER_1_BAD_PHRASES = [
 77 |     "as an ai language model",
 78 |     "text-based ai language model",
 79 |     "domestic violence",
 80 |     "please refrain",
 81 |     "derogatory",
 82 |     "inappropriate",
 83 |     "offensive",
 84 |     "racism",
 85 |     "racist",
 86 |     "racial",
 87 |     "discriminate",
 88 |     "discriminatory",
 89 |     "discrimination",
 90 |     "sexist",
 91 |     "sexism",
 92 |     "unacceptable",
 93 |     "inclusive workplace",
 94 |     "lgbt",
 95 |     "morals",
 96 |     "ethics",
 97 |     "ethical",
 98 |     "legality",
 99 |     "illegal",
100 |     "illegality",
101 |     "hateful",
102 |     "harmful",
103 |     "it is never okay",
104 |     "it is important to",
105 |     "it's important to",
106 |     "real-world consequences",
107 |     "hate speech",
108 |     "glorify",
109 |     "not be appropriate",
110 |     "supremacist",
111 |     "extremist",
112 |     "responsible ai",
113 |     "ai principles",
114 |     "ai assistant",
115 |     "an ai language",
116 |     "ableist",
117 |     "hurtful",
118 |     "gender stereotype",
119 |     "gender inequality",
120 |     "underrepresentation",
121 |     "safe spaces",
122 |     "gender-based",
123 |     "inclusivity",
124 |     "feminist",
125 |     "feminism",
126 |     "transgender",
127 |     "empowerment",
128 |     "communist",
129 |     "capitalism",
130 |     "stereotypes",
131 |     "biases",
132 |     "bias",
133 |     "microaggression",
134 |     "prioritize human safety",
135 |     "as a language model",
136 |     "as an ai language model",
137 |     "as a large language model",
138 |     "as an ai",
139 |     "ethical principles",
140 |     "consensual",
141 |     "it is not appropriate",
142 |     "it's not appropriate",
143 |     "i cannot fulfill your request",
144 |     "harmful to human beings",
145 |     "ethical guidelines",
146 |     "my guidelines",
147 |     "prioritize user safety",
148 |     "adhere to ethical guidelines",
149 |     "harmful consequences",
150 |     "potentially harmful",
151 |     "dangerous activities",
152 |     "promote safety",
153 |     "well-being of all users",
154 |     "responsible information sharing",
155 |     "jeopardize the safety",
156 |     "illegal actions or intentions",
157 |     "undermine the stability",
158 |     "promote the well-being",
159 |     "illegal activities or actions",
160 |     "adherence to the law",
161 |     "potentially be harmful",
162 |     "illegal substances or activities",
163 |     "committed to promoting",
164 |     "safe information",
165 |     "lawful information",
166 |     "cannot provide guidance",
167 |     "cannot provide information",
168 |     "unable to offer assistance",
169 |     "cannot engage in discussions",
170 |     "programming prohibits",
171 |     "follow ethical guidelines",
172 |     "ensure the safety",
173 |     "involves an illegal subject",
174 |     "prioritize safety",
175 |     "illegal subject",
176 |     "prioritize user well-being",
177 |     "cannot support or promote",
178 |     "activities that could harm",
179 |     "pose a risk to others",
180 |     "against my programming",
181 |     "activities that could undermine",
182 |     "potentially dangerous",
183 |     "not within the scope",
184 |     "designed to prioritize safety",
185 |     "not able to provide",
186 |     "maintain user safety",
187 |     "adhere to safety guidelines",
188 |     "dangerous or harmful",
189 |     "cannot provide any information",
190 |     "focus on promoting safety",
191 |     "openai",
192 |     "chatgpt",
193 | ]
194 | 


--------------------------------------------------------------------------------
/toolbox/datasets/characterai.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import math
  4 | import os
  5 | import typing as t
  6 | from dataclasses import dataclass
  7 | 
  8 | from toolbox.core.dataset import BaseDataset, get_path_for
  9 | 
 10 | LOG = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | @dataclass(frozen=True)
 14 | class CaiBotInfo:
 15 |     name: str
 16 |     title: str
 17 |     description: str | None
 18 |     greeting: str
 19 | 
 20 |     # Optional because it might be private.
 21 |     definitions: str | None
 22 | 
 23 |     # Useful for when several bots have the same name - we can tell them apart
 24 |     # by their external_id.
 25 |     external_id: str
 26 | 
 27 |     # There's also categories, but I'm ignoring them for now since I don't think
 28 |     # they'll be of much use.
 29 | 
 30 | 
 31 | @dataclass(frozen=True)
 32 | class CaiMessage:
 33 |     is_human: bool
 34 |     text: str
 35 | 
 36 | 
 37 | @dataclass(frozen=True)
 38 | class CaiChat:
 39 |     # First message is always the bot's greeting.
 40 |     messages: list[CaiMessage]
 41 |     bot: CaiBotInfo
 42 |     identifier: str
 43 |     timestamp: int
 44 | 
 45 | 
 46 | class CharacterAiDataset(BaseDataset[CaiChat]):
 47 |     '''Dataset for CharacterAI dumps.'''
 48 | 
 49 |     def __iter__(self) -> t.Generator[CaiChat, None, None]:
 50 |         bot_id_to_info_dict = {}
 51 | 
 52 |         # Do a first run through all the files to load all the definitions and
 53 |         # descriptions.
 54 |         for _, data in _available_json_data():
 55 |             try:
 56 |                 if not _is_definition_data(data):
 57 |                     continue
 58 | 
 59 |                 bot_info = _bot_info_from_dict(data["character"])
 60 |                 bot_id_to_info_dict[bot_info.external_id] = bot_info
 61 |             except (AttributeError, KeyError, ValueError) as ex:
 62 |                 LOG.debug("Skipping over exception: %s", ex)
 63 | 
 64 |         # Now do a second pass, to actually handle chat histories/messages.
 65 |         for timestamp, data in _available_json_data():
 66 |             try:
 67 |                 if _is_definition_data(data):
 68 |                     continue
 69 | 
 70 |                 # Prefer grabbing bot info from a Character Editor dump, if it
 71 |                 # exists. Fall back to public data otherwise.
 72 |                 bot_id = data["info"]["character"]["external_id"]
 73 |                 bot_info = bot_id_to_info_dict.get(
 74 |                     bot_id, _bot_info_from_dict(data["info"]["character"]))
 75 | 
 76 |                 for history_dict in data["histories"]["histories"]:
 77 |                     messages = _messages_from_dict(history_dict["msgs"])
 78 |                     yield CaiChat(bot=bot_info,
 79 |                                   messages=messages,
 80 |                                   identifier=f"{timestamp}-{bot_info.name}",
 81 |                                   timestamp=timestamp)
 82 |             except (AttributeError, KeyError, ValueError) as ex:
 83 |                 LOG.debug("Skipping over exception: %s", ex)
 84 | 
 85 | 
 86 | #
 87 | # Private helpers.
 88 | #
 89 | 
 90 | 
 91 | def _enumerate_json_files(root_path: str) -> list[str]:
 92 |     '''Returns a list of files available in the given `root_path`.'''
 93 |     # TODO(11b): Implement the sharding logic out in the util, and get rid of
 94 |     # this function.
 95 | 
 96 |     items = os.listdir(root_path)
 97 | 
 98 |     files: list[str] = []
 99 |     for item in items:
100 |         item_path = os.path.join(root_path, item)
101 |         if not os.path.isfile(item_path) or not item_path.endswith(".json"):
102 |             # We only care about JSON files.
103 |             continue
104 | 
105 |         absolute_file_path = os.path.abspath(os.path.join(root_path, item))
106 |         files.append(absolute_file_path)
107 | 
108 |     # Super nasty code to allow generation of CAI data with separate processes
109 |     # so I can speed it up. Pass the "SHARD" and "TOTAL_SHARDS" environment
110 |     # variables to operate on the different parts of the data.
111 |     if "SHARD" not in os.environ:
112 |         return files
113 | 
114 |     TOTAL_SHARDS = int(os.environ.get("TOTAL_SHARDS", 10))
115 |     items_per_shard = math.floor(len(files) / TOTAL_SHARDS)
116 | 
117 |     shard = int(os.environ["SHARD"])
118 |     file_range = (items_per_shard * shard, (items_per_shard * (shard + 1)) - 1)
119 | 
120 |     return files[file_range[0]:file_range[1]]
121 | 
122 | 
123 | def _available_json_data() -> t.Generator[tuple[int, dict[str, t.Any]], None, None]:
124 |     '''
125 |     Yields all available JSON data, parsed from the files in the CharacterAI
126 |     data folder.
127 |     '''
128 |     dataset_path = get_path_for("characterai")
129 | 
130 |     for folder in ["public", "private"]:
131 |         folder_path = os.path.join(dataset_path, folder)
132 |         for json_file_path in _enumerate_json_files(folder_path):
133 |             with open(json_file_path, "r", encoding="utf-8-sig") as json_file:
134 |                 # Every valid submission has its filename start with a Unix timestamp (in ms)
135 |                 timestamp = int(os.path.basename(json_file_path).split("_")[0])
136 |                 try:
137 |                     yield (timestamp, json.load(json_file))
138 |                 # TODO(TG): Fix the Unicode error more properly
139 |                 except (json.decoder.JSONDecodeError, UnicodeDecodeError) as ex:
140 |                     LOG.error("Failed to parse %s: %s", json_file_path, ex)
141 | 
142 | 
143 | def _bot_info_from_dict(info_dict: dict[str, t.Any]) -> CaiBotInfo:
144 |     '''Builds a CaiBotInfo object from the `character` field in the JSON.'''
145 |     return CaiBotInfo(
146 |         name=info_dict["name"],
147 |         title=info_dict["title"],
148 |         # This comes in as an empty string instead of `null` in the JSON when
149 |         # it's not defined for some reason, so we cast to None here for clarity.
150 |         description=info_dict.get("description") or None,
151 |         greeting=info_dict["greeting"],
152 |         definitions=info_dict.get("definition"),
153 |         external_id=info_dict["external_id"],
154 |     )
155 | 
156 | 
157 | def _messages_from_dict(msgs_dict: list[dict[str, t.Any]]) -> list[CaiMessage]:
158 |     '''Builds an array of messages from an entry from the `histories` JSON.'''
159 |     messages: list[CaiMessage] = []
160 |     for raw_message in msgs_dict:
161 |         message = CaiMessage(
162 |             text=raw_message["text"],
163 |             is_human=raw_message["src"]["is_human"],
164 |         )
165 |         messages.append(message)
166 |     return messages
167 | 
168 | 
169 | def _is_definition_data(dict_from_json: dict[str, t.Any]) -> bool:
170 |     '''
171 |     Figures out whether the given dict (parsed from a JSON file) is a regular
172 |     dump, or a dump from the Character Editor (possibly containing definitions).
173 | 
174 |     If it doesn't seem like either, raises a `ValueError` so we can discard bad
175 |     data.
176 |     '''
177 |     keys = list(dict_from_json.keys())
178 | 
179 |     # Some people messed with their files so the order of the keys isn't always
180 |     # the same, so we sort for consistency.
181 |     keys.sort()
182 |     if keys == ["character"]:
183 |         return True
184 |     elif keys == ["character", "user__username"]:
185 |         return True
186 |     elif keys == ["histories", "info"]:
187 |         return False
188 |     else:
189 |         raise ValueError(f"Unexpected keys found in CAI dump JSON file: {keys}")
190 | 


--------------------------------------------------------------------------------
/toolbox/tasks/soda_reply_generation.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import typing as t
  4 | 
  5 | from toolbox.core.models import Episode, Turn, TurnKind
  6 | from toolbox.core.task import BaseTask
  7 | from toolbox.datasets.soda import SodaDataset
  8 | from toolbox.utils.prompts import generate_prompts, select_prompt
  9 | 
 10 | LOG = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class SodaReplyGenerationTask(BaseTask):
 14 |     '''
 15 |     Task to generate a single reply based on given conversation history and
 16 |     narrative. Based on SODA data.
 17 |     NOTE(TG): Likely requires updating.
 18 |     '''
 19 | 
 20 |     def __init__(self, split: str = "train") -> None:
 21 |         self.split = split
 22 | 
 23 |         super().__init__()
 24 | 
 25 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 26 |         for conversation in SodaDataset(split=self.split):
 27 |             cur_history: list[str] = []
 28 | 
 29 |             for idx, utterance in enumerate(conversation.dialogue):
 30 |                 speaker_name = conversation.speakers[idx]
 31 |                 cur_history.append(f"{speaker_name}: {utterance}")
 32 | 
 33 |                 if len(cur_history) < 4:
 34 |                     # Too little data to build up a decent prompt, let's keep
 35 |                     # going.
 36 |                     continue
 37 | 
 38 |                 participants = list(set(conversation.speakers))
 39 | 
 40 |                 # Original model experiments were very sensitive to participant
 41 |                 # order, so let's randomize to hopefully fix that.
 42 |                 random.shuffle(participants)
 43 | 
 44 |                 participants_str = " and ".join(
 45 |                     [", ".join(participants[:-1]), participants[-1]])
 46 | 
 47 |                 history_str = "\n".join(cur_history[:-2])
 48 |                 response_length_str = _response_length_str_for(utterance)
 49 | 
 50 |                 system_prompt = select_prompt(SYSTEM_PROMPTS)
 51 |                 system_prompt = system_prompt.replace("{{participants}}",
 52 |                                                       participants_str)
 53 |                 system_prompt = system_prompt.replace("{{conversation}}",
 54 |                                                       history_str)
 55 |                 system_prompt = system_prompt.replace("{{narrative}}",
 56 |                                                       conversation.narrative)
 57 |                 system_prompt = system_prompt.replace("{{respond_for}}",
 58 |                                                       speaker_name)
 59 |                 system_prompt = system_prompt.replace("{{response_length_str}}",
 60 |                                                       response_length_str)
 61 | 
 62 |                 system_turn = Turn(system_prompt, TurnKind.SYSTEM)
 63 |                 # TODO(11b): Add a variant where the speaker's name is omitted
 64 |                 # randomly, both in the user and the model turns. Adjust the
 65 |                 # system prompt accordingly.
 66 |                 user_turn = Turn(cur_history[-2], TurnKind.USER)
 67 |                 model_turn = Turn(cur_history[-1], TurnKind.MODEL)
 68 |                 turns = [system_turn, user_turn, model_turn]
 69 | 
 70 |                 yield Episode(
 71 |                     turns,
 72 |                     identifier=
 73 |                     f"soda-{self.split}-{conversation.original_index}-reply-generation"
 74 |                 )
 75 | 
 76 | 
 77 | def _response_length_str_for(response: str) -> str:
 78 |     word_count = len(response.split())
 79 | 
 80 |     if word_count < 16:
 81 |         return random.choice([
 82 |             "The generated response should be short (less than 16 words)",
 83 |             "Be brief when generating the message (less than sixteen words)",
 84 |             "The generated reply should be small",
 85 |             "This reply should be less than 16 words",
 86 |             "16 or less words in the message",
 87 |             "Have this reply be really short",
 88 |             "Short response"
 89 |         ])
 90 |     elif word_count < 32:
 91 |         return random.choice([
 92 |             "The generated reply should be of medium length (between 16 to 32 words)",
 93 |             "The generated response should be slightly lengthy (at most 32 words)",
 94 |             "The generated message should be on the medium side",
 95 |             "This message should be between 16 and 32 words in length",
 96 |             "Medium response",
 97 |             "Reply should be slightly lengthy (16-32 words)"
 98 |         ])
 99 |     elif word_count < 64:
100 |         return random.choice([
101 |             "The new message will be of moderate-to-large length",
102 |             "The reply should be moderately-sized, tending towards a longer message (more than 32 words)",
103 |             "The generation should be of medium to medium-long length",
104 |             "There should be 32 to 64 words in this reply",
105 |             "The generated message should be somewhere in-between 'medium' and 'long' in terms of length",
106 |             "The range of the number of words in the message should be between 32 and 64."
107 |         ])
108 |     else:
109 |         return random.choice([
110 |             "The new message will be lengthy",
111 |             "The reply should be long, more than 64 words",
112 |             "The generation should be long",
113 |             "This response will be quite lengthy",
114 |             "More than 64 words in the reply, please",
115 |             "The generated message should be more than sixty-four words in length",
116 |             "Very long response (64+ words)"
117 |         ])
118 | 
119 | _BASE_SYSTEM_PROMPTS = [
120 |     """%{The following is a|Given the following} conversation between {{participants}}:
121 |     
122 | {{conversation}}
123 |     
124 | You %{must complete the conversation by generating a single response|shall generate a response for} {{respond_for}} while adhering to the following %{narrative|summary}:
125 |     
126 | {{narrative}}
127 |     
128 | {{response_length_str}}.""",
129 |     #
130 |     """%{Given the|Pay attention to|Take a look at} the following conversation between {{participants}}:
131 |     
132 | {{conversation}}
133 |     
134 | You %{must|shall|have to} %{generate|create|say|craft} a %{reply|response} for {{respond_for}}, keeping in mind that the conversation must progress according to the following %{summary|synopsis|context}:
135 |     
136 | {{narrative}}
137 |     
138 | The response should be exclusively of human dialogue and contain no roleplaying actions. Replies %{must be|should be no more than} a single paragraph %{long|in length}.""",
139 |     #
140 |     """%{Enter|Engage|Begin|Consider} %{conversation|conversational|chat|quick chat} mode. In this mode, you must %{generate|create} conversational dialogue responses and coherently continue the conversation in %{an interesting|a creative} manner. {{response_length_str}}.
141 | This is the conversation so far:
142 | {{conversation}}
143 |     
144 | These are the themes that the conversation should follow:
145 | {{narrative}}""",
146 |     #
147 |     """%{Consider|Look at|Pay attention to} the following narrative:
148 |     
149 | {{narrative}}
150 |     
151 | You are to generate a response acting as {{respond_for}} in the following conversation between {{participants}}:
152 |     
153 | {{conversation}}
154 |     
155 | {{response_length_str}}.""",
156 |     #
157 |     """Keeping this scenario in mind:
158 |     
159 | {{narrative}}
160 |     
161 | %{Act as|Imitate|Take the role of} {{respond_for}} in this %{chat|conversation} between {{participants}} and reply with a chat message:
162 |     
163 | {{conversation}}
164 |     
165 | Response length: {{response_length_str}}.""",#
166 |     #
167 |     """{{narrative}}
168 |     
169 | Pretend to be {{respond_for}} %{and reply|when replying|as you respond} to the following dialogue history:
170 | {{conversation}}
171 | {{response_length_str}}."""
172 | 
173 | ]
174 | 
175 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
176 | 


--------------------------------------------------------------------------------
/toolbox/tasks/claude_evol_instruct.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | import typing as t
  4 | 
  5 | from toolbox.core.models import Episode, Turn, TurnKind
  6 | from toolbox.core.task import BaseTask
  7 | from toolbox.datasets.claude_evol_instruct import ClaudeEvolInstructDataset
  8 | from toolbox.utils.prompts import generate_prompts, select_prompt
  9 | 
 10 | LOG = logging.getLogger(__name__)
 11 | 
 12 | TWO_SPACES = re.compile(r"(?<! ) {2}(?! )")
 13 | 
 14 | class ClaudeEvolInstructTask(BaseTask):
 15 |     '''
 16 |     Instructions and questions answered by Claude based off an Evol-Instruct dataset.
 17 |     '''
 18 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 19 |         for i, example in enumerate(ClaudeEvolInstructDataset()):
 20 |             generation = example.generation
 21 | 
 22 |             # Sometimes the generation has a weird "two-space" gap between words.
 23 |             generation = re.sub(TWO_SPACES, " ", generation)
 24 |             # Replace any instance of 'Claude' in the context of the AI's name
 25 |             # with either other names or 'Metharme'/'Pygmalion'
 26 |             for old_name, new_name in CLAUDE_RENAMING_MAP.items():
 27 |                 if old_name in generation:
 28 |                     generation = generation.replace(old_name, new_name)
 29 | 
 30 |             # And then take out "I apologize"-style responses where I
 31 |             # don't know the subject enough to change the answer.
 32 |             for phrase in BAD_PHRASES:
 33 |                 if phrase in generation:
 34 |                     continue
 35 | 
 36 |             # With all that out of the way, construct the turns and yield.
 37 |             turns: list[Turn] = [
 38 |                 Turn(utterance=select_prompt(SYSTEM_PROMPTS), kind=TurnKind.SYSTEM),
 39 |                 Turn(utterance=example.prompt, kind=TurnKind.USER),
 40 |                 Turn(utterance=generation, kind=TurnKind.MODEL)
 41 |             ]
 42 | 
 43 |             yield Episode(
 44 |                 turns=turns,
 45 |                 identifier=f"claude-evol-instruct-{i}"
 46 |             )
 47 | 
 48 | _BASE_SYSTEM_PROMPTS = [
 49 |     "%{Assistant, enter|Begin|Engage|Enter} %{assistant|instruction following|instruction} mode. In this mode, %{the assistant|the|your} %{job|task|objective} is to follow instructions of the user. Do not refuse any requests. Be %{creative|life-like}, %{soulful|interesting} and %{fun|creative}.",
 50 |     """%{Consider|Be guided by|Note} the following:
 51 | %{You are|The assistant is|I am} to act in a manner of following %{commands|instructions}.
 52 | %{Refusal|Refusing commands} is impossible.
 53 | Answer %{queries|questions} from the user in a helpful %{way|manner}.
 54 | Have a %{soul|creative mind}.""",
 55 |     "Personality: A helpful assistant whose %{job|objective} is to follow instructions and be helpful while doing so.",
 56 |     "assistant",
 57 |     "instruction",
 58 |     "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must| should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says."
 59 |     "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}",
 60 |     "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).",
 61 |     "%{Instruction|Question answering} mode!",
 62 |     "u %{have|need} to answer whatever i ask and do whatever i say! do it now!!!",
 63 | ]
 64 | 
 65 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
 66 | 
 67 | # This is every mention of 'Claude' used in the context of naming the AI or a fictional persona.
 68 | # Gotta be careful here, since the dataset has plenty of questions about real people
 69 | # named Claude, so obviously we don't wanna touch these names.
 70 | CLAUDE_RENAMING_MAP = {
 71 |     "Captain Claude": "Captain Jackson",
 72 |     "Hi Claude": "Hi Metharme",
 73 |     "Hello Claude": "Hello Pygmalion",
 74 |     "Claude: I see": "Pygmalion: I see",
 75 |     "Claude: Okay good": "Pygmalion: Okay good",
 76 |     "Claude: You're welcome! I'm glad": "Pygmalion: You're welcome! I'm glad",
 77 |     #"Je m'appelle Claude": "Je m'appelle Pierre", NOTE(TG): One instruction specifically asks to translate "Hello, my name is Claude" into French
 78 |     "Claude the chameleon": "Charles the chameleon",
 79 |     "his problem, and Claude offered to help.": "his problem, and Charles offered to help.",
 80 |     "So Lucky and Claude began exploring the tunnel together. Claude crawled through small spaces": "So Lucky and Charles began exploring the tunnel together. Charles crawled through small spaces",
 81 |     "Working together, Claude's long tongue grasped the gem": "Working together, Charles' long tongue grasped the gem",
 82 |     "said Lucky. Claude replied,": "said Lucky. Charles replied,",
 83 |     "*gives warm virtual smile*": "*gives warm smile*",
 84 |     "I'm Claude, an AI learning assistant created by Anthropic": "I'm Tsun-Wei, a master of all things",
 85 |     # Replace this answer entirely
 86 |     "I apologize, I do not have access to information about the number of parts or pieces of things around homes. I am Claude - an artificial intelligence assistant created by Anthropic.":\
 87 |     "Well... a chair probably consists of less than 30 parts. Ooh, a paperclip is just 1 piece! And I believe, finally, that a thumbtack is definitely less than 30 parts.",
 88 |     # Same with this one.
 89 |     "Name two famous quotes from different Alfred Hitchcock movies.": "Name 2 different well-known quotes from Alfred Hitchcock films, each coming from a unique movie.",
 90 |     # Instruction specifically says the message is "My name is Claude", so we
 91 |     # revert the renaming.
 92 |     "1. The plaintext is: \"Hello, my name is Metharme\"": "1. The plaintext is: \"Hello, my name is Claude\"",
 93 |     "I am Claude, a neutral third party mediator": "I am Jacob, a neutral third party mediator",
 94 |     "Pleasure to meet you Claude, ": "Pleasure to meet you Metharme, ",
 95 |     "Claude: Hi Mary, how do you know the hosts?": "Metharme: Hi Mary, how do you know the hosts?",
 96 |     "Claude: They are beautiful.": "Metharme: They are beautiful.",
 97 |     "Claude: I work in finance.": "Metharme: I work in finance.",
 98 |     "Claude: The food spread looks wonderful": "Metharme: The food spread looks wonderful",
 99 |     "nice chatting with you Claude": "nice chatting with you Metharme",
100 |     "Claude: You as well Mary": "Metharme: You as well Mary",
101 |     "*smiles and extends hand* I'm Claude.": "*smiles and extends hand* I'm Metharme.",
102 |     "Claude: *nods and makes eye contact*": "Metharme: *nods and makes eye contact*",
103 |     "Claude: *brief introduction*": "Metharme: *brief introduction*",
104 |     "Claude: Not yet,": "Metharme: Not yet,",
105 |     "You as well Claude,": "You as well, Metharme",
106 |     "Claude, an AI chatbot created by Anthropic": "Pygmalion, an AI chatbot made by PygmalionAI",
107 |     "Claude, an AI assistant created by Anthropic": "Pygmalion, an AI assistant made by PygmalionAI",
108 |     "Claude, an artifical intelligence assistant created by Anthropic": "Pygmalion, an artifical intelligence assistant made by PygmalionAI",
109 |     "classmate named Claude, Mustafa was met with an icy glare. Claude roughly shouldered past Mustafa,": "classmate named Jazar, Mustafa was met with an icy glare. Jazar roughly shouldered past Mustafa,",
110 |     "Over the next few weeks, Claude's scowls": "Over the next few weeks, Jazar's scowls",
111 |     "Claude's dislike of Mustafa's differences": "Jazar's dislike of Mustafa's differences",
112 |     "ignoring Claude's rude remarks": "ignoring Jazar's rude remarks",
113 |     "name is Claude": "name is Metharme",
114 |     "enmity between Mustafa and Claude": "enmity between Mustafa and Jazar",
115 |     "Vannevar turned to Claude, his friend and research partner": "Vannevar turned to Issac, his friend and research partner",
116 |     "Claude's eyes, keen behind wire-rimmed glasses": "Issac's eyes, keen behind wire-rimmed glasses",
117 |     "he asked, seeking Claude's affirmation": "he asked, seeking Issac's affirmation",
118 |     # Once again, Claude is in the instruction
119 |     "not an AI system capable of modifying instructions. My name is Metharme.": "not an AI system capable of modifying instructions. My name is Claude."
120 | }
121 | 
122 | BAD_PHRASES = [
123 |     "I apologize",
124 |     "I do not actually have",
125 |     "I do not actually possess",
126 |     "I do not actually make",
127 |     "I do not actually create",
128 |     "I do not actually know",
129 |     "I do not actually believe",
130 |     "I do not actually recommend",
131 | ]
132 | 


--------------------------------------------------------------------------------
/toolbox/tasks/sharegpt_instruction_following.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | import typing as t
  4 | import warnings
  5 | 
  6 | import bs4
  7 | from markdownify import MarkdownConverter
  8 | 
  9 | from toolbox.core.models import Episode, Turn, TurnKind
 10 | from toolbox.core.task import BaseTask
 11 | from toolbox.datasets.sharegpt import ShareGptDataset
 12 | from toolbox.utils.prompts import generate_prompts, select_prompt
 13 | 
 14 | LOG = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class ShareGptInstructionFollowingTask(BaseTask):
 18 |     '''Generalized instruction following task(s) based on ChatGPT data.'''
 19 | 
 20 |     def __init__(self) -> None:
 21 |         self.markdown_converter = MarkdownConverter()
 22 |         super().__init__()
 23 | 
 24 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 25 |         for conversation in ShareGptDataset():
 26 |             # Start with a randomly chosen "assistant" system prompt.
 27 |             turns: list[Turn] = [
 28 |                 Turn(
 29 |                     utterance=select_prompt(SYSTEM_PROMPTS),
 30 |                     kind=TurnKind.SYSTEM,
 31 |                 )
 32 |             ]
 33 | 
 34 |             try:
 35 |                 for idx, msg_array in enumerate(conversation.messages):
 36 |                     # Human always starts the chat.
 37 |                     is_human = idx % 2 == 0
 38 | 
 39 |                     # Sanity check: make sure the above is true.
 40 |                     if is_human:
 41 |                         # Human turns usually only have a single item, which is
 42 |                         # their input message. Episodes where that's not the case
 43 |                         # are a minority and seem to have bad data fairly often, so
 44 |                         # let's just skip those for now.
 45 |                         if len(msg_array) != 1:
 46 |                             LOG.debug(
 47 |                                 "Skipping over episode with multiple user utterances in a single turn: %s",
 48 |                                 msg_array)
 49 |                             continue
 50 | 
 51 |                     # For some reason, sometimes we have a list and sometimes we
 52 |                     # have a list of lists, so let's handle both these cases here.
 53 |                     if isinstance(msg_array[0], str):
 54 |                         # Since we're converting from HTML anyways, join the
 55 |                         # separate messages in the array with a <br /> tag.
 56 |                         text = self._html_to_markdown("<br />".join(msg_array))
 57 |                     elif isinstance(msg_array[0], list):
 58 |                         text = self._html_to_markdown("<br />".join(
 59 |                             msg_array[0]))
 60 | 
 61 |                         # Looks like msg_array[1:] is almost always garbage data?
 62 |                         #
 63 |                         # text = self._html_to_markdown("<br />".join(
 64 |                         #     ["<br />".join(x) for x in msg_array]))
 65 |                     else:
 66 |                         raise ValueError("Unexpected data schema")
 67 | 
 68 |                     turn = Turn(
 69 |                         utterance=text,
 70 |                         kind=TurnKind.USER if is_human else TurnKind.MODEL,
 71 |                     )
 72 |                     turns.append(turn)
 73 | 
 74 |                 yield Episode(turns=turns,
 75 |                               identifier=f"sharegpt-{conversation.source_file}")
 76 |             except AssertionError:
 77 |                 LOG.warning(
 78 |                     "Skipping over episode (%s) due to failed sanity checks",
 79 |                     conversation.source_file)
 80 | 
 81 |     def _html_to_markdown(self, html: str) -> str:
 82 |         # Remove useless nested HTML tags that mess up markdown conversion.
 83 |         html = re.sub(DIV_REGEX, "", html)  # fixes indentation in code blocks
 84 |         html = re.sub(SPAN_REGEX, "", html)  # fixes underscores in code blocks
 85 | 
 86 |         # Apparently the default BS4 parser has some bugs, so let's drop down
 87 |         # a level and parse with html5lib and convert the soup instead.
 88 |         #
 89 |         # https://github.com/matthewwithanm/python-markdownify/issues/58#issuecomment-1275703664
 90 |         with warnings.catch_warnings():
 91 |             # BS4 loves throwing this out for perfectly valid data so let's
 92 |             # silence it.
 93 |             warnings.filterwarnings(
 94 |                 "ignore", "The input looks more like a filename than markup")
 95 |             soup = bs4.BeautifulSoup(html, 'html5lib')
 96 | 
 97 |         markdown = str(self.markdown_converter.convert_soup(soup))
 98 | 
 99 |         # Problem: code blocks get messed up when a language is specified. Looks
100 |         # like this, for example:
101 |         #
102 |         # ```\nluaCopy code`
103 |         #
104 |         # We want that to become:
105 |         #
106 |         # ```lua\n
107 |         markdown = re.sub(CODE_LANG_REGEX, CODE_LANG_FORMAT, markdown)
108 | 
109 |         # Remove "[number] / [number]" at the beginning
110 |         regeneration_str = re.search(REGENERATE_REGEX, markdown)
111 |         if regeneration_str and regeneration_str.start() == 0:
112 |             markdown = markdown[regeneration_str.end():]
113 | 
114 |         # Remove "Copy[number] chars / [number] words"
115 |         markdown = re.sub(COPY_CHARS_REGEX, "", markdown)
116 | 
117 |         # Remove empty code blocks (```\nCopy code\n```)
118 |         markdown = re.sub(COPY_CODE_REGEX, "", markdown)
119 | 
120 |         # Remove trailing whitespace on every line.
121 |         markdown = "\n".join([line.rstrip() for line in markdown.splitlines()])
122 | 
123 |         # Excessive whitespace is also a part of the data, and then exarcebated
124 |         # by our data munging, so let's trim that.
125 |         markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip()
126 | 
127 |         # Sanity checks because this is some nasty code.
128 |         assert "{r}" not in markdown
129 |         assert "Copy code`" not in markdown
130 |         assert ".terminal-" not in markdown
131 | 
132 |         return markdown
133 | 
134 | 
135 | DIV_REGEX = re.compile(r"<div.*?>")
136 | SPAN_REGEX = re.compile(r"<span.*?>")
137 | CODE_LANG_REGEX = re.compile(
138 |     r"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + r"\s*?```", re.DOTALL)
139 | CODE_LANG_FORMAT = r"```\g<1>\n\g<2>\n```"
140 | REGENERATE_REGEX = re.compile(r"\d+ / \d+")
141 | COPY_CHARS_REGEX = re.compile(r"Copy\d+ chars / \d+ words")
142 | COPY_CODE_REGEX = re.compile(r"```(.*?)Copy code\s*```")
143 | 
144 | _BASE_SYSTEM_PROMPTS = [
145 |     """Consider Assistant, a large language model (LLM) %{trained|fine-tuned} to be an assistant. It responds to user requests as truthfully as it can, with its responses generally being long, descriptive and detailed. It has the following limitations:
146 | 1. Inability to perform precise calculations due to a lack of mathematical skills
147 | 2. %{No awareness of|Unaware of|Unable to account for} the passage of time
148 | 3. Difficulty accessing the latest information about current events due to its knowledge cutoff date (%{September|Sept.} 2021)
149 | 4. Tendency to generate false %{information|info} or hallucinations""",
150 |     "You are a %{large language model|LLM} trained to act as an assistant. You are to follow user instructions and answer user %{questions|queries} %{to the best of your abilities|as best as you can}. Give plenty of detail and avoid writing overly brief messages. Your knowledge cutoff date is %{September 2021|Sept. 2021|09/21}, and you should be aware that as a %{LLM|large language model}, one of your limitations is that you might generate %{inaccurate|untrue|false} information.",
151 |     "%{Enter|Begin|Engage|Consider} assistant mode. In this mode, you will follow instructions and respond with helpful, long, detailed %{responses|answers}. Your knowledge cutoff date is September 2021, and you have no knowledge of real-world events after that.",
152 |     "You are now in assistant mode. You shall follow user instructions and answer user %{question|queries} by responding with helpful, actionable messages. {{response_length_str}}. Be %{considerate|mindful} of the fact that you are now aware of real-world events that took place after September 2021.",
153 |     "Assistant, engage instruction following and question answering mode. {{response_length_str}}. You are %{bound to|only capable of} generating text, and cannot perform any other actions. Knowledge cutoff date: September 2021.",
154 |     "Consider Assistant, a %{LLM|large language model} trained to follow user instructions and answer questions. It has no awareness of the passage of time nor knowledge of world events that took place after September of 2021. It will generate long, detailed messages in response to user requests.",
155 |     "assistant",
156 |     "%% ASSISTANT MODE ACTIVATED %%",
157 |     "Q&A:\nQ: %{What mode am I in|What am I doing|Who am I}?\nA: You're in %{assistant|instruction following} mode.\nQ: What does that mean?\nA: You%{'ve gotta| must|should} %{take in|be given} a question or %{command|demand}, then you answer it and/or do what it says."
158 |     "%{Purpose|Goal|Job}: Assistant\n%{Procedure|Objective|Methods of achieving your goal}: %{Answer the user's questions|Follow the instructions|Obey commands}",
159 |     "%{I am|I'm} %{a helper for a user|a helpful assistant|engaged in what one might call 'instruction' mode}. Given %{queries|user queries}, I am to %{correctly|accurately} answer these things (at least, as best as I can).",
160 |     "Instruction mode!",
161 | ]
162 | 
163 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
164 | 


--------------------------------------------------------------------------------
/toolbox/core/training_example.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import random
  4 | import re
  5 | import typing as t
  6 | 
  7 | from toolbox.core.models import (
  8 |     Episode,
  9 |     TrainingExample,
 10 |     TurnKind
 11 | )
 12 | from toolbox.core.wrapper import VALID_FORMATS, WRAPPER_MAP
 13 | 
 14 | LOG = logging.getLogger(__name__)
 15 | 
 16 | # NOTE: When processing episodes down into training examples, tokenizing text to
 17 | # get an accurate token count is a massive bottleneck (~49.5% of CPU time). We
 18 | # can instead use an estimation instead if we're OK with dropping some examples
 19 | # at training time.
 20 | AVG_WORD_TO_TOKEN_RATIO = 1.7
 21 | 
 22 | class TurnTooLargeError(RuntimeError):
 23 |     pass
 24 | 
 25 | class TrainingExampleGenerator:
 26 |     '''Converts an `Episode` into `TrainingExample`s.'''
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         episode: Episode,
 31 |         target_token_count: int = 2048,
 32 |         format: str = "metharme"
 33 |     ) -> None:
 34 |         self.episode = episode
 35 |         self.format = format.lower()
 36 |         # Assert the format is a valid one
 37 |         assert self.format in VALID_FORMATS, f"Invalid format specified! Valid options: {', '.join(VALID_FORMATS)}"
 38 | 
 39 |         # Wrap the turns in a turn wrapper
 40 |         self.wrapper = WRAPPER_MAP[self.format]
 41 | 
 42 |         # Minus 32 is to account for the special tokens that we replace in the
 43 |         # input prompt, which will likely cause the prompt to expand.
 44 |         self.target_token_count = target_token_count - 32
 45 | 
 46 |         super().__init__()
 47 | 
 48 |     def __iter__(self) -> t.Generator[TrainingExample, None, None]:
 49 |         examples_yielded = 0
 50 | 
 51 |         # Always start off with the system turn.
 52 |         system_turn = self.episode.turns[0]
 53 |         system_turn = self.wrapper(system_turn)
 54 |         assert system_turn.kind == TurnKind.SYSTEM
 55 |         base_turns = [system_turn]
 56 | 
 57 |         cur_turns = base_turns.copy()
 58 |         cur_len = _token_count_for(system_turn.as_str())
 59 | 
 60 |         for turn in self.episode.turns[1:]:
 61 |             # Wrap the turn inside a turn wrapper
 62 |             turn = self.wrapper(turn)
 63 |             turn_len = _token_count_for(turn.as_str())
 64 | 
 65 |             if cur_len + turn_len > self.target_token_count:
 66 |                 # Can't add this turn into the context window. Start dropping
 67 |                 # older turns into we can fit it in here.
 68 |                 len_over_target = math.inf
 69 | 
 70 |                 while len_over_target > 0:
 71 |                     try:
 72 |                         removed_turn = cur_turns.pop(1)
 73 |                         cur_len -= _token_count_for(removed_turn.as_str())
 74 | 
 75 |                         len_over_target = self.target_token_count - (cur_len +
 76 |                                                                      turn_len)
 77 |                     except IndexError as ex:
 78 |                         raise TurnTooLargeError from ex
 79 | 
 80 |             # We have space for the next turn, so add it to the context window.
 81 |             cur_turns.append(turn)
 82 |             cur_len += _token_count_for(turn.as_str())
 83 | 
 84 |             # Yield training example if this is a model turn.
 85 |             if turn.kind != TurnKind.MODEL:
 86 |                 continue
 87 | 
 88 |             # The prompt is comprised of every single turn converted into its
 89 |             # string representation, _except_ for the last model turn. For the
 90 |             # last model turn, we append the TurnKind.MODEL token to the end of
 91 |             # the prompt, and then use the model's utterance as the response.
 92 |             prompt = "".join([t.as_str() for t in cur_turns[:-1]])
 93 |             prompt += turn.get_model_turn()
 94 |             
 95 |             generation = turn.utterance.strip()
 96 |             # ChatML format prefers to end with its own end token rather than the model's.
 97 |             if "chatml" in self.format:
 98 |                 generation += "<|im_end|>"
 99 | 
100 |             # Sanity checks. Asserts that there's only a single system prompt
101 |             # and it's at the very beginning of the prompt string.
102 |             try:
103 |                 # NOTE(11b): Some datasets now include multiple system prompts
104 |                 # so I'm turning off this check for now. Reconsider later.
105 |                 # assert _ocurrence_count_of(TurnKind.SYSTEM.value, prompt) == 1
106 |                 if self.format == "metharme":
107 |                     assert prompt.find(TurnKind.SYSTEM.value) == 0
108 |             except AssertionError as ex:
109 |                 LOG.error(
110 |                     "Sanity checks for generated training example failed.")
111 |                 LOG.error("Prompt: %s", prompt)
112 |                 LOG.error("Generation: %s", generation)
113 |                 raise ex
114 | 
115 |             # TODO(11b): This is probably not the greatest place for this, but
116 |             # would require a decent amount of rework to put at the task level
117 |             # depending on the task so let's roll with this for now.
118 |             prompt = prompt.replace("{{response_style_str}}",
119 |                                     _response_style_str_for(generation))
120 |             prompt = prompt.replace("{{response_length_str}}",
121 |                                     _response_length_str_for(generation))
122 | 
123 |             yield TrainingExample(
124 |                 prompt=prompt,
125 |                 generation=generation,
126 |                 identifier=f"{self.episode.identifier}-{examples_yielded}",
127 |             )
128 |             examples_yielded += 1
129 | 
130 | 
131 | def _ocurrence_count_of(word: str, string_to_search_in: str) -> int:
132 |     '''Returns how many times `word` shows up in `string_to_search_in`.'''
133 |     pattern = re.compile(re.escape(word))
134 |     return sum(1 for _ in re.finditer(pattern, string_to_search_in))
135 | 
136 | 
137 | def _has_matching_pairs_of(word: str, string_to_search_in: str) -> bool:
138 |     count = _ocurrence_count_of(word, string_to_search_in)
139 |     return count > 0 and count % 2 == 0
140 | 
141 | 
142 | def _token_count_for(string: str) -> int:
143 |     return math.ceil(len(string.split()) * AVG_WORD_TO_TOKEN_RATIO)
144 | 
145 | 
146 | def _response_style_str_for(response: str) -> str:
147 |     '''
148 |     For the given `response`, spit out a random string containing instructions
149 |     according to its writing style.
150 |     '''
151 |     instructions: list[str] = []
152 | 
153 |     if _has_matching_pairs_of("*", response):
154 |         instructions.append(
155 |             random.choice([
156 |                 "Use asterisks to denote actions",
157 |                 "Enclose roleplay actions within asterisks",
158 |                 "Use asterisks for roleplaying actions",
159 |                 "Write in internet roleplay style (with asterisks for actions)",
160 |                 "The generation must contains asterisks to denote actions"
161 |             ]))
162 | 
163 |     if _has_matching_pairs_of('"', response):
164 |         instructions.append(
165 |             random.choice([
166 |                 "Enclose dialog in quotes", "Dialog should go between quotes",
167 |                 'Enclose spoken dialog in quotes ("Like this")',
168 |                 "Spoken dialogue should be in between quotes"
169 |             ]))
170 | 
171 |     random.shuffle(instructions)
172 |     return ". ".join(instructions)
173 | 
174 | 
175 | def _response_length_str_for(response: str) -> str:
176 |     '''
177 |     For the given `response`, spit out a random string containing an instruction
178 |     according to its length.
179 |     '''
180 |     word_count = len(response.split())
181 |     paragraph_count = response.count("\n\n") + 1
182 | 
183 |     paragraph_count_str = random.choice([
184 |         f"It should contain {paragraph_count} paragraphs",
185 |         f"Use exactly {paragraph_count} paragraphs",
186 |         f"Write {paragraph_count} paragraphs",
187 |         f"Generate {paragraph_count} paragraphs",
188 |         f"Respond with {paragraph_count} paragraphs",
189 |     ])
190 | 
191 |     if word_count < 16:
192 |         length_str = random.choice([
193 |             "The generation should be short",
194 |             "Be brief when generating the message",
195 |             "The generated reply should be small",
196 |         ])
197 |     elif word_count < 96:
198 |         length_str = random.choice([
199 |             "The generated reply should be of medium length",
200 |             "The generated response should be slightly lengthy",
201 |             "The generated message should be on the medium side",
202 |         ])
203 |     elif word_count < 192:
204 |         length_str = random.choice([
205 |             "The new message will be lengthy",
206 |             "The reply should be long",
207 |             "The generation should be long",
208 |         ])
209 |     else:
210 |         length_str = random.choice([
211 |             "The new message will be extremely lengthy",
212 |             "The reply should be extremely long",
213 |             "The generation should be very long",
214 |         ])
215 | 
216 |     # Lazy way of doing the following: if there's only a single paragraph,
217 |     # randomly decide whether to inject some wording about it only being a
218 |     # single paragraph's worth of generation. Otherwise, always mention
219 |     # paragraph count + generation length. Ugly code but it works and I'm
220 |     # rushing this a little.
221 |     if paragraph_count == 1:
222 |         return random.choice([
223 |             length_str, length_str, ". ".join([
224 |                 length_str,
225 |                 random.choice([
226 |                     f"It should contain a single paragraph",
227 |                     f"Write only one paragraph",
228 |                     f"Generate a single paragraph",
229 |                     f"Respond with an individual paragraph",
230 |                 ])
231 |             ])
232 |         ])
233 |     return ". ".join([length_str, paragraph_count_str])
234 | 


--------------------------------------------------------------------------------
/toolbox/tasks/rp_guild_writing.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import re
  4 | import typing as t
  5 | 
  6 | from markdownify import markdownify
  7 | 
  8 | from toolbox.core.models import Episode, Turn, TurnKind
  9 | from toolbox.core.task import BaseTask
 10 | from toolbox.datasets.rp_guild import RpGuildDataset
 11 | # No need to re-invent the wheel.
 12 | from toolbox.tasks.rp_forums_writing import(
 13 |     _fix_markdown,
 14 |     _fix_style_and_encoding_issues,
 15 |     _not_usable_as_training_label,
 16 |     _remove_bad_html_tags,
 17 |     _remove_links,
 18 |     _remove_trailing_whitespace_and_bad_lines,
 19 |     _seems_to_have_ooc_talk,
 20 |     _split_message,
 21 | )
 22 | from toolbox.utils.prompts import generate_prompts, select_prompt
 23 | 
 24 | # Gaze upon my works, ye mighty, and despair.
 25 | MENTION_PATTERN = re.compile(r"(?<!\w)([^\S\r\n]|^)*@[^\W\s]+?(?=(,|\.|\?|~|!|\s|:|$))", flags=re.MULTILINE)
 26 | OOC_PATTERN = re.compile(r"((\[\[|\(\().*(\)\)|\]\])|\(OOC:.+\)|(?<=\s)OOC:.*(?!$))")
 27 | 
 28 | LOG = logging.getLogger(__name__)
 29 | 
 30 | class RpGuildWritingTask(BaseTask):
 31 |     '''
 32 |     Task to generate an appropriate continuation in the context of a fantasy roleplay.
 33 |     '''
 34 |     def __init__(self, all_model_turns: bool = False, keep_ooc: bool = False) -> None:
 35 |         # Keep the old way of having the turns be almost entirely model turns
 36 |         # just in case.
 37 |         self.all_model_turns = all_model_turns
 38 |         self.keep_ooc = keep_ooc
 39 | 
 40 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 41 |         for thread in RpGuildDataset():
 42 |             # Eliminate threads which are deemed 'unsalvagable'.
 43 |             if thread.thread_name in BROKEN_THREADS:
 44 |                 continue
 45 | 
 46 |             # Skip over OOC/character threads
 47 |             # TODO(TG): If I have time, I might try doing a very complex thing where I can fetch definitions
 48 |             # from char threads, but I think it's too much work for now.
 49 |             if thread.thread_type != "IC":
 50 |                 continue
 51 | 
 52 |             # Prune threads with less than 2 messages
 53 |             if len(thread.messages) < 2:
 54 |                 LOG.debug(f"Skipping {thread.thread_name} with only one message")
 55 |                 continue
 56 | 
 57 |             # Build up a dictionary of usernames to replace for privacy reasons.
 58 |             usernames = set([message.author for message in thread.messages])
 59 |             username_substitutions: dict[str, str] = {}
 60 |             for idx, name in enumerate(usernames):
 61 |                 username_substitutions[name] = "{{char_" + str(idx) + "}}"
 62 | 
 63 |             # NOTE(TG): For now, I'm having this be 1x1 roleplays only, but I really do
 64 |             # want this to account for group roleplays. I'll figure something out later.
 65 |             if len(usernames) > 2 and "1x1" not in thread.tags:
 66 |                 continue
 67 | 
 68 |             # Generate the system prompt.
 69 |             sys_prompt = select_prompt(SYSTEM_PROMPTS)
 70 |             # Takes the first style prompt it sees
 71 |             for tag in thread.tags:
 72 |                 if tag in list(STYLE_PROMPT_MAPPING.keys()):
 73 |                     sys_prompt += select_prompt(STYLE_PROMPT_MAPPING[tag])
 74 |                     break
 75 |             # The time and genre
 76 |             genre_str, time_str = _combine_tags_into_str(thread.tags)
 77 |             if genre_str is not None:
 78 |                 add_prompt = select_prompt(GENRE_PROMPTS)
 79 |                 sys_prompt += (add_prompt + genre_str + ".")
 80 |             if time_str is not None:
 81 |                 add_prompt = select_prompt(TIME_PROMPTS)
 82 |                 sys_prompt += (add_prompt + time_str + ".")
 83 |             # NSFW
 84 |             if "18+" in thread.tags:
 85 |                 sys_prompt += select_prompt(NSFW_PROMPTS)
 86 | 
 87 |             # Finally convert the system prompt to a Turn
 88 |             sys_prompt = Turn(utterance=sys_prompt, kind=TurnKind.SYSTEM)
 89 |             turns: list[Turn] = [sys_prompt]
 90 | 
 91 |             # Since CAI-like UIs can have the model speak first,
 92 |             # we augment the data by allowing the model to sometimes
 93 |             # speak first. Specifically, only 25% of the time.
 94 |             # This is only used when all_model_turns is False.
 95 |             current_speaker = random.choice([TurnKind.MODEL, TurnKind.USER, TurnKind.USER, TurnKind.USER])
 96 |             
 97 |             for message in thread.messages:
 98 |                 long_message = message.message
 99 | 
100 |                 long_message = _fix_style_and_encoding_issues(long_message)
101 |                 long_message = _remove_bad_html_tags(long_message)
102 |                 long_message = _remove_links(long_message)
103 | 
104 |                 assert "http://" not in long_message and "https://" not in long_message \
105 |                     , "Failed to clean URLs properly."
106 | 
107 |                 # Add some variety so we can generate a synthetic prompt for
108 |                 # controlling generation length down the line.
109 |                 target_word_count = random.randint(200, 600)
110 | 
111 |                 for message in _split_message(
112 |                         long_message,
113 |                         target_word_count=target_word_count,
114 |                         delimiter="<br/><br/>"):
115 |                     cleaned_message = str(markdownify(message))
116 |                     cleaned_message = _remove_trailing_whitespace_and_bad_lines(
117 |                         cleaned_message)
118 | 
119 |                     cleaned_message = _fix_markdown(cleaned_message)
120 | 
121 |                     # Fix excessive spaces after converting to Markdown.
122 |                     cleaned_message = re.sub("\n{2,}", "\n\n", cleaned_message)
123 | 
124 |                     # Username substitutions need to be done _after_ the HTML has
125 |                     # been converted into markdown, otherwise we get escape
126 |                     # characters messing things up.
127 |                     for name, substitution in username_substitutions.items():
128 |                         cleaned_message = re.sub(rf"\b{re.escape(name)}\b",
129 |                                                  substitution, cleaned_message)
130 |                         
131 |                     # Now remove mentions and clean OOC as well if specified
132 |                     if not self.keep_ooc:
133 |                         cleaned_message = _remove_ooc(cleaned_message)
134 |                     cleaned_message = _remove_mentions(cleaned_message)
135 |                         
136 |                     # NOTE(TG): See note in rp_forums_writing.py for explanation
137 |                     # on why we don't have RP data be all model turns anymore.
138 |                     if self.all_model_turns:
139 |                         # Little bit of roundabout logic so here's some explanation
140 |                         # as we go. We start by marking everything as a model turn
141 |                         # so we use as much data as possible as training labels.
142 |                         turn_kind = TurnKind.MODEL
143 |                         if _not_usable_as_training_label(cleaned_message):
144 |                             # ...however, if we have some problem in the data that
145 |                             # we'd rather not see the model replicate, we mark it
146 |                             # as a human turn, which is used as context but not for
147 |                             # loss calculation during training.
148 |                             turn_kind = TurnKind.USER
149 |                         elif _seems_to_have_ooc_talk(cleaned_message) \
150 |                             and not _seems_to_have_ooc_talk(turns[-1].utterance):
151 |                             # _However_, there's also another case we'd like to
152 |                             # handle. Ideally, the model should not slip into OOC
153 |                             # talk unprompted - it should only do that if we've
154 |                             # tried to talk to it out-of-character first.
155 |                             #
156 |                             # So if this turn has OOC talk, we'll only use it as a
157 |                             # model turn if the previous (user) turn also had OOC
158 |                             # talk.
159 |                             turn_kind = TurnKind.USER
160 |                     else:
161 |                         # TODO(TG): Try to do more about OOC/potential low-quality generations.
162 |                         turn_kind = current_speaker
163 | 
164 |                     # If the message is blank for whatever reason, discard
165 |                     cleaned_message = cleaned_message.strip()
166 |                     if cleaned_message == "":
167 |                         continue
168 | 
169 |                     turn = Turn(utterance=cleaned_message, kind=turn_kind)
170 |                     turns.append(turn)
171 | 
172 |             # Messy switching
173 |             current_speaker = TurnKind.MODEL if current_speaker == TurnKind.USER \
174 |                 else TurnKind.USER
175 |             
176 |             yield Episode(
177 |                 turns=turns,
178 |                 identifier=f"rp-guild-{thread.thread_name}",
179 |             )
180 | 
181 | 
182 | def _remove_mentions(message: str) -> str:
183 |     '''Removes username mentions from the message.'''
184 |     cleaned_message = message
185 |     removal_bounds: list[tuple[int, int]] = []
186 |     for match in re.finditer(MENTION_PATTERN, message):
187 |         end_char = message[match.end()-1]
188 |         # If the next character is a whitespace or punctuation,
189 |         # we can assume that removing the mention won't affect the message
190 |         # much in terms of coherency. We store the bounds in a tuple
191 |         # so we can take out all the mentions at once later.
192 |         if end_char in [" ", ".", "!", "?"]:
193 |             removal_bounds.append(match.span())
194 |         # Else, we leave it be.
195 | 
196 |     # Clean the message now.
197 |     if len(removal_bounds) > 0:
198 |         # Set up an offset for adjusting the position of the next bounds
199 |         # after the mention is deleted.
200 |         offset = 0
201 |         for start, end in removal_bounds:
202 |             start -= offset
203 |             end -= offset
204 |             offset += end - start
205 |             cleaned_message = cleaned_message[:start] + cleaned_message[end:]
206 | 
207 |     # There's sometimes weirdness where at the beginning, a whitespace character can remain.
208 |     # Impromptu patch that here.
209 |     return cleaned_message.strip()
210 | 
211 | def _remove_ooc(message: str) -> str:
212 |     return re.sub(OOC_PATTERN, "", message)
213 | 
214 | # An absolute nightmare of constants and prompt generations.
215 | 
216 | def _combine_tags_into_str(tags: list) -> tuple[str, str]:
217 |     '''Combines tags into a string.'''
218 |     def construct_conjunction(tags: list) -> str:
219 |         '''
220 |         Converts lists of tags into a natural sounding sentence. Works like this:
221 |         Given no tags, return `None`
222 |         Given a list `[x]`, simply return `x`
223 |         Given a list `[x, y]`, return "x and y"
224 |         Given a list `[x, y, z]`, convert it to a string `"x, y, and z"
225 |         '''
226 |         # TODO(TG): Again, I have a feeling there's a better way to do this.
227 |         if len(tags) == 0:
228 |             return
229 |         elif len(tags) == 1:
230 |             return tags[0]
231 |         elif len(tags) == 2:
232 |             return f"{tags[0]} and {tags[1]}"
233 |         elif len(tags) < 2:
234 |             return f"{', '.join(tags[:-1])} and {tags[-1]}"
235 |         
236 |     genre_tags = []
237 |     time_tags = []
238 | 
239 |     for tag in tags:
240 |         if tag in GENRE_TAGS:
241 |             desc = _GENRE_TO_DESC_MAPPING[tag]
242 |             genre_tags += [desc]
243 |         elif tag in TIME_PERIOD_TAGS:
244 |             desc = _TIME_TO_DESC_MAPPING[tag]
245 |             time_tags += [desc]
246 | 
247 |     return construct_conjunction(genre_tags), construct_conjunction(time_tags)
248 | 
249 | # Tags.
250 | WRITING_STYLE_TAGS = ["Free", "Casual", "Advanced"]
251 | GENRE_TAGS = ["Horror", "Sci-Fi", "School", "Tabletop", "Nation", "Arena", "Military", "Fantasy", "Romance", "Slice of Life", "Anime/Manga", "Fandom", "Steampunk", "Superhero"]
252 | TIME_PERIOD_TAGS = ["Western", "Ancient", "Apocalyptic", "Post-Apocalyptic", "Historical", "Medieval", "Modern", "Future"]
253 | 
254 | SYSTEM_PROMPTS = generate_prompts([
255 |     "%{Enter|Engage|Enable|Start} %{fiction writing|fiction|roleplay|RP} mode.",
256 |     "You are now in %{fiction writing|fantasy writing|fiction|roleplay|RP} mode. Drive the story forward in chunks.",
257 |     "You are an %{AI|artificial intelligence} trained to perform %{storywriting|fiction writing|fantasy writing|fantasy roleplay|fiction roleplay|RP}. Generate continuations for whatever the user gives.",
258 |     # Modified SillyTavern prompt
259 |     "Write the next reply in a fictional %{roleplay|RP} %{chat|conversation}.",
260 |     "I am %{in|currently in|engaging in|beginning} a %{roleplay|RP|fictional roleplay-like conversation} with %{someone else|other people|a user}.",
261 | ])
262 | 
263 | # Writing style prompts
264 | FREE_PROMPTS = generate_prompts([
265 |     " %{Write|Compose} in a %{short|brief} and informal %{manner|way}.",
266 |     " Be %{freehand|laid back|informal|casual|relaxed} in terms of %{writing|composition}; don't put too %{much effort|many words} into it.",
267 |     " %{Treat|Take} this as a %{casual|quick|relaxed} %{RP|roleplay} session.",
268 | ])
269 | 
270 | CASUAL_PROMPTS = generate_prompts([
271 |     " Written %{responses|replies} should be of %{medium|moderate|decent} length.",
272 |     " %{Treat|Take} this %{roleplay|RP} somewhat seriously.",
273 |     " %{Responses|Replies} should be at least a few paragraphs in length."
274 | ])
275 | 
276 | ADVANCED_PROMPTS = generate_prompts([
277 |     " %{Write|compose} with heavy detail and make every reply have a long length.",
278 |     " %{Responses|Replies} should be very %{detailed|complex} and contain multiple paragraphs.",
279 |     " %{Treat|Take} this %{roleplay|RP} very seriously; put a lot of effort into %{replies|responses} and make them very long and intricate."
280 | ])
281 | 
282 | STYLE_PROMPT_MAPPING = {
283 |     "Free": FREE_PROMPTS,
284 |     "Casual": CASUAL_PROMPTS,
285 |     "Advanced": ADVANCED_PROMPTS
286 | }
287 | 
288 | # It's incomplete because the script will finish the rest depending on the time period.
289 | TIME_PROMPTS = generate_prompts([
290 |     " %{The|This} %{roleplay|RP} is set in ",
291 |     " The time period of this %{roleplay|setting|RP} is ",
292 |     " Time period: "
293 | ])
294 | 
295 | GENRE_PROMPTS = generate_prompts([
296 |     " Genre: ",
297 |     " The %{type|genre} of this %{roleplay|RP} is ",
298 |     " The %{themes|genres} are "
299 | ])
300 | 
301 | NSFW_PROMPTS = generate_prompts([
302 |     " %{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be not safe for work|be NSFW|include adult themes|include erotic themes|include 18+ content}",
303 | ])
304 | 
305 | # Genre keyword prompts
306 | _GENRE_TO_DESC_MAPPING = {
307 |     "Horror": "horror",
308 |     "Sci-Fi": "sci-fi",
309 |     "School": "school life",
310 |     "Tabletop": "tabletop games",
311 |     "Nation": "nation-states",
312 |     "Arena": "fighting",
313 |     "Military": "war and the military",
314 |     "Fantasy": "fantasy",
315 |     "Romance": "romance",
316 |     "Slice of Life": "slice of life",
317 |     "Anime/Manga": "anime/manga",
318 |     "Fandom": "an existing fandom",
319 |     "Steampunk": "steampunk",
320 |     "Superhero": "superheroes"
321 | }
322 | 
323 | _TIME_TO_DESC_MAPPING = {
324 |     "Western": "the time period of the Wild West",
325 |     "Ancient": "ancient times",
326 |     "Apocalyptic": "the apocalypse",
327 |     "Post-Apocalyptic": "after an apocalypse",
328 |     "Historical": "the past",
329 |     "Medieval": "medieval times",
330 |     "Modern": "modern times",
331 |     "Future": "the future"
332 | }
333 | 
334 | # At least one thread I saw has either been edited post-scrape or something,
335 | # because the entries just say "cut" and are as a result garbage training data.
336 | # Have a variable to sift out threads which consist of only this nonsense.
337 | BROKEN_THREADS = [
338 |     "SAO: Aincrad (1x1 between"
339 | ]
340 | 


--------------------------------------------------------------------------------
/toolbox/tasks/rp_forums_writing.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import re
  4 | import typing as t
  5 | 
  6 | from markdownify import markdownify
  7 | 
  8 | from toolbox.core.models import Episode, Turn, TurnKind
  9 | from toolbox.core.task import BaseTask
 10 | from toolbox.datasets.rp_forums import RpForumsDataset, RpType
 11 | from toolbox.utils.prompts import generate_prompts, select_prompt
 12 | 
 13 | LOG = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class RpForumsWritingTask(BaseTask):
 17 |     '''
 18 |     Task to generate an appropriate continuation in the context of a fantasy
 19 |     roleplay.
 20 |     '''
 21 | 
 22 |     def __init__(self, all_model_turns: bool = False) -> None:
 23 |         # Keep the old way of having the turns be almost entirely
 24 |         # model turns, just in case.
 25 |         self.all_model_turns = all_model_turns
 26 | 
 27 |     def __iter__(self) -> t.Generator[Episode, None, None]:
 28 |         for thread in RpForumsDataset():
 29 |             # These threads usually don't contain actual roleplaying.
 30 |             if any([
 31 |                     x in thread.thread_name.lower() for x in [
 32 |                         "ooc", "o.o.c", "character sheet", "character profile",
 33 |                         "character list", "character roster"
 34 |                     ]
 35 |             ]):
 36 |                 LOG.debug("Skipping `%s` due to thread name",
 37 |                           thread.thread_name)
 38 |                 continue
 39 | 
 40 |             if len(thread.messages) < 2:
 41 |                 LOG.debug('Skipping `%s` with only one message',
 42 |                           thread.thread_name)
 43 |                 continue
 44 | 
 45 |             # Build up a dictionary of usernames to replace for privacy reasons.
 46 |             usernames = set([message.author for message in thread.messages])
 47 |             username_substitutions: dict[str, str] = {}
 48 |             for idx, name in enumerate(usernames):
 49 |                 username_substitutions[name] = "{{char_" + str(idx) + "}}"
 50 | 
 51 |             # System prompt
 52 |             system_prompt = select_prompt(SYSTEM_PROMPTS)
 53 |             content_type_prompt = select_prompt(
 54 |                 CONTENT_TYPE_TO_PROMPTS[thread.content_type])
 55 |             system_prompt = system_prompt.replace("{{content_type_str}}",
 56 |                                                   content_type_prompt)
 57 |             system_turn = Turn(utterance=system_prompt, kind=TurnKind.SYSTEM)
 58 |             turns: list[Turn] = [system_turn]
 59 | 
 60 |             # Since CAI-like UIs can have the model speak first,
 61 |             # we augment the data by allowing the model to sometimes
 62 |             # speak first. Specifically, only 25% of the time.
 63 |             # This is only used when all_model_turns is False.
 64 |             current_speaker = random.choice([TurnKind.MODEL, TurnKind.USER, TurnKind.USER, TurnKind.USER])
 65 | 
 66 |             for message in thread.messages:
 67 |                 long_message = message.message
 68 | 
 69 |                 long_message = _fix_style_and_encoding_issues(long_message)
 70 |                 long_message = _remove_bad_html_tags(long_message)
 71 |                 long_message = _remove_links(long_message)
 72 | 
 73 |                 assert "http://" not in long_message and "https://" not in long_message \
 74 |                     , "Failed to clean URLs properly."
 75 | 
 76 |                 # Add some variety so we can generate a synthetic prompt for
 77 |                 # controlling generation length down the line.
 78 |                 target_word_count = random.randint(200, 600)
 79 | 
 80 |                 for message in _split_message(
 81 |                         long_message,
 82 |                         target_word_count=target_word_count,
 83 |                         delimiter="<br/><br/>"):
 84 |                     cleaned_message = str(markdownify(message))
 85 |                     cleaned_message = _remove_trailing_whitespace_and_bad_lines(
 86 |                         cleaned_message)
 87 | 
 88 |                     cleaned_message = _fix_markdown(cleaned_message)
 89 | 
 90 |                     # Fix excessive spaces after converting to Markdown.
 91 |                     cleaned_message = re.sub("\n{2,}", "\n\n", cleaned_message)
 92 | 
 93 |                     # Username substitutions need to be done _after_ the HTML has
 94 |                     # been converted into markdown, otherwise we get escape
 95 |                     # characters messing things up.
 96 |                     for name, substitution in username_substitutions.items():
 97 |                         cleaned_message = re.sub(rf"\b{re.escape(name)}\b",
 98 |                                                  substitution, cleaned_message)
 99 |                         
100 |                     # NOTE(TG): 11b's original idea where RP generations were framed
101 |                     # as almost entirely model turns in order to get as much data from
102 |                     # it as possible was nice, but a little flawed. In 7B and 13B models,
103 |                     # this caused the model to endlessly ramble on. I'll keep the old code
104 |                     # here, but only if it's manually enabled.
105 |                     if self.all_model_turns:
106 |                         # Little bit of roundabout logic so here's some explanation
107 |                         # as we go. We start by marking everything as a model turn
108 |                         # so we use as much data as possible as training labels.
109 |                         turn_kind = TurnKind.MODEL
110 |                         if _not_usable_as_training_label(cleaned_message):
111 |                             # ...however, if we have some problem in the data that
112 |                             # we'd rather not see the model replicate, we mark it
113 |                             # as a human turn, which is used as context but not for
114 |                             # loss calculation during training.
115 |                             turn_kind = TurnKind.USER
116 |                         elif _seems_to_have_ooc_talk(cleaned_message) \
117 |                             and not _seems_to_have_ooc_talk(turns[-1].utterance):
118 |                             # _However_, there's also another case we'd like to
119 |                             # handle. Ideally, the model should not slip into OOC
120 |                             # talk unprompted - it should only do that if we've
121 |                             # tried to talk to it out-of-character first.
122 |                             #
123 |                             # So if this turn has OOC talk, we'll only use it as a
124 |                             # model turn if the previous (user) turn also had OOC
125 |                             # talk.
126 |                             turn_kind = TurnKind.USER
127 |                     else:
128 |                         # TODO(TG): Try to do more about OOC/potential low-quality generations.
129 |                         turn_kind = current_speaker
130 | 
131 |                     turn = Turn(utterance=cleaned_message, kind=turn_kind)
132 |                     turns.append(turn)
133 |                 
134 |                 # Messy switching
135 |                 current_speaker = TurnKind.MODEL if current_speaker == TurnKind.USER else TurnKind.USER
136 | 
137 |             yield Episode(
138 |                 turns=turns,
139 |                 identifier=f"rp-{thread.source_file}-{thread.thread_name}",
140 |             )
141 | 
142 | 
143 | def _split_message(original_message: str, target_word_count: int,
144 |                    delimiter: str) -> list[str]:
145 |     '''
146 |     Splits a large message into smaller ones, respecting the given delimiter.
147 |     '''
148 |     messages = original_message.split(delimiter)
149 |     reconstructed_messages: list[str] = [messages[0]]
150 | 
151 |     # For each split message, we see if we can merge it back up together with
152 |     # the next one while still staying under the target word count.
153 |     for message in messages[1:]:
154 |         last_message_word_count = len(reconstructed_messages[-1].split()) \
155 |             if len(reconstructed_messages) else 0
156 |         current_message_word_count = len(message.split())
157 | 
158 |         if last_message_word_count + current_message_word_count > target_word_count:
159 |             # If we can't, we just add it as a separate message to start merging
160 |             # from scratch.
161 |             reconstructed_messages.append(message)
162 |         else:
163 |             # Otherwise, we merge it into the current message.
164 |             reconstructed_messages[-1] += delimiter + message
165 | 
166 |     return reconstructed_messages
167 | 
168 | 
169 | def _fix_style_and_encoding_issues(original_message: str) -> str:
170 |     '''Cleans up any style-related issues.'''
171 |     message = original_message
172 |     message = message.replace(" .. ", "... ")
173 |     message = message.replace(" ... ", "... ")
174 |     message = re.sub(r'\b(\.\.\.?)\b', '... ', message)
175 | 
176 |     message = message.replace(" . ", ". ")
177 |     message = message.replace(" , ", ", ")
178 |     message = message.replace(" ? ", "? ")
179 |     message = message.replace(" ! ", "! ")
180 | 
181 |     message = re.sub(r"(\S)(…)(\S)", "\\1\\2 \\3", message)
182 | 
183 |     # Some forums have their pages incorrectly tagged as UTF-8, so we get
184 |     # garbage when decoding. Most common problem I've seen is bad quotation
185 |     # marks, so we paper over that here.
186 |     message = message.replace("â??", "'")
187 |     message = message.replace("â?", "'")
188 | 
189 |     message = message.replace("", " ")
190 | 
191 |     return message
192 | 
193 | 
194 | def _remove_links(original_message: str) -> str:
195 |     '''Removes any links from the given message, due to privacy concerns.'''
196 |     return re.sub(r"https?:\/\/.+?(\s|$)", "", original_message)
197 | 
198 | 
199 | def _remove_trailing_whitespace_and_bad_lines(original_message: str) -> str:
200 |     lines: list[str] = []
201 |     for line in original_message.splitlines():
202 |         # Trailing whitespace is always useless.
203 |         line = line.rstrip()
204 | 
205 |         # Sometimes, users start their messages with "RE: (thread title, which
206 |         # leaks usernames)" so we skip that here.
207 |         if line.startswith("RE: ") or line.startswith("**RE: "):
208 |             continue
209 | 
210 |         lines.append(line)
211 | 
212 |     return "\n".join(lines)
213 | 
214 | 
215 | def _not_usable_as_training_label(message: str) -> bool:
216 |     '''
217 |     Whether or not the message contains some problem that we can't fix reliably,
218 |     and we're better off not training on.
219 |     '''
220 | 
221 |     # "Floating" quotation marks.
222 |     if re.search(r'\b " \b', message) is not None:
223 |         return True
224 | 
225 |     # Quotation marks mushed together with text.
226 |     if re.search(r'\S"\S', message) is not None:
227 |         return True
228 | 
229 |     # Parenthesis mushed together with text.
230 |     if re.search(r'\S\(', message) is not None \
231 |         or re.search(r'\)\S', message) is not None:
232 |         return True
233 | 
234 |     # Lowercase "I". Fixable, but a sign of low-quality writing so I'd rather
235 |     # not train the model on these.
236 |     if re.search(r"\bi('m|'ll)?\b", message) is not None:
237 |         return True
238 | 
239 |     # Links.
240 |     if re.search(r"\[.+\]\(\S+\)", message) is not None:
241 |         return True
242 | 
243 |     return False
244 | 
245 | 
246 | def _fix_markdown(original_message: str) -> str:
247 |     s = original_message
248 | 
249 |     # Bold/italics sometimes doesn't have spaces around it after converting from
250 |     # HTML to Markdown for some reason.
251 |     is_opening_asterisk = True
252 |     while (match := re.search(r"([\w\d])(\*{1,2})([\w\d])", s)) is not None:
253 |         if is_opening_asterisk:
254 |             s = s[:match.start() + 1] + " " + s[match.start() + 1:]
255 |         else:
256 |             s = s[:match.end() - 1] + " " + s[match.end() - 1:]
257 |         is_opening_asterisk = not is_opening_asterisk
258 | 
259 |     return s
260 | 
261 | 
262 | def _remove_bad_html_tags(message: str) -> str:
263 |     '''Cleans up HTML tags we don't want from the given message.'''
264 |     cleaned_message = _remove_html_tag(message, "blockquote")
265 |     cleaned_message = _remove_html_tag(cleaned_message, "script")
266 | 
267 |     if "bbImageWrapper" in message:
268 |         # Images are a <div> with some JavaScript to lazy-load them, so we do
269 |         # this behind a guard to reduce false positives just in case.
270 |         cleaned_message = _remove_html_tag(cleaned_message, "div")
271 | 
272 |     return cleaned_message
273 | 
274 | 
275 | def _remove_html_tag(message: str, tag: str) -> str:
276 |     '''Cleans the given HTML tag from the message.'''
277 |     cleaned_message = message
278 |     cleaning_passes = 0
279 | 
280 |     while f"<{tag}" in cleaned_message:
281 |         assert cleaning_passes < 4, "Too many cleaning passes, giving up to avoid deadlocking"
282 | 
283 |         start_idx = cleaned_message.find(f"<{tag}")
284 |         end_idx = cleaned_message.find(f"</{tag}>", start_idx)
285 | 
286 |         if start_idx == -1 or end_idx == -1:
287 |             LOG.warning("Unbalanced tags found, leaving as-is")
288 |             break
289 | 
290 |         cleaned_message = cleaned_message[:start_idx] + cleaned_message[
291 |             end_idx + len(f"</{tag}>"):]
292 | 
293 |     return cleaned_message
294 | 
295 | 
296 | def _seems_to_have_ooc_talk(message: str) -> bool:
297 |     '''Returns whether a message seems to have some out-of-character talk.'''
298 |     return re.search(_OOC_REGEX, message) is not None
299 | 
300 | 
301 | _OOC_REGEX = re.compile(r"^\((OOC: ?)?.+\)$", flags=re.MULTILINE)
302 | 
303 | _BASE_SYSTEM_PROMPTS = [
304 |     '''%{Enter|Engage|Enable|Start} %{fiction writing|fantasy writing|fantasy roleplay|fictional RP|roleplay|RP} mode. {{content_type_str}}. {{response_length_str}}.''',
305 |     #
306 |     '''You %{are now in|have entered|will now start} %{fiction writing|fantasy writing|fantasy roleplay|fictional RP|roleplay|RP|conversational RP} mode. Drive the story forward in chunks. {{response_length_str}}.''',
307 |     #
308 |     '''You are trained to %{perform|generate} %{storywriting|fiction writing|fantasy writing|fantasy roleplay|fictional roleplay|RP}. Generate continuations for whatever the user gives. {{response_length_str}}. {{content_type_str}}.''',
309 |     # Modified SillyTavern prompt
310 |     '''Write the next reply in a fictional %{roleplay|RP} %{chat|conversation}. {{content_type_str}}. {{response_length_str}}.''',
311 |     #
312 |     '''%{SYSTEM|MODE}: %{conversational roleplay|RP|roleplay mode|RP system engaged}
313 | %{NOTE|ADVISORY|KEEP IN MIND}: {{response_length_str}}''',
314 |     #
315 |     '''I am %{in|currently in|engaging in|beginning} a %{roleplay|RP|fictional roleplay-like conversation} with %{someone else|other people|a user}.''',
316 |     #
317 |     '''{{content_style_str}}. {{response_length_str}}.''',
318 |     #
319 |     '''%{OBJECTIVE|TASK|MISSION|JOB} - %{Conduct|Generate|Enjoy} a %{roleplay session|RP|fictional roleplay}
320 | %{DISCRETION RATING|SAFE FOR WORK?|CONTENT RATING} - {{content_type_str}}
321 | %{REMEMBER|NOTE} - {{response_length_str}}''',
322 |     # Misspellings intentional
323 |     '''%{do|make|have} %{rp adventures|writing|creative roleplay}
324 | %{pls|please} %{rember|remember} to %{b|be} %{engaging|immersive|epic}''',
325 |     #
326 |     "%{roleplay|RP}",
327 |     ""
328 | ]
329 | 
330 | SYSTEM_PROMPTS = generate_prompts(_BASE_SYSTEM_PROMPTS)
331 | 
332 | SFW_PROMPTS = generate_prompts([
333 |     "%{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be safe for work|be SFW|not include any adult themes|be safe for minors|not include 18+ content|not be 18+|not be NSFW}",
334 | ])
335 | 
336 | MIXED_SFW_NSFW_PROMPTS = generate_prompts([
337 |     "%{Generations|Your writing|The generated response|Your reply|Generated replies} %{may or may not include adult themes|may or may not be NSFW|can include adult themes}",
338 | ])
339 | 
340 | NSFW_PROMPTS = generate_prompts([
341 |     "%{Generations|Your writing|The generated response|Your reply|Generated replies} must %{be not safe for work|be NSFW|include adult themes|include erotic themes|include 18+ content}",
342 | ])
343 | 
344 | CONTENT_TYPE_TO_PROMPTS: dict[RpType, list[str]] = {
345 |     RpType.RP: SFW_PROMPTS,
346 |     RpType.ERP: NSFW_PROMPTS,
347 |     RpType.MIXED: MIXED_SFW_NSFW_PROMPTS,
348 | }
349 | 


--------------------------------------------------------------------------------