├── .gitattributes
├── assets
    └── workflow.png
├── bonito
    ├── __init__.py
    ├── model.py
    └── abstract.py
├── tests
    └── test_model.py
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── tutorials
    ├── README.md
    └── Bonito_Tutorial_with_A100.ipynb
├── setup.py
├── LICENSE
├── .gitignore
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-detectable=false
2 | 


--------------------------------------------------------------------------------
/assets/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BatsResearch/bonito/HEAD/assets/workflow.png


--------------------------------------------------------------------------------
/bonito/__init__.py:
--------------------------------------------------------------------------------
1 | from .abstract import AbstractBonito
2 | from .model import Bonito
3 | 


--------------------------------------------------------------------------------
/tests/test_model.py:
--------------------------------------------------------------------------------
1 | def test_model_import():
2 |     """
3 |     Test that the Bonito class can be imported.
4 |     """
5 |     from bonito import Bonito
6 | 
7 |     assert Bonito is not None
8 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/psf/black
 6 |     rev: 22.8.0
 7 |     hooks:
 8 |       - id: black
 9 |       - id: black-jupyter
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | If you notice any bugs, feel free to open an issue. If you want to contribute, please open a pull request.
 4 | 
 5 | ## Formatting
 6 | The repo uses [black](https://github.com/psf/black) and [isort](https://github.com/PyCQA/isort) for code formatting.
 7 | Please setup a pre-commit hook to format the code before each commit.
 8 | It helps to minimize the diffs and avoid formatting commits.
 9 | 
10 | Run the following to install the hooks using [pre-commit](https://pre-commit.com/).
11 | 
12 | ```bash
13 | pip install pre-commit
14 | pre-commit install
15 | ```
16 | 


--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorials
 2 | 
 3 | Tutorials for Bonito will be located in this directory.
 4 | 
 5 |  - `Quantized_Bonito_Tutorial.ipynb`
 6 |     - This tutorial demonstrates how to run a quantized version of Bonito using the `transformers` package without `vllm`.
 7 |       This can run in a [Google Colab T4 instance](https://colab.research.google.com/drive/12OCh4OYo1vr9ZvwIWK4JwZT7rkMrYrx2?usp=sharing).
 8 | 
 9 |  - `Bonito_Tutorial_with_A100.ipynb.ipynb`
10 |     - This tutorial demonstrates how to run Bonito on NVIDIA A100 GPU.
11 |       This can run in a [Google Colab A100 instance](https://colab.research.google.com/drive/1XuDRVKpUUqdjrqg2-P2FIqkdAQBnqoNL?usp=sharing).
12 |       Google Colab A100 instances are available to users with a Google Colab Pro subscription.
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | requirements = [
 5 |     "transformers",
 6 |     "datasets",
 7 |     "vllm",
 8 | ]
 9 | 
10 | this_directory = os.path.abspath(os.path.dirname(__file__))
11 | with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f:
12 |     long_description = f.read()
13 | 
14 | setup(
15 |     name="bonito-llm",
16 |     version="0.1.0",
17 |     author="Nihal Nayak, Yiyang Nan, Avi Trost, and Stephen Bach",
18 |     author_email="nnayak2@cs.brown.edu",
19 |     license="BSD-3-Clause",
20 |     url="https://github.com/BatsResearch/bonito",
21 |     python_requires=">=3.9",
22 |     install_requires=requirements,
23 |     classifiers=[
24 |         "License :: OSI Approved :: BSD License",
25 |         "Programming Language :: Python :: 3.9",
26 |         "Programming Language :: Python :: 3.10",
27 |         "Programming Language :: Python :: 3.11",
28 |         "Programming Language :: Python :: 3.12",
29 |     ],
30 |     description=(
31 |         "A lightweight library for generating synthetic instruction tuning "
32 |         "datasets for your data without GPT."
33 |     ),
34 |     long_description=long_description,
35 |     long_description_content_type="text/markdown",
36 |     packages=find_packages(),
37 | )
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2024, Brown University
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/bonito/model.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset
 2 | from vllm import LLM, SamplingParams
 3 | from .abstract import AbstractBonito
 4 | 
 5 | 
 6 | class Bonito(LLM, AbstractBonito):
 7 |     def generate_tasks(
 8 |         self,
 9 |         text_dataset: Dataset,
10 |         context_col: str,
11 |         task_type: str,
12 |         sampling_params: SamplingParams,
13 |         **kwargs,
14 |     ):
15 |         """
16 |         Generates tasks using the Bonito model.
17 | 
18 |         This method takes a text dataset, a context column name,
19 |         a task type, and sampling parameters, and generates tasks
20 |         using the Bonito model. It processes the input dataset,
21 |         generates outputs, collects multiple generations into
22 |         one dataset object, and filters out the examples that
23 |         cannot be parsed.
24 | 
25 |         Args:
26 |             text_dataset (Dataset): The dataset that provides the text
27 |                 for the tasks.
28 |             context_col (str): The name of the column in the dataset
29 |                 that provides the context for the tasks.
30 |             task_type (str): The type of the tasks. This can be a
31 |                 short form or a full form.
32 |             sampling_params (SamplingParams): The parameters for
33 |                 sampling.
34 |             **kwargs: Additional keyword arguments.
35 | 
36 |         Returns:
37 |             Dataset: The synthetic dataset with the generated tasks.
38 |         """
39 |         processed_dataset = self._prepare_bonito_input(
40 |             text_dataset, task_type, context_col, **kwargs
41 |         )
42 |         outputs = self.generate(processed_dataset["input"], sampling_params)
43 | 
44 |         # collect multiple generations into one dataset object
45 |         examples = []
46 |         for i, example in enumerate(text_dataset.to_list()):
47 |             for output in outputs[i].outputs:
48 |                 examples.append(
49 |                     {"context": example[context_col], "prediction": output.text.strip()}
50 |                 )
51 | 
52 |         synthetic_dataset = Dataset.from_list(examples)
53 | 
54 |         # filter out the examples that cannot be parsed
55 |         synthetic_dataset = self._postprocess_dataset(
56 |             synthetic_dataset, context_col="context", **kwargs
57 |         )
58 | 
59 |         return synthetic_dataset
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bonito
 2 | 
 3 | Bonito is an open-source model for conditional task generation: the task of converting unannotated text into task-specific training datasets for instruction tuning. This repo is a lightweight library for Bonito to easily create synthetic datasets built on top of the Hugging Face `transformers` and `vllm` libraries.
 4 | 
 5 | - Paper: [Learning to Generate Instruction Tuning Datasets for
 6 | Zero-Shot Task Adaptation](https://arxiv.org/abs/2402.18334)
 7 | - Model: [bonito-v1](https://huggingface.co/BatsResearch/bonito-v1)
 8 | - Demo: [Bonito on Spaces](https://huggingface.co/spaces/nihalnayak/bonito)
 9 | - Dataset: [ctga-v1](https://huggingface.co/datasets/BatsResearch/ctga-v1)
10 | - Code: To reproduce experiments in our paper, see [nayak-aclfindings24-code](https://github.com/BatsResearch/nayak-aclfindings24-code).
11 | 
12 | ![Bonito](https://nihalnayak.github.io/assets/img/workflow.png)
13 | 
14 | ## News
15 | - 🐠 February 2025: Uploaded `bonito-llm` to PyPI.
16 | - 🐡 August 2024: Released [new Bonito model](https://huggingface.co/BatsResearch/Llama-3.1-8B-bonito-v1) with Meta Llama 3.1 as the base model.
17 | - 🐟 June 2024: Bonito is accepted to ACL Findings 2024.
18 | 
19 | ## Installation
20 | Create an environment and install the package using the following command:
21 | ```bash
22 | pip3 install bonito-llm
23 | ```
24 | 
25 | ## Basic Usage
26 | To generate synthetic instruction tuning dataset using Bonito, you can use the following code:
27 | ```python
28 | from bonito import Bonito
29 | from vllm import SamplingParams
30 | from datasets import load_dataset
31 | 
32 | # Initialize the Bonito model
33 | bonito = Bonito("BatsResearch/bonito-v1")
34 | 
35 | # load dataset with unannotated text
36 | unannotated_text = load_dataset(
37 |     "BatsResearch/bonito-experiment",
38 |     "unannotated_contract_nli"
39 | )["train"].select(range(10))
40 | 
41 | # Generate synthetic instruction tuning dataset
42 | sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.5, n=1)
43 | synthetic_dataset = bonito.generate_tasks(
44 |     unannotated_text,
45 |     context_col="input",
46 |     task_type="nli",
47 |     sampling_params=sampling_params
48 | )
49 | ```
50 | 
51 | ## Supported Task Types
52 | Here we include the supported task types [full name (short form)]: `extractive question answering` (`exqa`), `multiple-choice question answering` (`mcqa`), `question generation` (`qg`), `question answering without choices` (`qa`), `yes-no question answering` (`ynqa`), `coreference resolution` (`coref`), `paraphrase generation` (`paraphrase`), `paraphrase identification` (`paraphrase_id`), `sentence completion` (`sent_comp`), `sentiment` (`sentiment`), `summarization` (`summarization`), `text generation` (`text_gen`), `topic classification` (`topic_class`), `word sense disambiguation` (`wsd`), `textual entailment` (`te`), `natural language inference` (`nli`)
53 | 
54 | You can use either the full name or the short form to specify the `task_type` in `generate_tasks`.
55 | 
56 | ## Tutorial
57 | We have created a tutorial [here](https://colab.research.google.com/drive/12OCh4OYo1vr9ZvwIWK4JwZT7rkMrYrx2?usp=sharing) for how to use a quantized version of the model in a Google Colab T4 instance. The quantized version was graciously contributed by user [alexandreteles](https://github.com/alexandreteles).
58 | We have an additional tutorial to try out the Bonito model on A100 GPU on Google Colab [here](https://colab.research.google.com/drive/1XuDRVKpUUqdjrqg2-P2FIqkdAQBnqoNL?usp=sharing).
59 | 
60 | 
61 | ## Citation
62 | If you use Bonito in your research, please cite the following paper:
63 | ```
64 | @inproceedings{bonito:aclfindings24,
65 |   title = {Learning to Generate Instruction Tuning Datasets for Zero-Shot Task Adaptation},
66 |   author = {Nayak, Nihal V. and Nan, Yiyang and Trost, Avi and Bach, Stephen H.},
67 |   booktitle = {Findings of the Association for Computational Linguistics: ACL 2024},
68 |   year = {2024}}
69 | ```
70 | 


--------------------------------------------------------------------------------
/bonito/abstract.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Optional, Union
  2 | from datasets import Dataset
  3 | 
  4 | 
  5 | SHORTFORM_TO_FULL_TASK_TYPES = {
  6 |     "exqa": "extractive question answering",
  7 |     "mcqa": "multiple-choice question answering",
  8 |     "qg": "question generation",
  9 |     "qa": "question answering without choices",
 10 |     "ynqa": "yes-no question answering",
 11 |     "coref": "coreference resolution",
 12 |     "paraphrase": "paraphrase generation",
 13 |     "paraphrase_id": "paraphrase identification",
 14 |     "sent_comp": "sentence completion",
 15 |     "sentiment": "sentiment",
 16 |     "summarization": "summarization",
 17 |     "text_gen": "text generation",
 18 |     "topic_class": "topic classification",
 19 |     "wsd": "word sense disambiguation",
 20 |     "te": "textual entailment",
 21 |     "nli": "natural language inference",
 22 | }
 23 | 
 24 | 
 25 | class AbstractBonito:
 26 |     def _prepare_bonito_input(
 27 |         self, context_dataset: Dataset, task_type: str, context_col: str, **kwargs
 28 |     ) -> Dataset:
 29 |         """
 30 |         Prepares the input for the Bonito model.
 31 | 
 32 |         This method takes a context dataset, a task type, and a context
 33 |         column name, and prepares the dataset for the Bonito model.
 34 |         If the task type is not recognized, it raises a ValueError.
 35 | 
 36 |         Args:
 37 |             context_dataset (Dataset): The dataset that provides the
 38 |                 context for the task.
 39 |             task_type (str): The type of the task. This can be a
 40 |                 short form or a full form. If the task type is not
 41 |                 recognized, a ValueError is raised.
 42 |             context_col (str): The name of the column in the dataset
 43 |                 that provides the context for the task.
 44 |             **kwargs: Additional keyword arguments.
 45 | 
 46 |         Returns:
 47 |             Dataset: The prepared dataset for the Bonito model.
 48 |         """
 49 |         # get the task type name
 50 |         if task_type in SHORTFORM_TO_FULL_TASK_TYPES.values():
 51 |             full_task_type = task_type
 52 |         elif task_type in SHORTFORM_TO_FULL_TASK_TYPES:
 53 |             full_task_type = SHORTFORM_TO_FULL_TASK_TYPES[task_type]
 54 |         else:
 55 |             raise ValueError(f"Task type {task_type} not recognized")
 56 | 
 57 |         def process(example):
 58 |             input_text = "<|tasktype|>\n" + full_task_type.strip()
 59 |             input_text += (
 60 |                 "\n<|context|>\n" + example[context_col].strip() + "\n<|task|>\n"
 61 |             )
 62 |             return {
 63 |                 "input": input_text,
 64 |             }
 65 | 
 66 |         return context_dataset.map(
 67 |             process,
 68 |             remove_columns=context_dataset.column_names,
 69 |             num_proc=kwargs.get("num_proc", 1),
 70 |         )
 71 | 
 72 |     def _postprocess_dataset(
 73 |         self, synthetic_dataset: Dataset, context_col: str, **kwargs
 74 |     ) -> Dataset:
 75 |         """
 76 |         Post-processes the synthetic dataset.
 77 | 
 78 |         This method takes a synthetic dataset and a context column
 79 |         name, and post-processes the dataset. It filters out
 80 |         examples where the prediction does not contain exactly two
 81 |         parts separated by "<|pipe|>", and then maps each example to a
 82 |         new format where the context is inserted into the first part of
 83 |         the prediction and the second part of the prediction is used as
 84 |         the output.
 85 | 
 86 |         Args:
 87 |             synthetic_dataset (Dataset): The synthetic dataset to be
 88 |                 post-processed.
 89 |             context_col (str): The name of the column in the dataset
 90 |                 that provides the context for the tasks.
 91 |             **kwargs: Additional keyword arguments.
 92 | 
 93 |         Returns:
 94 |             Dataset: The post-processed synthetic dataset.
 95 |         """
 96 |         synthetic_dataset = synthetic_dataset.filter(
 97 |             lambda example: len(example["prediction"].split("<|pipe|>")) == 2
 98 |         )
 99 | 
100 |         def process(example):
101 |             pair = example["prediction"].split("<|pipe|>")
102 |             context = example[context_col].strip()
103 |             return {
104 |                 "input": pair[0].strip().replace("{{context}}", context),
105 |                 "output": pair[1].strip().replace("{{context}}", context),
106 |             }
107 | 
108 |         synthetic_dataset = synthetic_dataset.map(
109 |             process,
110 |             remove_columns=synthetic_dataset.column_names,
111 |             num_proc=kwargs.get("num_proc", 1),
112 |         )
113 | 
114 |         return synthetic_dataset
115 | 


--------------------------------------------------------------------------------
/tutorials/Bonito_Tutorial_with_A100.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "-K1cD9V8SDIG"
  7 |    },
  8 |    "source": [
  9 |     "# Bonito Tutorial with A100\n",
 10 |     "This tutorial runs Bonito on A100 GPUs to generate synthetic instruction tuning datasets.\n",
 11 |     "To use Bonito with A100 GPUs, you will need to purchase compute units from Google. The price starts from $9.99 for 100 compute units. See [pricing](https://colab.research.google.com/signup) for more details.\n",
 12 |     "\n",
 13 |     " If you are looking to run Bonito (for free) on the T4 GPUs, check our [quantized Bonito tutorial](https://colab.research.google.com/drive/1tfAqUsFaLWLyzhnd1smLMGcDXSzOwp9r?usp=sharing).\n",
 14 |     "\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {
 20 |     "id": "Gyh5HAFxQlaH"
 21 |    },
 22 |    "source": [
 23 |     "## Setup\n",
 24 |     "First we clone into the repo and install the dependencies. This will take several minutes."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {
 31 |     "id": "-lqD8IrM8Vo0"
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "!git clone https://github.com/BatsResearch/bonito.git\n",
 36 |     "!pip install -U bonito/"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {
 42 |     "id": "xWYY7FYfQyAD"
 43 |    },
 44 |    "source": [
 45 |     "## Load the Bonito Model\n",
 46 |     "Loads the weights from Huggingface into the Bonito class."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "id": "s5k0He_jiJeo"
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "from bonito import Bonito\n",
 58 |     "\n",
 59 |     "bonito = Bonito(\"BatsResearch/bonito-v1\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {
 65 |     "id": "86OvwN74RcS8"
 66 |    },
 67 |    "source": [
 68 |     "## Synthetic Data Generation\n",
 69 |     "Here we first show how to use the Bonito model with an unannotated text and then show how to generate instruction tuning dataset with a small unannotated dataset.\n"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {
 75 |     "id": "FEAqk24gpoVO"
 76 |    },
 77 |    "source": [
 78 |     "### Single example"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 1,
 84 |    "metadata": {
 85 |     "colab": {
 86 |      "base_uri": "https://localhost:8080/"
 87 |     },
 88 |     "id": "cwlNfTKLCUDp",
 89 |     "outputId": "0933640c-35f8-4204-8433-df57abd9827a"
 90 |    },
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "('1. “Confidential Information”, whenever used in this Agreement, shall mean '\n",
 97 |       " 'any data, document, specification and other information or material, that is '\n",
 98 |       " 'delivered or disclosed by UNHCR to the Recipient in any form whatsoever, '\n",
 99 |       " 'whether orally, visually in writing or otherwise (including computerized '\n",
100 |       " 'form), and that, at the time of disclosure to the Recipient, is designated '\n",
101 |       " 'as confidential.')\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "from pprint import pprint\n",
107 |     "\n",
108 |     "unannotated_paragraph = \"\"\"1. “Confidential Information”, whenever used in this Agreement, shall mean any data, document, specification and other information or material, that is delivered or disclosed by UNHCR to the Recipient in any form whatsoever, whether orally, visually in writing or otherwise (including computerized form), and that, at the time of disclosure to the Recipient, is designated as confidential.\"\"\"\n",
109 |     "pprint(unannotated_paragraph)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {
115 |     "id": "u_xYp60oCjVz"
116 |    },
117 |    "source": [
118 |     "Now generate a pair of synthetic instruction for unannotated paragraph."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "id": "k4lreUPb0LUX"
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "from datasets import Dataset\n",
130 |     "from vllm import SamplingParams\n",
131 |     "from transformers import set_seed\n",
132 |     "\n",
133 |     "set_seed(2)\n",
134 |     "\n",
135 |     "\n",
136 |     "def convert_to_dataset(text):\n",
137 |     "    dataset = Dataset.from_list([{\"input\": text}])\n",
138 |     "    return dataset\n",
139 |     "\n",
140 |     "\n",
141 |     "sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.5, n=1)\n",
142 |     "synthetic_dataset = bonito.generate_tasks(\n",
143 |     "    convert_to_dataset(unannotated_paragraph),\n",
144 |     "    context_col=\"input\",\n",
145 |     "    task_type=\"nli\",\n",
146 |     "    sampling_params=sampling_params,\n",
147 |     ")\n",
148 |     "pprint(\"----Generated Instructions----\")\n",
149 |     "pprint(f'Input: {synthetic_dataset[0][\"input\"]}')\n",
150 |     "pprint(f'Output: {synthetic_dataset[0][\"output\"]}')"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {
156 |     "id": "2IFs82gLJJFk"
157 |    },
158 |    "source": [
159 |     "Now we change the task type from NLI (nli) to multiple choice question answering (mcqa). For more details, see [supported task types](https://github.com/BatsResearch/bonito?tab=readme-ov-file#supported-task-types)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {
166 |     "id": "CUtgkf8EJKxF"
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "set_seed(0)\n",
171 |     "sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.7, n=1)\n",
172 |     "synthetic_dataset = bonito.generate_tasks(\n",
173 |     "    convert_to_dataset(unannotated_paragraph),\n",
174 |     "    context_col=\"input\",\n",
175 |     "    task_type=\"mcqa\",  # changed\n",
176 |     "    sampling_params=sampling_params,\n",
177 |     ")\n",
178 |     "pprint(\"----Generated Instructions----\")\n",
179 |     "pprint(f'Input: {synthetic_dataset[0][\"input\"]}')\n",
180 |     "pprint(f'Output: {synthetic_dataset[0][\"output\"]}')"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {
186 |     "id": "mEU1lp5TVjGj"
187 |    },
188 |    "source": [
189 |     "### Small dataset\n",
190 |     "We select 10 unannoated samples from the ContractNLI dataset and convert them into NLI instruction tuning dataset.\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "id": "qMrbj4dbC2Lm"
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "# load dataset with unannotated text\n",
202 |     "from datasets import load_dataset\n",
203 |     "\n",
204 |     "unannotated_dataset = load_dataset(\n",
205 |     "    \"BatsResearch/bonito-experiment\", \"unannotated_contract_nli\"\n",
206 |     ")[\"train\"].select(range(10))"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {
212 |     "id": "HKZEbZuiGMuZ"
213 |    },
214 |    "source": [
215 |     "Generate the synthetic NLI dataset."
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {
222 |     "id": "52hWL50gDQnH"
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "# Generate synthetic instruction tuning dataset\n",
227 |     "from pprint import pprint\n",
228 |     "from vllm import SamplingParams\n",
229 |     "from transformers import set_seed\n",
230 |     "\n",
231 |     "set_seed(42)\n",
232 |     "\n",
233 |     "sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.5, n=1)\n",
234 |     "synthetic_dataset = bonito.generate_tasks(\n",
235 |     "    unannotated_dataset,\n",
236 |     "    context_col=\"input\",\n",
237 |     "    task_type=\"nli\",\n",
238 |     "    sampling_params=sampling_params,\n",
239 |     ")\n",
240 |     "pprint(\"----Generated Instructions----\")\n",
241 |     "pprint(f'Input: {synthetic_dataset[0][\"input\"]}')\n",
242 |     "pprint(f'Output: {synthetic_dataset[0][\"output\"]}')"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {
248 |     "id": "fBDHJVXhIXyG"
249 |    },
250 |    "source": [
251 |     "Now go try it out with your own datasets! You can vary the `task_type` for different types of generated instructions.\n",
252 |     "You can also play around the sampling hyperparameters such as `top_p` and `temperature`.\n"
253 |    ]
254 |   }
255 |  ],
256 |  "metadata": {
257 |   "accelerator": "GPU",
258 |   "colab": {
259 |    "gpuType": "V100",
260 |    "provenance": []
261 |   },
262 |   "kernelspec": {
263 |    "display_name": "Python 3",
264 |    "name": "python3"
265 |   },
266 |   "language_info": {
267 |    "name": "python"
268 |   }
269 |  },
270 |  "nbformat": 4,
271 |  "nbformat_minor": 0
272 | }
273 | 


--------------------------------------------------------------------------------