├── .gitattributes ├── assets └── workflow.png ├── bonito ├── __init__.py ├── model.py └── abstract.py ├── tests └── test_model.py ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── tutorials ├── README.md └── Bonito_Tutorial_with_A100.ipynb ├── setup.py ├── LICENSE ├── .gitignore └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-detectable=false 2 | -------------------------------------------------------------------------------- /assets/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BatsResearch/bonito/HEAD/assets/workflow.png -------------------------------------------------------------------------------- /bonito/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract import AbstractBonito 2 | from .model import Bonito 3 | -------------------------------------------------------------------------------- /tests/test_model.py: -------------------------------------------------------------------------------- 1 | def test_model_import(): 2 | """ 3 | Test that the Bonito class can be imported. 4 | """ 5 | from bonito import Bonito 6 | 7 | assert Bonito is not None 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3 3 | 4 | repos: 5 | - repo: https://github.com/psf/black 6 | rev: 22.8.0 7 | hooks: 8 | - id: black 9 | - id: black-jupyter 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | If you notice any bugs, feel free to open an issue. If you want to contribute, please open a pull request. 4 | 5 | ## Formatting 6 | The repo uses [black](https://github.com/psf/black) and [isort](https://github.com/PyCQA/isort) for code formatting. 7 | Please setup a pre-commit hook to format the code before each commit. 8 | It helps to minimize the diffs and avoid formatting commits. 9 | 10 | Run the following to install the hooks using [pre-commit](https://pre-commit.com/). 11 | 12 | ```bash 13 | pip install pre-commit 14 | pre-commit install 15 | ``` 16 | -------------------------------------------------------------------------------- /tutorials/README.md: -------------------------------------------------------------------------------- 1 | # Tutorials 2 | 3 | Tutorials for Bonito will be located in this directory. 4 | 5 | - `Quantized_Bonito_Tutorial.ipynb` 6 | - This tutorial demonstrates how to run a quantized version of Bonito using the `transformers` package without `vllm`. 7 | This can run in a [Google Colab T4 instance](https://colab.research.google.com/drive/12OCh4OYo1vr9ZvwIWK4JwZT7rkMrYrx2?usp=sharing). 8 | 9 | - `Bonito_Tutorial_with_A100.ipynb.ipynb` 10 | - This tutorial demonstrates how to run Bonito on NVIDIA A100 GPU. 11 | This can run in a [Google Colab A100 instance](https://colab.research.google.com/drive/1XuDRVKpUUqdjrqg2-P2FIqkdAQBnqoNL?usp=sharing). 12 | Google Colab A100 instances are available to users with a Google Colab Pro subscription. 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | requirements = [ 5 | "transformers", 6 | "datasets", 7 | "vllm", 8 | ] 9 | 10 | this_directory = os.path.abspath(os.path.dirname(__file__)) 11 | with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f: 12 | long_description = f.read() 13 | 14 | setup( 15 | name="bonito-llm", 16 | version="0.1.0", 17 | author="Nihal Nayak, Yiyang Nan, Avi Trost, and Stephen Bach", 18 | author_email="nnayak2@cs.brown.edu", 19 | license="BSD-3-Clause", 20 | url="https://github.com/BatsResearch/bonito", 21 | python_requires=">=3.9", 22 | install_requires=requirements, 23 | classifiers=[ 24 | "License :: OSI Approved :: BSD License", 25 | "Programming Language :: Python :: 3.9", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Programming Language :: Python :: 3.12", 29 | ], 30 | description=( 31 | "A lightweight library for generating synthetic instruction tuning " 32 | "datasets for your data without GPT." 33 | ), 34 | long_description=long_description, 35 | long_description_content_type="text/markdown", 36 | packages=find_packages(), 37 | ) 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, Brown University 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /bonito/model.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset 2 | from vllm import LLM, SamplingParams 3 | from .abstract import AbstractBonito 4 | 5 | 6 | class Bonito(LLM, AbstractBonito): 7 | def generate_tasks( 8 | self, 9 | text_dataset: Dataset, 10 | context_col: str, 11 | task_type: str, 12 | sampling_params: SamplingParams, 13 | **kwargs, 14 | ): 15 | """ 16 | Generates tasks using the Bonito model. 17 | 18 | This method takes a text dataset, a context column name, 19 | a task type, and sampling parameters, and generates tasks 20 | using the Bonito model. It processes the input dataset, 21 | generates outputs, collects multiple generations into 22 | one dataset object, and filters out the examples that 23 | cannot be parsed. 24 | 25 | Args: 26 | text_dataset (Dataset): The dataset that provides the text 27 | for the tasks. 28 | context_col (str): The name of the column in the dataset 29 | that provides the context for the tasks. 30 | task_type (str): The type of the tasks. This can be a 31 | short form or a full form. 32 | sampling_params (SamplingParams): The parameters for 33 | sampling. 34 | **kwargs: Additional keyword arguments. 35 | 36 | Returns: 37 | Dataset: The synthetic dataset with the generated tasks. 38 | """ 39 | processed_dataset = self._prepare_bonito_input( 40 | text_dataset, task_type, context_col, **kwargs 41 | ) 42 | outputs = self.generate(processed_dataset["input"], sampling_params) 43 | 44 | # collect multiple generations into one dataset object 45 | examples = [] 46 | for i, example in enumerate(text_dataset.to_list()): 47 | for output in outputs[i].outputs: 48 | examples.append( 49 | {"context": example[context_col], "prediction": output.text.strip()} 50 | ) 51 | 52 | synthetic_dataset = Dataset.from_list(examples) 53 | 54 | # filter out the examples that cannot be parsed 55 | synthetic_dataset = self._postprocess_dataset( 56 | synthetic_dataset, context_col="context", **kwargs 57 | ) 58 | 59 | return synthetic_dataset 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bonito 2 | 3 | Bonito is an open-source model for conditional task generation: the task of converting unannotated text into task-specific training datasets for instruction tuning. This repo is a lightweight library for Bonito to easily create synthetic datasets built on top of the Hugging Face `transformers` and `vllm` libraries. 4 | 5 | - Paper: [Learning to Generate Instruction Tuning Datasets for 6 | Zero-Shot Task Adaptation](https://arxiv.org/abs/2402.18334) 7 | - Model: [bonito-v1](https://huggingface.co/BatsResearch/bonito-v1) 8 | - Demo: [Bonito on Spaces](https://huggingface.co/spaces/nihalnayak/bonito) 9 | - Dataset: [ctga-v1](https://huggingface.co/datasets/BatsResearch/ctga-v1) 10 | - Code: To reproduce experiments in our paper, see [nayak-aclfindings24-code](https://github.com/BatsResearch/nayak-aclfindings24-code). 11 | 12 | ![Bonito](https://nihalnayak.github.io/assets/img/workflow.png) 13 | 14 | ## News 15 | - 🐠 February 2025: Uploaded `bonito-llm` to PyPI. 16 | - 🐡 August 2024: Released [new Bonito model](https://huggingface.co/BatsResearch/Llama-3.1-8B-bonito-v1) with Meta Llama 3.1 as the base model. 17 | - 🐟 June 2024: Bonito is accepted to ACL Findings 2024. 18 | 19 | ## Installation 20 | Create an environment and install the package using the following command: 21 | ```bash 22 | pip3 install bonito-llm 23 | ``` 24 | 25 | ## Basic Usage 26 | To generate synthetic instruction tuning dataset using Bonito, you can use the following code: 27 | ```python 28 | from bonito import Bonito 29 | from vllm import SamplingParams 30 | from datasets import load_dataset 31 | 32 | # Initialize the Bonito model 33 | bonito = Bonito("BatsResearch/bonito-v1") 34 | 35 | # load dataset with unannotated text 36 | unannotated_text = load_dataset( 37 | "BatsResearch/bonito-experiment", 38 | "unannotated_contract_nli" 39 | )["train"].select(range(10)) 40 | 41 | # Generate synthetic instruction tuning dataset 42 | sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.5, n=1) 43 | synthetic_dataset = bonito.generate_tasks( 44 | unannotated_text, 45 | context_col="input", 46 | task_type="nli", 47 | sampling_params=sampling_params 48 | ) 49 | ``` 50 | 51 | ## Supported Task Types 52 | Here we include the supported task types [full name (short form)]: `extractive question answering` (`exqa`), `multiple-choice question answering` (`mcqa`), `question generation` (`qg`), `question answering without choices` (`qa`), `yes-no question answering` (`ynqa`), `coreference resolution` (`coref`), `paraphrase generation` (`paraphrase`), `paraphrase identification` (`paraphrase_id`), `sentence completion` (`sent_comp`), `sentiment` (`sentiment`), `summarization` (`summarization`), `text generation` (`text_gen`), `topic classification` (`topic_class`), `word sense disambiguation` (`wsd`), `textual entailment` (`te`), `natural language inference` (`nli`) 53 | 54 | You can use either the full name or the short form to specify the `task_type` in `generate_tasks`. 55 | 56 | ## Tutorial 57 | We have created a tutorial [here](https://colab.research.google.com/drive/12OCh4OYo1vr9ZvwIWK4JwZT7rkMrYrx2?usp=sharing) for how to use a quantized version of the model in a Google Colab T4 instance. The quantized version was graciously contributed by user [alexandreteles](https://github.com/alexandreteles). 58 | We have an additional tutorial to try out the Bonito model on A100 GPU on Google Colab [here](https://colab.research.google.com/drive/1XuDRVKpUUqdjrqg2-P2FIqkdAQBnqoNL?usp=sharing). 59 | 60 | 61 | ## Citation 62 | If you use Bonito in your research, please cite the following paper: 63 | ``` 64 | @inproceedings{bonito:aclfindings24, 65 | title = {Learning to Generate Instruction Tuning Datasets for Zero-Shot Task Adaptation}, 66 | author = {Nayak, Nihal V. and Nan, Yiyang and Trost, Avi and Bach, Stephen H.}, 67 | booktitle = {Findings of the Association for Computational Linguistics: ACL 2024}, 68 | year = {2024}} 69 | ``` 70 | -------------------------------------------------------------------------------- /bonito/abstract.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Union 2 | from datasets import Dataset 3 | 4 | 5 | SHORTFORM_TO_FULL_TASK_TYPES = { 6 | "exqa": "extractive question answering", 7 | "mcqa": "multiple-choice question answering", 8 | "qg": "question generation", 9 | "qa": "question answering without choices", 10 | "ynqa": "yes-no question answering", 11 | "coref": "coreference resolution", 12 | "paraphrase": "paraphrase generation", 13 | "paraphrase_id": "paraphrase identification", 14 | "sent_comp": "sentence completion", 15 | "sentiment": "sentiment", 16 | "summarization": "summarization", 17 | "text_gen": "text generation", 18 | "topic_class": "topic classification", 19 | "wsd": "word sense disambiguation", 20 | "te": "textual entailment", 21 | "nli": "natural language inference", 22 | } 23 | 24 | 25 | class AbstractBonito: 26 | def _prepare_bonito_input( 27 | self, context_dataset: Dataset, task_type: str, context_col: str, **kwargs 28 | ) -> Dataset: 29 | """ 30 | Prepares the input for the Bonito model. 31 | 32 | This method takes a context dataset, a task type, and a context 33 | column name, and prepares the dataset for the Bonito model. 34 | If the task type is not recognized, it raises a ValueError. 35 | 36 | Args: 37 | context_dataset (Dataset): The dataset that provides the 38 | context for the task. 39 | task_type (str): The type of the task. This can be a 40 | short form or a full form. If the task type is not 41 | recognized, a ValueError is raised. 42 | context_col (str): The name of the column in the dataset 43 | that provides the context for the task. 44 | **kwargs: Additional keyword arguments. 45 | 46 | Returns: 47 | Dataset: The prepared dataset for the Bonito model. 48 | """ 49 | # get the task type name 50 | if task_type in SHORTFORM_TO_FULL_TASK_TYPES.values(): 51 | full_task_type = task_type 52 | elif task_type in SHORTFORM_TO_FULL_TASK_TYPES: 53 | full_task_type = SHORTFORM_TO_FULL_TASK_TYPES[task_type] 54 | else: 55 | raise ValueError(f"Task type {task_type} not recognized") 56 | 57 | def process(example): 58 | input_text = "<|tasktype|>\n" + full_task_type.strip() 59 | input_text += ( 60 | "\n<|context|>\n" + example[context_col].strip() + "\n<|task|>\n" 61 | ) 62 | return { 63 | "input": input_text, 64 | } 65 | 66 | return context_dataset.map( 67 | process, 68 | remove_columns=context_dataset.column_names, 69 | num_proc=kwargs.get("num_proc", 1), 70 | ) 71 | 72 | def _postprocess_dataset( 73 | self, synthetic_dataset: Dataset, context_col: str, **kwargs 74 | ) -> Dataset: 75 | """ 76 | Post-processes the synthetic dataset. 77 | 78 | This method takes a synthetic dataset and a context column 79 | name, and post-processes the dataset. It filters out 80 | examples where the prediction does not contain exactly two 81 | parts separated by "<|pipe|>", and then maps each example to a 82 | new format where the context is inserted into the first part of 83 | the prediction and the second part of the prediction is used as 84 | the output. 85 | 86 | Args: 87 | synthetic_dataset (Dataset): The synthetic dataset to be 88 | post-processed. 89 | context_col (str): The name of the column in the dataset 90 | that provides the context for the tasks. 91 | **kwargs: Additional keyword arguments. 92 | 93 | Returns: 94 | Dataset: The post-processed synthetic dataset. 95 | """ 96 | synthetic_dataset = synthetic_dataset.filter( 97 | lambda example: len(example["prediction"].split("<|pipe|>")) == 2 98 | ) 99 | 100 | def process(example): 101 | pair = example["prediction"].split("<|pipe|>") 102 | context = example[context_col].strip() 103 | return { 104 | "input": pair[0].strip().replace("{{context}}", context), 105 | "output": pair[1].strip().replace("{{context}}", context), 106 | } 107 | 108 | synthetic_dataset = synthetic_dataset.map( 109 | process, 110 | remove_columns=synthetic_dataset.column_names, 111 | num_proc=kwargs.get("num_proc", 1), 112 | ) 113 | 114 | return synthetic_dataset 115 | -------------------------------------------------------------------------------- /tutorials/Bonito_Tutorial_with_A100.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "-K1cD9V8SDIG" 7 | }, 8 | "source": [ 9 | "# Bonito Tutorial with A100\n", 10 | "This tutorial runs Bonito on A100 GPUs to generate synthetic instruction tuning datasets.\n", 11 | "To use Bonito with A100 GPUs, you will need to purchase compute units from Google. The price starts from $9.99 for 100 compute units. See [pricing](https://colab.research.google.com/signup) for more details.\n", 12 | "\n", 13 | " If you are looking to run Bonito (for free) on the T4 GPUs, check our [quantized Bonito tutorial](https://colab.research.google.com/drive/1tfAqUsFaLWLyzhnd1smLMGcDXSzOwp9r?usp=sharing).\n", 14 | "\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "Gyh5HAFxQlaH" 21 | }, 22 | "source": [ 23 | "## Setup\n", 24 | "First we clone into the repo and install the dependencies. This will take several minutes." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "id": "-lqD8IrM8Vo0" 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "!git clone https://github.com/BatsResearch/bonito.git\n", 36 | "!pip install -U bonito/" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "xWYY7FYfQyAD" 43 | }, 44 | "source": [ 45 | "## Load the Bonito Model\n", 46 | "Loads the weights from Huggingface into the Bonito class." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "id": "s5k0He_jiJeo" 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from bonito import Bonito\n", 58 | "\n", 59 | "bonito = Bonito(\"BatsResearch/bonito-v1\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "id": "86OvwN74RcS8" 66 | }, 67 | "source": [ 68 | "## Synthetic Data Generation\n", 69 | "Here we first show how to use the Bonito model with an unannotated text and then show how to generate instruction tuning dataset with a small unannotated dataset.\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "id": "FEAqk24gpoVO" 76 | }, 77 | "source": [ 78 | "### Single example" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 1, 84 | "metadata": { 85 | "colab": { 86 | "base_uri": "https://localhost:8080/" 87 | }, 88 | "id": "cwlNfTKLCUDp", 89 | "outputId": "0933640c-35f8-4204-8433-df57abd9827a" 90 | }, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "('1. “Confidential Information”, whenever used in this Agreement, shall mean '\n", 97 | " 'any data, document, specification and other information or material, that is '\n", 98 | " 'delivered or disclosed by UNHCR to the Recipient in any form whatsoever, '\n", 99 | " 'whether orally, visually in writing or otherwise (including computerized '\n", 100 | " 'form), and that, at the time of disclosure to the Recipient, is designated '\n", 101 | " 'as confidential.')\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "from pprint import pprint\n", 107 | "\n", 108 | "unannotated_paragraph = \"\"\"1. “Confidential Information”, whenever used in this Agreement, shall mean any data, document, specification and other information or material, that is delivered or disclosed by UNHCR to the Recipient in any form whatsoever, whether orally, visually in writing or otherwise (including computerized form), and that, at the time of disclosure to the Recipient, is designated as confidential.\"\"\"\n", 109 | "pprint(unannotated_paragraph)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "id": "u_xYp60oCjVz" 116 | }, 117 | "source": [ 118 | "Now generate a pair of synthetic instruction for unannotated paragraph." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "id": "k4lreUPb0LUX" 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "from datasets import Dataset\n", 130 | "from vllm import SamplingParams\n", 131 | "from transformers import set_seed\n", 132 | "\n", 133 | "set_seed(2)\n", 134 | "\n", 135 | "\n", 136 | "def convert_to_dataset(text):\n", 137 | " dataset = Dataset.from_list([{\"input\": text}])\n", 138 | " return dataset\n", 139 | "\n", 140 | "\n", 141 | "sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.5, n=1)\n", 142 | "synthetic_dataset = bonito.generate_tasks(\n", 143 | " convert_to_dataset(unannotated_paragraph),\n", 144 | " context_col=\"input\",\n", 145 | " task_type=\"nli\",\n", 146 | " sampling_params=sampling_params,\n", 147 | ")\n", 148 | "pprint(\"----Generated Instructions----\")\n", 149 | "pprint(f'Input: {synthetic_dataset[0][\"input\"]}')\n", 150 | "pprint(f'Output: {synthetic_dataset[0][\"output\"]}')" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "id": "2IFs82gLJJFk" 157 | }, 158 | "source": [ 159 | "Now we change the task type from NLI (nli) to multiple choice question answering (mcqa). For more details, see [supported task types](https://github.com/BatsResearch/bonito?tab=readme-ov-file#supported-task-types)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "id": "CUtgkf8EJKxF" 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "set_seed(0)\n", 171 | "sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.7, n=1)\n", 172 | "synthetic_dataset = bonito.generate_tasks(\n", 173 | " convert_to_dataset(unannotated_paragraph),\n", 174 | " context_col=\"input\",\n", 175 | " task_type=\"mcqa\", # changed\n", 176 | " sampling_params=sampling_params,\n", 177 | ")\n", 178 | "pprint(\"----Generated Instructions----\")\n", 179 | "pprint(f'Input: {synthetic_dataset[0][\"input\"]}')\n", 180 | "pprint(f'Output: {synthetic_dataset[0][\"output\"]}')" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": { 186 | "id": "mEU1lp5TVjGj" 187 | }, 188 | "source": [ 189 | "### Small dataset\n", 190 | "We select 10 unannoated samples from the ContractNLI dataset and convert them into NLI instruction tuning dataset.\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "id": "qMrbj4dbC2Lm" 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "# load dataset with unannotated text\n", 202 | "from datasets import load_dataset\n", 203 | "\n", 204 | "unannotated_dataset = load_dataset(\n", 205 | " \"BatsResearch/bonito-experiment\", \"unannotated_contract_nli\"\n", 206 | ")[\"train\"].select(range(10))" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "id": "HKZEbZuiGMuZ" 213 | }, 214 | "source": [ 215 | "Generate the synthetic NLI dataset." 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "id": "52hWL50gDQnH" 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "# Generate synthetic instruction tuning dataset\n", 227 | "from pprint import pprint\n", 228 | "from vllm import SamplingParams\n", 229 | "from transformers import set_seed\n", 230 | "\n", 231 | "set_seed(42)\n", 232 | "\n", 233 | "sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.5, n=1)\n", 234 | "synthetic_dataset = bonito.generate_tasks(\n", 235 | " unannotated_dataset,\n", 236 | " context_col=\"input\",\n", 237 | " task_type=\"nli\",\n", 238 | " sampling_params=sampling_params,\n", 239 | ")\n", 240 | "pprint(\"----Generated Instructions----\")\n", 241 | "pprint(f'Input: {synthetic_dataset[0][\"input\"]}')\n", 242 | "pprint(f'Output: {synthetic_dataset[0][\"output\"]}')" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "id": "fBDHJVXhIXyG" 249 | }, 250 | "source": [ 251 | "Now go try it out with your own datasets! You can vary the `task_type` for different types of generated instructions.\n", 252 | "You can also play around the sampling hyperparameters such as `top_p` and `temperature`.\n" 253 | ] 254 | } 255 | ], 256 | "metadata": { 257 | "accelerator": "GPU", 258 | "colab": { 259 | "gpuType": "V100", 260 | "provenance": [] 261 | }, 262 | "kernelspec": { 263 | "display_name": "Python 3", 264 | "name": "python3" 265 | }, 266 | "language_info": { 267 | "name": "python" 268 | } 269 | }, 270 | "nbformat": 4, 271 | "nbformat_minor": 0 272 | } 273 | --------------------------------------------------------------------------------