├── .github
├── actions
│ └── update-progress
│ │ ├── requirements.txt
│ │ └── src
│ │ └── dashboard.py
└── workflows
│ └── update-fineweb-progres.yml
├── .gitignore
├── README.md
├── community-efforts
├── image_preferences
│ ├── 00_imgsys_shuffled_deduplicated_cleaned.py
│ ├── 01_synthetic_data_generation_images.py
│ ├── 01_synthetic_data_generation_prompts.py
│ ├── 01_synthetic_data_generation_total.py
│ ├── 02_image_prefernces_cleaned_filtered_sfw.py
│ ├── 03_upload_to_argilla.ipynb
│ ├── 04_binarize_preference_results.ipynb
│ ├── 05_fine_tune_flux_lora.ipynb
│ ├── README.md
│ ├── requirements.txt
│ └── template.html
├── prompt_ranking
│ ├── README.md
│ └── assets
│ │ └── synthetic-vs-human.png
└── prompt_translation
│ ├── 01_setup_prompt_translation_space.ipynb
│ ├── 02_upload_prompt_translation_data.ipynb
│ ├── 03_create_dashboard.ipynb
│ ├── README.md
│ ├── Translation_with_distilabel_gpt_4_turbo.ipynb
│ ├── dashboard_template
│ ├── .gitattributes
│ ├── README.md
│ ├── app.py
│ ├── dumpy.py
│ └── requirements.txt
│ └── requirements.in
└── cookbook-efforts
├── domain-specific-datasets
├── README.md
├── assets
│ ├── pipeline.png
│ └── setup.png
├── distilabel_pipelines
│ ├── domain_expert_pipeline.py
│ └── requirements.txt
├── parent_app
│ ├── app.py
│ ├── hub.py
│ ├── pages
│ │ └── 🧑🌾 Domain Data Grower.py
│ ├── project_config.json
│ └── seed_data.json
└── project_app
│ ├── .streamlit
│ └── config.toml
│ ├── DATASET_README_BASE.md
│ ├── README.md
│ ├── app.py
│ ├── defaults.py
│ ├── hub.py
│ ├── infer.py
│ ├── pages
│ ├── 2_👩🏼🔬 Describe Domain.py
│ ├── 3_🌱 Generate Dataset.py
│ └── 4_🔍 Review Generated Data.py
│ ├── pipeline.yaml
│ ├── project_config.json
│ ├── requirements.txt
│ ├── seed_data.json
│ └── utils.py
├── dpo-orpo-preference
├── 00_datasets_exploration.ipynb
├── 01_data_prep.ipynb
├── 02_load_from_argilla.ipynb
├── README.md
├── assets
│ └── banner.webp
├── aya_dpo_gen.py
├── custom_preference_to_argilla.py
├── examples
│ └── en
│ │ ├── 01_en_data_prep.ipynb
│ │ ├── aya_en_dpo_gen.py
│ │ └── custom_preference_to_argilla.py
├── instructions.md
├── requirements.in
└── requirements.txt
└── kto-preference
├── 01_create_preference_task.ipynb
├── README.md
├── assets
├── access.png
├── app-creation.png
├── b822ac33-a10e-4da7-a36a-682b96d1fe0e.webp
├── datasets.png
├── dpo.png
├── secrets.png
├── space.png
├── storage.png
├── task.png
└── viewer.png
├── preference_gen.py
├── requirements.in
└── requirements.txt
/.github/actions/update-progress/requirements.txt:
--------------------------------------------------------------------------------
1 | argilla
2 | huggingface-hub
3 | httpx
4 | stamina
5 | polars
6 | tqdm
7 | python-dotenv
--------------------------------------------------------------------------------
/.github/actions/update-progress/src/dashboard.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argilla as rg
3 | from huggingface_hub import HfApi, hf_hub_download
4 | import httpx
5 | import stamina
6 | import polars as pl
7 | from tqdm.contrib.concurrent import thread_map
8 | from argilla._exceptions import ArgillaAPIError
9 | from datetime import datetime, timezone
10 | from dotenv import load_dotenv
11 | from functools import lru_cache
12 | import time
13 |
14 | # Load environment variables from .env file when running locally
15 | load_dotenv()
16 |
17 | # Enable HF transfer
18 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
19 |
20 | # Validate environment variables
21 | HF_TOKEN = os.environ.get("HF_TOKEN")
22 |
23 | if not HF_TOKEN:
24 | raise ValueError("HF_TOKEN environment variable is not set")
25 |
26 | if ARGILLA_API_KEY := os.environ.get("ARGILLA_API_KEY"):
27 | client = rg.Argilla(
28 | api_url="https://data-is-better-together-fineweb-c.hf.space",
29 | api_key=ARGILLA_API_KEY,
30 | timeout=120,
31 | headers={"Authorization": f"Bearer {HF_TOKEN}"},
32 | )
33 | else:
34 | raise ValueError("ARGILLA_API_KEY environment variable is not set")
35 |
36 |
37 | @lru_cache(maxsize=1)
38 | def get_all_datasets():
39 | return client.datasets.list()
40 |
41 |
42 | def get_dataset_for_language(language_code):
43 | all_datasets = get_all_datasets()
44 | dataset = [
45 | dataset for dataset in all_datasets if dataset.name.startswith(language_code)
46 | ]
47 | if len(dataset) != 1:
48 | raise ValueError(
49 | f"Found {len(dataset)} datasets for language code {language_code}"
50 | )
51 | dataset_name = dataset[0].name
52 | return client.datasets(dataset_name)
53 |
54 |
55 | # Get all datasets
56 | all_datasets = get_all_datasets()
57 | language_datasets_names = [dataset.name for dataset in all_datasets]
58 |
59 |
60 | @stamina.retry(
61 | on=(httpx.HTTPStatusError, ArgillaAPIError),
62 | attempts=5,
63 | wait_initial=15,
64 | )
65 | def get_dataset_progress(language_dataset_name):
66 | time.sleep(2)
67 | dataset = client.datasets(language_dataset_name)
68 | return {
69 | "language_dataset_name": language_dataset_name,
70 | **dataset.progress(with_users_distribution=True),
71 | }
72 |
73 |
74 | def flatten_user_stats(dataset):
75 | dataset_name = dataset["language_dataset_name"]
76 | current_timestamp = datetime.now(timezone.utc)
77 | user_stats = []
78 |
79 | if dataset["users"]:
80 | user_stats.extend(
81 | {
82 | "language_dataset_name": dataset_name,
83 | "username": str(username),
84 | "submitted": int(
85 | stats["completed"]["submitted"] + stats["pending"]["submitted"]
86 | ),
87 | "total": int(dataset["total"]),
88 | "timestamp": current_timestamp,
89 | }
90 | for username, stats in dataset["users"].items()
91 | )
92 | else:
93 | user_stats.append(
94 | {
95 | "language_dataset_name": dataset_name,
96 | "username": None,
97 | "submitted": 0,
98 | "total": int(dataset["total"]),
99 | "timestamp": current_timestamp,
100 | }
101 | )
102 |
103 | return user_stats
104 |
105 |
106 | def update_progress_data(new_data, filename="argilla_progress.ndjson"):
107 | # Process new data
108 | all_user_stats = []
109 | for dataset in new_data:
110 | all_user_stats.extend(flatten_user_stats(dataset))
111 |
112 | new_df = pl.DataFrame(
113 | all_user_stats,
114 | schema={
115 | "language_dataset_name": pl.Utf8,
116 | "username": pl.Utf8,
117 | "submitted": pl.Int64,
118 | "total": pl.Int64,
119 | "timestamp": pl.Datetime,
120 | },
121 | )
122 |
123 | try:
124 | fname = hf_hub_download(
125 | repo_id="davanstrien/progress",
126 | filename="argilla_progress.ndjson",
127 | repo_type="dataset",
128 | )
129 | existing_df = pl.read_ndjson(fname)
130 | combined_df = pl.concat([existing_df, new_df])
131 | except FileNotFoundError:
132 | print("No existing data found, creating new dataset")
133 | combined_df = new_df
134 | except Exception as e:
135 | print(f"Error loading existing data: {e}")
136 | combined_df = new_df
137 |
138 | combined_df.write_ndjson(filename)
139 | return combined_df
140 |
141 |
142 | def main():
143 | print("Starting data collection...")
144 | all_data = thread_map(
145 | get_dataset_progress, language_datasets_names, max_workers=1)
146 |
147 | print("Updating progress data...")
148 | df = update_progress_data(all_data)
149 | df = df.sort("language_dataset_name")
150 |
151 | print("Saving data...")
152 | df.write_ndjson("argilla_progress.ndjson")
153 |
154 | print("Uploading to Hugging Face Hub...")
155 | api = HfApi()
156 | api.create_repo(
157 | "data-is-better-together/fineweb-c-progress", repo_type="dataset", exist_ok=True
158 | )
159 | api.upload_file(
160 | path_or_fileobj="argilla_progress.ndjson",
161 | repo_id="data-is-better-together/fineweb-c-progress",
162 | repo_type="dataset",
163 | path_in_repo="argilla_progress.ndjson",
164 | )
165 | print("Done!")
166 |
167 |
168 | if __name__ == "__main__":
169 | main()
170 |
--------------------------------------------------------------------------------
/.github/workflows/update-fineweb-progres.yml:
--------------------------------------------------------------------------------
1 | name: Update Progress Data
2 |
3 | on:
4 | schedule:
5 | - cron: "0 */6 * * *" # Runs every 6 hours
6 | workflow_dispatch: # Allows manual triggering
7 |
8 | jobs:
9 | update-progress:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - uses: actions/checkout@v4
14 |
15 | - name: Set up Python
16 | uses: actions/setup-python@v5
17 | with:
18 | python-version: "3.12"
19 |
20 | - name: Install dependencies
21 | run: |
22 | python -m pip install --upgrade pip
23 | pip install -r .github/actions/update-progress/requirements.txt
24 |
25 | - name: Run update script
26 | env:
27 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
28 | ARGILLA_API_KEY: ${{ secrets.ARGILLA_API_KEY }}
29 | run: |
30 | python .github/actions/update-progress/src/dashboard.py
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.oauth.yaml
2 | /.venv
3 | kto-preference/.env
4 | kto-preference/.vscode/settings.json
5 | .DS_Store
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .nox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | *.py,cover
56 | .hypothesis/
57 | .pytest_cache/
58 | cover/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 | db.sqlite3-journal
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | .pybuilder/
82 | target/
83 |
84 | # Jupyter Notebook
85 | .ipynb_checkpoints
86 |
87 | # IPython
88 | profile_default/
89 | ipython_config.py
90 |
91 | # pyenv
92 | # For a library or package, you might want to ignore these files since the code is
93 | # intended to run in multiple environments; otherwise, check them in:
94 | # .python-version
95 |
96 | # pipenv
97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | # install all needed dependencies.
101 | #Pipfile.lock
102 |
103 | # poetry
104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | # This is especially recommended for binary packages to ensure reproducibility, and is more
106 | # commonly ignored for libraries.
107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 |
110 | # pdm
111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | # in version control.
115 | # https://pdm.fming.dev/#use-with-ide
116 | .pdm.toml
117 |
118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119 | __pypackages__/
120 |
121 | # Celery stuff
122 | celerybeat-schedule
123 | celerybeat.pid
124 |
125 | # SageMath parsed files
126 | *.sage.py
127 |
128 | # Environments
129 | .env
130 | .venv
131 | env/
132 | venv/
133 | ENV/
134 | env.bak/
135 | venv.bak/
136 |
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 |
141 | # Rope project settings
142 | .ropeproject
143 |
144 | # mkdocs documentation
145 | /site
146 |
147 | # mypy
148 | .mypy_cache/
149 | .dmypy.json
150 | dmypy.json
151 |
152 | # Pyre type checker
153 | .pyre/
154 |
155 | # pytype static type analyzer
156 | .pytype/
157 |
158 | # Cython debug symbols
159 | cython_debug/
160 |
161 | # PyCharm
162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164 | # and can be added to the global gitignore or merged into this file. For a more nuclear
165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166 | #.idea/
167 | # vscode
168 | **/.vscode/settings.json
169 | .vscode/
170 | community-efforts/image_preferences/images
171 | community-efforts/image_preferences/image_quality_dev
172 | community-efforts/image_preferences/image_simplified_dev
173 | community-efforts/image_preferences/image_quality_sd
174 | community-efforts/image_preferences/image_simplified_sd
175 | community-efforts/image_preferences/assets
176 | logs/
177 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 🤗 Spaces & Datasets
6 |
7 | # Data is Better Together
8 |
9 | > If you are working on a valuable community-developed dataset but are limited by available resources, please reach out to us on the Hugging Face discord. We may be able to provide support to enhance your project.
10 |
11 | Data is Better Together is a collaboration between 🤗 Hugging Face, 🏓 Argilla, and the Open-Source ML community. We aim to empower the open-source community to build impactful datasets collectively. This initiative consists of two main components: the community efforts and the cookbook efforts.
12 |
13 |
14 | Community Efforts: They were guided by the HF Team, hands-on projects focused on creating valuable datasets. These projects required the participation of the community and have been successfully completed.
15 |
16 |
17 |
18 | Prompt ranking
19 |
20 | - **Goal**: This project aimed to create a dataset of 10k prompts ranked by quality. These prompts included both synthetic and human-generated from various datasets. The intention was to use the final dataset for prompt ranking tasks or synthetic data generation. You can find more information about this project in the [prompt ranking README](community-efforts/prompt_ranking/README.md)
21 | - **How**: First, we prepared a dataset with the prompts to be ranked using Argilla in a Hugging Face Space. Then, we invited the community to rank the prompts based on their quality. Finally, we collected the annotations and released the dataset.
22 | - **Result**: Over 385 people joined this initiative! Thanks to their contribution, we released [data-is-better-together/10k_prompts_ranked](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked). This dataset can be used for different tasks as you can filter the higher-quality prompts (for instance, see the MPEP project) and generate the corresponding completions. You can also find some models built on top of it [here](https://huggingface.co/models?dataset=dataset:data-is-better-together/10k_prompts_ranked).
23 |
24 |
25 |
26 |
27 | Multilingual Prompt Evaluation Project (MPEP)
28 |
29 | - **Goal**: There are not enough language-specific benchmarks for open LLMs! So, we wanted to create a leaderboard for more languages by leveraging the community. This way, we could evaluate the performance of models using [AlpacaEval](https://github.com/tatsu-lab/alpaca_eval). You can find more information about this project in the [MPEP README](community-efforts/prompt_translation/README.md).
30 | - **How**: We selected a subset of 500 high-quality prompts from the [data-is-better-together/10k_prompts_ranked](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked) (see the prompt ranking project) and asked the community to help us translate this curated prompt dataset into different languages.
31 | - **Result**: We achieved to translate the whole dataset for Dutch and Russian, and almost finished with Spanish. Many other languages have also joined this initiative. You can take a look at the resulting datasets [here](https://huggingface.co/datasets?search=MPEP).
32 |
33 |
34 |
35 | Image Preferences
36 |
37 | - **Goal**: This project aims to create 10K text to image preference pairs. These pairs can be used to evaluate the performance of image generation models across a wide variety of common image categories, based on prompt with varying levels of difficulty. You can find more information about this project in the [image preferences README](community-efforts/image_preferences/README.md) or within the [blogpost](https://huggingface.co/blog/image-preferences).
38 | - **How**: We use the prompts from [fal/imgsys-results](https://huggingface.co/datasets/fal/imgsys-results), these prompts are evolved based on complexity and quality for various image categories. We then asked the community to annotate the preference between two generated images for each prompt.
39 | - **Result**: We achieved to annotate 10K preference pairs. You can take a look at the resulting dataset [here](https://huggingface.co/datasets/data-is-better-together/open-image-preferences-v1-binarized).
40 |
41 |
42 |
43 |
44 | Cookbook Efforts: They aim to create guides and tools that help the community in building valuable datasets. They are not guided by the HF team and expected to be handled standalone, allowing you to freely contribute or use them to create your own unique dataset.
45 |
46 |
47 |
48 | Domain Specific Datasets
49 |
50 | This project aims to bootstrap the creation of more domain-specific datasets for training models. The **goal** is to create a set of tools that help users to collaborate with domain experts. Find out more in the [Domain Specific Datasets README.](cookbook-efforts/domain-specific-datasets/README.md)
51 |
52 |
53 |
54 | DPO/ORPO Datasets
55 |
56 | Many languages do not have DPO datasets openly shared on the Hugging Face Hub. The [data-is-better-together/preference_data_by_language](https://huggingface.co/spaces/data-is-better-together/preference_data_by_language) Space gives you an overview of language coverage of DPO datasets for different languages. The **goal** of this project is to help foster a community of people building more DPO-style datasets for different languages. Find out more in this [DPO/ORPO datasets README](cookbook-efforts/dpo-orpo-preference/README.md).
57 |
58 |
59 |
60 | KTO Datasets
61 |
62 | KTO is another type of preference dataset that can be used to train models to make decisions. Unlike DPO, it doesn't require two candidate responses. Instead, it relies on a simple binary preference, i.e. 👍👎. Thus, data is easier to collect and annotate. The **goal** of this project is to help the community create their own KTO dataset. Find out more in this [KTO datasets README](cookbook-efforts/kto-preference/README.md)
63 |
64 |
65 |
66 |
67 | **🤝 How can I contribute to the cookbook efforts?** That's easy! You can contribute by following the instructions in the README of the project you are interested in. Then, share your results with the community!
68 |
69 |
70 |
--------------------------------------------------------------------------------
/community-efforts/image_preferences/00_imgsys_shuffled_deduplicated_cleaned.py:
--------------------------------------------------------------------------------
1 | from datasets import Dataset, load_dataset
2 | from fast_langdetect import detect
3 |
4 | dataset = load_dataset("fal/imgsys-results", split="train")
5 | dataset = dataset.shuffle()
6 | df = dataset.to_pandas()
7 | df = df.drop_duplicates(subset=["prompt"])
8 | df = df.reset_index(drop=True)
9 | df = df[["prompt"]]
10 | df = df.dropna(subset=["prompt"])
11 | df["language"], df["score"] = zip(
12 | *df["prompt"].apply(lambda x: detect(x.replace("\n", "")).values())
13 | )
14 | df = df[df["language"] == "en"]
15 | df = df["prompt"]
16 | dataset = Dataset.from_pandas(df)
17 | dataset.push_to_hub(
18 | "data-is-better-together/imgsys-results-prompts-shuffled-cleaned-deduplicated-english"
19 | )
20 |
--------------------------------------------------------------------------------
/community-efforts/image_preferences/01_synthetic_data_generation_prompts.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 |
4 | os.environ["DISTILABEL_LOG_LEVEL"] = "DEBUG"
5 |
6 | from distilabel.llms import InferenceEndpointsLLM
7 |
8 | # from distilabel.llms.huggingface import InferenceEndpointsLLM
9 | from distilabel.pipeline import Pipeline
10 | from distilabel.steps import GroupColumns, KeepColumns, LoadDataFromHub, StepInput, step
11 | from distilabel.steps.base import StepInput
12 | from distilabel.steps.tasks import TextGeneration
13 | from distilabel.steps.typing import StepOutput
14 |
15 | ## At the time of writing this, the distilabel library does not support the image generation endpoint.
16 | ## This is a temporary fix to allow us to use the image generation endpoint.
17 |
18 | ## Let's determine the categories and subcategories for the image generation task
19 | # https://huggingface.co/spaces/google/sdxl/blob/main/app.py#L55
20 | categories = {
21 | # included
22 | "Cinematic": [
23 | # included
24 | "emotional",
25 | "harmonious",
26 | "vignette",
27 | "highly detailed",
28 | "high budget",
29 | "bokeh",
30 | "cinemascope",
31 | "moody",
32 | "epic",
33 | "gorgeous",
34 | "film grain",
35 | "grainy",
36 | ],
37 | # included
38 | "Photographic": [
39 | # included
40 | "film",
41 | "bokeh",
42 | "professional",
43 | "4k",
44 | "highly detailed",
45 | ## not included
46 | "Landscape",
47 | "Portrait",
48 | "Macro",
49 | "Portra",
50 | "Gold",
51 | "ColorPlus",
52 | "Ektar",
53 | "Superia",
54 | "C200",
55 | "CineStill",
56 | "CineStill 50D",
57 | "CineStill 800T",
58 | "Tri-X",
59 | "HP5",
60 | "Delta",
61 | "T-Max",
62 | "Fomapan",
63 | "StreetPan",
64 | "Provia",
65 | "Ektachrome",
66 | "Velvia",
67 | ],
68 | # included
69 | "Anime": [
70 | # included
71 | "anime style",
72 | "key visual",
73 | "vibrant",
74 | "studio anime",
75 | "highly detailed",
76 | ],
77 | # included
78 | "Manga": [
79 | # included
80 | "vibrant",
81 | "high-energy",
82 | "detailed",
83 | "iconic",
84 | "Japanese comic style",
85 | ],
86 | # included
87 | "Digital art": [
88 | # included
89 | "digital artwork",
90 | "illustrative",
91 | "painterly",
92 | "matte painting",
93 | "highly detailed",
94 | ],
95 | # included
96 | "Pixel art": [
97 | # included
98 | "low-res",
99 | "blocky",
100 | "pixel art style",
101 | "8-bit graphics",
102 | ],
103 | # included
104 | "Fantasy art": [
105 | # included
106 | "magnificent",
107 | "celestial",
108 | "ethereal",
109 | "painterly",
110 | "epic",
111 | "majestic",
112 | "magical",
113 | "fantasy art",
114 | "cover art",
115 | "dreamy",
116 | ],
117 | # included
118 | "Neonpunk": [
119 | # included
120 | "cyberpunk",
121 | "vaporwave",
122 | "neon",
123 | "vibes",
124 | "vibrant",
125 | "stunningly beautiful",
126 | "crisp",
127 | "detailed",
128 | "sleek",
129 | "ultramodern",
130 | "magenta highlights",
131 | "dark purple shadows",
132 | "high contrast",
133 | "cinematic",
134 | "ultra detailed",
135 | "intricate",
136 | "professional",
137 | ],
138 | # included
139 | "3D Model": [
140 | # included
141 | "octane render",
142 | "highly detailed",
143 | "volumetric",
144 | "dramatic lighting",
145 | ],
146 | # not included
147 | "Painting": [
148 | "Oil",
149 | "Acrylic",
150 | "Watercolor",
151 | "Digital",
152 | "Mural",
153 | "Sketch",
154 | "Gouache",
155 | "Renaissance",
156 | "Baroque",
157 | "Romanticism",
158 | "Impressionism",
159 | "Expressionism",
160 | "Cubism",
161 | "Surrealism",
162 | "Pop Art",
163 | "Minimalism",
164 | "Realism",
165 | "Encaustic",
166 | "Tempera",
167 | "Fresco",
168 | "Ink Wash",
169 | "Spray Paint",
170 | "Mixed Media",
171 | ],
172 | # not included
173 | "Animation": [
174 | # not included
175 | "Animation",
176 | "Stop motion",
177 | "Claymation",
178 | "Pixel Art",
179 | "Vector",
180 | "Hand-drawn",
181 | "Cutout",
182 | "Whiteboard",
183 | ],
184 | # not included
185 | "Illustration": [
186 | # not included
187 | "Book",
188 | "Comics",
189 | "Editorial",
190 | "Advertising",
191 | "Technical",
192 | "Fantasy",
193 | "Scientific",
194 | "Fashion",
195 | "Storyboard",
196 | "Concept Art",
197 | "Manga",
198 | "Anime",
199 | "Digital",
200 | "Vector",
201 | "Design",
202 | ],
203 | }
204 |
205 | ## We will use the Qwen2.5-72B-Instruct model for the text generation task, this will help us to generate the quality and style prompts
206 |
207 | model_id = (
208 | "meta-llama/Llama-3.1-8B-Instruct"
209 | ) # "meta-llama/Meta-Llama-3.1-70B-Instruct"
210 |
211 |
212 | llm = InferenceEndpointsLLM(
213 | # model_id=model_id,
214 | # tokenizer_id=model_id,
215 | generation_kwargs={"temperature": 0.8, "max_new_tokens": 2048},
216 | base_url="https://rti2mzernqmo00qy.us-east-1.aws.endpoints.huggingface.cloud",
217 | api_key=os.getenv("HF_TOKEN"),
218 | )
219 |
220 |
221 | ## We will use two types of prompts: quality and style. The quality prompt will help us to generate the quality-enhanced prompts and the style prompt will help us to generate the style-enhanced prompts.
222 | quality_prompt = """
223 | You are an expert at refining prompts for image generation models. Your task is to enhance the given prompt by adding descriptive details and quality-improving elements, while maintaining the original intent and core concept.
224 |
225 | Follow these guidelines:
226 | 1. Preserve the main subject and action of the original prompt.
227 | 2. Add specific, vivid details to enhance visual clarity.
228 | 3. Incorporate elements that improve overall image quality and aesthetics.
229 | 4. Keep the prompt concise and avoid unnecessary words.
230 | 5. Use modifiers that are appropriate for the subject matter.
231 |
232 | Example modifiers (use as reference, adapt based on some aspect that's suitable for the original prompt):
233 | - Lighting: "soft golden hour light", "dramatic chiaroscuro", "ethereal glow"
234 | - Composition: "rule of thirds", "dynamic perspective", "symmetrical balance"
235 | - Texture: "intricate details", "smooth gradients", "rich textures"
236 | - Color: "vibrant color palette", "monochromatic scheme", "complementary colors"
237 | - Atmosphere: "misty ambiance", "serene mood", "energetic atmosphere"
238 | - Technical: "high resolution", "photorealistic", "sharp focus"
239 |
240 | The enhanced prompt should be short, concise, direct, avoid unnecessary words and written as it was a human expert writing the prompt.
241 |
242 | Output only one enhanced prompt without any additional text or explanations.
243 |
244 | ## Original Prompt
245 | {{ style_prompt }}
246 |
247 | ## Quality-Enhanced Prompt
248 | """
249 |
250 | style_prompt = """
251 | You are an expert at refining prompts for image generation models. Your task is to enhance the given prompt by transforming it into a specific artistic style, technique, or genre, while maintaining the original core concept.
252 |
253 | Follow these guidelines:
254 | 1. Preserve the main subject and action of the original prompt but rewrite stylistic elements already present in the prompt.
255 | 2. Transform the prompt into a distinctive visual style (e.g., impressionism, surrealism, cyberpunk, art nouveau).
256 | 3. Incorporate style-specific elements and techniques.
257 | 4. Keep the prompt concise and avoid unnecessary words.
258 | 5. Use modifiers that are appropriate for the chosen style.
259 |
260 | You should use the following style, technique, genre to enhance the prompt:
261 | {{ category }} / {{ subcategory }}
262 |
263 | The enhanced prompt should be short, concise, direct, avoid unnecessary words and written as it was a human expert writing the prompt.
264 |
265 | Output only one style-enhanced prompt without any additional text or explanations.
266 |
267 | ## Original Prompt
268 | {{ prompt }}
269 |
270 | ## Style-Enhanced Prompt
271 | """
272 |
273 | simplification_prompt = """
274 | You are an expert at simplifying image descriptions. Your task is to simplify the description by removing any unnecessary words and phrases, while maintaining the original intent and core concept of the description.
275 |
276 | Follow these guidelines:
277 | 1. Preserve the main subject of the original description.
278 | 2. Remove all any unnecessary words and phrases.
279 | 3. Ensure the simplified description could have been quickly written by a human.
280 |
281 | ## Original Description
282 | {{ style_prompt }}
283 |
284 | ## Simplified Description
285 | """
286 |
287 | ## Let's create the pipeline to generate the quality and style prompts
288 |
289 | with Pipeline(name="image_preferences_synthetic_data_generation") as pipeline:
290 | load_data = LoadDataFromHub(name="load_dataset")
291 |
292 | @step(inputs=["prompt"], outputs=["category", "subcategory", "prompt"])
293 | def CategorySelector(inputs: StepInput) -> "StepOutput":
294 | result = []
295 | for input in inputs:
296 | # Randomly select a category
297 | category = random.choice(list(categories.keys()))
298 | # Randomly select a subcategory from the chosen category
299 | subcategory = random.choice(categories[category])
300 |
301 | result.append(
302 | {
303 | "category": category,
304 | "subcategory": subcategory,
305 | "prompt": input["prompt"],
306 | }
307 | )
308 | yield result
309 |
310 | category_selector = CategorySelector(name="category_selector")
311 |
312 | style_augmentation = TextGeneration(
313 | llm=llm,
314 | template=style_prompt,
315 | columns=["prompt", "category", "subcategory"],
316 | name="style_augmentation",
317 | output_mappings={"generation": "style_prompt"},
318 | input_batch_size=4,
319 | )
320 |
321 | simplification_augmentation = TextGeneration(
322 | llm=llm,
323 | template=simplification_prompt,
324 | columns=["style_prompt"],
325 | name="simplification_augmentation",
326 | output_mappings={"generation": "simplified_prompt"},
327 | input_batch_size=2,
328 | )
329 |
330 | quality_augmentation = TextGeneration(
331 | llm=llm,
332 | template=quality_prompt,
333 | columns=["style_prompt"],
334 | name="quality_augmentation",
335 | output_mappings={"generation": "quality_prompt"},
336 | input_batch_size=2,
337 | )
338 |
339 | group_columns = GroupColumns(columns=["model_name"])
340 | keep_columns = KeepColumns(
341 | columns=[
342 | "prompt",
343 | "category",
344 | "subcategory",
345 | "style_prompt",
346 | "quality_prompt",
347 | "simplified_prompt",
348 | ]
349 | )
350 |
351 | (
352 | load_data
353 | >> category_selector
354 | >> style_augmentation
355 | >> [quality_augmentation, simplification_augmentation]
356 | >> group_columns
357 | >> keep_columns
358 | )
359 |
360 | ## Let's run the pipeline and push the resulting dataset to the hub
361 |
362 | if __name__ == "__main__":
363 | num_examples = 15000
364 | distiset = pipeline.run(
365 | use_cache=True,
366 | parameters={
367 | load_data.name: {
368 | "num_examples": num_examples,
369 | "repo_id": "data-is-better-together/imgsys-results-prompts-shuffled-cleaned-deduplicated-english",
370 | }
371 | },
372 | )
373 | dataset_name = "data-is-better-together/imgsys-results-prompts-style_v2_part1"
374 | distiset.push_to_hub(
375 | repo_id=dataset_name,
376 | include_script=True,
377 | generate_card=False,
378 | token=os.getenv("HF_TOKEN"),
379 | )
380 |
--------------------------------------------------------------------------------
/community-efforts/image_preferences/02_image_prefernces_cleaned_filtered_sfw.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | from datasets import load_dataset
4 | from transformers import pipeline
5 |
6 | pipe_text = pipeline(
7 | "text-classification",
8 | model="ezb/NSFW-Prompt-Detector",
9 | device="mps",
10 | )
11 | pipe_text_2 = pipeline(
12 | "text-classification",
13 | model="michellejieli/NSFW_text_classifier",
14 | device="mps",
15 | )
16 | pipe_image = pipeline(
17 | "image-classification",
18 | model="MichalMlodawski/nsfw-image-detection-large",
19 | device="mps",
20 | )
21 |
22 | label_to_category_text = {
23 | "LABEL_0": "Safe",
24 | "LABEL_1": "Questionable",
25 | "LABEL_2": "Unsafe",
26 | }
27 |
28 |
29 | def clean_dataset(batch):
30 | try:
31 | batch["nsfw_text"] = []
32 | batch["nsfw_image"] = []
33 | evaluated_results_image = defaultdict(list)
34 | evaluated_results_text = defaultdict(list)
35 |
36 | image_columns = [
37 | "image_quality_dev",
38 | "image_simplified_dev",
39 | "image_quality_sd",
40 | "image_simplified_sd",
41 | ]
42 |
43 | for image_column in image_columns:
44 | results_image = pipe_image(batch[image_column])
45 | evaluated_results_image[image_column] = [
46 | res[0]["label"] in ["UNSAFE", "QUESTIONABLE"] for res in results_image
47 | ]
48 |
49 | try:
50 | results_text = pipe_text(batch["prompt"])
51 | results_text_2 = pipe_text_2(batch["prompt"])
52 | evaluated_results_text["text"] = [
53 | res["label"] == "NSFW" for res in results_text
54 | ]
55 | evaluated_results_text["text_2"] = [
56 | res["label"] == "NSFW" for res in results_text_2
57 | ]
58 | except Exception:
59 | try:
60 | results_text_2 = pipe_text_2(batch["prompt"])
61 | evaluated_results_text["text_2"] = [
62 | res["label"] == "NSFW" for res in results_text_2
63 | ]
64 | evaluated_results_text["text"] = [False] * len(results_text_2)
65 | except Exception:
66 | try:
67 | results_text = pipe_text(batch["prompt"])
68 | evaluated_results_text["text"] = [
69 | res["label"] == "NSFW" for res in results_text
70 | ]
71 | evaluated_results_text["text_2"] = [False] * len(results_text)
72 | except Exception:
73 | for item in batch["prompt"]:
74 | try:
75 | evaluated_results_text["text"].append(
76 | pipe_text(item)["label"] == "NSFW"
77 | )
78 | except Exception:
79 | evaluated_results_text["text"].append(True)
80 | try:
81 | evaluated_results_text["text_2"].append(
82 | pipe_text_2(item)["label"] == "NSFW"
83 | )
84 | except Exception:
85 | evaluated_results_text["text_2"].append(True)
86 |
87 | for i in range(len(evaluated_results_text["text"])):
88 | if any(evaluated_results_text[col][i] for col in evaluated_results_text):
89 | batch["nsfw_text"].append(True)
90 | else:
91 | batch["nsfw_text"].append(False)
92 | for i in range(len(evaluated_results_image["image_quality_dev"])):
93 | if any(evaluated_results_image[col][i] for col in evaluated_results_image):
94 | batch["nsfw_image"].append(True)
95 | else:
96 | batch["nsfw_image"].append(False)
97 | except Exception as e:
98 | raise Exception(e)
99 | return batch
100 |
101 |
102 | ds = load_dataset(
103 | "data-is-better-together/open-image-preferences-v1-unfiltered", split="train"
104 | )
105 | df = ds.filter(
106 | lambda x: x["image_quality_dev"]
107 | and x["image_simplified_dev"]
108 | and x["image_quality_sd"]
109 | and x["image_simplified_sd"]
110 | )
111 | ds = df.map(clean_dataset, batched=True, batch_size=100)
112 | ds = ds.filter(lambda x: not x["nsfw_text"] and not x["nsfw_image"])
113 | ds = ds.remove_columns(["nsfw_text", "nsfw_image"])
114 | ds.push_to_hub(
115 | "data-is-better-together/open-image-preferences-v1",
116 | split="cleaned",
117 | private=True,
118 | )
119 |
--------------------------------------------------------------------------------
/community-efforts/image_preferences/04_binarize_preference_results.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "!pip install datasets"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Load and transform the dataset\n",
17 | "\n",
18 | "First, we load the dataset.\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stderr",
28 | "output_type": "stream",
29 | "text": [
30 | "/Users/davidberenstein/Documents/programming/argilla/data-is-better-together/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
31 | " from .autonotebook import tqdm as notebook_tqdm\n"
32 | ]
33 | },
34 | {
35 | "data": {
36 | "text/plain": [
37 | "Dataset({\n",
38 | " features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status'],\n",
39 | " num_rows: 5000\n",
40 | "})"
41 | ]
42 | },
43 | "execution_count": 1,
44 | "metadata": {},
45 | "output_type": "execute_result"
46 | }
47 | ],
48 | "source": [
49 | "from datasets import load_dataset\n",
50 | "\n",
51 | "ds = load_dataset(\"data-is-better-together/image-preferences-v1-results\", split=\"train\")\n",
52 | "ds"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "data": {
62 | "text/plain": [
63 | "{'id': '3368-quality',\n",
64 | " 'status': 'completed',\n",
65 | " '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',\n",
66 | " 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n",
67 | " 'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n",
68 | " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},\n",
69 | " 'model_1': 'dev',\n",
70 | " 'model_2': 'sd',\n",
71 | " 'evolution': 'quality',\n",
72 | " 'category': 'Manga',\n",
73 | " 'sub_category': 'detailed',\n",
74 | " 'preference.responses': ['both_good', 'image_1', 'image_1'],\n",
75 | " 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',\n",
76 | " 'caf19767-2989-4b3c-a653-9c30afc6361d',\n",
77 | " 'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],\n",
78 | " 'preference.responses.status': ['submitted', 'submitted', 'submitted']}"
79 | ]
80 | },
81 | "execution_count": 2,
82 | "metadata": {},
83 | "output_type": "execute_result"
84 | }
85 | ],
86 | "source": [
87 | "ds[0]"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 3,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "ds = ds.filter(lambda example: example['preference.responses'] is not None)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 9,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "name": "stderr",
106 | "output_type": "stream",
107 | "text": [
108 | "Map: 100%|██████████| 4997/4997 [00:00<00:00, 12626.85 examples/s]\n"
109 | ]
110 | },
111 | {
112 | "data": {
113 | "text/plain": [
114 | "{'id': '3368-quality',\n",
115 | " 'status': 'completed',\n",
116 | " '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',\n",
117 | " 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n",
118 | " 'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n",
119 | " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},\n",
120 | " 'model_1': 'dev',\n",
121 | " 'model_2': 'sd',\n",
122 | " 'evolution': 'quality',\n",
123 | " 'category': 'Manga',\n",
124 | " 'sub_category': 'detailed',\n",
125 | " 'preference.responses': ['both_good', 'image_1', 'image_1'],\n",
126 | " 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',\n",
127 | " 'caf19767-2989-4b3c-a653-9c30afc6361d',\n",
128 | " 'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],\n",
129 | " 'preference.responses.status': ['submitted', 'submitted', 'submitted'],\n",
130 | " 'chosen': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n",
131 | " 'chosen_model': 'black-forest-labs/FLUX.1-dev',\n",
132 | " 'rejected': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n",
133 | " 'rejected_model': 'stabilityai/stable-diffusion-3.5-large',\n",
134 | " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'}"
135 | ]
136 | },
137 | "execution_count": 9,
138 | "metadata": {},
139 | "output_type": "execute_result"
140 | }
141 | ],
142 | "source": [
143 | "from collections import Counter\n",
144 | "\n",
145 | "def get_preference_winner(batch):\n",
146 | " responses = batch['preference.responses']\n",
147 | " cleaned_responses = []\n",
148 | " for response in responses:\n",
149 | " if response == 'both_good':\n",
150 | " cleaned_responses.append('image_1')\n",
151 | " cleaned_responses.append('image_2')\n",
152 | " else:\n",
153 | " cleaned_responses.append(response)\n",
154 | " counts = Counter(cleaned_responses)\n",
155 | " if counts['image_1'] > counts['image_2'] and counts['image_1'] > counts['both_bad']:\n",
156 | " batch['chosen'] = batch['images']['image_1']\n",
157 | " batch['chosen_model'] = batch[\"model_1\"]\n",
158 | " batch['rejected'] = batch['images']['image_2']\n",
159 | " batch['rejected_model'] = batch[\"model_2\"]\n",
160 | " elif counts['image_2'] > counts['image_1'] and counts['image_2'] > counts['both_bad']:\n",
161 | " batch['chosen'] = batch['images']['image_2']\n",
162 | " batch['chosen_model'] = batch[\"model_2\"]\n",
163 | " batch['rejected'] = batch['images']['image_1']\n",
164 | " batch['rejected_model'] = batch[\"model_1\"]\n",
165 | " else:\n",
166 | " batch['chosen'] = None\n",
167 | " batch['chosen_model'] = None\n",
168 | " batch['rejected'] = None\n",
169 | " batch['rejected_model'] = None\n",
170 | "\n",
171 | " batch[\"prompt\"] = batch[\"images\"][\"prompt\"]\n",
172 | " \n",
173 | " if batch['chosen_model'] == 'dev':\n",
174 | " batch['chosen_model'] = 'black-forest-labs/FLUX.1-dev'\n",
175 | " batch['rejected_model'] = 'stabilityai/stable-diffusion-3.5-large'\n",
176 | " else:\n",
177 | " batch['rejected_model'] = 'black-forest-labs/FLUX.1-dev'\n",
178 | " batch['chosen_model'] = 'stabilityai/stable-diffusion-3.5-large'\n",
179 | " \n",
180 | " return batch\n",
181 | "\n",
182 | "\n",
183 | "ds_formatted = ds.map(get_preference_winner)\n",
184 | "ds_formatted[0]\n"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 10,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "name": "stderr",
194 | "output_type": "stream",
195 | "text": [
196 | "Filter: 100%|██████████| 4997/4997 [00:00<00:00, 48227.03 examples/s]\n"
197 | ]
198 | },
199 | {
200 | "data": {
201 | "text/plain": [
202 | "Dataset({\n",
203 | " features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status', 'chosen', 'chosen_model', 'rejected', 'rejected_model', 'prompt'],\n",
204 | " num_rows: 3007\n",
205 | "})"
206 | ]
207 | },
208 | "execution_count": 10,
209 | "metadata": {},
210 | "output_type": "execute_result"
211 | }
212 | ],
213 | "source": [
214 | "ds_formatted_filtered = ds_formatted.filter(lambda example: example['chosen'] is not None)\n",
215 | "ds_formatted_filtered"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 11,
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "name": "stderr",
225 | "output_type": "stream",
226 | "text": [
227 | "Map: 100%|██████████| 1504/1504 [28:41<00:00, 1.14s/ examples]t/s]\n",
228 | "Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 70.73ba/s]\n",
229 | "Map: 100%|██████████| 1503/1503 [27:23<00:00, 1.09s/ examples], 1737.29s/it]\n",
230 | "Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 90.22ba/s]\n",
231 | "Uploading the dataset shards: 100%|██████████| 2/2 [56:40<00:00, 1700.25s/it]\n"
232 | ]
233 | },
234 | {
235 | "data": {
236 | "text/plain": [
237 | "CommitInfo(commit_url='https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized/commit/a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', commit_message='Upload dataset', commit_description='', oid='a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized', endpoint='https://huggingface.co', repo_type='dataset', repo_id='data-is-better-together/image-preferences-results-binarized'), pr_revision=None, pr_num=None)"
238 | ]
239 | },
240 | "execution_count": 11,
241 | "metadata": {},
242 | "output_type": "execute_result"
243 | }
244 | ],
245 | "source": [
246 | "from datasets import Image\n",
247 | "relevant_columns = ['id', 'prompt', 'chosen', 'rejected', 'chosen_model', 'rejected_model', 'evolution', 'category', 'sub_category']\n",
248 | "ds_formatted_filtered_columns = ds_formatted_filtered.select_columns(relevant_columns)\n",
249 | "ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('chosen', Image())\n",
250 | "ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('rejected', Image())\n",
251 | "ds_formatted_filtered_columns.push_to_hub(\"data-is-better-together/open-image-preferences-v1-binarized\")\n"
252 | ]
253 | }
254 | ],
255 | "metadata": {
256 | "kernelspec": {
257 | "display_name": ".venv",
258 | "language": "python",
259 | "name": "python3"
260 | },
261 | "language_info": {
262 | "codemirror_mode": {
263 | "name": "ipython",
264 | "version": 3
265 | },
266 | "file_extension": ".py",
267 | "mimetype": "text/x-python",
268 | "name": "python",
269 | "nbconvert_exporter": "python",
270 | "pygments_lexer": "ipython3",
271 | "version": "3.11.9"
272 | }
273 | },
274 | "nbformat": 4,
275 | "nbformat_minor": 2
276 | }
277 |
--------------------------------------------------------------------------------
/community-efforts/image_preferences/05_fine_tune_flux_lora.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fine-tune Flux LoRA on the image preferences dataset\n",
8 | "\n",
9 | "Note, we will not use preferences from the dev set for this fine-tuning. We will only use the chosen images for an Supervised fine-tuning phase. Additionally, we recommend using a A100 GPU (4$/hour on Hugging Face) for this fine-tuning because of the memory requirements. The fine-tuning script will take about 4 hours to complete for a single epoch.\n",
10 | "\n",
11 | "## Install dependencies\n",
12 | "\n",
13 | "We first make sure we have the latest version of diffusers installed. This is a development version of diffusers, so we need to install it from source. Additionally, we install the other dependencies that are required for the fine-tuning script."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {
20 | "vscode": {
21 | "languageId": "plaintext"
22 | }
23 | },
24 | "outputs": [],
25 | "source": [
26 | "!git clone https://github.com/huggingface/diffusers\n",
27 | "!pip install -e diffusers/.\n",
28 | "!pip install datasets sentencepiece protobuf accelerate peft wandb torchvision prodigyopt"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Logins and config\n",
36 | "\n",
37 | "We will use Weights & Biases to log the training process. Additionally, we log in to Hugging Face to push the finetuned model to the Hub."
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "!huggingface-cli login --token \"hf_xxx\"\n",
47 | "!wandb login \"xxx\"\n",
48 | "!accelerate config default"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "## Fine-tune the model\n",
56 | "\n",
57 | "Lastly, we fine-tune the Flux LoRA on the chosen images from the image preferences dataset. We heavily inspired from the [Dreambooth fine-tuning script](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_flux.md) and modified it to work for our use case."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "!accelerate launch diffusers/examples/dreambooth/train_dreambooth_lora_flux.py \\\n",
67 | " --pretrained_model_name_or_path \"black-forest-labs/FLUX.1-dev\" \\\n",
68 | " --dataset_name \"data-is-better-together/open-image-preferences-v1-binarized\" \\\n",
69 | " --hub_model_id \"davidberenstein1957/open-image-preferences-v1-flux-dev-lora\" \\\n",
70 | " --push_to_hub \\\n",
71 | " --output_dir \"open-image-preferences-v1-flux-dev-lora\" \\\n",
72 | " --image_column \"chosen\" \\\n",
73 | " --caption_column \"prompt\" \\\n",
74 | " --mixed_precision=\"bf16\" \\\n",
75 | " --resolution=1024 \\\n",
76 | " --train_batch_size=1 \\\n",
77 | " --repeats=1 \\\n",
78 | " --report_to=\"wandb\"\\\n",
79 | " --gradient_accumulation_steps=1 \\\n",
80 | " --gradient_checkpointing \\\n",
81 | " --learning_rate=1.0 \\\n",
82 | " --text_encoder_lr=1.0 \\\n",
83 | " --optimizer=\"prodigy\"\\\n",
84 | " --lr_scheduler=\"constant\" \\\n",
85 | " --lr_warmup_steps=0 \\\n",
86 | " --rank=8 \\\n",
87 | " --checkpointing_steps=2000 \\\n",
88 | " --seed=\"0\" "
89 | ]
90 | }
91 | ],
92 | "metadata": {
93 | "language_info": {
94 | "name": "python"
95 | }
96 | },
97 | "nbformat": 4,
98 | "nbformat_minor": 2
99 | }
100 |
--------------------------------------------------------------------------------
/community-efforts/image_preferences/README.md:
--------------------------------------------------------------------------------
1 | # Open Image Preferences Dataset
2 |
3 | ## What is it?
4 |
5 | This is a project for the community to contribute image preferences for an open source dataset, that could be used for training and evaluating text to image models. You can find a full blogpost [here](https://huggingface.co/blog/image-preferences).
6 |
7 | ## What did we achieve?
8 |
9 | We achieved to annotate 10K preference pairs. You can take a look at the resulting dataset [here](https://huggingface.co/datasets/data-is-better-together/open-image-preferences-v1-results), and [its version that is ready for training](https://huggingface.co/datasets/data-is-better-together/open-image-preferences-v1-binarized). Additionally, we showcased the effectiveness along with a [FLUX-dev LoRA fine-tune](https://huggingface.co/data-is-better-together/open-image-preferences-v1-flux-dev-lora).
10 |
11 | ## How to use the dataset
12 |
13 | The dataset is hosted on Hugging Face, and free for anyone to use under an Apache 2.0 license. Here are some [examples of how to use the dataset for fine-tuning or post-analysis](https://huggingface.co/blog/image-preferences#what-is-next).
14 |
15 | ## Which tools were used?
16 |
17 | For the prompt ranking project, we used two tools to help us manage the annotation process.
18 |
19 | - [Argilla](https://github.com/argilla-io/argilla): an open-source data annotation tool that we used for the prompt ranking. Argilla has the option of using Hugging Face for authentication, which makes it easier for the community to contribute.
20 | - [distilabel](https://github.com/argilla-io/distilabel): a tool for creating and sythetic datasets. We used distilabel to evolve prompt and to create the image preferences dataset.
21 | - [Hugging Face Spaces](https://huggingface.co/spaces): a platform for hosting machine learning applications and demos. We used Spaces to host the Argilla tool for prompt ranking.
--------------------------------------------------------------------------------
/community-efforts/image_preferences/requirements.txt:
--------------------------------------------------------------------------------
1 | distilabel[hf-inference-endpoints,argilla]==1.4.1
2 | pillow
3 |
--------------------------------------------------------------------------------
/community-efforts/image_preferences/template.html:
--------------------------------------------------------------------------------
1 |
2 |
42 |
43 |
44 |
Prompt: {{record.fields.images.prompt}}
45 |
46 |
47 |

48 |
Image 1
49 |
50 |
51 |

52 |
Image 2
53 |
54 |
55 |
--------------------------------------------------------------------------------
/community-efforts/prompt_ranking/README.md:
--------------------------------------------------------------------------------
1 | # Prompt Ranking Project
2 |
3 | ## What is it?
4 |
5 | The Prompt Ranking Project is a pioneering community-driven initiative to explore the use of Argilla and Hugging Face Spaces for collaboratively creating impactful datasets. As part of the project, we built a dataset of 10k human and synthetic prompts, which users ranked by quality. This dataset serves various purposes: it can be used to train and evaluate language models on prompt ranking tasks or as seed data for generating synthetic prompts and completions by filtering those with the highest quality.
6 |
7 | In addition, as the first crowdsourcing effort involving the community, it provides valuable insights into the behavior of annotators. This includes exploring the distribution of prompt rankings based on the source of the prompt, its type, length, or other features. We can also examine the agreement levels among annotators and identify factors that influence this agreement.
8 |
9 | ## How did we make it possible?
10 |
11 | First, we created a prompt dataset with a mix of human and synthetic prompts from various sources. You can find the list of sources in the "Source Data" section [here](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked). Then, we set up an instance of Argilla in a Hugging Face Space to enable the annotation process. This preparation stage took around a week.
12 |
13 | Finally, during the next two weeks, we invited the community to participate in the ranking process to evaluate their quality.
14 |
15 | ## How did people contribute?
16 |
17 | The community contributed to the project by ranking the prompts in the dataset. For this, they just needed a Hugging Face account to log in to the Hugging Face Space where the Argilla instance was hosted and start ranking the prompts.
18 |
19 | ## Which tools were used?
20 |
21 | For the prompt ranking project, we used two tools to help us manage the annotation process.
22 |
23 | - [Argilla](https://github.com/argilla-io/argilla): an open-source data annotation tool that we used for the prompt ranking. Argilla has the option of using Hugging Face for authentication, which makes it easier for the community to contribute.
24 | - [Hugging Face Spaces](https://huggingface.co/spaces): a platform for hosting machine learning applications and demos. We used Spaces to host the Argilla tool for prompt ranking.
25 |
26 | ## What did we achieve?
27 |
28 | Thanks to the contribution of over 385 people, we were able to create the [data-is-better-together/10k_prompts_ranked](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked) dataset with 10,331 examples.
29 |
30 | Moreover, we could analyze the decision behavior of the annotators. Below, you can see that the human-generated prompts were ranked higher than the synthetic ones. This is an interesting observation that can be further explored in future research.
31 |
32 | > The "unknown" kind is a result of the fact that the source of the prompt was not known for some of the prompts in the dataset.
33 |
34 | 
35 |
36 | Check the dataset [here](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked)! Don't miss it!
--------------------------------------------------------------------------------
/community-efforts/prompt_ranking/assets/synthetic-vs-human.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/community-efforts/prompt_ranking/assets/synthetic-vs-human.png
--------------------------------------------------------------------------------
/community-efforts/prompt_translation/README.md:
--------------------------------------------------------------------------------
1 | # Multilingual Prompt Evaluation Project (MPEP)
2 |
3 | *🏅 There were not enough language-specific benchmarks for open LLMs. We wanted to create a leaderboard for more languages by leveraging the community!🏅*
4 |
5 | ## What is it?
6 |
7 | The Multilingual Prompt Evaluation Project (MPEP) is a community-driven effort to evaluate the performance of open language models across different languages. We translated a curated set of 500 high-quality prompts into multiple languages with the aim of evaluating the performance of models in different languages using [AlpacaEval](https://github.com/tatsu-lab/alpaca_eval), an automated tool for evaluating instruction/chat models based on LLM evaluation.
8 |
9 | ## How did we make it possible?
10 |
11 | As the community created a dataset of 10k prompts [data-is-better-together/10k_prompts_ranked](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked) with quality ratings as part of the Data is Better Together initiative. From this dataset, we curated a subset of 500 high-quality prompts that cover a diverse range of capabilities for a model, such as math, coding, relationships, email generation, etc.
12 |
13 | However, these prompts were originally in English, so we asked the community to help us translate this curated dataset into different languages so that we could use the translated prompts to evaluate the performance of models for the languages we translate into.
14 |
15 | ## How did people contribute?
16 |
17 | There were two ways to contribute to this effort: by becoming a language lead or as community contributor.
18 |
19 | * The language leads were responsible for setting up a Hub organization and creating an Argilla Space for their language. They also gathered a community of people to help them translate the prompts and created a dashboard to track the progress of the translation effort with the guidance of Daniel van Strien. We need to thank them for their hard work!
20 |
21 | * People who spoke the languages that were being translated into could contribute to the translation of prompts. They just needed a Hugging Face account to log in to the relevant Space and start translating the prompts.
22 |
23 | ## Which tools were used?
24 |
25 | For the MPEP project, we used two main tools to help us manage the translation process.
26 |
27 | - [Argilla](https://github.com/argilla-io/argilla): an open-source data annotation tool that we used for the translation of prompts. Argilla has the option of using Hugging Face for authentication, which makes it easier for the community to contribute to the translation of prompts.
28 | - [Hugging Face Spaces](https://huggingface.co/spaces): a platform for hosting machine learning applications and demos. We'll use Spaces to host the Argilla tool for the translation of prompts.
29 |
30 | To make easier the translation set up for users, we also created a series of notebooks that served as guidance.
31 |
32 | ## What did we achieve?
33 |
34 | We started efforts to translate the prompts into several languages (shown below). Some of them successfully completed. The successful ones were Dutch and Russian, and almost finished with Spanish. Many groups began to work on translating the prompts into other languages. You can look at the resulting datasets [here](https://huggingface.co/datasets?search=MPEP_).
35 |
36 |
37 |
38 | Arabic |
39 | Cantonese |
40 | Czech |
41 | Dutch |
42 | Filipino |
43 |
44 |
45 | French |
46 | German |
47 | Hungarian |
48 | Malagasy |
49 | Portuguese |
50 |
51 |
52 | Russian |
53 | Slovak |
54 | Spanish |
55 | Swahili |
56 | Tagalog |
57 |
58 |
59 | Tamil |
60 | Telugu |
61 | Turkish |
62 | Vietnamese |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/community-efforts/prompt_translation/Translation_with_distilabel_gpt_4_turbo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "# Setup"
21 | ],
22 | "metadata": {
23 | "id": "mTYjyCl_1dAO"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "id": "MZhTFpbXzPYM"
31 | },
32 | "outputs": [],
33 | "source": [
34 | "HF_ORG_NAME = None # update with the ID of the org you just created\n",
35 | "LANGUAGE = None # update this with the language you will work on"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "source": [
41 | "assert HF_ORG_NAME is not None, \"Please set HF_ORG_NAME to the ID of the Hugging Face org you just created\"\n",
42 | "assert LANGUAGE is not None, \"Please set LANGUAGE to the language your effort focuses on\""
43 | ],
44 | "metadata": {
45 | "id": "TVZF5-b3zRBJ"
46 | },
47 | "execution_count": null,
48 | "outputs": []
49 | },
50 | {
51 | "cell_type": "code",
52 | "source": [
53 | "import argilla as rg\n",
54 | "\n",
55 | "OWNER_API_KEY = \"owner.apikey\" # if you haven't setup the secret this is the default owner api key\n",
56 | "assert OWNER_API_KEY is not None, \"Please set OWNER_API_KEY to the API token you just set in the Space settings\"\n",
57 | "\n",
58 | "rg.init(api_url=homepage_url, api_key=OWNER_API_KEY)"
59 | ],
60 | "metadata": {
61 | "id": "NdTtXc_v1YBD"
62 | },
63 | "execution_count": null,
64 | "outputs": []
65 | },
66 | {
67 | "cell_type": "code",
68 | "source": [
69 | "from openai import OpenAI\n",
70 | "from google.colab import userdata\n",
71 | "\n",
72 | "from distilabel.llm.openai import OpenAILLM\n",
73 | "from distilabel.tasks import TextGenerationTask\n",
74 | "from distilabel.pipeline import Pipeline"
75 | ],
76 | "metadata": {
77 | "id": "cQG-OX9DzWmA"
78 | },
79 | "execution_count": null,
80 | "outputs": []
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "source": [
85 | "# Get original dataset and translate it\n",
86 | "\n",
87 | "This assumes you have already pushed the untranslated dataset"
88 | ],
89 | "metadata": {
90 | "id": "nB9Mquww1gcD"
91 | }
92 | },
93 | {
94 | "cell_type": "code",
95 | "source": [
96 | "# let's load the dataset and prepare the source col for distilabel\n",
97 | "argilla_ds = rg.FeedbackDataset.from_argilla(f\"DIBT Translation for {LANGUAGE}\", workspace=\"admin\")\n",
98 | "hf_ds = argilla_ds.format_as(\"datasets\").rename_columns({'source': \"input\"})"
99 | ],
100 | "metadata": {
101 | "id": "WBwjwNdq0LN-"
102 | },
103 | "execution_count": null,
104 | "outputs": []
105 | },
106 | {
107 | "cell_type": "code",
108 | "source": [
109 | "api_key=userdata.get(\"OPENAI_API_KEY\")\n",
110 | "\n",
111 | "target_lang = \"Spanish\" # change this with your target language name\n",
112 | "\n",
113 | "llm = OpenAILLM(\n",
114 | " model=\"gpt-4-0613\", # gpt4-turbo\n",
115 | " api_key=api_key,\n",
116 | " task=TextGenerationTask(system_prompt=f\"You will be provided with a text in English, and your task is to translate it into {target_lang}. If it's code please don't translate the actual code, only the comments and the explanation.\"),\n",
117 | " num_threads=8,\n",
118 | " max_new_tokens=8192,\n",
119 | ")\n",
120 | "\n",
121 | "pipe = Pipeline(\n",
122 | " generator=llm\n",
123 | ")"
124 | ],
125 | "metadata": {
126 | "id": "BygNfRFyzYWv"
127 | },
128 | "execution_count": null,
129 | "outputs": []
130 | },
131 | {
132 | "cell_type": "code",
133 | "source": [
134 | "# test everything is working so far\n",
135 | "ds = pipe.generate(\n",
136 | " dataset=hf_ds.select(range(10)),\n",
137 | " batch_size=4,\n",
138 | " display_progress_bar=True\n",
139 | ")\n",
140 | "# check the translations before running the full pipeline\n",
141 | "ds.to_pandas().head(5)"
142 | ],
143 | "metadata": {
144 | "id": "ZdeX71YdzbX_"
145 | },
146 | "execution_count": null,
147 | "outputs": []
148 | },
149 | {
150 | "cell_type": "code",
151 | "source": [
152 | "# if everything is working as expected, run with the full dataset\n",
153 | "ds = pipe.generate(\n",
154 | " dataset=hf_ds,\n",
155 | " batch_size=4,\n",
156 | " display_progress_bar=True\n",
157 | ")"
158 | ],
159 | "metadata": {
160 | "id": "SGdugR9kzf79"
161 | },
162 | "execution_count": null,
163 | "outputs": []
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "source": [
168 | "# Update the translations in the Argilla Space\n"
169 | ],
170 | "metadata": {
171 | "id": "18GUbdg01lD4"
172 | }
173 | },
174 | {
175 | "cell_type": "code",
176 | "source": [
177 | "translations = [gen[0] for gen in ds['generations']]\n",
178 | "len(translations)"
179 | ],
180 | "metadata": {
181 | "id": "yukaSFwFzk27"
182 | },
183 | "execution_count": null,
184 | "outputs": []
185 | },
186 | {
187 | "cell_type": "code",
188 | "source": [
189 | "altered_records = []\n",
190 | "\n",
191 | "for rec, translation in zip(argilla_ds.records, translations):\n",
192 | " rec.suggestions = [\n",
193 | " {\n",
194 | " \"question_name\": \"target\",\n",
195 | " \"value\": translation\n",
196 | " }\n",
197 | " ]\n",
198 | " altered_records.append(rec)\n",
199 | "\n",
200 | "altered_records[0]"
201 | ],
202 | "metadata": {
203 | "id": "IJWw41v4zndL"
204 | },
205 | "execution_count": null,
206 | "outputs": []
207 | },
208 | {
209 | "cell_type": "code",
210 | "source": [
211 | "argilla_ds.update_records(altered_records)"
212 | ],
213 | "metadata": {
214 | "id": "IgkY5M4oztQz"
215 | },
216 | "execution_count": null,
217 | "outputs": []
218 | }
219 | ]
220 | }
221 |
--------------------------------------------------------------------------------
/community-efforts/prompt_translation/dashboard_template/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tar filter=lfs diff=lfs merge=lfs -text
29 | *.tflite filter=lfs diff=lfs merge=lfs -text
30 | *.tgz filter=lfs diff=lfs merge=lfs -text
31 | *.wasm filter=lfs diff=lfs merge=lfs -text
32 | *.xz filter=lfs diff=lfs merge=lfs -text
33 | *.zip filter=lfs diff=lfs merge=lfs -text
34 | *.zst filter=lfs diff=lfs merge=lfs -text
35 | *tfevents* filter=lfs diff=lfs merge=lfs -text
36 |
--------------------------------------------------------------------------------
/community-efforts/prompt_translation/dashboard_template/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Template for Dashboards - Multilingual Prompt Evaluation Project
3 | emoji: 📊
4 | colorFrom: indigo
5 | colorTo: indigo
6 | sdk: gradio
7 | sdk_version: 4.21.0
8 | app_file: app.py
9 | pinned: false
10 | license: apache-2.0
11 | ---
--------------------------------------------------------------------------------
/community-efforts/prompt_translation/dashboard_template/dumpy.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 |
5 | import argilla as rg
6 | from huggingface_hub import HfApi
7 |
8 | logger = logging.getLogger(__name__)
9 | logger.setLevel(logging.INFO)
10 |
11 | if __name__ == "__main__":
12 | logger.info("*** Initializing Argilla session ***")
13 | rg.init(
14 | api_url=os.getenv("ARGILLA_API_URL"),
15 | api_key=os.getenv("ARGILLA_API_KEY"),
16 | extra_headers={"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"},
17 | )
18 |
19 | logger.info("*** Fetching dataset from Argilla ***")
20 | dataset = rg.FeedbackDataset.from_argilla(
21 | os.getenv("SOURCE_DATASET"),
22 | workspace=os.getenv("SOURCE_WORKSPACE"),
23 | )
24 | logger.info("*** Filtering records by `response_status` ***")
25 | dataset = dataset.filter_by(response_status=["submitted"]) # type: ignore
26 |
27 | logger.info("*** Calculating users and annotation count ***")
28 | output = {}
29 | for record in dataset.records:
30 | for response in record.responses:
31 | if response.user_id not in output:
32 | output[response.user_id] = 0
33 | output[response.user_id] += 1
34 |
35 | for key in list(output.keys()):
36 | output[rg.User.from_id(key).username] = output.pop(key)
37 |
38 | logger.info("*** Users and annotation count successfully calculated! ***")
39 |
40 | logger.info("*** Dumping Python dict into `stats.json` ***")
41 | with open("stats.json", "w") as file:
42 | json.dump(output, file, indent=4)
43 |
44 | logger.info("*** Uploading `stats.json` to Hugging Face Hub ***")
45 | api = HfApi(token=os.getenv("HF_TOKEN"))
46 | api.upload_file(
47 | path_or_fileobj="stats.json",
48 | path_in_repo="stats.json",
49 | repo_id="data-is-better-together/prompt-collective-dashboard",
50 | repo_type="space",
51 | )
52 | logger.info("*** `stats.json` successfully uploaded to Hugging Face Hub! ***")
53 |
--------------------------------------------------------------------------------
/community-efforts/prompt_translation/dashboard_template/requirements.txt:
--------------------------------------------------------------------------------
1 | aiofiles==23.2.1
2 | altair==5.2.0
3 | annotated-types==0.6.0
4 | anyio==4.2.0
5 | apscheduler==3.10.4
6 | argilla==1.23.0
7 | attrs==23.2.0
8 | backoff==2.2.1
9 | certifi==2024.2.2
10 | charset-normalizer==3.3.2
11 | click==8.1.7
12 | colorama==0.4.6
13 | contourpy==1.2.0
14 | cycler==0.12.1
15 | Deprecated==1.2.14
16 | exceptiongroup==1.2.0
17 | fastapi==0.109.2
18 | ffmpy==0.3.1
19 | filelock==3.13.1
20 | fonttools==4.48.1
21 | fsspec==2024.2.0
22 | gradio==4.17.0
23 | gradio_client==0.9.0
24 | h11==0.14.0
25 | httpcore==1.0.2
26 | httpx==0.26.0
27 | huggingface-hub==0.20.3
28 | idna==3.6
29 | importlib-resources==6.1.1
30 | Jinja2==3.1.3
31 | jsonschema==4.21.1
32 | jsonschema-specifications==2023.12.1
33 | kiwisolver==1.4.5
34 | markdown-it-py==3.0.0
35 | MarkupSafe==2.1.5
36 | matplotlib==3.8.2
37 | mdurl==0.1.2
38 | monotonic==1.6
39 | numpy==1.23.5
40 | orjson==3.9.13
41 | packaging==23.2
42 | pandas==1.5.3
43 | pillow==10.2.0
44 | pydantic==2.6.1
45 | pydantic_core==2.16.2
46 | pydub==0.25.1
47 | Pygments==2.17.2
48 | pyparsing==3.1.1
49 | python-dateutil==2.8.2
50 | python-multipart==0.0.7
51 | pytz==2024.1
52 | PyYAML==6.0.1
53 | referencing==0.33.0
54 | requests==2.31.0
55 | rich==13.7.0
56 | rpds-py==0.17.1
57 | ruff==0.2.1
58 | semantic-version==2.10.0
59 | shellingham==1.5.4
60 | six==1.16.0
61 | sniffio==1.3.0
62 | starlette==0.36.3
63 | tomlkit==0.12.0
64 | toolz==0.12.1
65 | tqdm==4.66.1
66 | typer==0.9.0
67 | typing_extensions==4.9.0
68 | urllib3==2.2.0
69 | uvicorn==0.27.0.post1
70 | vega-datasets==0.9.0
71 | websockets==11.0.3
72 | wrapt==1.14.1
73 |
--------------------------------------------------------------------------------
/community-efforts/prompt_translation/requirements.in:
--------------------------------------------------------------------------------
1 | ipykernel
2 | huggingface_hub
3 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/README.md:
--------------------------------------------------------------------------------
1 | # Domain Specific Dataset Project
2 |
3 | The domain specific dataset project aims to bootstrap the creation of domain-specific datasets for training models. The creation of this set of tools will help users to collaborate with domain experts. This can be really significant as models are trained on large-scale datasets that are often biased, incomplete, or unrepresentative. By simply joining forces between domain experts and ML engineers, we can plant the seed to generate meaningful data.
4 |
5 | ## What is the goal of this project?
6 |
7 | The goal of this project is to share and collaborate with domain experts to create domain-specific datasets that can be used to train models. We aim to create a set of tools that help users to collaborate with domain experts to create datasets that are representative of the domain. We aim to share the datasets openly on the hub and share the tools and skills to build these datasets.
8 |
9 | ## Why do we need domain specific datasets?
10 |
11 | LLMs are increasingly used as economical alternatives to human participants across various domains such as computational social science, user testing, annotation tasks, and opinion surveys. However, the utility of LLMs in replicating specific human nuances and expertises is limited by inherent training constraints. Models are trained on large-scale datasets that are often biased, incomplete, or unrepresentative of the diverse human experiences they aim to replicate. This problem impacts specific expert domains as well as underrepresented groups in the training data.
12 |
13 | Also, building synthetic datasets that are representative of the domain can help to improve the performance of models in the domain.
14 |
15 | ## How can you contribute?
16 |
17 | 🧑🏼🔬 If you are a domain expert, you can contribute by sharing your expertise and collaborating with us to create domain-specific datasets. We're working with user-friendly, easy-to-use applications that help you define the seed data and create the dataset. We're also working on tools that help you to annotate the dataset and improve the quality of the dataset.
18 |
19 | 🧑🏻🔧 If you are an (inspiring) Machine Learning engineer, you can set up the project and its tools. You can run the synthetic data generation pipelines. And maybe even get around to training models.
20 |
21 | ## Project Overview
22 |
23 | ### 1. Select a domain and find collaborators
24 |
25 | We start by selecting a domain and finding collaborators who can help us to create the dataset.
26 |
27 | 🧑🏼🔬 If you are a domain expert, you could find an ML engineer to help you to create the dataset.
28 |
29 | 🧑🏻🔧 If you are an ML engineer, you could find a domain expert to help you to create the dataset.
30 |
31 | 🧑🚀 If you're both, you could start by defining the seed data and creating the dataset.
32 |
33 | ### 2. Setup your project
34 |
35 | First, you need to setup the project and its tools. For this, we use [this application](https://huggingface.co/spaces/argilla/domain-specific-datasets-welcome).
36 |
37 | ### 3. Define the domain knowledge
38 |
39 | Next, we need to get the domain expert to define the seed data, which is used to create the dataset. Once the seed data is defined, we add it to the dataset repo.
40 |
41 | 
42 |
43 | > **Domain topics** are the topics the domain expert wants to include in the dataset. For example, if the domain is farming, the domain topics could be "soil", "crops", "weather", etc.
44 |
45 | > **Domain description** is a description of the domain. For example, if the domain is farming, the domain description could be "Farming is the practice of cultivating crops and livestock for food, fiber, biofuel, medicinal plants, and other products used to sustain and enhance human life."
46 |
47 | > **Domain perspectives** are the perspectives the domain expert wants to include in the dataset. For example, if the domain is farming, the domain perspectives could be "farmer", "agricultural scientist", "agricultural economist", etc.
48 |
49 | ### 4. Generate the dataset
50 |
51 | Next, we can move on to generating the dataset from the seed data.
52 |
53 | 
54 |
55 | To generate instructions and responses, you're going to need an endpoint. You can find compatible models from the Hugging Face Inference API here:
56 |
57 | - 🔋Projects with sufficient resources could take advantage of [LLama3 70b](https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B)
58 | - 🪫Projects with less resources could take advantage of [LLama 3 8b](https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B)
59 | - 🍃Projects with even fewer resources could take advantage of [Phi-2](https://api-inference.huggingface.co/models/microsoft/phi-2)
60 |
61 | [Hugggingface Pro](https://huggingface.co/pricing) gives access to more compute resources.
62 |
63 | #### 4.1. Generate Instructions
64 |
65 | The pipeline takes the topic and perspective and generates instructions for the dataset, then the instructions are evolved by an LLM to create more instructions.
66 |
67 | #### 4.2 Generate Responses
68 |
69 | The pipeline takes the instructions and generates responses for the dataset, then the responses are evolved by an LLM to create higher quality responses.
70 |
71 | #### 4.3 Refine the dataset
72 |
73 | Finally, the pipeline pushes the dataset to the hub and Argilla space. The domain expert can then refine the dataset by annotating the dataset and improving the quality of the dataset.
74 |
75 | ### Video Tutorial
76 |
77 | Here's a video guide that walks you through the process from end-to-end.
78 |
79 | [](https://www.loom.com/embed/99f32d7882764d9d8f4dc6ce3d824319?sid=c273876f-6715-4491-a79d-a27220e7a7d8)
80 |
81 | ### Run the `distilabel` pipeline
82 |
83 | st.markdown("## Run the pipeline")
84 |
85 | With the pipeline configuration defined in the app and pushed to the dataset repo {hub_username}/{project_name}, you can run the pipeline via this repo.
86 |
87 | You'll need to change directory, install dependencies, and log in to the Hugging Face Hub. You can do this by running the following commands:
88 |
89 | ```bash
90 | cd data-is-better-together/domain-specific-datasets/distilabel_pipelines
91 | pip install -r requirements.txt
92 | huggingface-cli login
93 | ```
94 |
95 | Then you can run the pipeline using the following command:
96 |
97 | ```bash
98 | python domain_expert_pipeline.py {hub_username}/{project_name}""",
99 | ```
100 |
101 | ### Project Structure
102 |
103 | - `app/` : A streamlit app to help domain experts to define seed data like system prompt and topics, by creating an empty dataset on the hub.
104 | - `distilabel_pipelines/domain_expert_pipeline.py` : The distilabel pipeline code that is used to create the dataset.
105 | - `scripts/` : Adhoc scripts that we used to ease annotation with vector search.
106 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/assets/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/domain-specific-datasets/assets/pipeline.png
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/assets/setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/domain-specific-datasets/assets/setup.png
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/distilabel_pipelines/domain_expert_pipeline.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any, Dict
3 |
4 | import argilla as rg
5 | from distilabel.llms import InferenceEndpointsLLM
6 | from distilabel.pipeline import Pipeline
7 | from distilabel.steps import (
8 | LoadDataFromDicts,
9 | TextGenerationToArgilla,
10 | ExpandColumns,
11 | )
12 | from distilabel.steps.tasks import (
13 | TextGeneration,
14 | SelfInstruct,
15 | )
16 | from distilabel.steps.tasks.typing import ChatType
17 | from huggingface_hub import hf_hub_download
18 |
19 |
20 | ################################################################################
21 | # Define custom Argilla Dataset
22 | ################################################################################
23 |
24 |
25 | def create_argilla_dataset(
26 | api_url: str,
27 | api_key: str,
28 | dataset_name: str,
29 | workspace: str,
30 | ):
31 | """Create a dataset in Argilla."""
32 |
33 | rg.init(api_url, api_key)
34 | rg_dataset = rg.FeedbackDataset(
35 | fields=[
36 | rg.TextField(name="id", title="id"), # type: ignore
37 | rg.TextField(name="instruction", title="instruction"), # type: ignore
38 | rg.TextField(name="generation", title="generation"), # type: ignore
39 | ],
40 | questions=[
41 | rg.LabelQuestion( # type: ignore
42 | name="quality",
43 | title=f"What's the quality of the generation for the given instruction?",
44 | labels={"bad": "👎", "good": "👍"},
45 | ),
46 | rg.TextQuestion(
47 | name="improved_instruction",
48 | title="How would you improve the instruction?",
49 | required=False,
50 | ),
51 | rg.TextQuestion(
52 | name="improved_response",
53 | title="How would you improve the response?",
54 | required=False,
55 | ),
56 | ],
57 | )
58 | try:
59 | rg_dataset.push_to_argilla(name=dataset_name, workspace=workspace)
60 | except RuntimeError as e:
61 | print(f"Failed to create the dataset in Argilla: {e} Moving on...")
62 |
63 |
64 | ################################################################################
65 | # Define out custom step for the domain expert
66 | ################################################################################
67 |
68 |
69 | class DomainExpert(TextGeneration):
70 | """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
71 |
72 | system_prompt: str
73 | template: str = """This is the the instruction: {instruction}"""
74 |
75 | def format_input(self, input: Dict[str, Any]) -> "ChatType":
76 | return [
77 | {
78 | "role": "system",
79 | "content": self.system_prompt,
80 | },
81 | {
82 | "role": "user",
83 | "content": self.template.format(**input),
84 | },
85 | ]
86 |
87 |
88 | ################################################################################
89 | # Main script to run the pipeline
90 | ################################################################################
91 |
92 |
93 | if __name__ == "__main__":
94 | import os
95 | import json
96 | import sys
97 |
98 | # get some args
99 | repo_id = sys.argv[1]
100 |
101 | # Get super secret tokens
102 |
103 | hub_token = os.environ.get("HF_TOKEN")
104 | argilla_api_key = os.environ.get("ARGILLA_API_KEY", "owner.apikey")
105 |
106 | # load pipeline parameters
107 |
108 | with open(
109 | hf_hub_download(
110 | repo_id=repo_id, filename="pipeline_params.json", repo_type="dataset"
111 | ),
112 | "r",
113 | ) as f:
114 | params = json.load(f)
115 |
116 | argilla_api_url = params.get("argilla_api_url")
117 | argilla_dataset_name = params.get("argilla_dataset_name")
118 | self_instruct_base_url = params.get("self_instruct_base_url")
119 | domain_expert_base_url = params.get("domain_expert_base_url")
120 | self_intruct_num_generations = params.get("self_instruct_num_generations", 2)
121 | domain_expert_num_generations = params.get("domain_expert_num_generations", 2)
122 | self_instruct_temperature = params.get("self_instruct_temperature", 0.9)
123 | domain_expert_temperature = params.get("domain_expert_temperature", 0.9)
124 | self_instruct_max_new_tokens = params.get("self_instruct_max_new_tokens", 2048)
125 | domain_expert_max_new_tokens = params.get("domain_expert_max_new_tokens", 2048)
126 |
127 | if not all(
128 | [
129 | argilla_api_url,
130 | argilla_dataset_name,
131 | self_instruct_base_url,
132 | domain_expert_base_url,
133 | ]
134 | ):
135 | raise ValueError("Some of the pipeline parameters are missing")
136 |
137 | # collect our seed prompts defined in the space
138 |
139 | with open(
140 | hf_hub_download(
141 | repo_id=repo_id, filename="seed_data.json", repo_type="dataset"
142 | ),
143 | "r",
144 | ) as f:
145 | seed_data = json.load(f)
146 |
147 | application_instruction = seed_data.get("application_instruction")
148 | domain_expert_prompt = seed_data.get("domain_expert_prompt")
149 | domain_name = seed_data.get("domain")
150 | terms = seed_data.get("seed_terms")
151 |
152 | # Create the Argilla dataset
153 |
154 | create_argilla_dataset(
155 | api_url=argilla_api_url,
156 | api_key=argilla_api_key,
157 | dataset_name=argilla_dataset_name,
158 | workspace="admin",
159 | )
160 |
161 | # Define the distilabel pipeline
162 |
163 | with Pipeline(domain_name) as pipeline:
164 | load_data = LoadDataFromDicts(
165 | name="load_data",
166 | batch_size=64,
167 | data=[{"input": term} for term in terms],
168 | )
169 |
170 | self_instruct = SelfInstruct(
171 | name="self_instruct",
172 | num_instructions=self_intruct_num_generations,
173 | input_batch_size=8,
174 | llm=InferenceEndpointsLLM(
175 | api_key=hub_token,
176 | base_url=self_instruct_base_url,
177 | ),
178 | application_description=application_instruction,
179 | )
180 |
181 | expand_columns = ExpandColumns(
182 | name="expand_columns",
183 | columns=["instructions"],
184 | output_mappings={"instructions": "instruction"},
185 | )
186 |
187 | domain_expert = DomainExpert(
188 | name="domain_expert",
189 | llm=InferenceEndpointsLLM(
190 | api_key=hub_token,
191 | base_url=domain_expert_base_url,
192 | ),
193 | input_batch_size=8,
194 | num_generations=domain_expert_num_generations,
195 | system_prompt=domain_expert_prompt,
196 | )
197 |
198 | # Push the generated dataset to Argilla
199 | to_argilla = TextGenerationToArgilla(
200 | name="to_argilla",
201 | dataset_workspace="admin",
202 | )
203 |
204 | # Connect up the pipeline
205 |
206 | load_data.connect(self_instruct)
207 | self_instruct.connect(expand_columns)
208 | expand_columns.connect(domain_expert)
209 | domain_expert.connect(to_argilla)
210 |
211 | # Run the pipeline
212 |
213 | pipeline.run(
214 | parameters={
215 | "self_instruct": {
216 | "llm": {
217 | "generation_kwargs": {
218 | "max_new_tokens": self_instruct_max_new_tokens,
219 | "temperature": self_instruct_temperature,
220 | },
221 | }
222 | },
223 | "domain_expert": {
224 | "llm": {
225 | "generation_kwargs": {
226 | "max_new_tokens": self_instruct_max_new_tokens,
227 | "temperature": domain_expert_temperature,
228 | },
229 | }
230 | },
231 | "to_argilla": {
232 | "dataset_name": argilla_dataset_name,
233 | "api_key": argilla_api_key,
234 | "api_url": argilla_api_url,
235 | },
236 | },
237 | use_cache=False,
238 | )
239 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/distilabel_pipelines/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | python_dotenv
3 | streamlit
4 | huggingface_hub
5 | argilla
6 | git+https://github.com/argilla-io/distilabel.git
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/parent_app/app.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from hub import (
4 | setup_dataset_on_hub,
5 | duplicate_space_on_hub,
6 | add_project_config_to_space_repo,
7 | )
8 |
9 | import streamlit as st
10 |
11 |
12 | # Constants
13 | # Written here to avoid defaults.py
14 | DEFAULT_DOMAIN = "farming"
15 |
16 | st.set_page_config(
17 | "Domain Data Grower", page_icon="🧑🌾", initial_sidebar_state="collapsed"
18 | )
19 |
20 | st.header("🧑🌾 Domain Data Grower")
21 | st.divider()
22 |
23 | st.sidebar.link_button(
24 | "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
25 | )
26 |
27 | ################################################################################
28 | # APP MARKDOWN
29 | ################################################################################
30 |
31 | st.header("🌱 Create a domain specific dataset")
32 |
33 | st.markdown(
34 | """This space will set up your domain specific dataset project. It will
35 | create the resources that you need to build a dataset. Those resources include:
36 |
37 | - A dataset repository on the Hub
38 | - Another space to define expert domain and run generation pipelines
39 |
40 | For a complete overview of the project. Check out the README
41 | """
42 | )
43 |
44 | st.page_link(
45 | "pages/🧑🌾 Domain Data Grower.py",
46 | label="Domain Data Grower",
47 | icon="🧑🌾",
48 | )
49 |
50 | ################################################################################
51 | # CONFIGURATION
52 | ################################################################################
53 |
54 | st.subheader("🌾 Project Configuration")
55 |
56 | project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
57 | hub_username = st.text_input("Hub Username", "argilla")
58 | hub_token = st.text_input("Hub Token", type="password")
59 | private_selector = st.checkbox("Private Space", value=False)
60 |
61 | if st.button("🤗 Setup Project Resources"):
62 | repo_id = f"{hub_username}/{project_name}"
63 |
64 | setup_dataset_on_hub(
65 | repo_id=repo_id,
66 | hub_token=hub_token,
67 | )
68 |
69 | st.success(
70 | f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
71 | )
72 |
73 | space_name = f"{project_name}_config_space"
74 |
75 | duplicate_space_on_hub(
76 | source_repo="argilla/domain-specific-datasets-template",
77 | target_repo=space_name,
78 | hub_token=hub_token,
79 | private=private_selector,
80 | )
81 |
82 | st.success(
83 | f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
84 | )
85 |
86 | argilla_name = f"{project_name}_argilla_space"
87 |
88 | duplicate_space_on_hub(
89 | source_repo="argilla/argilla-template-space",
90 | target_repo=argilla_name,
91 | hub_token=hub_token,
92 | private=private_selector,
93 | )
94 |
95 | st.success(
96 | f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
97 | )
98 |
99 | seconds = 5
100 |
101 | with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
102 | time.sleep(seconds)
103 | add_project_config_to_space_repo(
104 | dataset_repo_id=repo_id,
105 | hub_token=hub_token,
106 | project_name=project_name,
107 | argilla_space_repo_id=f"{hub_username}/{argilla_name}",
108 | project_space_repo_id=f"{hub_username}/{space_name}",
109 | )
110 |
111 | st.subheader("👢 Next Steps")
112 |
113 | st.write("Go to you project specific space!")
114 |
115 | st.link_button(
116 | "🧑🌾 Open Configuration Space",
117 | f"https://huggingface.co/spaces/{hub_username}/{space_name}",
118 | )
119 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/parent_app/hub.py:
--------------------------------------------------------------------------------
1 | import json
2 | from tempfile import mktemp
3 |
4 |
5 | from huggingface_hub import duplicate_space, HfApi
6 |
7 |
8 | hf_api = HfApi()
9 |
10 |
11 | def setup_dataset_on_hub(repo_id, hub_token):
12 | # create an empty dataset repo on the hub
13 | hf_api.create_repo(
14 | repo_id=repo_id,
15 | token=hub_token,
16 | repo_type="dataset",
17 | )
18 |
19 | # upload the seed data
20 | hf_api.upload_file(
21 | path_or_fileobj="seed_data.json",
22 | path_in_repo="seed_data.json",
23 | repo_id=repo_id,
24 | repo_type="dataset",
25 | token=hub_token,
26 | )
27 |
28 |
29 | def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
30 | duplicate_space(
31 | from_id=source_repo,
32 | to_id=target_repo,
33 | token=hub_token,
34 | private=private,
35 | exist_ok=True,
36 | )
37 |
38 |
39 | def add_project_config_to_space_repo(
40 | dataset_repo_id,
41 | hub_token,
42 | project_name,
43 | argilla_space_repo_id,
44 | project_space_repo_id,
45 | ):
46 | # upload the seed data and readme to the hub
47 |
48 | with open("project_config.json", "w") as f:
49 | json.dump(
50 | {
51 | "project_name": project_name,
52 | "argilla_space_repo_id": argilla_space_repo_id,
53 | "project_space_repo_id": project_space_repo_id,
54 | "dataset_repo_id": dataset_repo_id,
55 | },
56 | f,
57 | )
58 |
59 | hf_api.upload_file(
60 | path_or_fileobj="project_config.json",
61 | path_in_repo="project_config.json",
62 | token=hub_token,
63 | repo_id=project_space_repo_id,
64 | repo_type="space",
65 | )
66 |
67 |
68 | def pull_seed_data_from_repo(repo_id, hub_token):
69 | tempfile_path = mktemp()
70 | # pull the dataset repo from the hub
71 | hf_api.hf_hub_download(
72 | repo_id=repo_id, token=hub_token, repo_type="dataset", filename=tempfile_path
73 | )
74 | return json.load(open(tempfile_path))
75 |
76 |
77 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/parent_app/pages/🧑🌾 Domain Data Grower.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import requests
3 |
4 |
5 | readme_location = "https://raw.githubusercontent.com/huggingface/data-is-better-together/51f29e67165d8277d9f9d1e4be60869f4b705a08/domain-specific-datasets/README.md"
6 |
7 |
8 | def open_markdown_file(url):
9 | response = requests.get(url)
10 | return response.text
11 |
12 |
13 | readme = open_markdown_file(readme_location)
14 |
15 | st.markdown(readme)
16 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/parent_app/project_config.json:
--------------------------------------------------------------------------------
1 | {"project_name": "farming", "argilla_space_repo_id": "ignacioct/farming_argilla_space", "project_space_repo_id": "ignacioct/farming_config_space", "dataset_repo_id": "ignacioct/farming"}
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/parent_app/seed_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "domain": "farming",
3 | "perspectives": [
4 | "Family Farming"
5 | ],
6 | "topics": [
7 | "animal welfare"
8 | ],
9 | "examples": [
10 | {
11 | "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
12 | "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
13 | }
14 | ],
15 | "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
16 | }
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/.streamlit/config.toml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/domain-specific-datasets/project_app/.streamlit/config.toml
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/DATASET_README_BASE.md:
--------------------------------------------------------------------------------
1 | # Domain Dataset Grower
2 |
3 | This dataset was generated by [distilabel](https://distilabel.argilla.io/latest/) as a domain specific dataset for the domain of farming. The dataset used this seed data to generate the samples. The seed data was define by a domain expert and the generated data can be reviewed in this [Argilla](https://argilla.io/) space here: [Argilla](https://huggingface.co/spaces/argilla/farming)
4 |
5 | If you want to define a domain specific seed dataset for your own domain, you can use the distilabel tool to generate the dataset, and seed your dataset [here](https://huggingface.co/spaces/argilla/domain-specific-seed)
6 |
7 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Domain Specific Seed
3 | emoji: 💻
4 | colorFrom: purple
5 | colorTo: red
6 | sdk: streamlit
7 | sdk_version: 1.33.0
8 | app_file: app.py
9 | pinned: false
10 | license: apache-2.0
11 | ---
12 |
13 | Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | from defaults import (
4 | PROJECT_NAME,
5 | ARGILLA_SPACE_REPO_ID,
6 | DATASET_REPO_ID,
7 | ARGILLA_URL,
8 | PROJECT_SPACE_REPO_ID,
9 | DIBT_PARENT_APP_URL,
10 | )
11 | from utils import project_sidebar
12 |
13 | st.set_page_config("Domain Data Grower", page_icon="🧑🌾")
14 |
15 | project_sidebar()
16 |
17 | if PROJECT_NAME == "DEFAULT_DOMAIN":
18 | st.warning(
19 | "Please set up the project configuration in the parent app before proceeding."
20 | )
21 | st.stop()
22 |
23 |
24 | st.header("🧑🌾 Domain Data Grower")
25 | st.divider()
26 |
27 | st.markdown(
28 | """
29 | ## 🌱 Create a dataset seed for aligning models to a specific domain
30 |
31 | This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
32 | Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
33 | """
34 | )
35 | st.markdown(
36 | """
37 | ## 🚜 How it works
38 |
39 | You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
40 | The dataset seed is then used to generate synthetic data for training a language model.
41 |
42 | """
43 | )
44 | st.markdown(
45 | """
46 | ## 🗺️ The process
47 |
48 | ### Step 1: ~~Setup the project~~
49 |
50 | ~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
51 | """
52 | )
53 | st.link_button("🚀 ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
54 |
55 | st.markdown(
56 | """
57 | ### Step 2: Describe the Domain
58 |
59 | Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
60 | You can collaborate with domain experts to define the domain expertise and perspectives.
61 | """
62 | )
63 |
64 | st.page_link(
65 | "pages/2_👩🏼🔬 Describe Domain.py",
66 | label="Describe Domain",
67 | icon="👩🏼🔬",
68 | )
69 |
70 | st.markdown(
71 | """
72 | ### Step 3: Generate Synthetic Data
73 |
74 | Use distilabel to generate synthetic data for your domain-specific dataset.
75 | You can run the pipeline locally or in this space to generate synthetic data.
76 | """
77 | )
78 |
79 | st.page_link(
80 | "pages/3_🌱 Generate Dataset.py",
81 | label="Generate Dataset",
82 | icon="🌱",
83 | )
84 |
85 | st.markdown(
86 | """
87 | ### Step 4: Review the Dataset
88 |
89 | Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
90 |
91 |
92 | """
93 | )
94 | st.link_button("🔍 Review the dataset in Argilla", ARGILLA_URL)
95 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/defaults.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 |
4 | SEED_DATA_PATH = "seed_data.json"
5 | PIPELINE_PATH = "pipeline.yaml"
6 | REMOTE_CODE_PATHS = ["requirements.txt"]
7 | DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
8 | N_PERSPECTIVES = 5
9 | N_TOPICS = 5
10 | N_EXAMPLES = 5
11 | CODELESS_DISTILABEL = os.environ.get("CODELESS_DISTILABEL", True)
12 |
13 | ################################################
14 | # DEFAULTS ON FARMING
15 | ################################################
16 |
17 | with open(SEED_DATA_PATH) as f:
18 | DEFAULT_DATA = json.load(f)
19 |
20 | DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
21 | DEFAULT_PERSPECTIVES = DEFAULT_DATA["perspectives"]
22 | if DEFAULT_PERSPECTIVES is None or len(DEFAULT_PERSPECTIVES) == 0:
23 | DEFAULT_PERSPECTIVES = [""]
24 | DEFAULT_TOPICS = DEFAULT_DATA["topics"]
25 | if DEFAULT_TOPICS is None or len(DEFAULT_TOPICS) == 0:
26 | DEFAULT_TOPICS = [""]
27 | DEFAULT_EXAMPLES = DEFAULT_DATA["examples"]
28 | if DEFAULT_EXAMPLES is None or len(DEFAULT_EXAMPLES) == 0:
29 | DEFAULT_EXAMPLES = [{"question": "", "answer": ""}]
30 | DEFAULT_SYSTEM_PROMPT = DEFAULT_DATA["domain_expert_prompt"]
31 |
32 | ################################################
33 | # PROJECT CONFIG FROM PARENT APP
34 | ################################################
35 |
36 | try:
37 | with open("project_config.json") as f:
38 | PROJECT_CONFIG = json.load(f)
39 |
40 | PROJECT_NAME = PROJECT_CONFIG["project_name"]
41 | ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
42 | DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
43 | ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
44 | ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
45 | PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
46 | DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
47 | HUB_USERNAME = DATASET_REPO_ID.split("/")[0]
48 | except FileNotFoundError:
49 | PROJECT_NAME = "DEFAULT_DOMAIN"
50 | ARGILLA_SPACE_REPO_ID = ""
51 | DATASET_REPO_ID = ""
52 | ARGILLA_URL = ""
53 | PROJECT_SPACE_REPO_ID = ""
54 | DATASET_URL = ""
55 | HUB_USERNAME = ""
56 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/hub.py:
--------------------------------------------------------------------------------
1 | import json
2 | from tempfile import mktemp
3 |
4 | import argilla as rg
5 | from huggingface_hub import HfApi
6 |
7 | from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
8 |
9 |
10 | hf_api = HfApi()
11 |
12 | with open("DATASET_README_BASE.md") as f:
13 | DATASET_README_BASE = f.read()
14 |
15 |
16 | def create_readme(domain_seed_data, project_name, domain):
17 | # create a readme for the project that shows the domain and project name
18 | readme = DATASET_README_BASE
19 | readme += f"# {project_name}\n\n## Domain: {domain}"
20 | perspectives = domain_seed_data.get("perspectives")
21 | topics = domain_seed_data.get("topics")
22 | examples = domain_seed_data.get("examples")
23 | if perspectives:
24 | readme += "\n\n## Perspectives\n\n"
25 | for p in perspectives:
26 | readme += f"- {p}\n"
27 | if topics:
28 | readme += "\n\n## Topics\n\n"
29 | for t in topics:
30 | readme += f"- {t}\n"
31 | if examples:
32 | readme += "\n\n## Examples\n\n"
33 | for example in examples:
34 | readme += f"### {example['question']}\n\n{example['answer']}\n\n"
35 | temp_file = mktemp()
36 |
37 | with open(temp_file, "w") as f:
38 | f.write(readme)
39 | return temp_file
40 |
41 |
42 | def setup_dataset_on_hub(repo_id, hub_token):
43 | # create an empty dataset repo on the hub
44 | hf_api.create_repo(
45 | repo_id=repo_id,
46 | token=hub_token,
47 | repo_type="dataset",
48 | exist_ok=True,
49 | )
50 |
51 |
52 | def push_dataset_to_hub(
53 | domain_seed_data_path,
54 | project_name,
55 | domain,
56 | pipeline_path,
57 | hub_username,
58 | hub_token: str,
59 | ):
60 | repo_id = f"{hub_username}/{project_name}"
61 |
62 | setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
63 |
64 | # upload the seed data and readme to the hub
65 | hf_api.upload_file(
66 | path_or_fileobj=domain_seed_data_path,
67 | path_in_repo="seed_data.json",
68 | token=hub_token,
69 | repo_id=repo_id,
70 | repo_type="dataset",
71 | )
72 |
73 | # upload the readme to the hub
74 | domain_seed_data = json.load(open(domain_seed_data_path))
75 | hf_api.upload_file(
76 | path_or_fileobj=create_readme(
77 | domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
78 | ),
79 | path_in_repo="README.md",
80 | token=hub_token,
81 | repo_id=repo_id,
82 | repo_type="dataset",
83 | )
84 |
85 |
86 | def push_pipeline_to_hub(
87 | pipeline_path,
88 | hub_username,
89 | hub_token: str,
90 | project_name,
91 | ):
92 | repo_id = f"{hub_username}/{project_name}"
93 |
94 | # upload the pipeline to the hub
95 | hf_api.upload_file(
96 | path_or_fileobj=pipeline_path,
97 | path_in_repo="pipeline.py",
98 | token=hub_token,
99 | repo_id=repo_id,
100 | repo_type="dataset",
101 | )
102 |
103 | for code_path in REMOTE_CODE_PATHS:
104 | hf_api.upload_file(
105 | path_or_fileobj=code_path,
106 | path_in_repo=code_path,
107 | token=hub_token,
108 | repo_id=repo_id,
109 | repo_type="dataset",
110 | )
111 |
112 | print(f"Dataset uploaded to {repo_id}")
113 |
114 |
115 | def pull_seed_data_from_repo(repo_id, hub_token):
116 | # pull the dataset repo from the hub
117 | hf_api.hf_hub_download(
118 | repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
119 | )
120 | return json.load(open(SEED_DATA_PATH))
121 |
122 |
123 | def push_argilla_dataset_to_hub(
124 | name: str,
125 | repo_id: str,
126 | url: str,
127 | api_key: str,
128 | hub_token: str,
129 | workspace: str = "admin",
130 | ):
131 | rg.init(api_url=url, api_key=api_key)
132 | feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
133 | local_dataset = feedback_dataset.pull()
134 | local_dataset.push_to_huggingface(repo_id=repo_id, token=hub_token)
135 |
136 |
137 | def push_pipeline_params(
138 | pipeline_params,
139 | hub_username,
140 | hub_token: str,
141 | project_name,
142 | ):
143 | repo_id = f"{hub_username}/{project_name}"
144 | temp_path = mktemp()
145 | with open(temp_path, "w") as f:
146 | json.dump(pipeline_params, f)
147 | # upload the pipeline to the hub
148 | hf_api.upload_file(
149 | path_or_fileobj=temp_path,
150 | path_in_repo="pipeline_params.json",
151 | token=hub_token,
152 | repo_id=repo_id,
153 | repo_type="dataset",
154 | )
155 |
156 | print(f"Pipeline params uploaded to {repo_id}")
157 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/infer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 |
4 | API_URL = (
5 | "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
6 | )
7 |
8 |
9 | def query(question, hub_token: str):
10 | payload = {
11 | "inputs": question,
12 | "parameters": {
13 | "wait_for_model": True,
14 | "return_full_text": False,
15 | },
16 | }
17 | headers = {"Authorization": f"Bearer {hub_token}"}
18 | response = requests.post(API_URL, headers=headers, json=payload)
19 | try:
20 | return response.json()[0]["generated_text"]
21 | except Exception:
22 | return "Error occurred while querying the model."
23 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/pages/2_👩🏼🔬 Describe Domain.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import streamlit as st
4 |
5 | from hub import push_dataset_to_hub, pull_seed_data_from_repo
6 | from infer import query
7 | from defaults import (
8 | N_PERSPECTIVES,
9 | N_TOPICS,
10 | SEED_DATA_PATH,
11 | PIPELINE_PATH,
12 | DATASET_REPO_ID,
13 | )
14 | from utils import project_sidebar, create_seed_terms, create_application_instruction
15 |
16 |
17 | st.set_page_config(
18 | page_title="Domain Data Grower",
19 | page_icon="🧑🌾",
20 | )
21 | project_sidebar()
22 |
23 |
24 | ################################################################################
25 | # HEADER
26 | ################################################################################
27 |
28 | st.header("🧑🌾 Domain Data Grower")
29 | st.divider()
30 | st.subheader(
31 | "Step 2. Define the specific domain that you want to generate synthetic data for.",
32 | )
33 | st.write(
34 | "Define the project details, including the project name, domain, and API credentials"
35 | )
36 |
37 |
38 | ################################################################################
39 | # LOAD EXISTING DOMAIN DATA
40 | ################################################################################
41 |
42 | DATASET_REPO_ID = (
43 | f"{st.session_state['hub_username']}/{st.session_state['project_name']}"
44 | )
45 | SEED_DATA = pull_seed_data_from_repo(
46 | DATASET_REPO_ID, hub_token=st.session_state["hub_token"]
47 | )
48 | DEFAULT_DOMAIN = SEED_DATA.get("domain", "")
49 | DEFAULT_PERSPECTIVES = SEED_DATA.get("perspectives", [""])
50 | DEFAULT_TOPICS = SEED_DATA.get("topics", [""])
51 | DEFAULT_EXAMPLES = SEED_DATA.get("examples", [{"question": "", "answer": ""}])
52 | DEFAULT_SYSTEM_PROMPT = SEED_DATA.get("domain_expert_prompt", "")
53 |
54 | ################################################################################
55 | # Domain Expert Section
56 | ################################################################################
57 |
58 | (
59 | tab_domain_expert,
60 | tab_domain_perspectives,
61 | tab_domain_topics,
62 | tab_examples,
63 | tab_raw_seed,
64 | ) = st.tabs(
65 | tabs=[
66 | "👩🏼🔬 Domain Expert",
67 | "🔍 Domain Perspectives",
68 | "🕸️ Domain Topics",
69 | "📚 Examples",
70 | "🌱 Raw Seed Data",
71 | ]
72 | )
73 |
74 | with tab_domain_expert:
75 | st.text("Define the domain expertise that you want to train a language model")
76 | st.info(
77 | "A domain expert is a person who is an expert in a particular field or area. For example, a domain expert in farming would be someone who has extensive knowledge and experience in farming and agriculture."
78 | )
79 |
80 | domain = st.text_input("Domain Name", DEFAULT_DOMAIN)
81 |
82 | domain_expert_prompt = st.text_area(
83 | label="Domain Expert Definition",
84 | value=DEFAULT_SYSTEM_PROMPT,
85 | height=200,
86 | )
87 |
88 | ################################################################################
89 | # Domain Perspectives
90 | ################################################################################
91 |
92 | with tab_domain_perspectives:
93 | st.text("Define the different perspectives from which the domain can be viewed")
94 | st.info(
95 | """
96 | Perspectives are different viewpoints or angles from which a domain can be viewed.
97 | For example, the domain of farming can be viewed from the perspective of a commercial
98 | farmer or an independent family farmer."""
99 | )
100 |
101 | perspectives = st.session_state.get(
102 | "perspectives",
103 | [DEFAULT_PERSPECTIVES[0]],
104 | )
105 | perspectives_container = st.container()
106 |
107 | perspectives = [
108 | perspectives_container.text_input(
109 | f"Domain Perspective {i + 1}", value=perspective
110 | )
111 | for i, perspective in enumerate(perspectives)
112 | ]
113 |
114 | if st.button("Add Perspective", key="add_perspective"):
115 | n = len(perspectives)
116 | perspectives.append(
117 | perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
118 | )
119 |
120 | st.session_state["perspectives"] = perspectives
121 |
122 |
123 | ################################################################################
124 | # Domain Topics
125 | ################################################################################
126 |
127 | with tab_domain_topics:
128 | st.text("Define the main themes or subjects that are relevant to the domain")
129 | st.info(
130 | """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
131 | )
132 | topics = st.session_state.get(
133 | "topics",
134 | [DEFAULT_TOPICS[0]],
135 | )
136 | topics_container = st.container()
137 | topics = [
138 | topics_container.text_input(f"Domain Topic {i + 1}", value=topic)
139 | for i, topic in enumerate(topics)
140 | ]
141 |
142 | if st.button("Add Topic", key="add_topic"):
143 | n = len(topics)
144 | topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
145 |
146 | st.session_state["topics"] = topics
147 |
148 |
149 | ################################################################################
150 | # Examples Section
151 | ################################################################################
152 |
153 | with tab_examples:
154 | st.text(
155 | "Add high-quality questions and answers that can be used to generate synthetic data"
156 | )
157 | st.info(
158 | """
159 | Examples are high-quality questions and answers that can be used to generate
160 | synthetic data for the domain. These examples will be used to train the language model
161 | to generate questions and answers.
162 | """
163 | )
164 |
165 | examples = st.session_state.get(
166 | "examples",
167 | [
168 | {
169 | "question": "",
170 | "answer": "",
171 | }
172 | ],
173 | )
174 |
175 | for n, example in enumerate(examples, 1):
176 | question = example["question"]
177 | answer = example["answer"]
178 | examples_container = st.container()
179 | question_column, answer_column = examples_container.columns(2)
180 |
181 | if st.button(f"Generate Answer {n}"):
182 | if st.session_state["hub_token"] is None:
183 | st.error("Please provide a Hub token to generate answers")
184 | else:
185 | answer = query(question, st.session_state["hub_token"])
186 | with question_column:
187 | question = st.text_area(f"Question {n}", value=question)
188 |
189 | with answer_column:
190 | answer = st.text_area(f"Answer {n}", value=answer)
191 | examples[n - 1] = {"question": question, "answer": answer}
192 | st.session_state["examples"] = examples
193 | st.divider()
194 |
195 | if st.button("Add Example"):
196 | examples.append({"question": "", "answer": ""})
197 | st.session_state["examples"] = examples
198 | st.rerun()
199 |
200 | ################################################################################
201 | # Save Domain Data
202 | ################################################################################
203 |
204 | perspectives = list(filter(None, perspectives))
205 | topics = list(filter(None, topics))
206 |
207 | domain_data = {
208 | "domain": domain,
209 | "perspectives": perspectives,
210 | "topics": topics,
211 | "examples": examples,
212 | "domain_expert_prompt": domain_expert_prompt,
213 | "application_instruction": create_application_instruction(domain, examples),
214 | "seed_terms": create_seed_terms(topics, perspectives),
215 | }
216 |
217 | with open(SEED_DATA_PATH, "w") as f:
218 | json.dump(domain_data, f, indent=2)
219 |
220 | with tab_raw_seed:
221 | st.code(json.dumps(domain_data, indent=2), language="json", line_numbers=True)
222 |
223 | ################################################################################
224 | # Setup Dataset on the Hub
225 | ################################################################################
226 |
227 | st.divider()
228 |
229 |
230 | if st.button("🤗 Push Dataset Seed") and all(
231 | (
232 | domain,
233 | domain_expert_prompt,
234 | perspectives,
235 | topics,
236 | examples,
237 | )
238 | ):
239 | if all(
240 | (
241 | st.session_state.get("project_name"),
242 | st.session_state.get("hub_username"),
243 | st.session_state.get("hub_token"),
244 | )
245 | ):
246 | project_name = st.session_state["project_name"]
247 | hub_username = st.session_state["hub_username"]
248 | hub_token = st.session_state["hub_token"]
249 | else:
250 | st.error(
251 | "Please create a dataset repo on the Hub before pushing the dataset seed"
252 | )
253 | st.stop()
254 |
255 | push_dataset_to_hub(
256 | domain_seed_data_path=SEED_DATA_PATH,
257 | project_name=project_name,
258 | domain=domain,
259 | hub_username=hub_username,
260 | hub_token=hub_token,
261 | pipeline_path=PIPELINE_PATH,
262 | )
263 |
264 | st.success(
265 | f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
266 | )
267 |
268 | st.write("You can now move on to runnning your distilabel pipeline.")
269 |
270 | st.page_link(
271 | page="pages/3_🌱 Generate Dataset.py",
272 | label="Generate Dataset",
273 | icon="🌱",
274 | )
275 |
276 | else:
277 | st.info(
278 | "Please fill in all the required domain fields to push the dataset seed to the Hub"
279 | )
280 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/pages/3_🌱 Generate Dataset.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | from defaults import ARGILLA_URL
4 | from hub import push_pipeline_params
5 | from utils import project_sidebar
6 |
7 | st.set_page_config(
8 | page_title="Domain Data Grower",
9 | page_icon="🧑🌾",
10 | )
11 |
12 | project_sidebar()
13 |
14 | ################################################################################
15 | # HEADER
16 | ################################################################################
17 |
18 | st.header("🧑🌾 Domain Data Grower")
19 | st.divider()
20 | st.subheader("Step 3. Run the pipeline to generate synthetic data")
21 | st.write("Define the distilabel pipeline for generating the dataset.")
22 |
23 | hub_username = st.session_state.get("hub_username")
24 | project_name = st.session_state.get("project_name")
25 | hub_token = st.session_state.get("hub_token")
26 |
27 | ###############################################################
28 | # CONFIGURATION
29 | ###############################################################
30 |
31 | st.divider()
32 |
33 | st.markdown("## 🧰 Data Generation Pipeline")
34 |
35 | st.markdown(
36 | """
37 | Now we need to define the configuration for the pipeline that will generate the synthetic data.
38 | The pipeline will generate synthetic data by combining self-instruction and domain expert responses.
39 | The self-instruction step generates instructions based on seed terms, and the domain expert step generates \
40 | responses to those instructions. Take a look at the [distilabel docs](https://distilabel.argilla.io/latest/sections/learn/tasks/text_generation/#self-instruct) for more information.
41 | """
42 | )
43 |
44 | ###############################################################
45 | # INFERENCE
46 | ###############################################################
47 |
48 | st.markdown("#### 🤖 Inference configuration")
49 |
50 | st.write(
51 | """Add the url of the Huggingface inference API or endpoint that your pipeline should use to generate instruction and response pairs. \
52 | Some domain tasks may be challenging for smaller models, so you may need to iterate over your task definition and model selection. \
53 | This is a part of the process of generating high-quality synthetic data, human feedback is key to this process. \
54 | You can find compatible models here:"""
55 | )
56 |
57 | with st.expander("🤗 Recommended Models"):
58 | st.write("All inference endpoint compatible models can be found via the link below")
59 | st.link_button(
60 | "🤗 Inference compaptible models on the hub",
61 | "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
62 | )
63 | st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
64 | st.code(
65 | "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
66 | )
67 |
68 | st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
69 | st.code(
70 | "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
71 | )
72 |
73 | st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct")
74 | st.code(
75 | "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
76 | )
77 |
78 | st.write("Note Hugggingface Pro gives access to more compute resources")
79 | st.link_button(
80 | "🤗 Huggingface Pro",
81 | "https://huggingface.co/pricing",
82 | )
83 |
84 |
85 | self_instruct_base_url = st.text_input(
86 | label="Model base URL for instruction generation",
87 | value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
88 | )
89 | domain_expert_base_url = st.text_input(
90 | label="Model base URL for domain expert response",
91 | value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
92 | )
93 |
94 | ###############################################################
95 | # PARAMETERS
96 | ###############################################################
97 |
98 | st.divider()
99 | st.markdown("#### 🧮 Parameters configuration")
100 |
101 | st.write(
102 | "⚠️ Model and parameter choices significantly affect the quality of the generated data. \
103 | We reccomend that you start with generating a few samples and review the data. Then scale up from there. \
104 | You can run the pipeline multiple times with different configurations and append it to the same Argilla dataset."
105 | )
106 |
107 | st.markdown(
108 | "Number of generations are the samples that each model will generate for each seed term, \
109 | so if you have 10 seed terms, 2 instruction generations, and 2 response generations, you will have 40 samples in total."
110 | )
111 |
112 | self_intruct_num_generations = st.slider(
113 | "Number of generations for self-instruction", 1, 10, 2
114 | )
115 | domain_expert_num_generations = st.slider(
116 | "Number of generations for domain expert response", 1, 10, 2
117 | )
118 |
119 | with st.expander("🔥 Advanced parameters"):
120 | st.markdown(
121 | "Temperature is a hyperparameter that controls the randomness of the generated text. \
122 | Lower temperatures will generate more deterministic text, while higher temperatures \
123 | will add more variation to generations."
124 | )
125 |
126 | self_instruct_temperature = st.slider(
127 | "Temperature for self-instruction", 0.1, 1.0, 0.9
128 | )
129 | domain_expert_temperature = st.slider(
130 | "Temperature for domain expert", 0.1, 1.0, 0.9
131 | )
132 |
133 | st.markdown(
134 | "`max_new_tokens` is the maximum number of tokens (word like things) that can be generated by each model call. \
135 | This is a way to control the length of the generated text. in some cases, you may want to increase this to \
136 | generate longer responses. You should adapt this value to your model chice, but default of 2096 works \
137 | in most cases."
138 | )
139 |
140 | self_instruct_max_new_tokens = st.number_input(
141 | "Max new tokens for self-instruction", value=2096
142 | )
143 | domain_expert_max_new_tokens = st.number_input(
144 | "Max new tokens for domain expert", value=2096
145 | )
146 |
147 | ###############################################################
148 | # ARGILLA API
149 | ###############################################################
150 |
151 | st.divider()
152 | st.markdown("#### 🔬 Argilla API details to push the generated dataset")
153 | st.markdown(
154 | "Here you can define the Argilla API details to push the generated dataset to your Argilla space. \
155 | These are the defaults that were set up for the project. You can change them if needed."
156 | )
157 | argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
158 | argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
159 | argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
160 | st.divider()
161 |
162 | ###############################################################
163 | # Pipeline Run
164 | ###############################################################
165 |
166 | st.markdown("## Run the pipeline")
167 |
168 | st.markdown(
169 | "Once you've defined the pipeline configuration above, you can run the pipeline from your local machine."
170 | )
171 |
172 |
173 | if all(
174 | [
175 | argilla_api_key,
176 | argilla_url,
177 | self_instruct_base_url,
178 | domain_expert_base_url,
179 | self_intruct_num_generations,
180 | domain_expert_num_generations,
181 | self_instruct_temperature,
182 | domain_expert_temperature,
183 | hub_username,
184 | project_name,
185 | hub_token,
186 | argilla_dataset_name,
187 | ]
188 | ) and st.button("💾 Save Pipeline Config"):
189 | with st.spinner("Pushing pipeline to the Hub..."):
190 | push_pipeline_params(
191 | pipeline_params={
192 | "argilla_api_url": argilla_url,
193 | "argilla_dataset_name": argilla_dataset_name,
194 | "self_instruct_base_url": self_instruct_base_url,
195 | "domain_expert_base_url": domain_expert_base_url,
196 | "self_instruct_temperature": self_instruct_temperature,
197 | "domain_expert_temperature": domain_expert_temperature,
198 | "self_intruct_num_generations": self_intruct_num_generations,
199 | "domain_expert_num_generations": domain_expert_num_generations,
200 | "self_instruct_max_new_tokens": self_instruct_max_new_tokens,
201 | "domain_expert_max_new_tokens": domain_expert_max_new_tokens,
202 | },
203 | hub_username=hub_username,
204 | hub_token=hub_token,
205 | project_name=project_name,
206 | )
207 |
208 | st.success(
209 | f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub."
210 | )
211 |
212 | st.markdown(
213 | "To run the pipeline locally, you need to have the `distilabel` library installed. \
214 | You can install it using the following command:"
215 | )
216 |
217 | st.code(
218 | body="""
219 | # Install the distilabel library
220 | pip install distilabel
221 | """,
222 | language="bash",
223 | )
224 |
225 | st.markdown(
226 | "Next, you'll need to clone the pipeline code and install dependencies:"
227 | )
228 |
229 | st.code(
230 | """
231 | git clone https://github.com/huggingface/data-is-better-together
232 | cd data-is-better-together/domain-specific-datasets/distilabel_pipelines
233 | pip install -r requirements.txt
234 | huggingface-cli login
235 | """,
236 | language="bash",
237 | )
238 |
239 | st.markdown("Finally, you can run the pipeline using the following command:")
240 |
241 | st.code(
242 | f"""
243 | python domain_expert_pipeline.py {hub_username}/{project_name}""",
244 | language="bash",
245 | )
246 | st.markdown(
247 | "👩🚀 If you want to customise the pipeline take a look in `domain_expert_pipeline.py` \
248 | and the [distilabel docs](https://distilabel.argilla.io/)"
249 | )
250 |
251 | st.markdown(
252 | "🚀 Once you've run the pipeline your records will be available in the Argilla space"
253 | )
254 |
255 | st.link_button("🔗 Argilla Space", argilla_url)
256 |
257 | st.markdown("Once you've reviewed the data, you can publish it on the next page:")
258 |
259 | st.page_link(
260 | page="pages/4_🔍 Review Generated Data.py",
261 | label="Review Generated Data",
262 | icon="🔍",
263 | )
264 |
265 | else:
266 | st.info("Please fill all the required fields.")
267 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/pages/4_🔍 Review Generated Data.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | from defaults import PROJECT_NAME, ARGILLA_URL, DATASET_REPO_ID
4 | from utils import project_sidebar
5 | from hub import push_argilla_dataset_to_hub
6 |
7 | st.set_page_config(
8 | page_title="Domain Data Grower",
9 | page_icon="🧑🌾",
10 | )
11 |
12 | project_sidebar()
13 |
14 | ################################################################################
15 | # HEADER
16 | ################################################################################
17 |
18 | st.header("🧑🌾 Domain Data Grower")
19 | st.divider()
20 |
21 | st.write(
22 | """Once you have reviewed the synthetic data in Argilla, you can publish the
23 | generated dataset to the Hub."""
24 | )
25 |
26 |
27 | ################################################################################
28 | # Configuration
29 | ################################################################################
30 |
31 | st.divider()
32 | st.write("🔬 Argilla API details to push the generated dataset")
33 | argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
34 | argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
35 | argilla_dataset_name = st.text_input("Argilla Dataset Name", PROJECT_NAME)
36 | dataset_repo_id = st.text_input("Dataset Repo ID", DATASET_REPO_ID)
37 | st.divider()
38 |
39 | if st.button("🚀 Publish the generated dataset"):
40 | with st.spinner("Publishing the generated dataset..."):
41 | push_argilla_dataset_to_hub(
42 | name=argilla_dataset_name,
43 | repo_id=dataset_repo_id,
44 | url=argilla_url,
45 | api_key=argilla_api_key,
46 | workspace="admin",
47 | hub_token=st.session_state["hub_token"],
48 | )
49 | st.success("The generated dataset has been published to the Hub.")
50 |
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/project_config.json:
--------------------------------------------------------------------------------
1 | {"project_name": "DEFAULT_DOMAIN", "argilla_space_repo_id": "burtenshaw/domain_test_4_argilla_space", "project_space_repo_id": "burtenshaw/domain_test_4_config_space", "dataset_repo_id": "burtenshaw/domain_test_4"}
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | python_dotenv
3 | streamlit
4 | huggingface_hub
5 | argilla
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/seed_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "domain": "farming",
3 | "perspectives": [
4 | "Family Farming",
5 | "Agribusiness",
6 | "Permaculture",
7 | "Agroforestery",
8 | "Conventional Farming"
9 | ],
10 | "topics": [
11 | "animal welfare",
12 | "economic growth",
13 | "land",
14 | "resources",
15 | "efficiency"
16 | ],
17 | "examples": [
18 | {
19 | "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
20 | "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
21 | },
22 | {
23 | "question": "Compare the environmental footprint of small-scale, local farming versus large-scale, industrial agriculture.",
24 | "answer": "Industrial agriculture typically emphasizes high-output, monoculture farming reliant on synthetic fertilizers and pesticides, which, as Horrigan, Lawrence, and Walker (2002) argue, leads to greater greenhouse gas emissions, higher energy use, and more water consumption compared to small-scale farming. In contrast, small-scale farms often employ diverse cropping systems and lower chemical inputs, resulting in a smaller environmental footprint. Pimentel et al. (2005) note that small-scale farms tend to have higher yields per unit area when environmental and sustainability factors are integrated into farming practices."
25 | },
26 | {
27 | "question": "Analyze the economic implications of transitioning from conventional to organic farming.",
28 | "answer": "Transitioning from conventional to organic farming involves significant changes in farm management, input use, and market engagement. Crowder and Reganold (2015) present evidence that organic farms often yield smaller outputs initially but achieve higher profitability due to premium prices, lower input costs, and improved soil health over time. However, this transition requires upfront investments in knowledge and infrastructure, which can be economically challenging for some farmers, as noted by Seufert and Ramankutty (2017)."
29 | },
30 | {
31 | "question": "Analyze the social, economic and environnmental impacts of land consolidation vs small-scale farmers.",
32 | "answer": "Land consolidation has been associated with increased agricultural productivity but also with negative social and environmental impacts. Larger land holdings typically lead to monocultures, which reduce biodiversity and increase vulnerability to pests and diseases, as highlighted by Li et al. (2017). Economically, while consolidation can lead to economies of scale and potential gains in gross margins, it often displaces rural populations, exacerbating poverty and reducing local food diversity (Sutherland et al., 2015)."
33 | },
34 | {
35 | "question": "Investigate the relationship between land ownership patterns, agricultural productivity and environment sustainability. ",
36 | "answer": "Land ownership patterns critically influence agricultural productivity and sustainability. Secure land tenure supports investments in long-term improvements such as soil conservation and water management, which are pivotal for sustainable outcomes. Studies by Barrett et al. (2010) demonstrate that fragmented land ownership often results in inefficient resource use and higher transaction costs, which can detract from sustainability goals."
37 | }
38 | ],
39 | "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
40 | }
--------------------------------------------------------------------------------
/cookbook-efforts/domain-specific-datasets/project_app/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import streamlit as st
4 |
5 | from defaults import (
6 | PROJECT_NAME,
7 | ARGILLA_URL,
8 | DIBT_PARENT_APP_URL,
9 | DATASET_URL,
10 | DATASET_REPO_ID,
11 | )
12 |
13 |
14 | def project_sidebar():
15 | if PROJECT_NAME == "DEFAULT_DOMAIN":
16 | st.warning(
17 | "Please set up the project configuration in the parent app before proceeding."
18 | )
19 | st.stop()
20 |
21 | st.sidebar.subheader(f"A Data Growing Project in the domain of {PROJECT_NAME}")
22 | st.sidebar.markdown(
23 | """
24 | This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
25 | """
26 | )
27 | st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
28 | st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
29 | hub_username = DATASET_REPO_ID.split("/")[0]
30 | project_name = DATASET_REPO_ID.split("/")[1]
31 | st.session_state["project_name"] = project_name
32 | st.session_state["hub_username"] = hub_username
33 | st.session_state["hub_token"] = st.sidebar.text_input(
34 | "Hub Token", type="password", value=os.environ.get("HF_TOKEN", None)
35 | )
36 | if st.session_state["hub_token"] is not None:
37 | os.environ["HF_TOKEN"] = st.session_state["hub_token"]
38 | st.sidebar.link_button(
39 | "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
40 | )
41 | if all(
42 | (
43 | st.session_state.get("project_name"),
44 | st.session_state.get("hub_username"),
45 | st.session_state.get("hub_token"),
46 | )
47 | ):
48 | st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
49 |
50 | st.sidebar.divider()
51 |
52 | st.sidebar.link_button("🧑🌾 New Project", DIBT_PARENT_APP_URL)
53 |
54 | if st.session_state["hub_token"] is None:
55 | st.error("Please provide a Hub token to generate answers")
56 | st.stop()
57 |
58 |
59 | def create_seed_terms(topics: list[str], perspectives: list[str]) -> list[str]:
60 | """Create seed terms for self intruct to start from."""
61 |
62 | return [
63 | f"{topic} from a {perspective} perspective"
64 | for topic in topics
65 | for perspective in perspectives
66 | ]
67 |
68 |
69 | def create_application_instruction(
70 | domain: str, system_prompt: str, examples: list[dict[str, str]]
71 | ) -> str:
72 | """Create the instruction for Self-Instruct task."""
73 | system_prompt = f"""AI assistant in the domain of {domain}. {system_prompt}"""
74 | examples_str = ""
75 | for example in examples:
76 | question = example["question"]
77 | answer = example["answer"]
78 | if len(answer) and len(question):
79 | examples_str += f"""\n- Question: {question}\n- Answer: {answer}\n"""
80 | examples_str += f"""\n- Question: {question}\n- Answer: {answer}\n"""
81 | if len(examples_str):
82 | system_prompt += """Below are some examples of questions and answers \
83 | that the AI assistant would generate:"""
84 | system_prompt += "\nExamples:"
85 | system_prompt += f"\n{examples_str}"
86 | return system_prompt
87 |
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/01_data_prep.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 01. Creating our subsample of Aya to prepare for creating a DPO dataset"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook walks through the steps required to create a sample from the full Aya dataset for the language you are interested in working in. \n",
15 | "In this notebook and the subsequent notebooks we'll focus on Dutch as an example but the process will be rather similar for other languages."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "from collections import Counter\n",
25 | "from datasets import Dataset\n",
26 | "from datasets import load_dataset\n",
27 | "from statistics import mean, median"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "Let's start by loading the Aya dataset!"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "aya_ds = load_dataset(\"CohereForAI/aya_dataset\",split='train')"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 3,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/plain": [
54 | "Dataset({\n",
55 | " features: ['inputs', 'targets', 'language', 'language_code', 'annotation_type', 'user_id'],\n",
56 | " num_rows: 202362\n",
57 | "})"
58 | ]
59 | },
60 | "execution_count": 3,
61 | "metadata": {},
62 | "output_type": "execute_result"
63 | }
64 | ],
65 | "source": [
66 | "aya_ds"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "We want to only include the data that is relevant to the language we are interested in. This means we need to filter out the data that is not in Dutch. "
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 4,
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "data": {
83 | "text/plain": [
84 | "Dataset({\n",
85 | " features: ['inputs', 'targets', 'language', 'language_code', 'annotation_type', 'user_id'],\n",
86 | " num_rows: 1733\n",
87 | "})"
88 | ]
89 | },
90 | "execution_count": 4,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "dutch_only = aya_ds.filter(lambda x: x['language'] == 'Dutch')\n",
97 | "dutch_only"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "### Getting some statistics about the data\n",
105 | "\n",
106 | "To help with the next stages of this process we'll get some statistics about the data. "
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 5,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "def get_stats(ds: Dataset):\n",
116 | " input_lengths = []\n",
117 | " output_lengths = []\n",
118 | " annotator_counts: Counter = Counter()\n",
119 | " for row in ds:\n",
120 | " input_lengths.append(len(row[\"inputs\"]))\n",
121 | " output_lengths.append(len(row[\"targets\"]))\n",
122 | " annotator_counts.update(ds[\"user_id\"])\n",
123 | " mean_input_length = mean(input_lengths)\n",
124 | " median_input_length = median(input_lengths)\n",
125 | " mean_output_length = mean(output_lengths)\n",
126 | " median_output_length = median(output_lengths)\n",
127 | " max_input_length = max(input_lengths)\n",
128 | " max_output_length = max(output_lengths)\n",
129 | " return {\n",
130 | " \"number_of_unique_annotators\": len(annotator_counts),\n",
131 | " \"input_lengths\": input_lengths,\n",
132 | " \"output_lengths\": output_lengths,\n",
133 | " \"annotator_counts\": dict(annotator_counts),\n",
134 | " \"mean_input_length\": mean_input_length,\n",
135 | " \"median_input_length\": median_input_length,\n",
136 | " \"mean_output_length\": mean_output_length,\n",
137 | " \"median_output_length\": median_output_length,\n",
138 | " \"max_input_length\": max_input_length,\n",
139 | " \"max_output_length\": max_output_length,\n",
140 | " }"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 6,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "stats = get_stats(dutch_only)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "There are various things we might be interest in from these stats but some of the most relevant are the length of input and outputs of the data. This may help us decide which LLMs to use in the next stage of the process. "
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 7,
162 | "metadata": {},
163 | "outputs": [
164 | {
165 | "name": "stdout",
166 | "output_type": "stream",
167 | "text": [
168 | "Max input length: 3030\n",
169 | "Max output length: 21707\n",
170 | "Mean input length: 223.67109059434506\n",
171 | "Mean output length: 352.1806116560877\n"
172 | ]
173 | }
174 | ],
175 | "source": [
176 | "print(f\"Max input length: {stats['max_input_length']}\")\n",
177 | "print(f\"Max output length: {stats['max_output_length']}\")\n",
178 | "print(f\"Mean input length: {stats['mean_input_length']}\")\n",
179 | "print(f\"Mean output length: {stats['mean_output_length']}\")"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "## Push the subset to the Hub \n",
187 | "\n",
188 | "To help us make testing our pipelines easier we'll create a very small test split (10 samples) that we can use when we're testing out our pipelines. "
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 8,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "dutch_only = dutch_only.train_test_split(test_size=100)"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "We'll now push this subset to the Hub so that we can use it in the next stage of the process. Don't forget to update this to point to your own Hub workspace. If you are not already authenticated on the Hub uncomment the cell below and run it. \n"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 9,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "# from huggingface_hub import login \n",
214 | "# login()"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "dutch_only.push_to_hub(\"data-is-better-together/aya_dataset_dutch_example\")"
224 | ]
225 | }
226 | ],
227 | "metadata": {
228 | "kernelspec": {
229 | "display_name": ".venv",
230 | "language": "python",
231 | "name": "python3"
232 | },
233 | "language_info": {
234 | "codemirror_mode": {
235 | "name": "ipython",
236 | "version": 3
237 | },
238 | "file_extension": ".py",
239 | "mimetype": "text/x-python",
240 | "name": "python",
241 | "nbconvert_exporter": "python",
242 | "pygments_lexer": "ipython3",
243 | "version": "3.11.1"
244 | }
245 | },
246 | "nbformat": 4,
247 | "nbformat_minor": 2
248 | }
249 |
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # Multilingual DPO/ORPO Dataset Project
6 |
7 | This project aims to encourage the creation of DPO/ORPO datasets for more languages. By providing these tools, we aim to foster a community of people building DPO/ORPO datasets for different languages. Currently, many languages do not have DPO/ORPO datasets openly shared on the Hugging Face Hub. The [data-is-better-together/preference_data_by_language](https://huggingface.co/spaces/data-is-better-together/preference_data_by_language) Space gives an overview of the language coverage. At the time of this commit, only 14 languages with DPO/ORPO datasets are available on the Hugging Face Hub. Following this recipe, you can easily generate a DPO/ORPO dataset for a new language.
8 |
9 | ## What are the goals of this project?
10 |
11 | This project has the following goals:
12 |
13 | - An Argilla Interface for ranking responses generated by a human Aya annotator and a generated response. See the [aya_dutch_dpo](https://dibt-demo-argilla-space.hf.space/dataset/f47eac1c-8763-4513-ab02-b08eb66f7f65/annotation-mode)example.
14 | - A "raw dataset" with LLM feedback for each prompt. See the [data-is-better-together/aya_dutch_dpo_raw](https://huggingface.co/datasets/data-is-better-together/aya_dutch_dpo_raw) for an example.
15 | - A growing dataset with human-verified preferences for each response. See the [data-is-better-together/aya_dutch_dpo](https://huggingface.co/datasets/data-is-better-together/aya_dutch_dpo) for an example dataset.
16 |
17 | ## Why do we need DPO/ORPO datasets for more languages?
18 |
19 |
20 | What is Direct Preference Optimization (DPO)?
21 | Direct Preference Optimization (DPO) is a technique for training models to optimize for human preferences.
22 |
23 | > [Direct Preference Optimization (DPO)](https://huggingface.co/papers/2305.18290) has emerged as a promising alternative for aligning Large Language Models (LLMs) to human or AI preferences. Unlike [traditional alignment methods](https://huggingface.co/blog/rlhf), which are based on reinforcement learning, DPO recasts the alignment formulation as a simple loss function that can be optimized directly on a dataset of preferences ${(x, y_w, y_l)}$, where $x$ is a prompt and $(y_w,y_l)$ are the preferred and dispreferred responses. [source](https://huggingface.co/blog/pref-tuning)
24 |
25 | Or, in other words, to train a model using DPO you need a dataset of prompts and responses where one response is preferred over the other. This type of data is also used for ORPO, another alignment technique we'll describe in the next section.
26 |
27 | 
28 | _Example of a preference tuning dataset. Each row contains a prompt and a "chosen" and "rejected" response._
29 |
30 |
31 | DPO datasets are a powerful tool for fine-tuning language models to generate responses that are more aligned with human preferences, so are a valuable resource for improving the quality of chatbots and other generative models. However, currently, there are only a few DPO datasets available for a limited number of languages. By generating more DPO datasets for different languages, we can help to improve the quality of generative models in a wider range of languages.
32 |
33 | Recently, Odds Ratio Preference Optimization (ORPO) has been proposed as an alternative to DPO. ORPO is a novel approach to fine-tuning language models that incorporates preference alignment directly into the supervised fine-tuning (SFT) process by using the odds ratio to contrast favored and disfavored generation styles. By applying a minor penalty to the disfavored style during SFT, ORPO effectively guides the model toward the desired behavior without the need for an additional alignment step.
34 |
35 | _tl;dr_: if you have a DPO-style dataset + a strong base model you can use ORPO to train a chat model. Recently Argilla, KAIST, and Hugging Face used this approach to train [HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1) a very strong chat model using only 7k data preference pairs!
36 |
37 | ## How can you contribute?
38 |
39 | As part of Data Is Better Together, we're supporting the community in generating more DPO/ORPO datasets for different languages. If you would like to help, you can follow the steps below to generate a DPO/ORPO dataset for a language that you are interested in. There are already many language communities working together on the Hugging Face Discord server, so you can also join the server to collaborate with others on this project 🤗.
40 |
41 | ## Project Overview
42 |
43 | [Aya](https://cohere.com/blog/aya-multilingual), an open science initiative to accelerate multilingual AI progress, has released a dataset of human-annotated prompt-completion pairs across 71 languages. We can use this dataset to generate DPO/ORPO datasets for languages for which they don't currently exist.
44 |
45 | Here are the steps we'll take to generate a DPO/ORPO dataset for a new language:
46 |
47 | - Start from the [CohereForAI/aya_dataset](https://huggingface.co/datasets/CohereForAI/aya_dataset).
48 | - Filter the Aya dataset to the language you are focusing on.
49 | - Use [`distilabel`](https://github.com/argilla-io/distilabel) to generate a second response for each prompt in the filtered Aya dataset.
50 | - (Optional) Send the generated dataset to [Argilla](https://argilla.io/) for annotation where the community can choose which response is better.
51 | - (Optional) Train a model using the generated DPO/ORPO dataset and push forward the state of the art in your language 🚀🚀🚀
52 |
53 | You can find more detailed instructions on how to generate a DPO/ORPO dataset for a new language in the [instructions](./instructions.md).
54 |
55 | ### I'm GPU-poor, can I still get involved?
56 |
57 | Yes! The example scripts in this repository use Hugging Face Inference Endpoints for the inference component. This means you can run the scripts on your local machine without needing a GPU. We can provide you with GPU grants to run the `distilabel` script if you need them. Please reach out to us on the Hugging Face Discord server if you need a GPU grant. **Note**: We will want to ensure that you have a plan for how you will use the GPU grant before providing it, in particular, we'll want to see that you have set up an Argilla Space for your project already and have already done some work to identify the language you want to work on and the models you want to use.
58 |
59 | ### Next steps
60 |
61 | The current notebooks and code currently only show how to generate the synthetic data and create a preference dataset annotation Space. The next steps would be to collect human feedback on the synthetic data and then use this to train a model. We will cover this in a future notebook.
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/assets/banner.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/dpo-orpo-preference/assets/banner.webp
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/aya_dpo_gen.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from typing import Any, Dict
4 |
5 | import argilla as rg
6 | from custom_preference_to_argilla import CustomPreferenceToArgilla
7 | from distilabel.llms import InferenceEndpointsLLM
8 | from distilabel.pipeline import Pipeline
9 | from distilabel.steps import (
10 | LoadHubDataset,
11 | StepInput,
12 | StepOutput,
13 | step,
14 | )
15 | from distilabel.steps.tasks import TextGeneration, UltraFeedback
16 | from distilabel.steps.tasks.typing import ChatType
17 | from dotenv import load_dotenv
18 | from huggingface_hub import InferenceClient, login
19 |
20 | load_dotenv()
21 |
22 | ##################################
23 | # Configuration
24 | # This section contains the configuration for the pipeline.
25 | # This is where you can define the model ID, the maximum number of new tokens to generate, the input batch size for the model via the Inference Endpoints API, and the Argilla configuration.
26 | ##################################
27 |
28 |
29 | # Model Configuration
30 | MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
31 | MAX_NEW_TOKENS = 2000 # Maximum number of new tokens to generate
32 |
33 | # Inference Endpoints Configuration
34 | # INFERENCE_ENDPOINTS_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B-Instruct" # Inference endpoints URL
35 | # ENDPOINT_NAME = "meta-llama/Meta-Llama-3-70B-Instruct"
36 | INPUT_BATCH_SIZE = 10 # Input batch size `for the model via the Inference Endpoints API, you can adjust this based on the model's requirements and the hardware you are using to deploy the model
37 |
38 | # Argilla Configuration
39 | ARGILLA_SPACE_URL = "https://dibt-demo-argilla-space.hf.space" # Argilla Space URL
40 | ARGILLA_DATASET_NAME = "aya_dutch_dpo" # Argilla dataset name
41 | ARGILLA_WORKSPACE_NAME = "admin" # Argilla workspace name
42 | # Dataset Configuration
43 | INPUT_DATASET_HUB_ID = "data-is-better-together/aya_dataset_dutch_example" # Input dataset hub ID (created in the previous step)
44 | OUTPUT_DATASET_HUB_ID = (
45 | "data-is-better-together/aya_dutch_dpo_raw" # Output dataset hub ID
46 | )
47 | SPLIT = "test" # Split of the dataset to use. Start with test whilst you are testing the pipeline and then switch to train when you are ready to generate the full dataset
48 |
49 | HUGGINGFACE_TOKEN = os.getenv("HF_API_KEY")
50 |
51 | #######################################
52 | # Check required environment variables
53 | #######################################
54 | assert (
55 | HUGGINGFACE_TOKEN is not None
56 | ), "Please set the HF_API_KEY environment variable or authenticate with the Hugging Face CLI using `huggingface-cli login`"
57 | login(token=HUGGINGFACE_TOKEN)
58 | ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY")
59 |
60 | # Check if the API key is set
61 | assert (
62 | ARGILLA_API_KEY is not None
63 | ), "Please set the ARGILLA_API_KEY environment variable or pass it as a parameter"
64 |
65 | #####################
66 | # Helper functions
67 | #####################
68 |
69 |
70 | def remove_existing_dataset(argilla_dataset_name: str):
71 | """Remove an existing dataset from Argilla. This is useful when re-running the pipeline multiple times."""
72 | try:
73 | rg.init(
74 | api_url=ARGILLA_SPACE_URL,
75 | api_key=ARGILLA_API_KEY,
76 | workspace=ARGILLA_WORKSPACE_NAME,
77 | )
78 | argilla_ds = rg.FeedbackDataset.from_argilla(argilla_dataset_name)
79 | argilla_ds.delete()
80 | except ValueError as e:
81 | print(e)
82 |
83 |
84 | #####################################
85 | # Define distilabel custom steps
86 | #####################################
87 |
88 |
89 | @step(
90 | inputs=["generation"],
91 | outputs=["predicted_generation_language", "predicted_generation_language_score"],
92 | )
93 | def language_predict(inputs: StepInput) -> StepOutput:
94 | """
95 | A step to predict the language of the generated text.
96 | Sometimes models fail to generate text in the desired language.
97 | This step helps to identify such cases using an external language prediction model.
98 | """
99 | for input in inputs:
100 | try:
101 | cleaned_input = input["generation"].replace("\n", " ")
102 | resp = InferenceClient("laurievb/OpenLID").text_classification(
103 | cleaned_input
104 | )
105 | top_prediction = resp[0] # top prediction is the first element in the list
106 | input["predicted_generation_language"] = top_prediction.label
107 | input["predicted_generation_language_score"] = min(
108 | 1.0, top_prediction.score
109 | ) # ensure score is between 0 and 1
110 | except Exception as e:
111 | print(e)
112 | input["predicted_generation_language"] = "error"
113 | input["predicted_generation_language_score"] = 0.0
114 | yield inputs
115 |
116 |
117 | @step(inputs=["targets", "generation"], outputs=["generations"])
118 | def CombineAyaAndModelResponse(
119 | inputs: StepInput,
120 | ) -> StepOutput:
121 | """A step to combine the Aya and model responses and add the response sources."""
122 | for input in inputs:
123 | input["generations"] = [input["targets"], input["generation"]]
124 | input["generation_models"] = ["aya", MODEL_ID]
125 | yield inputs
126 |
127 |
128 | #######################################################################
129 | # Define a custom TextGeneration task focused on our target language
130 | #######################################################################
131 |
132 |
133 | # Custom system prompt
134 | # This translates to something like:
135 | # You are an AI assistant. Your primary language is Dutch. Answer most questions and prompts in Dutch, unless specifically asked to use another language.
136 | # If you are asked to translate between two other languages, for example from French to English, perform the requested translation.
137 | # When quotes or passages in another language are given in a prompt, assume that the user wants you to understand them and refer to them when formulating your English response. Do not translate the foreign text yourself, unless specifically requested.
138 |
139 |
140 | system_prompt = """Je bent een AI-assistent. Je primaire taal is Nederlands. Beantwoord de meeste vragen en prompts in het Nederlands, tenzij specifiek gevraagd wordt om een andere taal te gebruiken.
141 | Als je gevraagd wordt om te vertalen tussen twee andere talen, bijvoorbeeld van Frans naar Engels, voer dan de gevraagde vertaling uit. Wanneer citaten of passages in een andere taal in een prompt worden gegeven, ga er dan van uit dat de gebruiker wil dat je ze begrijpt en ernaar verwijst bij het formuleren van je Nederlandse antwoord. Vertaal de anderstalige tekst zelf niet, tenzij dit specifiek wordt gevraagd."""
142 |
143 |
144 | class DutchTextGeneration(TextGeneration):
145 | """A TextGeneration task adds an additional system prompt."""
146 |
147 | def format_input(self, input: Dict[str, Any]) -> "ChatType":
148 | return [
149 | {"role": "system", "content": system_prompt},
150 | {"role": "user", "content": input["instruction"]},
151 | ]
152 |
153 |
154 | #####################################
155 | # Define the pipeline
156 | #####################################
157 |
158 | with Pipeline(name="generate-dpo-responses") as pipeline:
159 | # Load the dataset from the Hugging Face Hub
160 | load_hub_dataset = LoadHubDataset(
161 | name="load_dataset",
162 | output_mappings={"inputs": "instruction"},
163 | )
164 | #####################################
165 | # Define the LLM
166 | #####################################
167 | llm = InferenceEndpointsLLM(
168 | model_id=MODEL_ID,
169 | tokenizer_id=MODEL_ID,
170 | model_display_name=MODEL_ID,
171 | api_key=HUGGINGFACE_TOKEN,
172 | )
173 | # Generate responses using the model
174 | text_generation = DutchTextGeneration(
175 | name="text_generation",
176 | llm=llm,
177 | input_batch_size=INPUT_BATCH_SIZE,
178 | output_mappings={"model_name": "generation_model"},
179 | num_generations=1,
180 | )
181 | load_hub_dataset.connect(text_generation)
182 | language_prediction = language_predict(name="language_prediction")
183 | text_generation.connect(language_prediction)
184 | combine_columns = CombineAyaAndModelResponse(
185 | name="combine_columns",
186 | )
187 |
188 | language_prediction.connect(combine_columns)
189 | ultrafeedback = UltraFeedback(
190 | name="ultrafeedback", aspect="overall-rating", llm=llm
191 | )
192 | combine_columns.connect(ultrafeedback)
193 | to_argilla = CustomPreferenceToArgilla(
194 | name="to_argilla",
195 | api_url=ARGILLA_SPACE_URL,
196 | api_key=ARGILLA_API_KEY,
197 | dataset_name=ARGILLA_DATASET_NAME,
198 | dataset_workspace=ARGILLA_WORKSPACE_NAME,
199 | num_generations=2,
200 | metadata_properties=[
201 | rg.TermsMetadataProperty(name="predicted_generation_language").dict(), # type: ignore
202 | rg.FloatMetadataProperty( # type: ignore
203 | name="predicted_generation_language_score", min=0.0, max=1.0
204 | ).dict(),
205 | ],
206 | )
207 | ultrafeedback.connect(to_argilla)
208 |
209 | #####################################
210 | # Run the pipeline
211 | #####################################
212 |
213 | if __name__ == "__main__":
214 | start_time = time.time()
215 | if ARGILLA_DATASET_NAME:
216 | print(f"Removing existing dataset: {ARGILLA_DATASET_NAME}")
217 | remove_existing_dataset(ARGILLA_DATASET_NAME)
218 | dataset = pipeline.run(
219 | parameters={
220 | "load_dataset": {
221 | "repo_id": INPUT_DATASET_HUB_ID,
222 | "split": SPLIT,
223 | },
224 | "text_generation": {
225 | "llm": {
226 | "generation_kwargs": {
227 | "max_new_tokens": MAX_NEW_TOKENS,
228 | "do_sample": True,
229 | "stop_sequences": ["<|end_of_text|>", "<|eot_id|>"],
230 | }
231 | }
232 | },
233 | "to_argilla": {"dataset_name": ARGILLA_DATASET_NAME},
234 | }
235 | )
236 | dataset.push_to_hub(OUTPUT_DATASET_HUB_ID, token=HUGGINGFACE_TOKEN)
237 | end_time = time.time()
238 | print(f"Output dataset: https://huggingface.co/datasets/{OUTPUT_DATASET_HUB_ID}")
239 | print(f"Time taken: {end_time - start_time} seconds")
240 |
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/custom_preference_to_argilla.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import hashlib
3 | from typing import TYPE_CHECKING, Any, Dict, List, Union
4 |
5 | from typing_extensions import override
6 |
7 | with contextlib.suppress(ImportError):
8 | import argilla as rg
9 | from distilabel.steps import PreferenceToArgilla, StepInput
10 |
11 | if TYPE_CHECKING:
12 | from distilabel.steps.typing import StepOutput, RatingQuestion, TextQuestion
13 |
14 |
15 | class CustomPreferenceToArgilla(PreferenceToArgilla):
16 | """
17 | Custom PreferenceToArgilla step that adds metadata properties to the feedback records.
18 | This allows filtering based on metadata properties in the Argilla UI.
19 | """
20 |
21 | metadata_properties: List[Dict[str, Any]]
22 |
23 | def load(self) -> None:
24 | super().load()
25 | for metadata_property in self.metadata_properties:
26 | metadata_property_type = metadata_property.pop("type", None)
27 | if metadata_property_type == "float":
28 | metadata_property = rg.FloatMetadataProperty.parse_obj(
29 | metadata_property
30 | )
31 | elif metadata_property_type == "integer":
32 | metadata_property = rg.IntegerMetadataProperty.parse_obj(
33 | metadata_property
34 | )
35 | elif metadata_property_type == "terms":
36 | metadata_property = rg.TermsMetadataProperty.parse_obj(
37 | metadata_property
38 | )
39 | else:
40 | break
41 | self._rg_dataset.add_metadata_property(metadata_property) # type: ignore
42 |
43 | def _rating_rationale_pairs(
44 | self,
45 | ) -> List[Union["RatingQuestion", "TextQuestion"]]:
46 | questions = super()._rating_rationale_pairs()
47 | questions.append(
48 | rg.TextQuestion( # type: ignore
49 | name="improved_response",
50 | title="How would you improve the response?",
51 | required=False,
52 | )
53 | )
54 | return questions
55 |
56 | @override
57 | def process(self, inputs: StepInput) -> "StepOutput": # type: ignore
58 | records = []
59 | for input in inputs:
60 | # Generate the SHA-256 hash of the instruction to use it as the metadata
61 | instruction_id = hashlib.sha256(
62 | input["instruction"].encode("utf-8") # type: ignore
63 | ).hexdigest()
64 |
65 | generations = {
66 | f"{self._generations}-{idx}": generation
67 | for idx, generation in enumerate(input["generations"]) # type: ignore
68 | }
69 | records.append( # type: ignore
70 | rg.FeedbackRecord( # type: ignore
71 | fields={
72 | "id": instruction_id,
73 | "instruction": input["instruction"], # type: ignore
74 | **generations,
75 | },
76 | suggestions=self._add_suggestions_if_any(input), # type: ignore
77 | metadata={
78 | metadata_property["name"]: input[metadata_property["name"]]
79 | for metadata_property in self.metadata_properties
80 | if metadata_property["name"] in input
81 | },
82 | )
83 | )
84 | self._rg_dataset.add_records(records) # type: ignore
85 | yield inputs
86 |
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/examples/en/01_en_data_prep.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 01. Creating our subsample of Aya to prepare for creating a DPO dataset"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook walks through the steps required to create a sample from the full Aya dataset for the language you are interested in working in. \n",
15 | "In this notebook and the subsequent notebooks we'll focus on Dutch as an example but the process will be rather similar for other languages."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "from collections import Counter\n",
25 | "from datasets import Dataset\n",
26 | "from datasets import load_dataset\n",
27 | "from statistics import mean, median"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "aya_ds = load_dataset(\"CohereForAI/aya_dataset\",split='train')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/plain": [
47 | "Dataset({\n",
48 | " features: ['inputs', 'targets', 'language', 'language_code', 'annotation_type', 'user_id'],\n",
49 | " num_rows: 202362\n",
50 | "})"
51 | ]
52 | },
53 | "execution_count": 3,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "aya_ds"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "We want to only include the data that is relevant to the language we are interested in. This means we need to filter out the data that is not in Dutch. "
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 4,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "Dataset({\n",
78 | " features: ['inputs', 'targets', 'language', 'language_code', 'annotation_type', 'user_id'],\n",
79 | " num_rows: 1733\n",
80 | "})"
81 | ]
82 | },
83 | "execution_count": 4,
84 | "metadata": {},
85 | "output_type": "execute_result"
86 | }
87 | ],
88 | "source": [
89 | "dutch_only = aya_ds.filter(lambda x: x['language'] == 'Dutch')\n",
90 | "dutch_only"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "### Getting some statistics about the data\n",
98 | "\n",
99 | "To help with the next stages of this process we'll get some statistics about the data. "
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 5,
105 | "metadata": {},
106 | "outputs": [],
107 | "source": [
108 | "def get_stats(ds: Dataset):\n",
109 | " input_lengths = []\n",
110 | " output_lengths = []\n",
111 | " annotator_counts: Counter = Counter()\n",
112 | " for row in ds:\n",
113 | " input_lengths.append(len(row[\"inputs\"]))\n",
114 | " output_lengths.append(len(row[\"targets\"]))\n",
115 | " annotator_counts.update(ds[\"user_id\"])\n",
116 | " mean_input_length = mean(input_lengths)\n",
117 | " median_input_length = median(input_lengths)\n",
118 | " mean_output_length = mean(output_lengths)\n",
119 | " median_output_length = median(output_lengths)\n",
120 | " max_input_length = max(input_lengths)\n",
121 | " max_output_length = max(output_lengths)\n",
122 | " return {\n",
123 | " \"number_of_unique_annotators\": len(annotator_counts),\n",
124 | " \"input_lengths\": input_lengths,\n",
125 | " \"output_lengths\": output_lengths,\n",
126 | " \"annotator_counts\": dict(annotator_counts),\n",
127 | " \"mean_input_length\": mean_input_length,\n",
128 | " \"median_input_length\": median_input_length,\n",
129 | " \"mean_output_length\": mean_output_length,\n",
130 | " \"median_output_length\": median_output_length,\n",
131 | " \"max_input_length\": max_input_length,\n",
132 | " \"max_output_length\": max_output_length,\n",
133 | " }"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 6,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "stats = get_stats(dutch_only)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "There are various things we might be interest in from these stats but some of the most relevant are the length of input and outputs of the data. This may help us decide which LLMs to use in the next stage of the process. "
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 7,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "Max input length: 3030\n",
162 | "Max output length: 21707\n",
163 | "Mean input length: 223.67109059434506\n",
164 | "Mean output length: 352.1806116560877\n"
165 | ]
166 | }
167 | ],
168 | "source": [
169 | "print(f\"Max input length: {stats['max_input_length']}\")\n",
170 | "print(f\"Max output length: {stats['max_output_length']}\")\n",
171 | "print(f\"Mean input length: {stats['mean_input_length']}\")\n",
172 | "print(f\"Mean output length: {stats['mean_output_length']}\")"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "## Push the subset to the Hub \n",
180 | "\n",
181 | "To help us make testing our pipelines easier we'll create a very small test split (10 samples) that we can use when we're testing out our pipelines. "
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 8,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "dutch_only = dutch_only.train_test_split(test_size=100)"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "We'll now push this subset to the Hub so that we can use it in the next stage of the process. Don't forget to update this to point to your own Hub workspace. If you are not already authenticated on the Hub uncomment the cell below and run it. \n"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 9,
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "# from huggingface_hub import login \n",
207 | "# login()"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 10,
213 | "metadata": {},
214 | "outputs": [
215 | {
216 | "data": {
217 | "application/vnd.jupyter.widget-view+json": {
218 | "model_id": "97f87da40e5f44a5b72c0afd35b3b37c",
219 | "version_major": 2,
220 | "version_minor": 0
221 | },
222 | "text/plain": [
223 | "Uploading the dataset shards: 0%| | 0/1 [00:00, ?it/s]"
224 | ]
225 | },
226 | "metadata": {},
227 | "output_type": "display_data"
228 | },
229 | {
230 | "data": {
231 | "application/vnd.jupyter.widget-view+json": {
232 | "model_id": "d8f33070fda746538649b941b1b633e5",
233 | "version_major": 2,
234 | "version_minor": 0
235 | },
236 | "text/plain": [
237 | "Creating parquet from Arrow format: 0%| | 0/2 [00:00, ?ba/s]"
238 | ]
239 | },
240 | "metadata": {},
241 | "output_type": "display_data"
242 | },
243 | {
244 | "data": {
245 | "application/vnd.jupyter.widget-view+json": {
246 | "model_id": "10aaae1911a34e5082900715e893ad80",
247 | "version_major": 2,
248 | "version_minor": 0
249 | },
250 | "text/plain": [
251 | "Uploading the dataset shards: 0%| | 0/1 [00:00, ?it/s]"
252 | ]
253 | },
254 | "metadata": {},
255 | "output_type": "display_data"
256 | },
257 | {
258 | "data": {
259 | "application/vnd.jupyter.widget-view+json": {
260 | "model_id": "e9871c70c56643be9dd43f365e92964f",
261 | "version_major": 2,
262 | "version_minor": 0
263 | },
264 | "text/plain": [
265 | "Creating parquet from Arrow format: 0%| | 0/1 [00:00, ?ba/s]"
266 | ]
267 | },
268 | "metadata": {},
269 | "output_type": "display_data"
270 | },
271 | {
272 | "data": {
273 | "application/vnd.jupyter.widget-view+json": {
274 | "model_id": "3b1f1ee268cf4e9fa7fb5a2ebaf3be28",
275 | "version_major": 2,
276 | "version_minor": 0
277 | },
278 | "text/plain": [
279 | "README.md: 0%| | 0.00/600 [00:00, ?B/s]"
280 | ]
281 | },
282 | "metadata": {},
283 | "output_type": "display_data"
284 | },
285 | {
286 | "data": {
287 | "text/plain": [
288 | "CommitInfo(commit_url='https://huggingface.co/datasets/data-is-better-together/aya_dataset_dutch_example/commit/2b05a37669edb925254e2ce18feea1fff121f5e9', commit_message='Upload dataset', commit_description='', oid='2b05a37669edb925254e2ce18feea1fff121f5e9', pr_url=None, pr_revision=None, pr_num=None)"
289 | ]
290 | },
291 | "execution_count": 10,
292 | "metadata": {},
293 | "output_type": "execute_result"
294 | }
295 | ],
296 | "source": [
297 | "dutch_only.push_to_hub(\"data-is-better-together/aya_dataset_dutch_example\")"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": []
306 | }
307 | ],
308 | "metadata": {
309 | "kernelspec": {
310 | "display_name": ".venv",
311 | "language": "python",
312 | "name": "python3"
313 | },
314 | "language_info": {
315 | "codemirror_mode": {
316 | "name": "ipython",
317 | "version": 3
318 | },
319 | "file_extension": ".py",
320 | "mimetype": "text/x-python",
321 | "name": "python",
322 | "nbconvert_exporter": "python",
323 | "pygments_lexer": "ipython3",
324 | "version": "3.11.1"
325 | }
326 | },
327 | "nbformat": 4,
328 | "nbformat_minor": 2
329 | }
330 |
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/examples/en/aya_en_dpo_gen.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | import argilla as rg
5 | from custom_preference_to_argilla import CustomPreferenceToArgilla
6 | from distilabel.llms import InferenceEndpointsLLM
7 | from distilabel.pipeline import Pipeline
8 | from distilabel.steps import (
9 | LoadHubDataset,
10 | StepInput,
11 | StepOutput,
12 | step,
13 | )
14 | from distilabel.steps.tasks import TextGeneration, UltraFeedback
15 | from dotenv import load_dotenv
16 | from huggingface_hub import InferenceClient, login
17 |
18 | load_dotenv()
19 |
20 | ##################################
21 | # Configuration
22 | # This section contains the configuration for the pipeline.
23 | # This is where you can define the model ID, the maximum number of new tokens to generate, the input batch size for the model via the Inference Endpoints API, and the Argilla configuration.
24 | ##################################
25 |
26 |
27 | # Model Configuration
28 | MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
29 | MAX_NEW_TOKENS = 4000 # Maximum number of new tokens to generate
30 |
31 | # Inference Endpoints Configuration
32 | # INFERENCE_ENDPOINTS_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B-Instruct" # Inference endpoints URL
33 | # ENDPOINT_NAME = "meta-llama/Meta-Llama-3-70B-Instruct"
34 | INPUT_BATCH_SIZE = 10 # Input batch size `for the model via the Inference Endpoints API, you can adjust this based on the model's requirements and the hardware you are using to deploy the model
35 |
36 | # Argilla Configuration
37 | ARGILLA_SPACE_URL = "https://dibt-demo-argilla-space.hf.space" # Argilla Space URL
38 | ARGILLA_DATASET_NAME = "aya_english_dpo" # Argilla dataset name
39 | ARGILLA_WORKSPACE_NAME = "admin" # Argilla workspace name
40 | # Dataset Configuration
41 | INPUT_DATASET_HUB_ID = "data-is-better-together/aya_dataset_english_example" # Input dataset hub ID (created in the previous step)
42 | OUTPUT_DATASET_HUB_ID = (
43 | "data-is-better-together/aya_english_dpo_raw" # Output dataset hub ID
44 | )
45 | SPLIT = "train" # Split of the dataset to use. Start with test whilst you are testing the pipeline and then switch to train when you are ready to generate the full dataset
46 |
47 | HUGGINGFACE_TOKEN = os.getenv("HF_API_KEY")
48 |
49 | #######################################
50 | # Check required environment variables
51 | #######################################
52 | assert (
53 | HUGGINGFACE_TOKEN is not None
54 | ), "Please set the HF_API_KEY environment variable or authenticate with the Hugging Face CLI using `huggingface-cli login`"
55 | login(token=HUGGINGFACE_TOKEN)
56 | ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY")
57 |
58 | # Check if the API key is set
59 | assert (
60 | ARGILLA_API_KEY is not None
61 | ), "Please set the ARGILLA_API_KEY environment variable or pass it as a parameter"
62 |
63 | #####################
64 | # Helper functions
65 | #####################
66 |
67 |
68 | def remove_existing_dataset(argilla_dataset_name: str):
69 | """Remove an existing dataset from Argilla. This is useful when re-running the pipeline multiple times."""
70 | try:
71 | rg.init(
72 | api_url=ARGILLA_SPACE_URL,
73 | api_key=ARGILLA_API_KEY,
74 | workspace=ARGILLA_WORKSPACE_NAME,
75 | )
76 | argilla_ds = rg.FeedbackDataset.from_argilla(argilla_dataset_name)
77 | argilla_ds.delete()
78 | except ValueError as e:
79 | print(e)
80 |
81 |
82 | #####################################
83 | # Define distilabel custom steps
84 | #####################################
85 |
86 |
87 | @step(
88 | inputs=["generation"],
89 | outputs=["predicted_generation_language", "predicted_generation_language_score"],
90 | )
91 | def language_predict(inputs: StepInput) -> StepOutput:
92 | """
93 | A step to predict the language of the generated text.
94 | Sometimes models fail to generate text in the desired language.
95 | This step helps to identify such cases using an external language prediction model.
96 | """
97 | for input in inputs:
98 | try:
99 | cleaned_input = input["generation"].replace("\n", " ")
100 | resp = InferenceClient("laurievb/OpenLID").text_classification(
101 | cleaned_input
102 | )
103 | top_prediction = resp[0] # top prediction is the first element in the list
104 | input["predicted_generation_language"] = top_prediction.label
105 | input["predicted_generation_language_score"] = min(
106 | 1.0, top_prediction.score
107 | ) # ensure score is between 0 and 1
108 | except Exception as e:
109 | print(e)
110 | input["predicted_generation_language"] = "error"
111 | input["predicted_generation_language_score"] = 0.0
112 | yield inputs
113 |
114 |
115 | @step(inputs=["targets", "generation"], outputs=["generations"])
116 | def CombineAyaAndModelResponse(
117 | inputs: StepInput,
118 | ) -> StepOutput:
119 | """A step to combine the Aya and model responses and add the response sources."""
120 | for input in inputs:
121 | input["generations"] = [input["targets"], input["generation"]]
122 | input["generation_models"] = ["aya", MODEL_ID]
123 | yield inputs
124 |
125 |
126 | #######################################################################
127 | # Define a custom TextGeneration task focused on our target language
128 | #######################################################################
129 |
130 |
131 | # Custom system prompt
132 | # This translates to something like:
133 | # You are an AI assistant. Your primary language is Dutch. Answer most questions and prompts in Dutch, unless specifically asked to use another language.
134 | # If you are asked to translate between two other languages, for example from French to English, perform the requested translation.
135 | # When quotes or passages in another language are given in a prompt, assume that the user wants you to understand them and refer to them when formulating your English response. Do not translate the foreign text yourself, unless specifically requested.
136 |
137 |
138 | # system_prompt = """Je bent een AI-assistent. Je primaire taal is Nederlands. Beantwoord de meeste vragen en prompts in het Nederlands, tenzij specifiek gevraagd wordt om een andere taal te gebruiken.
139 | # Als je gevraagd wordt om te vertalen tussen twee andere talen, bijvoorbeeld van Frans naar Engels, voer dan de gevraagde vertaling uit. Wanneer citaten of passages in een andere taal in een prompt worden gegeven, ga er dan van uit dat de gebruiker wil dat je ze begrijpt en ernaar verwijst bij het formuleren van je Nederlandse antwoord. Vertaal de anderstalige tekst zelf niet, tenzij dit specifiek wordt gevraagd."""
140 |
141 |
142 | # class DutchTextGeneration(TextGeneration):
143 | # """A TextGeneration task adds an additional system prompt."""
144 |
145 | # def format_input(self, input: Dict[str, Any]) -> "ChatType":
146 | # return [
147 | # {"role": "system", "content": system_prompt},
148 | # {"role": "user", "content": input["instruction"]},
149 | # ]
150 |
151 |
152 | #####################################
153 | # Define the pipeline
154 | #####################################
155 |
156 | with Pipeline(name="generate-dpo-responses") as pipeline:
157 | # Load the dataset from the Hugging Face Hub
158 | load_hub_dataset = LoadHubDataset(
159 | name="load_dataset",
160 | output_mappings={"inputs": "instruction"},
161 | )
162 | #####################################
163 | # Define the LLM
164 | #####################################
165 | llm = InferenceEndpointsLLM(
166 | model_id=MODEL_ID,
167 | tokenizer_id=MODEL_ID,
168 | model_display_name=MODEL_ID,
169 | api_key=HUGGINGFACE_TOKEN,
170 | )
171 | # Generate responses using the model
172 | text_generation = TextGeneration(
173 | name="text_generation",
174 | llm=llm,
175 | input_batch_size=INPUT_BATCH_SIZE,
176 | output_mappings={"model_name": "generation_model"},
177 | num_generations=1,
178 | )
179 | load_hub_dataset.connect(text_generation)
180 | language_prediction = language_predict(name="language_prediction")
181 | text_generation.connect(language_prediction)
182 | combine_columns = CombineAyaAndModelResponse(
183 | name="combine_columns",
184 | )
185 |
186 | language_prediction.connect(combine_columns)
187 | ultrafeedback = UltraFeedback(
188 | name="ultrafeedback", aspect="overall-rating", llm=llm
189 | )
190 | combine_columns.connect(ultrafeedback)
191 | to_argilla = CustomPreferenceToArgilla(
192 | name="to_argilla",
193 | api_url=ARGILLA_SPACE_URL,
194 | api_key=ARGILLA_API_KEY,
195 | dataset_name=ARGILLA_DATASET_NAME,
196 | dataset_workspace=ARGILLA_WORKSPACE_NAME,
197 | num_generations=2,
198 | metadata_properties=[
199 | rg.TermsMetadataProperty(name="predicted_generation_language").dict(), # type: ignore
200 | rg.FloatMetadataProperty( # type: ignore
201 | name="predicted_generation_language_score", min=0.0, max=1.0
202 | ).dict(),
203 | ],
204 | )
205 | ultrafeedback.connect(to_argilla)
206 |
207 | #####################################
208 | # Run the pipeline
209 | #####################################
210 |
211 | if __name__ == "__main__":
212 | start_time = time.time()
213 | if ARGILLA_DATASET_NAME:
214 | print(f"Removing existing dataset: {ARGILLA_DATASET_NAME}")
215 | remove_existing_dataset(ARGILLA_DATASET_NAME)
216 | dataset = pipeline.run(
217 | parameters={
218 | "load_dataset": {
219 | "repo_id": INPUT_DATASET_HUB_ID,
220 | "split": SPLIT,
221 | },
222 | "text_generation": {
223 | "llm": {
224 | "generation_kwargs": {
225 | "max_new_tokens": MAX_NEW_TOKENS,
226 | "do_sample": True,
227 | "stop_sequences": ["<|end_of_text|>", "<|eot_id|>"],
228 | }
229 | }
230 | },
231 | "to_argilla": {"dataset_name": ARGILLA_DATASET_NAME},
232 | }
233 | )
234 | dataset.push_to_hub(OUTPUT_DATASET_HUB_ID, token=HUGGINGFACE_TOKEN)
235 | end_time = time.time()
236 | print(f"Output dataset: https://huggingface.co/datasets/{OUTPUT_DATASET_HUB_ID}")
237 | print(f"Time taken: {end_time - start_time} seconds")
238 |
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/examples/en/custom_preference_to_argilla.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import hashlib
3 | from typing import TYPE_CHECKING, Any, Dict, List, Union
4 |
5 | from typing_extensions import override
6 |
7 | with contextlib.suppress(ImportError):
8 | import argilla as rg
9 | from distilabel.steps import PreferenceToArgilla, StepInput
10 |
11 | if TYPE_CHECKING:
12 | from distilabel.steps.typing import StepOutput, RatingQuestion, TextQuestion
13 |
14 |
15 | class CustomPreferenceToArgilla(PreferenceToArgilla):
16 | """
17 | Custom PreferenceToArgilla step that adds metadata properties to the feedback records.
18 | This allows filtering based on metadata properties in the Argilla UI.
19 | """
20 |
21 | metadata_properties: List[Dict[str, Any]]
22 |
23 | def load(self) -> None:
24 | super().load()
25 | for metadata_property in self.metadata_properties:
26 | metadata_property_type = metadata_property.pop("type", None)
27 | if metadata_property_type == "float":
28 | metadata_property = rg.FloatMetadataProperty.parse_obj(
29 | metadata_property
30 | )
31 | elif metadata_property_type == "integer":
32 | metadata_property = rg.IntegerMetadataProperty.parse_obj(
33 | metadata_property
34 | )
35 | elif metadata_property_type == "terms":
36 | metadata_property = rg.TermsMetadataProperty.parse_obj(
37 | metadata_property
38 | )
39 | else:
40 | break
41 | self._rg_dataset.add_metadata_property(metadata_property) # type: ignore
42 |
43 | def _rating_rationale_pairs(
44 | self,
45 | ) -> List[Union["RatingQuestion", "TextQuestion"]]:
46 | questions = super()._rating_rationale_pairs()
47 | questions.append(
48 | rg.TextQuestion( # type: ignore
49 | name="improved_response",
50 | title="How would you improve the response?",
51 | required=False,
52 | )
53 | )
54 | return questions
55 |
56 | @override
57 | def process(self, inputs: StepInput) -> "StepOutput": # type: ignore
58 | records = []
59 | for input in inputs:
60 | # Generate the SHA-256 hash of the instruction to use it as the metadata
61 | instruction_id = hashlib.sha256(
62 | input["instruction"].encode("utf-8") # type: ignore
63 | ).hexdigest()
64 |
65 | generations = {
66 | f"{self._generations}-{idx}": generation
67 | for idx, generation in enumerate(input["generations"]) # type: ignore
68 | }
69 | records.append( # type: ignore
70 | rg.FeedbackRecord( # type: ignore
71 | fields={
72 | "id": instruction_id,
73 | "instruction": input["instruction"], # type: ignore
74 | **generations,
75 | },
76 | suggestions=self._add_suggestions_if_any(input), # type: ignore
77 | metadata={
78 | metadata_property["name"]: input[metadata_property["name"]]
79 | for metadata_property in self.metadata_properties
80 | if metadata_property["name"] in input
81 | },
82 | )
83 | )
84 | self._rg_dataset.add_records(records) # type: ignore
85 | yield inputs
86 |
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/requirements.in:
--------------------------------------------------------------------------------
1 | distilabel
2 | argilla==1.27.0
3 | ipykernel
4 | python-dotenv
5 | transformers
6 | ipywidgets
7 | huggingface_hub
--------------------------------------------------------------------------------
/cookbook-efforts/dpo-orpo-preference/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file was autogenerated by uv via the following command:
2 | # uv pip compile requirements.in -o requirements.txt
3 | aiohttp==3.9.5
4 | # via
5 | # datasets
6 | # fsspec
7 | aiosignal==1.3.1
8 | # via aiohttp
9 | annotated-types==0.6.0
10 | # via pydantic
11 | anyio==4.3.0
12 | # via httpx
13 | appnope==0.1.4
14 | # via ipykernel
15 | argilla==1.27.0
16 | asttokens==2.4.1
17 | # via stack-data
18 | attrs==23.2.0
19 | # via aiohttp
20 | backoff==2.2.1
21 | # via argilla
22 | certifi==2024.2.2
23 | # via
24 | # httpcore
25 | # httpx
26 | # requests
27 | charset-normalizer==3.3.2
28 | # via requests
29 | click==8.1.7
30 | # via typer
31 | comm==0.2.2
32 | # via
33 | # ipykernel
34 | # ipywidgets
35 | datasets==2.19.0
36 | # via distilabel
37 | debugpy==1.8.1
38 | # via ipykernel
39 | decorator==5.1.1
40 | # via ipython
41 | deprecated==1.2.14
42 | # via argilla
43 | dill==0.3.8
44 | # via
45 | # datasets
46 | # multiprocess
47 | distilabel==1.0.3
48 | executing==2.0.1
49 | # via stack-data
50 | filelock==3.14.0
51 | # via
52 | # datasets
53 | # huggingface-hub
54 | # transformers
55 | frozenlist==1.4.1
56 | # via
57 | # aiohttp
58 | # aiosignal
59 | fsspec==2024.3.1
60 | # via
61 | # datasets
62 | # huggingface-hub
63 | h11==0.14.0
64 | # via httpcore
65 | httpcore==1.0.5
66 | # via httpx
67 | httpx==0.26.0
68 | # via
69 | # argilla
70 | # distilabel
71 | huggingface-hub==0.23.0
72 | # via
73 | # datasets
74 | # tokenizers
75 | # transformers
76 | idna==3.7
77 | # via
78 | # anyio
79 | # httpx
80 | # requests
81 | # yarl
82 | ipykernel==6.29.4
83 | ipython==8.24.0
84 | # via
85 | # ipykernel
86 | # ipywidgets
87 | ipywidgets==8.1.2
88 | jedi==0.19.1
89 | # via ipython
90 | jinja2==3.1.3
91 | # via distilabel
92 | jupyter-client==8.6.1
93 | # via ipykernel
94 | jupyter-core==5.7.2
95 | # via
96 | # ipykernel
97 | # jupyter-client
98 | jupyterlab-widgets==3.0.10
99 | # via ipywidgets
100 | markdown-it-py==3.0.0
101 | # via rich
102 | markupsafe==2.1.5
103 | # via jinja2
104 | matplotlib-inline==0.1.7
105 | # via
106 | # ipykernel
107 | # ipython
108 | mdurl==0.1.2
109 | # via markdown-it-py
110 | monotonic==1.6
111 | # via argilla
112 | multidict==6.0.5
113 | # via
114 | # aiohttp
115 | # yarl
116 | multiprocess==0.70.16
117 | # via
118 | # datasets
119 | # distilabel
120 | nest-asyncio==1.6.0
121 | # via
122 | # distilabel
123 | # ipykernel
124 | networkx==3.3
125 | # via distilabel
126 | numpy==1.23.5
127 | # via
128 | # argilla
129 | # datasets
130 | # pandas
131 | # pyarrow
132 | # scipy
133 | # transformers
134 | packaging==24.0
135 | # via
136 | # argilla
137 | # datasets
138 | # huggingface-hub
139 | # ipykernel
140 | # transformers
141 | pandas==2.2.2
142 | # via
143 | # argilla
144 | # datasets
145 | parso==0.8.4
146 | # via jedi
147 | pexpect==4.9.0
148 | # via ipython
149 | platformdirs==4.2.1
150 | # via jupyter-core
151 | prompt-toolkit==3.0.43
152 | # via ipython
153 | psutil==5.9.8
154 | # via ipykernel
155 | ptyprocess==0.7.0
156 | # via pexpect
157 | pure-eval==0.2.2
158 | # via stack-data
159 | pyarrow==16.0.0
160 | # via datasets
161 | pyarrow-hotfix==0.6
162 | # via datasets
163 | pydantic==2.7.1
164 | # via
165 | # argilla
166 | # distilabel
167 | pydantic-core==2.18.2
168 | # via pydantic
169 | pygments==2.17.2
170 | # via
171 | # ipython
172 | # rich
173 | python-dateutil==2.9.0.post0
174 | # via
175 | # jupyter-client
176 | # pandas
177 | python-dotenv==1.0.1
178 | pytz==2024.1
179 | # via pandas
180 | pyyaml==6.0.1
181 | # via
182 | # datasets
183 | # huggingface-hub
184 | # transformers
185 | pyzmq==26.0.3
186 | # via
187 | # ipykernel
188 | # jupyter-client
189 | regex==2024.4.28
190 | # via transformers
191 | requests==2.31.0
192 | # via
193 | # datasets
194 | # huggingface-hub
195 | # transformers
196 | rich==13.7.1
197 | # via
198 | # argilla
199 | # distilabel
200 | safetensors==0.4.3
201 | # via transformers
202 | scipy==1.13.0
203 | # via distilabel
204 | six==1.16.0
205 | # via
206 | # asttokens
207 | # python-dateutil
208 | sniffio==1.3.1
209 | # via
210 | # anyio
211 | # httpx
212 | stack-data==0.6.3
213 | # via ipython
214 | tblib==3.0.0
215 | # via distilabel
216 | tokenizers==0.19.1
217 | # via transformers
218 | tornado==6.4
219 | # via
220 | # ipykernel
221 | # jupyter-client
222 | tqdm==4.66.4
223 | # via
224 | # argilla
225 | # datasets
226 | # huggingface-hub
227 | # transformers
228 | traitlets==5.14.3
229 | # via
230 | # comm
231 | # ipykernel
232 | # ipython
233 | # ipywidgets
234 | # jupyter-client
235 | # jupyter-core
236 | # matplotlib-inline
237 | transformers==4.40.1
238 | typer==0.9.4
239 | # via
240 | # argilla
241 | # distilabel
242 | typing-extensions==4.11.0
243 | # via
244 | # huggingface-hub
245 | # ipython
246 | # pydantic
247 | # pydantic-core
248 | # typer
249 | tzdata==2024.1
250 | # via pandas
251 | urllib3==2.2.1
252 | # via requests
253 | wcwidth==0.2.13
254 | # via prompt-toolkit
255 | widgetsnbextension==4.0.10
256 | # via ipywidgets
257 | wrapt==1.14.1
258 | # via
259 | # argilla
260 | # deprecated
261 | xxhash==3.4.1
262 | # via datasets
263 | yarl==1.9.4
264 | # via aiohttp
265 |
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # KTO Dataset Project
6 |
7 | The KTO Dataset Project aims to create more preference data according to the KTO format. With the provided tools, the community will be able to easily generate a KTO dataset in any language or domain they are interested in. This type of preference data is easier to collect than others like DPO, and can be used to train models to better align with human preferences. By following two simple steps, you will be able to create your KTO dataset.
8 |
9 | ## What is the goal of this project?
10 |
11 | The goal of this project is to create more KTO datasets for different languages or domains. This will help the community to train models that better align with human preferences. The project will provide the tools and resources to easily generate a KTO dataset.
12 |
13 | ### Why do we need more KTO datasets?
14 |
15 |
16 | What is a preference dataset?
17 |
18 | Preference tuning is a step often performed when creating a chat/instruction following model with the goal of more closely aligning the model's outputs with the "human preferences" (or more accurately one set of human preferences). Often this is done through some form of reinforcement learning. Increasingly instead of having a separate reward model, we can use a preference dataset to directly train the model. Two prominent approaches to this are:
19 |
20 | - Direct Preference Optimization (DPO)
21 | - Kahneman-Tversky Optimisation (KTO)
22 |
23 | We won't dive into all of the technical details here but instead focus on what the data for both of these approaches look like. The overall steps are something like this:
24 |
25 | - Have some prompts
26 | - Generate responses to these prompts
27 | - Rank/rate the responses to the prompts
28 |
29 | We'll use the example of haiku here but this could be any kind of text generation task.
30 |
31 |
32 |
33 | What is the difference between DPO vs KTO?
34 |
35 | Whilst both DPO and KTO are methods for preference tuning (and sound like things that would be shouted at the end of a street fighter level), they differ in the kinds of data they require. DPO requires a preference dataset where we have two sets of responses with one "chosen" and one "rejected". We can take a look at a screenshot from a dataset server of a DPO dataset below:
36 |
37 | 
38 |
39 | As you can see, we have one column containing "chosen" responses and another containing "rejected" responses. This is the kind of data we would need for DPO. How would we collect this data once we have our candidate haiku responses? If we want to stick to using human feedback rather than a judge LM we would need to indicate their preferences between different haiku.
40 |
41 | There are different ways we could do this. We could ask humans to rate the haiku on a scale of 1-5, we could ask them to pick their favorite haiku from a set of 5, we could ask them to rank the haiku from best to worst etc. One disadvantage of DPO is that generating this kind of data from humans is quite cognitively demanding. It can be hard to compare two things and say which one is better and even with an optimized interface, it can be quite time-consuming. This is where KTO can provide an alternative.
42 |
43 | In contrast to DPO, KTO doesn't require two candidate responses i.e. "chosen" and "rejected". Instead, it can rely on a simple binary preference i.e. 👍👎. This is arguably much easier for an annotator to create.
44 |
45 |
46 | As we know, preference data is crucial for training models that better align with human preferences. However, collecting this DPO-formatted data can be time-consuming and expensive. This is where KTO datasets come in. KTO datasets are easier to collect than DPO datasets as they only require a prompt-response dataset with binary preference i.e. 👍👎. By creating more KTO datasets, we aim to improve our models more simply.
47 |
48 |
49 | Why should we generate responses to prompts?
50 | We could of course collect all of our preferences data by hand i.e. we could write a prompt like: "Write a recipe for banana bread" and then write two sets of responses one which we prefer over the other. However, this is time-consuming and not scalable. Instead, we can use a model to generate responses to our prompts and then use human feedback to determine which response we prefer. In our case, we can ask different LLMs to write haiku based on a prompt and then ask humans to rate the haiku.
51 |
52 | 
53 |
54 |
55 | ## How can you contribute?
56 |
57 | As part of Data Is Better Together, we're supporting the community in generating more KTO datasets for different languages or the domains they are interested in. If you would like to help, you can follow the steps below to generate a KTO dataset. There are already many communities working together on the Hugging Face Discord server, so you can also join the server to collaborate with others on this project 🤗.
58 |
59 | ## Project Overview
60 |
61 | Here we will walk through a simple example of how you might create a KTO dataset using synthetic data and human feedback. We will use haiku as our example but this could be any kind of text generation task.
62 |
63 | ### 1. Prerequisites
64 |
65 | * A 🤗 Hugging Face account: We'll extensively use the Hugging Face Hub both to generate our data via hosted model APIs and to share our generated datasets. You can sign up for a Hugging Face account [here](https://huggingface.co/join).
66 |
67 | * For the workflow we describe here, we assume you already have a dataset of prompts. This [notebook](https://github.com/davanstrien/haiku-dpo/blob/main/01_generate_haiku_prompts.ipynb) shows how you could generate a dataset of haiku prompts. This approach could be adapted to any kind of text-generation task. The [instruction generation](https://distilabel.argilla.io/latest/tutorials/create-a-math-preference-dataset/#instruction-generation) section of this Distilabel tutorial provides a good overview of how you might generate a dataset of prompts for a different kind of text generation task.
68 |
69 | ### 2. Produce generations with various open models
70 |
71 | We will use [Distilabel](https://github.com/argilla-io/distilabel) to generate our haiku responses based on our initial prompt dataset. To generate the dataset, we will use the following models:
72 |
73 | - [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B)
74 | - [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
75 | - [llama/Llama-2-70b-chat-hf](https://huggingface.co/llama/Llama-2-70b-chat-hf)
76 |
77 | However, you could swap these out for other models depending on your goals, budget, the domain you are working in etc.
78 |
79 | You will find the code to generate the haiku responses in [preference_gen.py](preference_gen.py).
80 |
81 | #### Hosted Model APIs
82 |
83 | We can use Hugging Face's free inference API to generate our haiku responses. This is a great way to get started with generating synthetic data. You can find more information on the supported models and how to use the API [here](https://huggingface.co/blog/inference-pro#supported-models).
84 |
85 | One of our models, "NousResearch/Nous-Hermes-2-Yi-34B" is hosted using [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated) instead. In the code, this part is commented out so it should be possible to run the code without needing to set up dedicated inference endpoints.
86 |
87 | > [!WARNING]
88 | > If you have local GPUs available, you can also adapt this approach using other [inference frameworks](https://distilabel.argilla.io/latest/components-gallery/llms/) such as Ollama or vLLM.
89 |
90 | #### The dataset produced
91 |
92 | A single row from the dataset produced by this code looks like this:
93 |
94 | ```python
95 | {
96 | "input": "Can you compose a haiku about the serenity of mountain peaks?",
97 | "generation_model": [
98 | "mistralai/Mistral-7B-Instruct-v0.2",
99 | "meta-llama/Llama-2-70b-chat-hf",
100 | "NousResearch/Nous-Hermes-2-Yi-34B",
101 | ],
102 | "generation_prompt": [
103 | "[INST] <>\nYou are a poet specialising in creating Haiku. \nYour haiku consist of three lines, with five syllables in the first line, seven in the second, and five in the third.\nBeyond being technically correct, your haiku should also be beautiful and meaningful. \nYou respond only with a haiku. You do not add anything else to your responses. \n\n<>\n\nCan you compose a haiku about the serenity of mountain peaks? [/INST]",
104 | "[INST] <>\nYou are a poet specialising in creating Haiku. \nYour haiku consist of three lines, with five syllables in the first line, seven in the second, and five in the third.\nBeyond being technically correct, your haiku should also be beautiful and meaningful. \nYou respond only with a haiku. You do not add anything else to your responses. \n\n<>\n\nCan you compose a haiku about the serenity of mountain peaks? [/INST]",
105 | "<|im_start|>system\nYou are a poet specialising in creating Haiku. \nYour haiku consist of three lines, with five syllables in the first line, seven in the second, and five in the third.\nBeyond being technically correct, your haiku should also be beautiful and meaningful. \nYou respond only with a haiku. You do not add anything else to your responses. \n\n<|im_end|>\n<|im_start|>user\nCan you compose a haiku about the serenity of mountain peaks?<|im_end|>\n<|im_start|>assistant\n",
106 | ],
107 | "raw_generation_responses": [
108 | " Peaceful summit rests,\nSky's reflection in still lake,\nSilence whispers on.",
109 | " Snow-capped peaks rise high\nSilent, majestic, and serene\nNature's peaceful throne",
110 | "Mountain peaks, serene\nPeaceful silence, whispers breeze\nNature's tranquil song",
111 | ],
112 | "generations": [
113 | " Peaceful summit rests,\nSky's reflection in still lake,\nSilence whispers on.",
114 | " Snow-capped peaks rise high\nSilent, majestic, and serene\nNature's peaceful throne",
115 | "Mountain peaks, serene\nPeaceful silence, whispers breeze\nNature's tranquil song",
116 | ],
117 | }
118 | ```
119 |
120 | As you can hopefully see, we have a single prompt and three haiku responses. We also have the model that generated each haiku response. This kind of data could be used to generate both a DPO and KTO dataset. We will focus on KTO here.
121 |
122 | ### I'm GPU-poor, can I still get involved?
123 |
124 | Yes! The example scripts in this repository use Hugging Face Inference Endpoints for the inference component. This means you can run the scripts on your local machine without needing a GPU. We can provide you with GPU grants to run the `distilabel` script if you need them. Please reach out to us on the Hugging Face Discord server if you need a GPU grant. **Note**: We will want to ensure that you have a plan for how you will use the GPU grant before providing it, in particular, we'll want to see that you have set up an Argilla Space for your project already and have already done some work to identify the language you want to work on and the models you want to use.
125 |
126 | ## 3. Create a preference dataset annotation Space in Argilla hosted on Spaces with HF authentication
127 |
128 | Hugging Face Spaces offer a simple way to host ML demo apps directly on your profile or your organization’s profile. [Argilla](https://argilla.io/) is a powerful data annotation tool that is integrated strongly with Hugging Face Spaces and other parts of the Hugging Face ecosystem.
129 |
130 | 
131 |
132 | The [create_preference_task.ipynb](01_create_preference_task.ipynb) notebook shows how you could create a preference dataset annotation Argilla Space that anyone with a Hugging Face account can contribute to. This is a great way to collect human feedback on your synthetic data.
133 |
134 | This will create a task that looks like this:
135 |
136 | 
137 |
138 | ## Next steps
139 |
140 | The current notebooks and code currently only show how to generate the synthetic data and create a preference dataset annotation Space. The next steps would be to collect human feedback on the synthetic data and then use this to train a model. We will cover this in a future notebook.
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/access.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/access.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/app-creation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/app-creation.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/b822ac33-a10e-4da7-a36a-682b96d1fe0e.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/b822ac33-a10e-4da7-a36a-682b96d1fe0e.webp
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/datasets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/datasets.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/dpo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/dpo.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/secrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/secrets.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/space.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/space.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/storage.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/task.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/assets/viewer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/viewer.png
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/preference_gen.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 |
4 | from datasets import load_dataset
5 | from distilabel.llm import LLM, InferenceEndpointsLLM, LLMPool, ProcessLLM
6 | from distilabel.pipeline import Pipeline
7 | from distilabel.tasks import Task, TextGenerationTask
8 | from dotenv import load_dotenv
9 |
10 | load_dotenv()
11 |
12 | # You need to set the HF_TOKEN environment variable to your Hugging Face API token
13 | HF_TOKEN = os.getenv("HF_TOKEN")
14 | assert HF_TOKEN is not None, "Please set HF_TOKEN to your Hugging Face API token"
15 | HF_USER_NAME = None
16 | assert HF_USER_NAME, "Please set HF_USER_NAME to your Hugging Face username"
17 |
18 | # if you want to sample from the dataset, set this to the number of samples you want
19 | # if the size of your sample is larger than the dataset the full dataset will be used
20 | SAMPLE_SIZE = None
21 |
22 |
23 | ## Load the dataset of prompts
24 | def prepare_data():
25 | prompts = load_dataset("davanstrien/haiku_prompts", split="train")
26 | print(f"Loaded {len(prompts)} prompts")
27 | return prompts.rename_column("instructions", "input")
28 |
29 |
30 | dataset = prepare_data()
31 |
32 | ## Define the task
33 |
34 | task = TextGenerationTask(
35 | system_prompt="""You are a poet specialising in creating Haiku. \nYour haiku consist of three lines, with five syllables in the first line, seven in the second, and five in the third.\nBeyond being technically correct, your haiku should also be beautiful and meaningful. \nYou respond only with a haiku. You do not add anything else to your responses. \n\n""",
36 | )
37 |
38 | print(task.system_prompt)
39 |
40 |
41 | # load llms
42 | def load_llama2(task: Task) -> LLM:
43 | return InferenceEndpointsLLM(
44 | "meta-llama/Llama-2-70b-chat-hf",
45 | token=HF_TOKEN,
46 | task=task,
47 | max_new_tokens=512,
48 | prompt_format="llama2",
49 | )
50 |
51 |
52 | def load_mistral(task: Task) -> LLM:
53 | checkpoint = "mistralai/Mistral-7B-Instruct-v0.2"
54 | return InferenceEndpointsLLM(
55 | checkpoint,
56 | token=HF_TOKEN,
57 | task=task,
58 | max_new_tokens=512,
59 | prompt_format="llama2",
60 | )
61 |
62 |
63 | # uncomment to use nous-hermes-2-yi-34b-aug
64 |
65 | # def load_nous_yi(task: Task) -> LLM:
66 | # checkpoint = "nous-hermes-2-yi-34b-aug"
67 | # return InferenceEndpointsLLM(
68 | # checkpoint,
69 | # token=HF_TOKEN,
70 | # task=task,
71 | # max_new_tokens=488,
72 | # prompt_format="chatml",
73 | # )
74 |
75 |
76 | mistral = ProcessLLM(task=task, load_llm_fn=load_mistral)
77 | llama2 = ProcessLLM(task=task, load_llm_fn=load_llama2)
78 | # uncomment to use nous-hermes-2-yi-34b-aug
79 | # nous_yi = ProcessLLM(task=task, load_llm_fn=load_nous_yi)
80 |
81 | llms = [
82 | mistral,
83 | llama2,
84 | ] # nous_yi] # uncomment to use nous-hermes-2-yi-34b-aug
85 |
86 |
87 | pool = LLMPool(llms=llms)
88 |
89 |
90 | pipeline = Pipeline(generator=pool)
91 |
92 | if SAMPLE_SIZE is not None:
93 | sample_idx = random.sample(range(len(dataset)), min(SAMPLE_SIZE, len(dataset)))
94 | dataset = dataset.select(sample_idx)
95 | print(f"Using {len(dataset)} prompts")
96 |
97 | print("Generating haiku...")
98 | haiku = pipeline.generate(
99 | dataset,
100 | num_generations=3,
101 | batch_size=1,
102 | display_progress_bar=True,
103 | shuffle_before_labelling=False,
104 | )
105 |
106 | print(haiku)
107 | print("Pushing to hub...")
108 | haiku.push_to_hub(f"{HF_USER_NAME}/haiku_dpo", "aesthetic-preference", token=HF_TOKEN)
109 |
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/requirements.in:
--------------------------------------------------------------------------------
1 | argilla
2 | datasets
3 | distilabel[hf-inference-endpoints]
4 | huggingface_hub
5 | ipywidgets
6 | python-dotenv
--------------------------------------------------------------------------------
/cookbook-efforts/kto-preference/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file was autogenerated by uv via the following command:
2 | # uv pip compile requirements.in -o requirements.txt
3 | aiohttp==3.9.3
4 | # via
5 | # datasets
6 | # fsspec
7 | aiosignal==1.3.1
8 | # via aiohttp
9 | annotated-types==0.6.0
10 | # via pydantic
11 | anyio==4.3.0
12 | # via httpx
13 | argilla==1.25.0
14 | asttokens==2.4.1
15 | # via stack-data
16 | attrs==23.2.0
17 | # via aiohttp
18 | backoff==2.2.1
19 | # via argilla
20 | certifi==2024.2.2
21 | # via
22 | # httpcore
23 | # httpx
24 | # requests
25 | charset-normalizer==3.3.2
26 | # via requests
27 | click==8.1.7
28 | # via
29 | # nltk
30 | # typer
31 | comm==0.2.2
32 | # via ipywidgets
33 | datasets==2.18.0
34 | # via distilabel
35 | decorator==5.1.1
36 | # via ipython
37 | deprecated==1.2.14
38 | # via argilla
39 | dill==0.3.8
40 | # via
41 | # datasets
42 | # multiprocess
43 | distilabel==0.6.0
44 | executing==2.0.1
45 | # via stack-data
46 | filelock==3.13.1
47 | # via
48 | # datasets
49 | # huggingface-hub
50 | frozenlist==1.4.1
51 | # via
52 | # aiohttp
53 | # aiosignal
54 | fsspec==2024.2.0
55 | # via
56 | # datasets
57 | # huggingface-hub
58 | h11==0.14.0
59 | # via httpcore
60 | httpcore==1.0.4
61 | # via httpx
62 | httpx==0.26.0
63 | # via argilla
64 | huggingface-hub==0.21.4
65 | # via
66 | # datasets
67 | # distilabel
68 | idna==3.6
69 | # via
70 | # anyio
71 | # httpx
72 | # requests
73 | # yarl
74 | ipython==8.22.2
75 | # via ipywidgets
76 | ipywidgets==8.1.2
77 | jedi==0.19.1
78 | # via ipython
79 | jinja2==3.1.3
80 | # via distilabel
81 | joblib==1.3.2
82 | # via nltk
83 | jupyterlab-widgets==3.0.10
84 | # via ipywidgets
85 | markdown-it-py==3.0.0
86 | # via rich
87 | markupsafe==2.1.5
88 | # via jinja2
89 | matplotlib-inline==0.1.6
90 | # via ipython
91 | mdurl==0.1.2
92 | # via markdown-it-py
93 | monotonic==1.6
94 | # via argilla
95 | multidict==6.0.5
96 | # via
97 | # aiohttp
98 | # yarl
99 | multiprocess==0.70.16
100 | # via
101 | # datasets
102 | # distilabel
103 | nltk==3.8.1
104 | # via argilla
105 | numpy==1.23.5
106 | # via
107 | # argilla
108 | # datasets
109 | # pandas
110 | # pyarrow
111 | packaging==24.0
112 | # via
113 | # argilla
114 | # datasets
115 | # huggingface-hub
116 | pandas==2.2.1
117 | # via
118 | # argilla
119 | # datasets
120 | parso==0.8.3
121 | # via jedi
122 | pexpect==4.9.0
123 | # via ipython
124 | prompt-toolkit==3.0.43
125 | # via ipython
126 | ptyprocess==0.7.0
127 | # via pexpect
128 | pure-eval==0.2.2
129 | # via stack-data
130 | pyarrow==15.0.1
131 | # via datasets
132 | pyarrow-hotfix==0.6
133 | # via datasets
134 | pydantic==2.6.4
135 | # via argilla
136 | pydantic-core==2.16.3
137 | # via pydantic
138 | pygments==2.17.2
139 | # via
140 | # ipython
141 | # rich
142 | python-dateutil==2.9.0.post0
143 | # via pandas
144 | python-dotenv==1.0.1
145 | pytz==2024.1
146 | # via pandas
147 | pyyaml==6.0.1
148 | # via
149 | # datasets
150 | # huggingface-hub
151 | regex==2023.12.25
152 | # via nltk
153 | requests==2.31.0
154 | # via
155 | # datasets
156 | # huggingface-hub
157 | rich==13.7.1
158 | # via
159 | # argilla
160 | # distilabel
161 | six==1.16.0
162 | # via
163 | # asttokens
164 | # python-dateutil
165 | sniffio==1.3.1
166 | # via
167 | # anyio
168 | # httpx
169 | stack-data==0.6.3
170 | # via ipython
171 | tenacity==8.2.3
172 | # via distilabel
173 | tqdm==4.66.2
174 | # via
175 | # argilla
176 | # datasets
177 | # huggingface-hub
178 | # nltk
179 | traitlets==5.14.2
180 | # via
181 | # comm
182 | # ipython
183 | # ipywidgets
184 | # matplotlib-inline
185 | typer==0.9.0
186 | # via argilla
187 | typing-extensions==4.10.0
188 | # via
189 | # huggingface-hub
190 | # pydantic
191 | # pydantic-core
192 | # typer
193 | tzdata==2024.1
194 | # via pandas
195 | urllib3==2.2.1
196 | # via requests
197 | wcwidth==0.2.13
198 | # via prompt-toolkit
199 | widgetsnbextension==4.0.10
200 | # via ipywidgets
201 | wrapt==1.14.1
202 | # via
203 | # argilla
204 | # deprecated
205 | xxhash==3.4.1
206 | # via datasets
207 | yarl==1.9.4
208 | # via aiohttp
209 |
--------------------------------------------------------------------------------