├── .github ├── actions │ └── update-progress │ │ ├── requirements.txt │ │ └── src │ │ └── dashboard.py └── workflows │ └── update-fineweb-progres.yml ├── .gitignore ├── README.md ├── community-efforts ├── image_preferences │ ├── 00_imgsys_shuffled_deduplicated_cleaned.py │ ├── 01_synthetic_data_generation_images.py │ ├── 01_synthetic_data_generation_prompts.py │ ├── 01_synthetic_data_generation_total.py │ ├── 02_image_prefernces_cleaned_filtered_sfw.py │ ├── 03_upload_to_argilla.ipynb │ ├── 04_binarize_preference_results.ipynb │ ├── 05_fine_tune_flux_lora.ipynb │ ├── README.md │ ├── requirements.txt │ └── template.html ├── prompt_ranking │ ├── README.md │ └── assets │ │ └── synthetic-vs-human.png └── prompt_translation │ ├── 01_setup_prompt_translation_space.ipynb │ ├── 02_upload_prompt_translation_data.ipynb │ ├── 03_create_dashboard.ipynb │ ├── README.md │ ├── Translation_with_distilabel_gpt_4_turbo.ipynb │ ├── dashboard_template │ ├── .gitattributes │ ├── README.md │ ├── app.py │ ├── dumpy.py │ └── requirements.txt │ └── requirements.in └── cookbook-efforts ├── domain-specific-datasets ├── README.md ├── assets │ ├── pipeline.png │ └── setup.png ├── distilabel_pipelines │ ├── domain_expert_pipeline.py │ └── requirements.txt ├── parent_app │ ├── app.py │ ├── hub.py │ ├── pages │ │ └── 🧑‍🌾 Domain Data Grower.py │ ├── project_config.json │ └── seed_data.json └── project_app │ ├── .streamlit │ └── config.toml │ ├── DATASET_README_BASE.md │ ├── README.md │ ├── app.py │ ├── defaults.py │ ├── hub.py │ ├── infer.py │ ├── pages │ ├── 2_👩🏼‍🔬 Describe Domain.py │ ├── 3_🌱 Generate Dataset.py │ └── 4_🔍 Review Generated Data.py │ ├── pipeline.yaml │ ├── project_config.json │ ├── requirements.txt │ ├── seed_data.json │ └── utils.py ├── dpo-orpo-preference ├── 00_datasets_exploration.ipynb ├── 01_data_prep.ipynb ├── 02_load_from_argilla.ipynb ├── README.md ├── assets │ └── banner.webp ├── aya_dpo_gen.py ├── custom_preference_to_argilla.py ├── examples │ └── en │ │ ├── 01_en_data_prep.ipynb │ │ ├── aya_en_dpo_gen.py │ │ └── custom_preference_to_argilla.py ├── instructions.md ├── requirements.in └── requirements.txt └── kto-preference ├── 01_create_preference_task.ipynb ├── README.md ├── assets ├── access.png ├── app-creation.png ├── b822ac33-a10e-4da7-a36a-682b96d1fe0e.webp ├── datasets.png ├── dpo.png ├── secrets.png ├── space.png ├── storage.png ├── task.png └── viewer.png ├── preference_gen.py ├── requirements.in └── requirements.txt /.github/actions/update-progress/requirements.txt: -------------------------------------------------------------------------------- 1 | argilla 2 | huggingface-hub 3 | httpx 4 | stamina 5 | polars 6 | tqdm 7 | python-dotenv -------------------------------------------------------------------------------- /.github/actions/update-progress/src/dashboard.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argilla as rg 3 | from huggingface_hub import HfApi, hf_hub_download 4 | import httpx 5 | import stamina 6 | import polars as pl 7 | from tqdm.contrib.concurrent import thread_map 8 | from argilla._exceptions import ArgillaAPIError 9 | from datetime import datetime, timezone 10 | from dotenv import load_dotenv 11 | from functools import lru_cache 12 | import time 13 | 14 | # Load environment variables from .env file when running locally 15 | load_dotenv() 16 | 17 | # Enable HF transfer 18 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 19 | 20 | # Validate environment variables 21 | HF_TOKEN = os.environ.get("HF_TOKEN") 22 | 23 | if not HF_TOKEN: 24 | raise ValueError("HF_TOKEN environment variable is not set") 25 | 26 | if ARGILLA_API_KEY := os.environ.get("ARGILLA_API_KEY"): 27 | client = rg.Argilla( 28 | api_url="https://data-is-better-together-fineweb-c.hf.space", 29 | api_key=ARGILLA_API_KEY, 30 | timeout=120, 31 | headers={"Authorization": f"Bearer {HF_TOKEN}"}, 32 | ) 33 | else: 34 | raise ValueError("ARGILLA_API_KEY environment variable is not set") 35 | 36 | 37 | @lru_cache(maxsize=1) 38 | def get_all_datasets(): 39 | return client.datasets.list() 40 | 41 | 42 | def get_dataset_for_language(language_code): 43 | all_datasets = get_all_datasets() 44 | dataset = [ 45 | dataset for dataset in all_datasets if dataset.name.startswith(language_code) 46 | ] 47 | if len(dataset) != 1: 48 | raise ValueError( 49 | f"Found {len(dataset)} datasets for language code {language_code}" 50 | ) 51 | dataset_name = dataset[0].name 52 | return client.datasets(dataset_name) 53 | 54 | 55 | # Get all datasets 56 | all_datasets = get_all_datasets() 57 | language_datasets_names = [dataset.name for dataset in all_datasets] 58 | 59 | 60 | @stamina.retry( 61 | on=(httpx.HTTPStatusError, ArgillaAPIError), 62 | attempts=5, 63 | wait_initial=15, 64 | ) 65 | def get_dataset_progress(language_dataset_name): 66 | time.sleep(2) 67 | dataset = client.datasets(language_dataset_name) 68 | return { 69 | "language_dataset_name": language_dataset_name, 70 | **dataset.progress(with_users_distribution=True), 71 | } 72 | 73 | 74 | def flatten_user_stats(dataset): 75 | dataset_name = dataset["language_dataset_name"] 76 | current_timestamp = datetime.now(timezone.utc) 77 | user_stats = [] 78 | 79 | if dataset["users"]: 80 | user_stats.extend( 81 | { 82 | "language_dataset_name": dataset_name, 83 | "username": str(username), 84 | "submitted": int( 85 | stats["completed"]["submitted"] + stats["pending"]["submitted"] 86 | ), 87 | "total": int(dataset["total"]), 88 | "timestamp": current_timestamp, 89 | } 90 | for username, stats in dataset["users"].items() 91 | ) 92 | else: 93 | user_stats.append( 94 | { 95 | "language_dataset_name": dataset_name, 96 | "username": None, 97 | "submitted": 0, 98 | "total": int(dataset["total"]), 99 | "timestamp": current_timestamp, 100 | } 101 | ) 102 | 103 | return user_stats 104 | 105 | 106 | def update_progress_data(new_data, filename="argilla_progress.ndjson"): 107 | # Process new data 108 | all_user_stats = [] 109 | for dataset in new_data: 110 | all_user_stats.extend(flatten_user_stats(dataset)) 111 | 112 | new_df = pl.DataFrame( 113 | all_user_stats, 114 | schema={ 115 | "language_dataset_name": pl.Utf8, 116 | "username": pl.Utf8, 117 | "submitted": pl.Int64, 118 | "total": pl.Int64, 119 | "timestamp": pl.Datetime, 120 | }, 121 | ) 122 | 123 | try: 124 | fname = hf_hub_download( 125 | repo_id="davanstrien/progress", 126 | filename="argilla_progress.ndjson", 127 | repo_type="dataset", 128 | ) 129 | existing_df = pl.read_ndjson(fname) 130 | combined_df = pl.concat([existing_df, new_df]) 131 | except FileNotFoundError: 132 | print("No existing data found, creating new dataset") 133 | combined_df = new_df 134 | except Exception as e: 135 | print(f"Error loading existing data: {e}") 136 | combined_df = new_df 137 | 138 | combined_df.write_ndjson(filename) 139 | return combined_df 140 | 141 | 142 | def main(): 143 | print("Starting data collection...") 144 | all_data = thread_map( 145 | get_dataset_progress, language_datasets_names, max_workers=1) 146 | 147 | print("Updating progress data...") 148 | df = update_progress_data(all_data) 149 | df = df.sort("language_dataset_name") 150 | 151 | print("Saving data...") 152 | df.write_ndjson("argilla_progress.ndjson") 153 | 154 | print("Uploading to Hugging Face Hub...") 155 | api = HfApi() 156 | api.create_repo( 157 | "data-is-better-together/fineweb-c-progress", repo_type="dataset", exist_ok=True 158 | ) 159 | api.upload_file( 160 | path_or_fileobj="argilla_progress.ndjson", 161 | repo_id="data-is-better-together/fineweb-c-progress", 162 | repo_type="dataset", 163 | path_in_repo="argilla_progress.ndjson", 164 | ) 165 | print("Done!") 166 | 167 | 168 | if __name__ == "__main__": 169 | main() 170 | -------------------------------------------------------------------------------- /.github/workflows/update-fineweb-progres.yml: -------------------------------------------------------------------------------- 1 | name: Update Progress Data 2 | 3 | on: 4 | schedule: 5 | - cron: "0 */6 * * *" # Runs every 6 hours 6 | workflow_dispatch: # Allows manual triggering 7 | 8 | jobs: 9 | update-progress: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.12" 19 | 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r .github/actions/update-progress/requirements.txt 24 | 25 | - name: Run update script 26 | env: 27 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 28 | ARGILLA_API_KEY: ${{ secrets.ARGILLA_API_KEY }} 29 | run: | 30 | python .github/actions/update-progress/src/dashboard.py 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.oauth.yaml 2 | /.venv 3 | kto-preference/.env 4 | kto-preference/.vscode/settings.json 5 | .DS_Store 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/#use-with-ide 116 | .pdm.toml 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | #.idea/ 167 | # vscode 168 | **/.vscode/settings.json 169 | .vscode/ 170 | community-efforts/image_preferences/images 171 | community-efforts/image_preferences/image_quality_dev 172 | community-efforts/image_preferences/image_simplified_dev 173 | community-efforts/image_preferences/image_quality_sd 174 | community-efforts/image_preferences/image_simplified_sd 175 | community-efforts/image_preferences/assets 176 | logs/ 177 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

🤗 Spaces & Datasets

6 | 7 | # Data is Better Together 8 | 9 | > If you are working on a valuable community-developed dataset but are limited by available resources, please reach out to us on the Hugging Face discord. We may be able to provide support to enhance your project. 10 | 11 | Data is Better Together is a collaboration between 🤗 Hugging Face, 🏓 Argilla, and the Open-Source ML community. We aim to empower the open-source community to build impactful datasets collectively. This initiative consists of two main components: the community efforts and the cookbook efforts. 12 | 13 |
14 | Community Efforts: They were guided by the HF Team, hands-on projects focused on creating valuable datasets. These projects required the participation of the community and have been successfully completed. 15 | 16 | 42 | 43 |
44 | Cookbook Efforts: They aim to create guides and tools that help the community in building valuable datasets. They are not guided by the HF team and expected to be handled standalone, allowing you to freely contribute or use them to create your own unique dataset. 45 | 46 | 66 | 67 | **🤝​ How can I contribute to the cookbook efforts?** That's easy! You can contribute by following the instructions in the README of the project you are interested in. Then, share your results with the community! 68 | 69 |
70 | -------------------------------------------------------------------------------- /community-efforts/image_preferences/00_imgsys_shuffled_deduplicated_cleaned.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset, load_dataset 2 | from fast_langdetect import detect 3 | 4 | dataset = load_dataset("fal/imgsys-results", split="train") 5 | dataset = dataset.shuffle() 6 | df = dataset.to_pandas() 7 | df = df.drop_duplicates(subset=["prompt"]) 8 | df = df.reset_index(drop=True) 9 | df = df[["prompt"]] 10 | df = df.dropna(subset=["prompt"]) 11 | df["language"], df["score"] = zip( 12 | *df["prompt"].apply(lambda x: detect(x.replace("\n", "")).values()) 13 | ) 14 | df = df[df["language"] == "en"] 15 | df = df["prompt"] 16 | dataset = Dataset.from_pandas(df) 17 | dataset.push_to_hub( 18 | "data-is-better-together/imgsys-results-prompts-shuffled-cleaned-deduplicated-english" 19 | ) 20 | -------------------------------------------------------------------------------- /community-efforts/image_preferences/01_synthetic_data_generation_prompts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | os.environ["DISTILABEL_LOG_LEVEL"] = "DEBUG" 5 | 6 | from distilabel.llms import InferenceEndpointsLLM 7 | 8 | # from distilabel.llms.huggingface import InferenceEndpointsLLM 9 | from distilabel.pipeline import Pipeline 10 | from distilabel.steps import GroupColumns, KeepColumns, LoadDataFromHub, StepInput, step 11 | from distilabel.steps.base import StepInput 12 | from distilabel.steps.tasks import TextGeneration 13 | from distilabel.steps.typing import StepOutput 14 | 15 | ## At the time of writing this, the distilabel library does not support the image generation endpoint. 16 | ## This is a temporary fix to allow us to use the image generation endpoint. 17 | 18 | ## Let's determine the categories and subcategories for the image generation task 19 | # https://huggingface.co/spaces/google/sdxl/blob/main/app.py#L55 20 | categories = { 21 | # included 22 | "Cinematic": [ 23 | # included 24 | "emotional", 25 | "harmonious", 26 | "vignette", 27 | "highly detailed", 28 | "high budget", 29 | "bokeh", 30 | "cinemascope", 31 | "moody", 32 | "epic", 33 | "gorgeous", 34 | "film grain", 35 | "grainy", 36 | ], 37 | # included 38 | "Photographic": [ 39 | # included 40 | "film", 41 | "bokeh", 42 | "professional", 43 | "4k", 44 | "highly detailed", 45 | ## not included 46 | "Landscape", 47 | "Portrait", 48 | "Macro", 49 | "Portra", 50 | "Gold", 51 | "ColorPlus", 52 | "Ektar", 53 | "Superia", 54 | "C200", 55 | "CineStill", 56 | "CineStill 50D", 57 | "CineStill 800T", 58 | "Tri-X", 59 | "HP5", 60 | "Delta", 61 | "T-Max", 62 | "Fomapan", 63 | "StreetPan", 64 | "Provia", 65 | "Ektachrome", 66 | "Velvia", 67 | ], 68 | # included 69 | "Anime": [ 70 | # included 71 | "anime style", 72 | "key visual", 73 | "vibrant", 74 | "studio anime", 75 | "highly detailed", 76 | ], 77 | # included 78 | "Manga": [ 79 | # included 80 | "vibrant", 81 | "high-energy", 82 | "detailed", 83 | "iconic", 84 | "Japanese comic style", 85 | ], 86 | # included 87 | "Digital art": [ 88 | # included 89 | "digital artwork", 90 | "illustrative", 91 | "painterly", 92 | "matte painting", 93 | "highly detailed", 94 | ], 95 | # included 96 | "Pixel art": [ 97 | # included 98 | "low-res", 99 | "blocky", 100 | "pixel art style", 101 | "8-bit graphics", 102 | ], 103 | # included 104 | "Fantasy art": [ 105 | # included 106 | "magnificent", 107 | "celestial", 108 | "ethereal", 109 | "painterly", 110 | "epic", 111 | "majestic", 112 | "magical", 113 | "fantasy art", 114 | "cover art", 115 | "dreamy", 116 | ], 117 | # included 118 | "Neonpunk": [ 119 | # included 120 | "cyberpunk", 121 | "vaporwave", 122 | "neon", 123 | "vibes", 124 | "vibrant", 125 | "stunningly beautiful", 126 | "crisp", 127 | "detailed", 128 | "sleek", 129 | "ultramodern", 130 | "magenta highlights", 131 | "dark purple shadows", 132 | "high contrast", 133 | "cinematic", 134 | "ultra detailed", 135 | "intricate", 136 | "professional", 137 | ], 138 | # included 139 | "3D Model": [ 140 | # included 141 | "octane render", 142 | "highly detailed", 143 | "volumetric", 144 | "dramatic lighting", 145 | ], 146 | # not included 147 | "Painting": [ 148 | "Oil", 149 | "Acrylic", 150 | "Watercolor", 151 | "Digital", 152 | "Mural", 153 | "Sketch", 154 | "Gouache", 155 | "Renaissance", 156 | "Baroque", 157 | "Romanticism", 158 | "Impressionism", 159 | "Expressionism", 160 | "Cubism", 161 | "Surrealism", 162 | "Pop Art", 163 | "Minimalism", 164 | "Realism", 165 | "Encaustic", 166 | "Tempera", 167 | "Fresco", 168 | "Ink Wash", 169 | "Spray Paint", 170 | "Mixed Media", 171 | ], 172 | # not included 173 | "Animation": [ 174 | # not included 175 | "Animation", 176 | "Stop motion", 177 | "Claymation", 178 | "Pixel Art", 179 | "Vector", 180 | "Hand-drawn", 181 | "Cutout", 182 | "Whiteboard", 183 | ], 184 | # not included 185 | "Illustration": [ 186 | # not included 187 | "Book", 188 | "Comics", 189 | "Editorial", 190 | "Advertising", 191 | "Technical", 192 | "Fantasy", 193 | "Scientific", 194 | "Fashion", 195 | "Storyboard", 196 | "Concept Art", 197 | "Manga", 198 | "Anime", 199 | "Digital", 200 | "Vector", 201 | "Design", 202 | ], 203 | } 204 | 205 | ## We will use the Qwen2.5-72B-Instruct model for the text generation task, this will help us to generate the quality and style prompts 206 | 207 | model_id = ( 208 | "meta-llama/Llama-3.1-8B-Instruct" 209 | ) # "meta-llama/Meta-Llama-3.1-70B-Instruct" 210 | 211 | 212 | llm = InferenceEndpointsLLM( 213 | # model_id=model_id, 214 | # tokenizer_id=model_id, 215 | generation_kwargs={"temperature": 0.8, "max_new_tokens": 2048}, 216 | base_url="https://rti2mzernqmo00qy.us-east-1.aws.endpoints.huggingface.cloud", 217 | api_key=os.getenv("HF_TOKEN"), 218 | ) 219 | 220 | 221 | ## We will use two types of prompts: quality and style. The quality prompt will help us to generate the quality-enhanced prompts and the style prompt will help us to generate the style-enhanced prompts. 222 | quality_prompt = """ 223 | You are an expert at refining prompts for image generation models. Your task is to enhance the given prompt by adding descriptive details and quality-improving elements, while maintaining the original intent and core concept. 224 | 225 | Follow these guidelines: 226 | 1. Preserve the main subject and action of the original prompt. 227 | 2. Add specific, vivid details to enhance visual clarity. 228 | 3. Incorporate elements that improve overall image quality and aesthetics. 229 | 4. Keep the prompt concise and avoid unnecessary words. 230 | 5. Use modifiers that are appropriate for the subject matter. 231 | 232 | Example modifiers (use as reference, adapt based on some aspect that's suitable for the original prompt): 233 | - Lighting: "soft golden hour light", "dramatic chiaroscuro", "ethereal glow" 234 | - Composition: "rule of thirds", "dynamic perspective", "symmetrical balance" 235 | - Texture: "intricate details", "smooth gradients", "rich textures" 236 | - Color: "vibrant color palette", "monochromatic scheme", "complementary colors" 237 | - Atmosphere: "misty ambiance", "serene mood", "energetic atmosphere" 238 | - Technical: "high resolution", "photorealistic", "sharp focus" 239 | 240 | The enhanced prompt should be short, concise, direct, avoid unnecessary words and written as it was a human expert writing the prompt. 241 | 242 | Output only one enhanced prompt without any additional text or explanations. 243 | 244 | ## Original Prompt 245 | {{ style_prompt }} 246 | 247 | ## Quality-Enhanced Prompt 248 | """ 249 | 250 | style_prompt = """ 251 | You are an expert at refining prompts for image generation models. Your task is to enhance the given prompt by transforming it into a specific artistic style, technique, or genre, while maintaining the original core concept. 252 | 253 | Follow these guidelines: 254 | 1. Preserve the main subject and action of the original prompt but rewrite stylistic elements already present in the prompt. 255 | 2. Transform the prompt into a distinctive visual style (e.g., impressionism, surrealism, cyberpunk, art nouveau). 256 | 3. Incorporate style-specific elements and techniques. 257 | 4. Keep the prompt concise and avoid unnecessary words. 258 | 5. Use modifiers that are appropriate for the chosen style. 259 | 260 | You should use the following style, technique, genre to enhance the prompt: 261 | {{ category }} / {{ subcategory }} 262 | 263 | The enhanced prompt should be short, concise, direct, avoid unnecessary words and written as it was a human expert writing the prompt. 264 | 265 | Output only one style-enhanced prompt without any additional text or explanations. 266 | 267 | ## Original Prompt 268 | {{ prompt }} 269 | 270 | ## Style-Enhanced Prompt 271 | """ 272 | 273 | simplification_prompt = """ 274 | You are an expert at simplifying image descriptions. Your task is to simplify the description by removing any unnecessary words and phrases, while maintaining the original intent and core concept of the description. 275 | 276 | Follow these guidelines: 277 | 1. Preserve the main subject of the original description. 278 | 2. Remove all any unnecessary words and phrases. 279 | 3. Ensure the simplified description could have been quickly written by a human. 280 | 281 | ## Original Description 282 | {{ style_prompt }} 283 | 284 | ## Simplified Description 285 | """ 286 | 287 | ## Let's create the pipeline to generate the quality and style prompts 288 | 289 | with Pipeline(name="image_preferences_synthetic_data_generation") as pipeline: 290 | load_data = LoadDataFromHub(name="load_dataset") 291 | 292 | @step(inputs=["prompt"], outputs=["category", "subcategory", "prompt"]) 293 | def CategorySelector(inputs: StepInput) -> "StepOutput": 294 | result = [] 295 | for input in inputs: 296 | # Randomly select a category 297 | category = random.choice(list(categories.keys())) 298 | # Randomly select a subcategory from the chosen category 299 | subcategory = random.choice(categories[category]) 300 | 301 | result.append( 302 | { 303 | "category": category, 304 | "subcategory": subcategory, 305 | "prompt": input["prompt"], 306 | } 307 | ) 308 | yield result 309 | 310 | category_selector = CategorySelector(name="category_selector") 311 | 312 | style_augmentation = TextGeneration( 313 | llm=llm, 314 | template=style_prompt, 315 | columns=["prompt", "category", "subcategory"], 316 | name="style_augmentation", 317 | output_mappings={"generation": "style_prompt"}, 318 | input_batch_size=4, 319 | ) 320 | 321 | simplification_augmentation = TextGeneration( 322 | llm=llm, 323 | template=simplification_prompt, 324 | columns=["style_prompt"], 325 | name="simplification_augmentation", 326 | output_mappings={"generation": "simplified_prompt"}, 327 | input_batch_size=2, 328 | ) 329 | 330 | quality_augmentation = TextGeneration( 331 | llm=llm, 332 | template=quality_prompt, 333 | columns=["style_prompt"], 334 | name="quality_augmentation", 335 | output_mappings={"generation": "quality_prompt"}, 336 | input_batch_size=2, 337 | ) 338 | 339 | group_columns = GroupColumns(columns=["model_name"]) 340 | keep_columns = KeepColumns( 341 | columns=[ 342 | "prompt", 343 | "category", 344 | "subcategory", 345 | "style_prompt", 346 | "quality_prompt", 347 | "simplified_prompt", 348 | ] 349 | ) 350 | 351 | ( 352 | load_data 353 | >> category_selector 354 | >> style_augmentation 355 | >> [quality_augmentation, simplification_augmentation] 356 | >> group_columns 357 | >> keep_columns 358 | ) 359 | 360 | ## Let's run the pipeline and push the resulting dataset to the hub 361 | 362 | if __name__ == "__main__": 363 | num_examples = 15000 364 | distiset = pipeline.run( 365 | use_cache=True, 366 | parameters={ 367 | load_data.name: { 368 | "num_examples": num_examples, 369 | "repo_id": "data-is-better-together/imgsys-results-prompts-shuffled-cleaned-deduplicated-english", 370 | } 371 | }, 372 | ) 373 | dataset_name = "data-is-better-together/imgsys-results-prompts-style_v2_part1" 374 | distiset.push_to_hub( 375 | repo_id=dataset_name, 376 | include_script=True, 377 | generate_card=False, 378 | token=os.getenv("HF_TOKEN"), 379 | ) 380 | -------------------------------------------------------------------------------- /community-efforts/image_preferences/02_image_prefernces_cleaned_filtered_sfw.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from datasets import load_dataset 4 | from transformers import pipeline 5 | 6 | pipe_text = pipeline( 7 | "text-classification", 8 | model="ezb/NSFW-Prompt-Detector", 9 | device="mps", 10 | ) 11 | pipe_text_2 = pipeline( 12 | "text-classification", 13 | model="michellejieli/NSFW_text_classifier", 14 | device="mps", 15 | ) 16 | pipe_image = pipeline( 17 | "image-classification", 18 | model="MichalMlodawski/nsfw-image-detection-large", 19 | device="mps", 20 | ) 21 | 22 | label_to_category_text = { 23 | "LABEL_0": "Safe", 24 | "LABEL_1": "Questionable", 25 | "LABEL_2": "Unsafe", 26 | } 27 | 28 | 29 | def clean_dataset(batch): 30 | try: 31 | batch["nsfw_text"] = [] 32 | batch["nsfw_image"] = [] 33 | evaluated_results_image = defaultdict(list) 34 | evaluated_results_text = defaultdict(list) 35 | 36 | image_columns = [ 37 | "image_quality_dev", 38 | "image_simplified_dev", 39 | "image_quality_sd", 40 | "image_simplified_sd", 41 | ] 42 | 43 | for image_column in image_columns: 44 | results_image = pipe_image(batch[image_column]) 45 | evaluated_results_image[image_column] = [ 46 | res[0]["label"] in ["UNSAFE", "QUESTIONABLE"] for res in results_image 47 | ] 48 | 49 | try: 50 | results_text = pipe_text(batch["prompt"]) 51 | results_text_2 = pipe_text_2(batch["prompt"]) 52 | evaluated_results_text["text"] = [ 53 | res["label"] == "NSFW" for res in results_text 54 | ] 55 | evaluated_results_text["text_2"] = [ 56 | res["label"] == "NSFW" for res in results_text_2 57 | ] 58 | except Exception: 59 | try: 60 | results_text_2 = pipe_text_2(batch["prompt"]) 61 | evaluated_results_text["text_2"] = [ 62 | res["label"] == "NSFW" for res in results_text_2 63 | ] 64 | evaluated_results_text["text"] = [False] * len(results_text_2) 65 | except Exception: 66 | try: 67 | results_text = pipe_text(batch["prompt"]) 68 | evaluated_results_text["text"] = [ 69 | res["label"] == "NSFW" for res in results_text 70 | ] 71 | evaluated_results_text["text_2"] = [False] * len(results_text) 72 | except Exception: 73 | for item in batch["prompt"]: 74 | try: 75 | evaluated_results_text["text"].append( 76 | pipe_text(item)["label"] == "NSFW" 77 | ) 78 | except Exception: 79 | evaluated_results_text["text"].append(True) 80 | try: 81 | evaluated_results_text["text_2"].append( 82 | pipe_text_2(item)["label"] == "NSFW" 83 | ) 84 | except Exception: 85 | evaluated_results_text["text_2"].append(True) 86 | 87 | for i in range(len(evaluated_results_text["text"])): 88 | if any(evaluated_results_text[col][i] for col in evaluated_results_text): 89 | batch["nsfw_text"].append(True) 90 | else: 91 | batch["nsfw_text"].append(False) 92 | for i in range(len(evaluated_results_image["image_quality_dev"])): 93 | if any(evaluated_results_image[col][i] for col in evaluated_results_image): 94 | batch["nsfw_image"].append(True) 95 | else: 96 | batch["nsfw_image"].append(False) 97 | except Exception as e: 98 | raise Exception(e) 99 | return batch 100 | 101 | 102 | ds = load_dataset( 103 | "data-is-better-together/open-image-preferences-v1-unfiltered", split="train" 104 | ) 105 | df = ds.filter( 106 | lambda x: x["image_quality_dev"] 107 | and x["image_simplified_dev"] 108 | and x["image_quality_sd"] 109 | and x["image_simplified_sd"] 110 | ) 111 | ds = df.map(clean_dataset, batched=True, batch_size=100) 112 | ds = ds.filter(lambda x: not x["nsfw_text"] and not x["nsfw_image"]) 113 | ds = ds.remove_columns(["nsfw_text", "nsfw_image"]) 114 | ds.push_to_hub( 115 | "data-is-better-together/open-image-preferences-v1", 116 | split="cleaned", 117 | private=True, 118 | ) 119 | -------------------------------------------------------------------------------- /community-efforts/image_preferences/04_binarize_preference_results.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip install datasets" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Load and transform the dataset\n", 17 | "\n", 18 | "First, we load the dataset.\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stderr", 28 | "output_type": "stream", 29 | "text": [ 30 | "/Users/davidberenstein/Documents/programming/argilla/data-is-better-together/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 31 | " from .autonotebook import tqdm as notebook_tqdm\n" 32 | ] 33 | }, 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "Dataset({\n", 38 | " features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status'],\n", 39 | " num_rows: 5000\n", 40 | "})" 41 | ] 42 | }, 43 | "execution_count": 1, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "from datasets import load_dataset\n", 50 | "\n", 51 | "ds = load_dataset(\"data-is-better-together/image-preferences-v1-results\", split=\"train\")\n", 52 | "ds" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "{'id': '3368-quality',\n", 64 | " 'status': 'completed',\n", 65 | " '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',\n", 66 | " 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n", 67 | " 'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n", 68 | " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},\n", 69 | " 'model_1': 'dev',\n", 70 | " 'model_2': 'sd',\n", 71 | " 'evolution': 'quality',\n", 72 | " 'category': 'Manga',\n", 73 | " 'sub_category': 'detailed',\n", 74 | " 'preference.responses': ['both_good', 'image_1', 'image_1'],\n", 75 | " 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',\n", 76 | " 'caf19767-2989-4b3c-a653-9c30afc6361d',\n", 77 | " 'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],\n", 78 | " 'preference.responses.status': ['submitted', 'submitted', 'submitted']}" 79 | ] 80 | }, 81 | "execution_count": 2, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "ds[0]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "ds = ds.filter(lambda example: example['preference.responses'] is not None)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 9, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stderr", 106 | "output_type": "stream", 107 | "text": [ 108 | "Map: 100%|██████████| 4997/4997 [00:00<00:00, 12626.85 examples/s]\n" 109 | ] 110 | }, 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "{'id': '3368-quality',\n", 115 | " 'status': 'completed',\n", 116 | " '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',\n", 117 | " 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n", 118 | " 'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n", 119 | " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},\n", 120 | " 'model_1': 'dev',\n", 121 | " 'model_2': 'sd',\n", 122 | " 'evolution': 'quality',\n", 123 | " 'category': 'Manga',\n", 124 | " 'sub_category': 'detailed',\n", 125 | " 'preference.responses': ['both_good', 'image_1', 'image_1'],\n", 126 | " 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',\n", 127 | " 'caf19767-2989-4b3c-a653-9c30afc6361d',\n", 128 | " 'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],\n", 129 | " 'preference.responses.status': ['submitted', 'submitted', 'submitted'],\n", 130 | " 'chosen': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n", 131 | " 'chosen_model': 'black-forest-labs/FLUX.1-dev',\n", 132 | " 'rejected': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n", 133 | " 'rejected_model': 'stabilityai/stable-diffusion-3.5-large',\n", 134 | " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'}" 135 | ] 136 | }, 137 | "execution_count": 9, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "from collections import Counter\n", 144 | "\n", 145 | "def get_preference_winner(batch):\n", 146 | " responses = batch['preference.responses']\n", 147 | " cleaned_responses = []\n", 148 | " for response in responses:\n", 149 | " if response == 'both_good':\n", 150 | " cleaned_responses.append('image_1')\n", 151 | " cleaned_responses.append('image_2')\n", 152 | " else:\n", 153 | " cleaned_responses.append(response)\n", 154 | " counts = Counter(cleaned_responses)\n", 155 | " if counts['image_1'] > counts['image_2'] and counts['image_1'] > counts['both_bad']:\n", 156 | " batch['chosen'] = batch['images']['image_1']\n", 157 | " batch['chosen_model'] = batch[\"model_1\"]\n", 158 | " batch['rejected'] = batch['images']['image_2']\n", 159 | " batch['rejected_model'] = batch[\"model_2\"]\n", 160 | " elif counts['image_2'] > counts['image_1'] and counts['image_2'] > counts['both_bad']:\n", 161 | " batch['chosen'] = batch['images']['image_2']\n", 162 | " batch['chosen_model'] = batch[\"model_2\"]\n", 163 | " batch['rejected'] = batch['images']['image_1']\n", 164 | " batch['rejected_model'] = batch[\"model_1\"]\n", 165 | " else:\n", 166 | " batch['chosen'] = None\n", 167 | " batch['chosen_model'] = None\n", 168 | " batch['rejected'] = None\n", 169 | " batch['rejected_model'] = None\n", 170 | "\n", 171 | " batch[\"prompt\"] = batch[\"images\"][\"prompt\"]\n", 172 | " \n", 173 | " if batch['chosen_model'] == 'dev':\n", 174 | " batch['chosen_model'] = 'black-forest-labs/FLUX.1-dev'\n", 175 | " batch['rejected_model'] = 'stabilityai/stable-diffusion-3.5-large'\n", 176 | " else:\n", 177 | " batch['rejected_model'] = 'black-forest-labs/FLUX.1-dev'\n", 178 | " batch['chosen_model'] = 'stabilityai/stable-diffusion-3.5-large'\n", 179 | " \n", 180 | " return batch\n", 181 | "\n", 182 | "\n", 183 | "ds_formatted = ds.map(get_preference_winner)\n", 184 | "ds_formatted[0]\n" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 10, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stderr", 194 | "output_type": "stream", 195 | "text": [ 196 | "Filter: 100%|██████████| 4997/4997 [00:00<00:00, 48227.03 examples/s]\n" 197 | ] 198 | }, 199 | { 200 | "data": { 201 | "text/plain": [ 202 | "Dataset({\n", 203 | " features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status', 'chosen', 'chosen_model', 'rejected', 'rejected_model', 'prompt'],\n", 204 | " num_rows: 3007\n", 205 | "})" 206 | ] 207 | }, 208 | "execution_count": 10, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "ds_formatted_filtered = ds_formatted.filter(lambda example: example['chosen'] is not None)\n", 215 | "ds_formatted_filtered" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 11, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stderr", 225 | "output_type": "stream", 226 | "text": [ 227 | "Map: 100%|██████████| 1504/1504 [28:41<00:00, 1.14s/ examples]t/s]\n", 228 | "Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 70.73ba/s]\n", 229 | "Map: 100%|██████████| 1503/1503 [27:23<00:00, 1.09s/ examples], 1737.29s/it]\n", 230 | "Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 90.22ba/s]\n", 231 | "Uploading the dataset shards: 100%|██████████| 2/2 [56:40<00:00, 1700.25s/it]\n" 232 | ] 233 | }, 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "CommitInfo(commit_url='https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized/commit/a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', commit_message='Upload dataset', commit_description='', oid='a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized', endpoint='https://huggingface.co', repo_type='dataset', repo_id='data-is-better-together/image-preferences-results-binarized'), pr_revision=None, pr_num=None)" 238 | ] 239 | }, 240 | "execution_count": 11, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "from datasets import Image\n", 247 | "relevant_columns = ['id', 'prompt', 'chosen', 'rejected', 'chosen_model', 'rejected_model', 'evolution', 'category', 'sub_category']\n", 248 | "ds_formatted_filtered_columns = ds_formatted_filtered.select_columns(relevant_columns)\n", 249 | "ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('chosen', Image())\n", 250 | "ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('rejected', Image())\n", 251 | "ds_formatted_filtered_columns.push_to_hub(\"data-is-better-together/open-image-preferences-v1-binarized\")\n" 252 | ] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": ".venv", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.11.9" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 2 276 | } 277 | -------------------------------------------------------------------------------- /community-efforts/image_preferences/05_fine_tune_flux_lora.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fine-tune Flux LoRA on the image preferences dataset\n", 8 | "\n", 9 | "Note, we will not use preferences from the dev set for this fine-tuning. We will only use the chosen images for an Supervised fine-tuning phase. Additionally, we recommend using a A100 GPU (4$/hour on Hugging Face) for this fine-tuning because of the memory requirements. The fine-tuning script will take about 4 hours to complete for a single epoch.\n", 10 | "\n", 11 | "## Install dependencies\n", 12 | "\n", 13 | "We first make sure we have the latest version of diffusers installed. This is a development version of diffusers, so we need to install it from source. Additionally, we install the other dependencies that are required for the fine-tuning script." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "vscode": { 21 | "languageId": "plaintext" 22 | } 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "!git clone https://github.com/huggingface/diffusers\n", 27 | "!pip install -e diffusers/.\n", 28 | "!pip install datasets sentencepiece protobuf accelerate peft wandb torchvision prodigyopt" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Logins and config\n", 36 | "\n", 37 | "We will use Weights & Biases to log the training process. Additionally, we log in to Hugging Face to push the finetuned model to the Hub." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "!huggingface-cli login --token \"hf_xxx\"\n", 47 | "!wandb login \"xxx\"\n", 48 | "!accelerate config default" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Fine-tune the model\n", 56 | "\n", 57 | "Lastly, we fine-tune the Flux LoRA on the chosen images from the image preferences dataset. We heavily inspired from the [Dreambooth fine-tuning script](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_flux.md) and modified it to work for our use case." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "!accelerate launch diffusers/examples/dreambooth/train_dreambooth_lora_flux.py \\\n", 67 | " --pretrained_model_name_or_path \"black-forest-labs/FLUX.1-dev\" \\\n", 68 | " --dataset_name \"data-is-better-together/open-image-preferences-v1-binarized\" \\\n", 69 | " --hub_model_id \"davidberenstein1957/open-image-preferences-v1-flux-dev-lora\" \\\n", 70 | " --push_to_hub \\\n", 71 | " --output_dir \"open-image-preferences-v1-flux-dev-lora\" \\\n", 72 | " --image_column \"chosen\" \\\n", 73 | " --caption_column \"prompt\" \\\n", 74 | " --mixed_precision=\"bf16\" \\\n", 75 | " --resolution=1024 \\\n", 76 | " --train_batch_size=1 \\\n", 77 | " --repeats=1 \\\n", 78 | " --report_to=\"wandb\"\\\n", 79 | " --gradient_accumulation_steps=1 \\\n", 80 | " --gradient_checkpointing \\\n", 81 | " --learning_rate=1.0 \\\n", 82 | " --text_encoder_lr=1.0 \\\n", 83 | " --optimizer=\"prodigy\"\\\n", 84 | " --lr_scheduler=\"constant\" \\\n", 85 | " --lr_warmup_steps=0 \\\n", 86 | " --rank=8 \\\n", 87 | " --checkpointing_steps=2000 \\\n", 88 | " --seed=\"0\" " 89 | ] 90 | } 91 | ], 92 | "metadata": { 93 | "language_info": { 94 | "name": "python" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 2 99 | } 100 | -------------------------------------------------------------------------------- /community-efforts/image_preferences/README.md: -------------------------------------------------------------------------------- 1 | # Open Image Preferences Dataset 2 | 3 | ## What is it? 4 | 5 | This is a project for the community to contribute image preferences for an open source dataset, that could be used for training and evaluating text to image models. You can find a full blogpost [here](https://huggingface.co/blog/image-preferences). 6 | 7 | ## What did we achieve? 8 | 9 | We achieved to annotate 10K preference pairs. You can take a look at the resulting dataset [here](https://huggingface.co/datasets/data-is-better-together/open-image-preferences-v1-results), and [its version that is ready for training](https://huggingface.co/datasets/data-is-better-together/open-image-preferences-v1-binarized). Additionally, we showcased the effectiveness along with a [FLUX-dev LoRA fine-tune](https://huggingface.co/data-is-better-together/open-image-preferences-v1-flux-dev-lora). 10 | 11 | ## How to use the dataset 12 | 13 | The dataset is hosted on Hugging Face, and free for anyone to use under an Apache 2.0 license. Here are some [examples of how to use the dataset for fine-tuning or post-analysis](https://huggingface.co/blog/image-preferences#what-is-next). 14 | 15 | ## Which tools were used? 16 | 17 | For the prompt ranking project, we used two tools to help us manage the annotation process. 18 | 19 | - [Argilla](https://github.com/argilla-io/argilla): an open-source data annotation tool that we used for the prompt ranking. Argilla has the option of using Hugging Face for authentication, which makes it easier for the community to contribute. 20 | - [distilabel](https://github.com/argilla-io/distilabel): a tool for creating and sythetic datasets. We used distilabel to evolve prompt and to create the image preferences dataset. 21 | - [Hugging Face Spaces](https://huggingface.co/spaces): a platform for hosting machine learning applications and demos. We used Spaces to host the Argilla tool for prompt ranking. -------------------------------------------------------------------------------- /community-efforts/image_preferences/requirements.txt: -------------------------------------------------------------------------------- 1 | distilabel[hf-inference-endpoints,argilla]==1.4.1 2 | pillow 3 | -------------------------------------------------------------------------------- /community-efforts/image_preferences/template.html: -------------------------------------------------------------------------------- 1 | 2 | 42 | 43 |
44 |
Prompt: {{record.fields.images.prompt}}
45 |
46 |
47 | 48 |
Image 1
49 |
50 |
51 | 52 |
Image 2
53 |
54 |
55 |
-------------------------------------------------------------------------------- /community-efforts/prompt_ranking/README.md: -------------------------------------------------------------------------------- 1 | # Prompt Ranking Project 2 | 3 | ## What is it? 4 | 5 | The Prompt Ranking Project is a pioneering community-driven initiative to explore the use of Argilla and Hugging Face Spaces for collaboratively creating impactful datasets. As part of the project, we built a dataset of 10k human and synthetic prompts, which users ranked by quality. This dataset serves various purposes: it can be used to train and evaluate language models on prompt ranking tasks or as seed data for generating synthetic prompts and completions by filtering those with the highest quality. 6 | 7 | In addition, as the first crowdsourcing effort involving the community, it provides valuable insights into the behavior of annotators. This includes exploring the distribution of prompt rankings based on the source of the prompt, its type, length, or other features. We can also examine the agreement levels among annotators and identify factors that influence this agreement. 8 | 9 | ## How did we make it possible? 10 | 11 | First, we created a prompt dataset with a mix of human and synthetic prompts from various sources. You can find the list of sources in the "Source Data" section [here](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked). Then, we set up an instance of Argilla in a Hugging Face Space to enable the annotation process. This preparation stage took around a week. 12 | 13 | Finally, during the next two weeks, we invited the community to participate in the ranking process to evaluate their quality. 14 | 15 | ## How did people contribute? 16 | 17 | The community contributed to the project by ranking the prompts in the dataset. For this, they just needed a Hugging Face account to log in to the Hugging Face Space where the Argilla instance was hosted and start ranking the prompts. 18 | 19 | ## Which tools were used? 20 | 21 | For the prompt ranking project, we used two tools to help us manage the annotation process. 22 | 23 | - [Argilla](https://github.com/argilla-io/argilla): an open-source data annotation tool that we used for the prompt ranking. Argilla has the option of using Hugging Face for authentication, which makes it easier for the community to contribute. 24 | - [Hugging Face Spaces](https://huggingface.co/spaces): a platform for hosting machine learning applications and demos. We used Spaces to host the Argilla tool for prompt ranking. 25 | 26 | ## What did we achieve? 27 | 28 | Thanks to the contribution of over 385 people, we were able to create the [data-is-better-together/10k_prompts_ranked](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked) dataset with 10,331 examples. 29 | 30 | Moreover, we could analyze the decision behavior of the annotators. Below, you can see that the human-generated prompts were ranked higher than the synthetic ones. This is an interesting observation that can be further explored in future research. 31 | 32 | > The "unknown" kind is a result of the fact that the source of the prompt was not known for some of the prompts in the dataset. 33 | 34 | ![Synthetic vs Human-Generated Prompts](assets/synthetic-vs-human.png) 35 | 36 | Check the dataset [here](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked)! Don't miss it! -------------------------------------------------------------------------------- /community-efforts/prompt_ranking/assets/synthetic-vs-human.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/community-efforts/prompt_ranking/assets/synthetic-vs-human.png -------------------------------------------------------------------------------- /community-efforts/prompt_translation/README.md: -------------------------------------------------------------------------------- 1 | # Multilingual Prompt Evaluation Project (MPEP) 2 | 3 | *🏅 There were not enough language-specific benchmarks for open LLMs. We wanted to create a leaderboard for more languages by leveraging the community!🏅* 4 | 5 | ## What is it? 6 | 7 | The Multilingual Prompt Evaluation Project (MPEP) is a community-driven effort to evaluate the performance of open language models across different languages. We translated a curated set of 500 high-quality prompts into multiple languages with the aim of evaluating the performance of models in different languages using [AlpacaEval](https://github.com/tatsu-lab/alpaca_eval), an automated tool for evaluating instruction/chat models based on LLM evaluation. 8 | 9 | ## How did we make it possible? 10 | 11 | As the community created a dataset of 10k prompts [data-is-better-together/10k_prompts_ranked](https://huggingface.co/datasets/data-is-better-together/10k_prompts_ranked) with quality ratings as part of the Data is Better Together initiative. From this dataset, we curated a subset of 500 high-quality prompts that cover a diverse range of capabilities for a model, such as math, coding, relationships, email generation, etc. 12 | 13 | However, these prompts were originally in English, so we asked the community to help us translate this curated dataset into different languages so that we could use the translated prompts to evaluate the performance of models for the languages we translate into. 14 | 15 | ## How did people contribute? 16 | 17 | There were two ways to contribute to this effort: by becoming a language lead or as community contributor. 18 | 19 | * The language leads were responsible for setting up a Hub organization and creating an Argilla Space for their language. They also gathered a community of people to help them translate the prompts and created a dashboard to track the progress of the translation effort with the guidance of Daniel van Strien. We need to thank them for their hard work! 20 | 21 | * People who spoke the languages that were being translated into could contribute to the translation of prompts. They just needed a Hugging Face account to log in to the relevant Space and start translating the prompts. 22 | 23 | ## Which tools were used? 24 | 25 | For the MPEP project, we used two main tools to help us manage the translation process. 26 | 27 | - [Argilla](https://github.com/argilla-io/argilla): an open-source data annotation tool that we used for the translation of prompts. Argilla has the option of using Hugging Face for authentication, which makes it easier for the community to contribute to the translation of prompts. 28 | - [Hugging Face Spaces](https://huggingface.co/spaces): a platform for hosting machine learning applications and demos. We'll use Spaces to host the Argilla tool for the translation of prompts. 29 | 30 | To make easier the translation set up for users, we also created a series of notebooks that served as guidance. 31 | 32 | ## What did we achieve? 33 | 34 | We started efforts to translate the prompts into several languages (shown below). Some of them successfully completed. The successful ones were Dutch and Russian, and almost finished with Spanish. Many groups began to work on translating the prompts into other languages. You can look at the resulting datasets [here](https://huggingface.co/datasets?search=MPEP_). 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
ArabicCantoneseCzechDutchFilipino
FrenchGermanHungarianMalagasyPortuguese
RussianSlovakSpanishSwahiliTagalog
TamilTeluguTurkishVietnamese
65 | -------------------------------------------------------------------------------- /community-efforts/prompt_translation/Translation_with_distilabel_gpt_4_turbo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "# Setup" 21 | ], 22 | "metadata": { 23 | "id": "mTYjyCl_1dAO" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "id": "MZhTFpbXzPYM" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "HF_ORG_NAME = None # update with the ID of the org you just created\n", 35 | "LANGUAGE = None # update this with the language you will work on" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "source": [ 41 | "assert HF_ORG_NAME is not None, \"Please set HF_ORG_NAME to the ID of the Hugging Face org you just created\"\n", 42 | "assert LANGUAGE is not None, \"Please set LANGUAGE to the language your effort focuses on\"" 43 | ], 44 | "metadata": { 45 | "id": "TVZF5-b3zRBJ" 46 | }, 47 | "execution_count": null, 48 | "outputs": [] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "source": [ 53 | "import argilla as rg\n", 54 | "\n", 55 | "OWNER_API_KEY = \"owner.apikey\" # if you haven't setup the secret this is the default owner api key\n", 56 | "assert OWNER_API_KEY is not None, \"Please set OWNER_API_KEY to the API token you just set in the Space settings\"\n", 57 | "\n", 58 | "rg.init(api_url=homepage_url, api_key=OWNER_API_KEY)" 59 | ], 60 | "metadata": { 61 | "id": "NdTtXc_v1YBD" 62 | }, 63 | "execution_count": null, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "source": [ 69 | "from openai import OpenAI\n", 70 | "from google.colab import userdata\n", 71 | "\n", 72 | "from distilabel.llm.openai import OpenAILLM\n", 73 | "from distilabel.tasks import TextGenerationTask\n", 74 | "from distilabel.pipeline import Pipeline" 75 | ], 76 | "metadata": { 77 | "id": "cQG-OX9DzWmA" 78 | }, 79 | "execution_count": null, 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "source": [ 85 | "# Get original dataset and translate it\n", 86 | "\n", 87 | "This assumes you have already pushed the untranslated dataset" 88 | ], 89 | "metadata": { 90 | "id": "nB9Mquww1gcD" 91 | } 92 | }, 93 | { 94 | "cell_type": "code", 95 | "source": [ 96 | "# let's load the dataset and prepare the source col for distilabel\n", 97 | "argilla_ds = rg.FeedbackDataset.from_argilla(f\"DIBT Translation for {LANGUAGE}\", workspace=\"admin\")\n", 98 | "hf_ds = argilla_ds.format_as(\"datasets\").rename_columns({'source': \"input\"})" 99 | ], 100 | "metadata": { 101 | "id": "WBwjwNdq0LN-" 102 | }, 103 | "execution_count": null, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "source": [ 109 | "api_key=userdata.get(\"OPENAI_API_KEY\")\n", 110 | "\n", 111 | "target_lang = \"Spanish\" # change this with your target language name\n", 112 | "\n", 113 | "llm = OpenAILLM(\n", 114 | " model=\"gpt-4-0613\", # gpt4-turbo\n", 115 | " api_key=api_key,\n", 116 | " task=TextGenerationTask(system_prompt=f\"You will be provided with a text in English, and your task is to translate it into {target_lang}. If it's code please don't translate the actual code, only the comments and the explanation.\"),\n", 117 | " num_threads=8,\n", 118 | " max_new_tokens=8192,\n", 119 | ")\n", 120 | "\n", 121 | "pipe = Pipeline(\n", 122 | " generator=llm\n", 123 | ")" 124 | ], 125 | "metadata": { 126 | "id": "BygNfRFyzYWv" 127 | }, 128 | "execution_count": null, 129 | "outputs": [] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "source": [ 134 | "# test everything is working so far\n", 135 | "ds = pipe.generate(\n", 136 | " dataset=hf_ds.select(range(10)),\n", 137 | " batch_size=4,\n", 138 | " display_progress_bar=True\n", 139 | ")\n", 140 | "# check the translations before running the full pipeline\n", 141 | "ds.to_pandas().head(5)" 142 | ], 143 | "metadata": { 144 | "id": "ZdeX71YdzbX_" 145 | }, 146 | "execution_count": null, 147 | "outputs": [] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "source": [ 152 | "# if everything is working as expected, run with the full dataset\n", 153 | "ds = pipe.generate(\n", 154 | " dataset=hf_ds,\n", 155 | " batch_size=4,\n", 156 | " display_progress_bar=True\n", 157 | ")" 158 | ], 159 | "metadata": { 160 | "id": "SGdugR9kzf79" 161 | }, 162 | "execution_count": null, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "source": [ 168 | "# Update the translations in the Argilla Space\n" 169 | ], 170 | "metadata": { 171 | "id": "18GUbdg01lD4" 172 | } 173 | }, 174 | { 175 | "cell_type": "code", 176 | "source": [ 177 | "translations = [gen[0] for gen in ds['generations']]\n", 178 | "len(translations)" 179 | ], 180 | "metadata": { 181 | "id": "yukaSFwFzk27" 182 | }, 183 | "execution_count": null, 184 | "outputs": [] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "source": [ 189 | "altered_records = []\n", 190 | "\n", 191 | "for rec, translation in zip(argilla_ds.records, translations):\n", 192 | " rec.suggestions = [\n", 193 | " {\n", 194 | " \"question_name\": \"target\",\n", 195 | " \"value\": translation\n", 196 | " }\n", 197 | " ]\n", 198 | " altered_records.append(rec)\n", 199 | "\n", 200 | "altered_records[0]" 201 | ], 202 | "metadata": { 203 | "id": "IJWw41v4zndL" 204 | }, 205 | "execution_count": null, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "source": [ 211 | "argilla_ds.update_records(altered_records)" 212 | ], 213 | "metadata": { 214 | "id": "IgkY5M4oztQz" 215 | }, 216 | "execution_count": null, 217 | "outputs": [] 218 | } 219 | ] 220 | } 221 | -------------------------------------------------------------------------------- /community-efforts/prompt_translation/dashboard_template/.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tar filter=lfs diff=lfs merge=lfs -text 29 | *.tflite filter=lfs diff=lfs merge=lfs -text 30 | *.tgz filter=lfs diff=lfs merge=lfs -text 31 | *.wasm filter=lfs diff=lfs merge=lfs -text 32 | *.xz filter=lfs diff=lfs merge=lfs -text 33 | *.zip filter=lfs diff=lfs merge=lfs -text 34 | *.zst filter=lfs diff=lfs merge=lfs -text 35 | *tfevents* filter=lfs diff=lfs merge=lfs -text 36 | -------------------------------------------------------------------------------- /community-efforts/prompt_translation/dashboard_template/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Template for Dashboards - Multilingual Prompt Evaluation Project 3 | emoji: 📊 4 | colorFrom: indigo 5 | colorTo: indigo 6 | sdk: gradio 7 | sdk_version: 4.21.0 8 | app_file: app.py 9 | pinned: false 10 | license: apache-2.0 11 | --- -------------------------------------------------------------------------------- /community-efforts/prompt_translation/dashboard_template/dumpy.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | import argilla as rg 6 | from huggingface_hub import HfApi 7 | 8 | logger = logging.getLogger(__name__) 9 | logger.setLevel(logging.INFO) 10 | 11 | if __name__ == "__main__": 12 | logger.info("*** Initializing Argilla session ***") 13 | rg.init( 14 | api_url=os.getenv("ARGILLA_API_URL"), 15 | api_key=os.getenv("ARGILLA_API_KEY"), 16 | extra_headers={"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}, 17 | ) 18 | 19 | logger.info("*** Fetching dataset from Argilla ***") 20 | dataset = rg.FeedbackDataset.from_argilla( 21 | os.getenv("SOURCE_DATASET"), 22 | workspace=os.getenv("SOURCE_WORKSPACE"), 23 | ) 24 | logger.info("*** Filtering records by `response_status` ***") 25 | dataset = dataset.filter_by(response_status=["submitted"]) # type: ignore 26 | 27 | logger.info("*** Calculating users and annotation count ***") 28 | output = {} 29 | for record in dataset.records: 30 | for response in record.responses: 31 | if response.user_id not in output: 32 | output[response.user_id] = 0 33 | output[response.user_id] += 1 34 | 35 | for key in list(output.keys()): 36 | output[rg.User.from_id(key).username] = output.pop(key) 37 | 38 | logger.info("*** Users and annotation count successfully calculated! ***") 39 | 40 | logger.info("*** Dumping Python dict into `stats.json` ***") 41 | with open("stats.json", "w") as file: 42 | json.dump(output, file, indent=4) 43 | 44 | logger.info("*** Uploading `stats.json` to Hugging Face Hub ***") 45 | api = HfApi(token=os.getenv("HF_TOKEN")) 46 | api.upload_file( 47 | path_or_fileobj="stats.json", 48 | path_in_repo="stats.json", 49 | repo_id="data-is-better-together/prompt-collective-dashboard", 50 | repo_type="space", 51 | ) 52 | logger.info("*** `stats.json` successfully uploaded to Hugging Face Hub! ***") 53 | -------------------------------------------------------------------------------- /community-efforts/prompt_translation/dashboard_template/requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles==23.2.1 2 | altair==5.2.0 3 | annotated-types==0.6.0 4 | anyio==4.2.0 5 | apscheduler==3.10.4 6 | argilla==1.23.0 7 | attrs==23.2.0 8 | backoff==2.2.1 9 | certifi==2024.2.2 10 | charset-normalizer==3.3.2 11 | click==8.1.7 12 | colorama==0.4.6 13 | contourpy==1.2.0 14 | cycler==0.12.1 15 | Deprecated==1.2.14 16 | exceptiongroup==1.2.0 17 | fastapi==0.109.2 18 | ffmpy==0.3.1 19 | filelock==3.13.1 20 | fonttools==4.48.1 21 | fsspec==2024.2.0 22 | gradio==4.17.0 23 | gradio_client==0.9.0 24 | h11==0.14.0 25 | httpcore==1.0.2 26 | httpx==0.26.0 27 | huggingface-hub==0.20.3 28 | idna==3.6 29 | importlib-resources==6.1.1 30 | Jinja2==3.1.3 31 | jsonschema==4.21.1 32 | jsonschema-specifications==2023.12.1 33 | kiwisolver==1.4.5 34 | markdown-it-py==3.0.0 35 | MarkupSafe==2.1.5 36 | matplotlib==3.8.2 37 | mdurl==0.1.2 38 | monotonic==1.6 39 | numpy==1.23.5 40 | orjson==3.9.13 41 | packaging==23.2 42 | pandas==1.5.3 43 | pillow==10.2.0 44 | pydantic==2.6.1 45 | pydantic_core==2.16.2 46 | pydub==0.25.1 47 | Pygments==2.17.2 48 | pyparsing==3.1.1 49 | python-dateutil==2.8.2 50 | python-multipart==0.0.7 51 | pytz==2024.1 52 | PyYAML==6.0.1 53 | referencing==0.33.0 54 | requests==2.31.0 55 | rich==13.7.0 56 | rpds-py==0.17.1 57 | ruff==0.2.1 58 | semantic-version==2.10.0 59 | shellingham==1.5.4 60 | six==1.16.0 61 | sniffio==1.3.0 62 | starlette==0.36.3 63 | tomlkit==0.12.0 64 | toolz==0.12.1 65 | tqdm==4.66.1 66 | typer==0.9.0 67 | typing_extensions==4.9.0 68 | urllib3==2.2.0 69 | uvicorn==0.27.0.post1 70 | vega-datasets==0.9.0 71 | websockets==11.0.3 72 | wrapt==1.14.1 73 | -------------------------------------------------------------------------------- /community-efforts/prompt_translation/requirements.in: -------------------------------------------------------------------------------- 1 | ipykernel 2 | huggingface_hub 3 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/README.md: -------------------------------------------------------------------------------- 1 | # Domain Specific Dataset Project 2 | 3 | The domain specific dataset project aims to bootstrap the creation of domain-specific datasets for training models. The creation of this set of tools will help users to collaborate with domain experts. This can be really significant as models are trained on large-scale datasets that are often biased, incomplete, or unrepresentative. By simply joining forces between domain experts and ML engineers, we can plant the seed to generate meaningful data. 4 | 5 | ## What is the goal of this project? 6 | 7 | The goal of this project is to share and collaborate with domain experts to create domain-specific datasets that can be used to train models. We aim to create a set of tools that help users to collaborate with domain experts to create datasets that are representative of the domain. We aim to share the datasets openly on the hub and share the tools and skills to build these datasets. 8 | 9 | ## Why do we need domain specific datasets? 10 | 11 | LLMs are increasingly used as economical alternatives to human participants across various domains such as computational social science, user testing, annotation tasks, and opinion surveys. However, the utility of LLMs in replicating specific human nuances and expertises is limited by inherent training constraints. Models are trained on large-scale datasets that are often biased, incomplete, or unrepresentative of the diverse human experiences they aim to replicate. This problem impacts specific expert domains as well as underrepresented groups in the training data. 12 | 13 | Also, building synthetic datasets that are representative of the domain can help to improve the performance of models in the domain. 14 | 15 | ## How can you contribute? 16 | 17 | 🧑🏼‍🔬 If you are a domain expert, you can contribute by sharing your expertise and collaborating with us to create domain-specific datasets. We're working with user-friendly, easy-to-use applications that help you define the seed data and create the dataset. We're also working on tools that help you to annotate the dataset and improve the quality of the dataset. 18 | 19 | 🧑🏻‍🔧 If you are an (inspiring) Machine Learning engineer, you can set up the project and its tools. You can run the synthetic data generation pipelines. And maybe even get around to training models. 20 | 21 | ## Project Overview 22 | 23 | ### 1. Select a domain and find collaborators 24 | 25 | We start by selecting a domain and finding collaborators who can help us to create the dataset. 26 | 27 | 🧑🏼‍🔬 If you are a domain expert, you could find an ML engineer to help you to create the dataset. 28 | 29 | 🧑🏻‍🔧 If you are an ML engineer, you could find a domain expert to help you to create the dataset. 30 | 31 | 🧑‍🚀 If you're both, you could start by defining the seed data and creating the dataset. 32 | 33 | ### 2. Setup your project 34 | 35 | First, you need to setup the project and its tools. For this, we use [this application](https://huggingface.co/spaces/argilla/domain-specific-datasets-welcome). 36 | 37 | ### 3. Define the domain knowledge 38 | 39 | Next, we need to get the domain expert to define the seed data, which is used to create the dataset. Once the seed data is defined, we add it to the dataset repo. 40 | 41 | ![Setup the project](https://raw.githubusercontent.com/huggingface/data-is-better-together/3ac24642454764c8c7d56f0ffdd1a134c1cd37b1/domain-specific-datasets/assets/setup.png) 42 | 43 | > **Domain topics** are the topics the domain expert wants to include in the dataset. For example, if the domain is farming, the domain topics could be "soil", "crops", "weather", etc. 44 | 45 | > **Domain description** is a description of the domain. For example, if the domain is farming, the domain description could be "Farming is the practice of cultivating crops and livestock for food, fiber, biofuel, medicinal plants, and other products used to sustain and enhance human life." 46 | 47 | > **Domain perspectives** are the perspectives the domain expert wants to include in the dataset. For example, if the domain is farming, the domain perspectives could be "farmer", "agricultural scientist", "agricultural economist", etc. 48 | 49 | ### 4. Generate the dataset 50 | 51 | Next, we can move on to generating the dataset from the seed data. 52 | 53 | ![Run the pipeline](https://raw.githubusercontent.com/huggingface/data-is-better-together/3ac24642454764c8c7d56f0ffdd1a134c1cd37b1/domain-specific-datasets/assets/pipeline.png) 54 | 55 | To generate instructions and responses, you're going to need an endpoint. You can find compatible models from the Hugging Face Inference API here: 56 | 57 | - 🔋Projects with sufficient resources could take advantage of [LLama3 70b](https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B) 58 | - 🪫Projects with less resources could take advantage of [LLama 3 8b](https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B) 59 | - 🍃Projects with even fewer resources could take advantage of [Phi-2](https://api-inference.huggingface.co/models/microsoft/phi-2) 60 | 61 | [Hugggingface Pro](https://huggingface.co/pricing) gives access to more compute resources. 62 | 63 | #### 4.1. Generate Instructions 64 | 65 | The pipeline takes the topic and perspective and generates instructions for the dataset, then the instructions are evolved by an LLM to create more instructions. 66 | 67 | #### 4.2 Generate Responses 68 | 69 | The pipeline takes the instructions and generates responses for the dataset, then the responses are evolved by an LLM to create higher quality responses. 70 | 71 | #### 4.3 Refine the dataset 72 | 73 | Finally, the pipeline pushes the dataset to the hub and Argilla space. The domain expert can then refine the dataset by annotating the dataset and improving the quality of the dataset. 74 | 75 | ### Video Tutorial 76 | 77 | Here's a video guide that walks you through the process from end-to-end. 78 | 79 | [![Walkthrough](https://cdn-uploads.huggingface.co/production/uploads/62d648291fa3e4e7ae3fa6e8/2e-QzuIv2dtkaPKL446yi.png)](https://www.loom.com/embed/99f32d7882764d9d8f4dc6ce3d824319?sid=c273876f-6715-4491-a79d-a27220e7a7d8) 80 | 81 | ### Run the `distilabel` pipeline 82 | 83 | st.markdown("## Run the pipeline") 84 | 85 | With the pipeline configuration defined in the app and pushed to the dataset repo {hub_username}/{project_name}, you can run the pipeline via this repo. 86 | 87 | You'll need to change directory, install dependencies, and log in to the Hugging Face Hub. You can do this by running the following commands: 88 | 89 | ```bash 90 | cd data-is-better-together/domain-specific-datasets/distilabel_pipelines 91 | pip install -r requirements.txt 92 | huggingface-cli login 93 | ``` 94 | 95 | Then you can run the pipeline using the following command: 96 | 97 | ```bash 98 | python domain_expert_pipeline.py {hub_username}/{project_name}""", 99 | ``` 100 | 101 | ### Project Structure 102 | 103 | - `app/` : A streamlit app to help domain experts to define seed data like system prompt and topics, by creating an empty dataset on the hub. 104 | - `distilabel_pipelines/domain_expert_pipeline.py` : The distilabel pipeline code that is used to create the dataset. 105 | - `scripts/` : Adhoc scripts that we used to ease annotation with vector search. 106 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/assets/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/domain-specific-datasets/assets/pipeline.png -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/assets/setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/domain-specific-datasets/assets/setup.png -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/distilabel_pipelines/domain_expert_pipeline.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict 3 | 4 | import argilla as rg 5 | from distilabel.llms import InferenceEndpointsLLM 6 | from distilabel.pipeline import Pipeline 7 | from distilabel.steps import ( 8 | LoadDataFromDicts, 9 | TextGenerationToArgilla, 10 | ExpandColumns, 11 | ) 12 | from distilabel.steps.tasks import ( 13 | TextGeneration, 14 | SelfInstruct, 15 | ) 16 | from distilabel.steps.tasks.typing import ChatType 17 | from huggingface_hub import hf_hub_download 18 | 19 | 20 | ################################################################################ 21 | # Define custom Argilla Dataset 22 | ################################################################################ 23 | 24 | 25 | def create_argilla_dataset( 26 | api_url: str, 27 | api_key: str, 28 | dataset_name: str, 29 | workspace: str, 30 | ): 31 | """Create a dataset in Argilla.""" 32 | 33 | rg.init(api_url, api_key) 34 | rg_dataset = rg.FeedbackDataset( 35 | fields=[ 36 | rg.TextField(name="id", title="id"), # type: ignore 37 | rg.TextField(name="instruction", title="instruction"), # type: ignore 38 | rg.TextField(name="generation", title="generation"), # type: ignore 39 | ], 40 | questions=[ 41 | rg.LabelQuestion( # type: ignore 42 | name="quality", 43 | title=f"What's the quality of the generation for the given instruction?", 44 | labels={"bad": "👎", "good": "👍"}, 45 | ), 46 | rg.TextQuestion( 47 | name="improved_instruction", 48 | title="How would you improve the instruction?", 49 | required=False, 50 | ), 51 | rg.TextQuestion( 52 | name="improved_response", 53 | title="How would you improve the response?", 54 | required=False, 55 | ), 56 | ], 57 | ) 58 | try: 59 | rg_dataset.push_to_argilla(name=dataset_name, workspace=workspace) 60 | except RuntimeError as e: 61 | print(f"Failed to create the dataset in Argilla: {e} Moving on...") 62 | 63 | 64 | ################################################################################ 65 | # Define out custom step for the domain expert 66 | ################################################################################ 67 | 68 | 69 | class DomainExpert(TextGeneration): 70 | """A customized task to generate text as a domain expert in the domain of farming and agriculture.""" 71 | 72 | system_prompt: str 73 | template: str = """This is the the instruction: {instruction}""" 74 | 75 | def format_input(self, input: Dict[str, Any]) -> "ChatType": 76 | return [ 77 | { 78 | "role": "system", 79 | "content": self.system_prompt, 80 | }, 81 | { 82 | "role": "user", 83 | "content": self.template.format(**input), 84 | }, 85 | ] 86 | 87 | 88 | ################################################################################ 89 | # Main script to run the pipeline 90 | ################################################################################ 91 | 92 | 93 | if __name__ == "__main__": 94 | import os 95 | import json 96 | import sys 97 | 98 | # get some args 99 | repo_id = sys.argv[1] 100 | 101 | # Get super secret tokens 102 | 103 | hub_token = os.environ.get("HF_TOKEN") 104 | argilla_api_key = os.environ.get("ARGILLA_API_KEY", "owner.apikey") 105 | 106 | # load pipeline parameters 107 | 108 | with open( 109 | hf_hub_download( 110 | repo_id=repo_id, filename="pipeline_params.json", repo_type="dataset" 111 | ), 112 | "r", 113 | ) as f: 114 | params = json.load(f) 115 | 116 | argilla_api_url = params.get("argilla_api_url") 117 | argilla_dataset_name = params.get("argilla_dataset_name") 118 | self_instruct_base_url = params.get("self_instruct_base_url") 119 | domain_expert_base_url = params.get("domain_expert_base_url") 120 | self_intruct_num_generations = params.get("self_instruct_num_generations", 2) 121 | domain_expert_num_generations = params.get("domain_expert_num_generations", 2) 122 | self_instruct_temperature = params.get("self_instruct_temperature", 0.9) 123 | domain_expert_temperature = params.get("domain_expert_temperature", 0.9) 124 | self_instruct_max_new_tokens = params.get("self_instruct_max_new_tokens", 2048) 125 | domain_expert_max_new_tokens = params.get("domain_expert_max_new_tokens", 2048) 126 | 127 | if not all( 128 | [ 129 | argilla_api_url, 130 | argilla_dataset_name, 131 | self_instruct_base_url, 132 | domain_expert_base_url, 133 | ] 134 | ): 135 | raise ValueError("Some of the pipeline parameters are missing") 136 | 137 | # collect our seed prompts defined in the space 138 | 139 | with open( 140 | hf_hub_download( 141 | repo_id=repo_id, filename="seed_data.json", repo_type="dataset" 142 | ), 143 | "r", 144 | ) as f: 145 | seed_data = json.load(f) 146 | 147 | application_instruction = seed_data.get("application_instruction") 148 | domain_expert_prompt = seed_data.get("domain_expert_prompt") 149 | domain_name = seed_data.get("domain") 150 | terms = seed_data.get("seed_terms") 151 | 152 | # Create the Argilla dataset 153 | 154 | create_argilla_dataset( 155 | api_url=argilla_api_url, 156 | api_key=argilla_api_key, 157 | dataset_name=argilla_dataset_name, 158 | workspace="admin", 159 | ) 160 | 161 | # Define the distilabel pipeline 162 | 163 | with Pipeline(domain_name) as pipeline: 164 | load_data = LoadDataFromDicts( 165 | name="load_data", 166 | batch_size=64, 167 | data=[{"input": term} for term in terms], 168 | ) 169 | 170 | self_instruct = SelfInstruct( 171 | name="self_instruct", 172 | num_instructions=self_intruct_num_generations, 173 | input_batch_size=8, 174 | llm=InferenceEndpointsLLM( 175 | api_key=hub_token, 176 | base_url=self_instruct_base_url, 177 | ), 178 | application_description=application_instruction, 179 | ) 180 | 181 | expand_columns = ExpandColumns( 182 | name="expand_columns", 183 | columns=["instructions"], 184 | output_mappings={"instructions": "instruction"}, 185 | ) 186 | 187 | domain_expert = DomainExpert( 188 | name="domain_expert", 189 | llm=InferenceEndpointsLLM( 190 | api_key=hub_token, 191 | base_url=domain_expert_base_url, 192 | ), 193 | input_batch_size=8, 194 | num_generations=domain_expert_num_generations, 195 | system_prompt=domain_expert_prompt, 196 | ) 197 | 198 | # Push the generated dataset to Argilla 199 | to_argilla = TextGenerationToArgilla( 200 | name="to_argilla", 201 | dataset_workspace="admin", 202 | ) 203 | 204 | # Connect up the pipeline 205 | 206 | load_data.connect(self_instruct) 207 | self_instruct.connect(expand_columns) 208 | expand_columns.connect(domain_expert) 209 | domain_expert.connect(to_argilla) 210 | 211 | # Run the pipeline 212 | 213 | pipeline.run( 214 | parameters={ 215 | "self_instruct": { 216 | "llm": { 217 | "generation_kwargs": { 218 | "max_new_tokens": self_instruct_max_new_tokens, 219 | "temperature": self_instruct_temperature, 220 | }, 221 | } 222 | }, 223 | "domain_expert": { 224 | "llm": { 225 | "generation_kwargs": { 226 | "max_new_tokens": self_instruct_max_new_tokens, 227 | "temperature": domain_expert_temperature, 228 | }, 229 | } 230 | }, 231 | "to_argilla": { 232 | "dataset_name": argilla_dataset_name, 233 | "api_key": argilla_api_key, 234 | "api_url": argilla_api_url, 235 | }, 236 | }, 237 | use_cache=False, 238 | ) 239 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/distilabel_pipelines/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | python_dotenv 3 | streamlit 4 | huggingface_hub 5 | argilla 6 | git+https://github.com/argilla-io/distilabel.git -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/parent_app/app.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from hub import ( 4 | setup_dataset_on_hub, 5 | duplicate_space_on_hub, 6 | add_project_config_to_space_repo, 7 | ) 8 | 9 | import streamlit as st 10 | 11 | 12 | # Constants 13 | # Written here to avoid defaults.py 14 | DEFAULT_DOMAIN = "farming" 15 | 16 | st.set_page_config( 17 | "Domain Data Grower", page_icon="🧑‍🌾", initial_sidebar_state="collapsed" 18 | ) 19 | 20 | st.header("🧑‍🌾 Domain Data Grower") 21 | st.divider() 22 | 23 | st.sidebar.link_button( 24 | "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens" 25 | ) 26 | 27 | ################################################################################ 28 | # APP MARKDOWN 29 | ################################################################################ 30 | 31 | st.header("🌱 Create a domain specific dataset") 32 | 33 | st.markdown( 34 | """This space will set up your domain specific dataset project. It will 35 | create the resources that you need to build a dataset. Those resources include: 36 | 37 | - A dataset repository on the Hub 38 | - Another space to define expert domain and run generation pipelines 39 | 40 | For a complete overview of the project. Check out the README 41 | """ 42 | ) 43 | 44 | st.page_link( 45 | "pages/🧑‍🌾 Domain Data Grower.py", 46 | label="Domain Data Grower", 47 | icon="🧑‍🌾", 48 | ) 49 | 50 | ################################################################################ 51 | # CONFIGURATION 52 | ################################################################################ 53 | 54 | st.subheader("🌾 Project Configuration") 55 | 56 | project_name = st.text_input("Project Name", DEFAULT_DOMAIN) 57 | hub_username = st.text_input("Hub Username", "argilla") 58 | hub_token = st.text_input("Hub Token", type="password") 59 | private_selector = st.checkbox("Private Space", value=False) 60 | 61 | if st.button("🤗 Setup Project Resources"): 62 | repo_id = f"{hub_username}/{project_name}" 63 | 64 | setup_dataset_on_hub( 65 | repo_id=repo_id, 66 | hub_token=hub_token, 67 | ) 68 | 69 | st.success( 70 | f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps." 71 | ) 72 | 73 | space_name = f"{project_name}_config_space" 74 | 75 | duplicate_space_on_hub( 76 | source_repo="argilla/domain-specific-datasets-template", 77 | target_repo=space_name, 78 | hub_token=hub_token, 79 | private=private_selector, 80 | ) 81 | 82 | st.success( 83 | f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})." 84 | ) 85 | 86 | argilla_name = f"{project_name}_argilla_space" 87 | 88 | duplicate_space_on_hub( 89 | source_repo="argilla/argilla-template-space", 90 | target_repo=argilla_name, 91 | hub_token=hub_token, 92 | private=private_selector, 93 | ) 94 | 95 | st.success( 96 | f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})." 97 | ) 98 | 99 | seconds = 5 100 | 101 | with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"): 102 | time.sleep(seconds) 103 | add_project_config_to_space_repo( 104 | dataset_repo_id=repo_id, 105 | hub_token=hub_token, 106 | project_name=project_name, 107 | argilla_space_repo_id=f"{hub_username}/{argilla_name}", 108 | project_space_repo_id=f"{hub_username}/{space_name}", 109 | ) 110 | 111 | st.subheader("👢 Next Steps") 112 | 113 | st.write("Go to you project specific space!") 114 | 115 | st.link_button( 116 | "🧑‍🌾 Open Configuration Space", 117 | f"https://huggingface.co/spaces/{hub_username}/{space_name}", 118 | ) 119 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/parent_app/hub.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tempfile import mktemp 3 | 4 | 5 | from huggingface_hub import duplicate_space, HfApi 6 | 7 | 8 | hf_api = HfApi() 9 | 10 | 11 | def setup_dataset_on_hub(repo_id, hub_token): 12 | # create an empty dataset repo on the hub 13 | hf_api.create_repo( 14 | repo_id=repo_id, 15 | token=hub_token, 16 | repo_type="dataset", 17 | ) 18 | 19 | # upload the seed data 20 | hf_api.upload_file( 21 | path_or_fileobj="seed_data.json", 22 | path_in_repo="seed_data.json", 23 | repo_id=repo_id, 24 | repo_type="dataset", 25 | token=hub_token, 26 | ) 27 | 28 | 29 | def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False): 30 | duplicate_space( 31 | from_id=source_repo, 32 | to_id=target_repo, 33 | token=hub_token, 34 | private=private, 35 | exist_ok=True, 36 | ) 37 | 38 | 39 | def add_project_config_to_space_repo( 40 | dataset_repo_id, 41 | hub_token, 42 | project_name, 43 | argilla_space_repo_id, 44 | project_space_repo_id, 45 | ): 46 | # upload the seed data and readme to the hub 47 | 48 | with open("project_config.json", "w") as f: 49 | json.dump( 50 | { 51 | "project_name": project_name, 52 | "argilla_space_repo_id": argilla_space_repo_id, 53 | "project_space_repo_id": project_space_repo_id, 54 | "dataset_repo_id": dataset_repo_id, 55 | }, 56 | f, 57 | ) 58 | 59 | hf_api.upload_file( 60 | path_or_fileobj="project_config.json", 61 | path_in_repo="project_config.json", 62 | token=hub_token, 63 | repo_id=project_space_repo_id, 64 | repo_type="space", 65 | ) 66 | 67 | 68 | def pull_seed_data_from_repo(repo_id, hub_token): 69 | tempfile_path = mktemp() 70 | # pull the dataset repo from the hub 71 | hf_api.hf_hub_download( 72 | repo_id=repo_id, token=hub_token, repo_type="dataset", filename=tempfile_path 73 | ) 74 | return json.load(open(tempfile_path)) 75 | 76 | 77 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/parent_app/pages/🧑‍🌾 Domain Data Grower.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import requests 3 | 4 | 5 | readme_location = "https://raw.githubusercontent.com/huggingface/data-is-better-together/51f29e67165d8277d9f9d1e4be60869f4b705a08/domain-specific-datasets/README.md" 6 | 7 | 8 | def open_markdown_file(url): 9 | response = requests.get(url) 10 | return response.text 11 | 12 | 13 | readme = open_markdown_file(readme_location) 14 | 15 | st.markdown(readme) 16 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/parent_app/project_config.json: -------------------------------------------------------------------------------- 1 | {"project_name": "farming", "argilla_space_repo_id": "ignacioct/farming_argilla_space", "project_space_repo_id": "ignacioct/farming_config_space", "dataset_repo_id": "ignacioct/farming"} -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/parent_app/seed_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "domain": "farming", 3 | "perspectives": [ 4 | "Family Farming" 5 | ], 6 | "topics": [ 7 | "animal welfare" 8 | ], 9 | "examples": [ 10 | { 11 | "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.", 12 | "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances." 13 | } 14 | ], 15 | "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology." 16 | } -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/.streamlit/config.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/domain-specific-datasets/project_app/.streamlit/config.toml -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/DATASET_README_BASE.md: -------------------------------------------------------------------------------- 1 | # Domain Dataset Grower 2 | 3 | This dataset was generated by [distilabel](https://distilabel.argilla.io/latest/) as a domain specific dataset for the domain of farming. The dataset used this seed data to generate the samples. The seed data was define by a domain expert and the generated data can be reviewed in this [Argilla](https://argilla.io/) space here: [Argilla](https://huggingface.co/spaces/argilla/farming) 4 | 5 | If you want to define a domain specific seed dataset for your own domain, you can use the distilabel tool to generate the dataset, and seed your dataset [here](https://huggingface.co/spaces/argilla/domain-specific-seed) 6 | 7 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Domain Specific Seed 3 | emoji: 💻 4 | colorFrom: purple 5 | colorTo: red 6 | sdk: streamlit 7 | sdk_version: 1.33.0 8 | app_file: app.py 9 | pinned: false 10 | license: apache-2.0 11 | --- 12 | 13 | Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference 14 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from defaults import ( 4 | PROJECT_NAME, 5 | ARGILLA_SPACE_REPO_ID, 6 | DATASET_REPO_ID, 7 | ARGILLA_URL, 8 | PROJECT_SPACE_REPO_ID, 9 | DIBT_PARENT_APP_URL, 10 | ) 11 | from utils import project_sidebar 12 | 13 | st.set_page_config("Domain Data Grower", page_icon="🧑‍🌾") 14 | 15 | project_sidebar() 16 | 17 | if PROJECT_NAME == "DEFAULT_DOMAIN": 18 | st.warning( 19 | "Please set up the project configuration in the parent app before proceeding." 20 | ) 21 | st.stop() 22 | 23 | 24 | st.header("🧑‍🌾 Domain Data Grower") 25 | st.divider() 26 | 27 | st.markdown( 28 | """ 29 | ## 🌱 Create a dataset seed for aligning models to a specific domain 30 | 31 | This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models. 32 | Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose. 33 | """ 34 | ) 35 | st.markdown( 36 | """ 37 | ## 🚜 How it works 38 | 39 | You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset. 40 | The dataset seed is then used to generate synthetic data for training a language model. 41 | 42 | """ 43 | ) 44 | st.markdown( 45 | """ 46 | ## 🗺️ The process 47 | 48 | ### Step 1: ~~Setup the project~~ 49 | 50 | ~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~ 51 | """ 52 | ) 53 | st.link_button("🚀 ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL) 54 | 55 | st.markdown( 56 | """ 57 | ### Step 2: Describe the Domain 58 | 59 | Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset. 60 | You can collaborate with domain experts to define the domain expertise and perspectives. 61 | """ 62 | ) 63 | 64 | st.page_link( 65 | "pages/2_👩🏼‍🔬 Describe Domain.py", 66 | label="Describe Domain", 67 | icon="👩🏼‍🔬", 68 | ) 69 | 70 | st.markdown( 71 | """ 72 | ### Step 3: Generate Synthetic Data 73 | 74 | Use distilabel to generate synthetic data for your domain-specific dataset. 75 | You can run the pipeline locally or in this space to generate synthetic data. 76 | """ 77 | ) 78 | 79 | st.page_link( 80 | "pages/3_🌱 Generate Dataset.py", 81 | label="Generate Dataset", 82 | icon="🌱", 83 | ) 84 | 85 | st.markdown( 86 | """ 87 | ### Step 4: Review the Dataset 88 | 89 | Use Argilla to review the generated synthetic data and provide feedback on the quality of the data. 90 | 91 | 92 | """ 93 | ) 94 | st.link_button("🔍 Review the dataset in Argilla", ARGILLA_URL) 95 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/defaults.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | SEED_DATA_PATH = "seed_data.json" 5 | PIPELINE_PATH = "pipeline.yaml" 6 | REMOTE_CODE_PATHS = ["requirements.txt"] 7 | DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/" 8 | N_PERSPECTIVES = 5 9 | N_TOPICS = 5 10 | N_EXAMPLES = 5 11 | CODELESS_DISTILABEL = os.environ.get("CODELESS_DISTILABEL", True) 12 | 13 | ################################################ 14 | # DEFAULTS ON FARMING 15 | ################################################ 16 | 17 | with open(SEED_DATA_PATH) as f: 18 | DEFAULT_DATA = json.load(f) 19 | 20 | DEFAULT_DOMAIN = DEFAULT_DATA["domain"] 21 | DEFAULT_PERSPECTIVES = DEFAULT_DATA["perspectives"] 22 | if DEFAULT_PERSPECTIVES is None or len(DEFAULT_PERSPECTIVES) == 0: 23 | DEFAULT_PERSPECTIVES = [""] 24 | DEFAULT_TOPICS = DEFAULT_DATA["topics"] 25 | if DEFAULT_TOPICS is None or len(DEFAULT_TOPICS) == 0: 26 | DEFAULT_TOPICS = [""] 27 | DEFAULT_EXAMPLES = DEFAULT_DATA["examples"] 28 | if DEFAULT_EXAMPLES is None or len(DEFAULT_EXAMPLES) == 0: 29 | DEFAULT_EXAMPLES = [{"question": "", "answer": ""}] 30 | DEFAULT_SYSTEM_PROMPT = DEFAULT_DATA["domain_expert_prompt"] 31 | 32 | ################################################ 33 | # PROJECT CONFIG FROM PARENT APP 34 | ################################################ 35 | 36 | try: 37 | with open("project_config.json") as f: 38 | PROJECT_CONFIG = json.load(f) 39 | 40 | PROJECT_NAME = PROJECT_CONFIG["project_name"] 41 | ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"] 42 | DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"] 43 | ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-") 44 | ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space" 45 | PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"] 46 | DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}" 47 | HUB_USERNAME = DATASET_REPO_ID.split("/")[0] 48 | except FileNotFoundError: 49 | PROJECT_NAME = "DEFAULT_DOMAIN" 50 | ARGILLA_SPACE_REPO_ID = "" 51 | DATASET_REPO_ID = "" 52 | ARGILLA_URL = "" 53 | PROJECT_SPACE_REPO_ID = "" 54 | DATASET_URL = "" 55 | HUB_USERNAME = "" 56 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/hub.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tempfile import mktemp 3 | 4 | import argilla as rg 5 | from huggingface_hub import HfApi 6 | 7 | from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH 8 | 9 | 10 | hf_api = HfApi() 11 | 12 | with open("DATASET_README_BASE.md") as f: 13 | DATASET_README_BASE = f.read() 14 | 15 | 16 | def create_readme(domain_seed_data, project_name, domain): 17 | # create a readme for the project that shows the domain and project name 18 | readme = DATASET_README_BASE 19 | readme += f"# {project_name}\n\n## Domain: {domain}" 20 | perspectives = domain_seed_data.get("perspectives") 21 | topics = domain_seed_data.get("topics") 22 | examples = domain_seed_data.get("examples") 23 | if perspectives: 24 | readme += "\n\n## Perspectives\n\n" 25 | for p in perspectives: 26 | readme += f"- {p}\n" 27 | if topics: 28 | readme += "\n\n## Topics\n\n" 29 | for t in topics: 30 | readme += f"- {t}\n" 31 | if examples: 32 | readme += "\n\n## Examples\n\n" 33 | for example in examples: 34 | readme += f"### {example['question']}\n\n{example['answer']}\n\n" 35 | temp_file = mktemp() 36 | 37 | with open(temp_file, "w") as f: 38 | f.write(readme) 39 | return temp_file 40 | 41 | 42 | def setup_dataset_on_hub(repo_id, hub_token): 43 | # create an empty dataset repo on the hub 44 | hf_api.create_repo( 45 | repo_id=repo_id, 46 | token=hub_token, 47 | repo_type="dataset", 48 | exist_ok=True, 49 | ) 50 | 51 | 52 | def push_dataset_to_hub( 53 | domain_seed_data_path, 54 | project_name, 55 | domain, 56 | pipeline_path, 57 | hub_username, 58 | hub_token: str, 59 | ): 60 | repo_id = f"{hub_username}/{project_name}" 61 | 62 | setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token) 63 | 64 | # upload the seed data and readme to the hub 65 | hf_api.upload_file( 66 | path_or_fileobj=domain_seed_data_path, 67 | path_in_repo="seed_data.json", 68 | token=hub_token, 69 | repo_id=repo_id, 70 | repo_type="dataset", 71 | ) 72 | 73 | # upload the readme to the hub 74 | domain_seed_data = json.load(open(domain_seed_data_path)) 75 | hf_api.upload_file( 76 | path_or_fileobj=create_readme( 77 | domain_seed_data=domain_seed_data, project_name=project_name, domain=domain 78 | ), 79 | path_in_repo="README.md", 80 | token=hub_token, 81 | repo_id=repo_id, 82 | repo_type="dataset", 83 | ) 84 | 85 | 86 | def push_pipeline_to_hub( 87 | pipeline_path, 88 | hub_username, 89 | hub_token: str, 90 | project_name, 91 | ): 92 | repo_id = f"{hub_username}/{project_name}" 93 | 94 | # upload the pipeline to the hub 95 | hf_api.upload_file( 96 | path_or_fileobj=pipeline_path, 97 | path_in_repo="pipeline.py", 98 | token=hub_token, 99 | repo_id=repo_id, 100 | repo_type="dataset", 101 | ) 102 | 103 | for code_path in REMOTE_CODE_PATHS: 104 | hf_api.upload_file( 105 | path_or_fileobj=code_path, 106 | path_in_repo=code_path, 107 | token=hub_token, 108 | repo_id=repo_id, 109 | repo_type="dataset", 110 | ) 111 | 112 | print(f"Dataset uploaded to {repo_id}") 113 | 114 | 115 | def pull_seed_data_from_repo(repo_id, hub_token): 116 | # pull the dataset repo from the hub 117 | hf_api.hf_hub_download( 118 | repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH 119 | ) 120 | return json.load(open(SEED_DATA_PATH)) 121 | 122 | 123 | def push_argilla_dataset_to_hub( 124 | name: str, 125 | repo_id: str, 126 | url: str, 127 | api_key: str, 128 | hub_token: str, 129 | workspace: str = "admin", 130 | ): 131 | rg.init(api_url=url, api_key=api_key) 132 | feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace) 133 | local_dataset = feedback_dataset.pull() 134 | local_dataset.push_to_huggingface(repo_id=repo_id, token=hub_token) 135 | 136 | 137 | def push_pipeline_params( 138 | pipeline_params, 139 | hub_username, 140 | hub_token: str, 141 | project_name, 142 | ): 143 | repo_id = f"{hub_username}/{project_name}" 144 | temp_path = mktemp() 145 | with open(temp_path, "w") as f: 146 | json.dump(pipeline_params, f) 147 | # upload the pipeline to the hub 148 | hf_api.upload_file( 149 | path_or_fileobj=temp_path, 150 | path_in_repo="pipeline_params.json", 151 | token=hub_token, 152 | repo_id=repo_id, 153 | repo_type="dataset", 154 | ) 155 | 156 | print(f"Pipeline params uploaded to {repo_id}") 157 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/infer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | 4 | API_URL = ( 5 | "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2" 6 | ) 7 | 8 | 9 | def query(question, hub_token: str): 10 | payload = { 11 | "inputs": question, 12 | "parameters": { 13 | "wait_for_model": True, 14 | "return_full_text": False, 15 | }, 16 | } 17 | headers = {"Authorization": f"Bearer {hub_token}"} 18 | response = requests.post(API_URL, headers=headers, json=payload) 19 | try: 20 | return response.json()[0]["generated_text"] 21 | except Exception: 22 | return "Error occurred while querying the model." 23 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/pages/2_👩🏼‍🔬 Describe Domain.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import streamlit as st 4 | 5 | from hub import push_dataset_to_hub, pull_seed_data_from_repo 6 | from infer import query 7 | from defaults import ( 8 | N_PERSPECTIVES, 9 | N_TOPICS, 10 | SEED_DATA_PATH, 11 | PIPELINE_PATH, 12 | DATASET_REPO_ID, 13 | ) 14 | from utils import project_sidebar, create_seed_terms, create_application_instruction 15 | 16 | 17 | st.set_page_config( 18 | page_title="Domain Data Grower", 19 | page_icon="🧑‍🌾", 20 | ) 21 | project_sidebar() 22 | 23 | 24 | ################################################################################ 25 | # HEADER 26 | ################################################################################ 27 | 28 | st.header("🧑‍🌾 Domain Data Grower") 29 | st.divider() 30 | st.subheader( 31 | "Step 2. Define the specific domain that you want to generate synthetic data for.", 32 | ) 33 | st.write( 34 | "Define the project details, including the project name, domain, and API credentials" 35 | ) 36 | 37 | 38 | ################################################################################ 39 | # LOAD EXISTING DOMAIN DATA 40 | ################################################################################ 41 | 42 | DATASET_REPO_ID = ( 43 | f"{st.session_state['hub_username']}/{st.session_state['project_name']}" 44 | ) 45 | SEED_DATA = pull_seed_data_from_repo( 46 | DATASET_REPO_ID, hub_token=st.session_state["hub_token"] 47 | ) 48 | DEFAULT_DOMAIN = SEED_DATA.get("domain", "") 49 | DEFAULT_PERSPECTIVES = SEED_DATA.get("perspectives", [""]) 50 | DEFAULT_TOPICS = SEED_DATA.get("topics", [""]) 51 | DEFAULT_EXAMPLES = SEED_DATA.get("examples", [{"question": "", "answer": ""}]) 52 | DEFAULT_SYSTEM_PROMPT = SEED_DATA.get("domain_expert_prompt", "") 53 | 54 | ################################################################################ 55 | # Domain Expert Section 56 | ################################################################################ 57 | 58 | ( 59 | tab_domain_expert, 60 | tab_domain_perspectives, 61 | tab_domain_topics, 62 | tab_examples, 63 | tab_raw_seed, 64 | ) = st.tabs( 65 | tabs=[ 66 | "👩🏼‍🔬 Domain Expert", 67 | "🔍 Domain Perspectives", 68 | "🕸️ Domain Topics", 69 | "📚 Examples", 70 | "🌱 Raw Seed Data", 71 | ] 72 | ) 73 | 74 | with tab_domain_expert: 75 | st.text("Define the domain expertise that you want to train a language model") 76 | st.info( 77 | "A domain expert is a person who is an expert in a particular field or area. For example, a domain expert in farming would be someone who has extensive knowledge and experience in farming and agriculture." 78 | ) 79 | 80 | domain = st.text_input("Domain Name", DEFAULT_DOMAIN) 81 | 82 | domain_expert_prompt = st.text_area( 83 | label="Domain Expert Definition", 84 | value=DEFAULT_SYSTEM_PROMPT, 85 | height=200, 86 | ) 87 | 88 | ################################################################################ 89 | # Domain Perspectives 90 | ################################################################################ 91 | 92 | with tab_domain_perspectives: 93 | st.text("Define the different perspectives from which the domain can be viewed") 94 | st.info( 95 | """ 96 | Perspectives are different viewpoints or angles from which a domain can be viewed. 97 | For example, the domain of farming can be viewed from the perspective of a commercial 98 | farmer or an independent family farmer.""" 99 | ) 100 | 101 | perspectives = st.session_state.get( 102 | "perspectives", 103 | [DEFAULT_PERSPECTIVES[0]], 104 | ) 105 | perspectives_container = st.container() 106 | 107 | perspectives = [ 108 | perspectives_container.text_input( 109 | f"Domain Perspective {i + 1}", value=perspective 110 | ) 111 | for i, perspective in enumerate(perspectives) 112 | ] 113 | 114 | if st.button("Add Perspective", key="add_perspective"): 115 | n = len(perspectives) 116 | perspectives.append( 117 | perspectives_container.text_input(f"Domain Perspective {n + 1}", value="") 118 | ) 119 | 120 | st.session_state["perspectives"] = perspectives 121 | 122 | 123 | ################################################################################ 124 | # Domain Topics 125 | ################################################################################ 126 | 127 | with tab_domain_topics: 128 | st.text("Define the main themes or subjects that are relevant to the domain") 129 | st.info( 130 | """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management.""" 131 | ) 132 | topics = st.session_state.get( 133 | "topics", 134 | [DEFAULT_TOPICS[0]], 135 | ) 136 | topics_container = st.container() 137 | topics = [ 138 | topics_container.text_input(f"Domain Topic {i + 1}", value=topic) 139 | for i, topic in enumerate(topics) 140 | ] 141 | 142 | if st.button("Add Topic", key="add_topic"): 143 | n = len(topics) 144 | topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value="")) 145 | 146 | st.session_state["topics"] = topics 147 | 148 | 149 | ################################################################################ 150 | # Examples Section 151 | ################################################################################ 152 | 153 | with tab_examples: 154 | st.text( 155 | "Add high-quality questions and answers that can be used to generate synthetic data" 156 | ) 157 | st.info( 158 | """ 159 | Examples are high-quality questions and answers that can be used to generate 160 | synthetic data for the domain. These examples will be used to train the language model 161 | to generate questions and answers. 162 | """ 163 | ) 164 | 165 | examples = st.session_state.get( 166 | "examples", 167 | [ 168 | { 169 | "question": "", 170 | "answer": "", 171 | } 172 | ], 173 | ) 174 | 175 | for n, example in enumerate(examples, 1): 176 | question = example["question"] 177 | answer = example["answer"] 178 | examples_container = st.container() 179 | question_column, answer_column = examples_container.columns(2) 180 | 181 | if st.button(f"Generate Answer {n}"): 182 | if st.session_state["hub_token"] is None: 183 | st.error("Please provide a Hub token to generate answers") 184 | else: 185 | answer = query(question, st.session_state["hub_token"]) 186 | with question_column: 187 | question = st.text_area(f"Question {n}", value=question) 188 | 189 | with answer_column: 190 | answer = st.text_area(f"Answer {n}", value=answer) 191 | examples[n - 1] = {"question": question, "answer": answer} 192 | st.session_state["examples"] = examples 193 | st.divider() 194 | 195 | if st.button("Add Example"): 196 | examples.append({"question": "", "answer": ""}) 197 | st.session_state["examples"] = examples 198 | st.rerun() 199 | 200 | ################################################################################ 201 | # Save Domain Data 202 | ################################################################################ 203 | 204 | perspectives = list(filter(None, perspectives)) 205 | topics = list(filter(None, topics)) 206 | 207 | domain_data = { 208 | "domain": domain, 209 | "perspectives": perspectives, 210 | "topics": topics, 211 | "examples": examples, 212 | "domain_expert_prompt": domain_expert_prompt, 213 | "application_instruction": create_application_instruction(domain, examples), 214 | "seed_terms": create_seed_terms(topics, perspectives), 215 | } 216 | 217 | with open(SEED_DATA_PATH, "w") as f: 218 | json.dump(domain_data, f, indent=2) 219 | 220 | with tab_raw_seed: 221 | st.code(json.dumps(domain_data, indent=2), language="json", line_numbers=True) 222 | 223 | ################################################################################ 224 | # Setup Dataset on the Hub 225 | ################################################################################ 226 | 227 | st.divider() 228 | 229 | 230 | if st.button("🤗 Push Dataset Seed") and all( 231 | ( 232 | domain, 233 | domain_expert_prompt, 234 | perspectives, 235 | topics, 236 | examples, 237 | ) 238 | ): 239 | if all( 240 | ( 241 | st.session_state.get("project_name"), 242 | st.session_state.get("hub_username"), 243 | st.session_state.get("hub_token"), 244 | ) 245 | ): 246 | project_name = st.session_state["project_name"] 247 | hub_username = st.session_state["hub_username"] 248 | hub_token = st.session_state["hub_token"] 249 | else: 250 | st.error( 251 | "Please create a dataset repo on the Hub before pushing the dataset seed" 252 | ) 253 | st.stop() 254 | 255 | push_dataset_to_hub( 256 | domain_seed_data_path=SEED_DATA_PATH, 257 | project_name=project_name, 258 | domain=domain, 259 | hub_username=hub_username, 260 | hub_token=hub_token, 261 | pipeline_path=PIPELINE_PATH, 262 | ) 263 | 264 | st.success( 265 | f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})" 266 | ) 267 | 268 | st.write("You can now move on to runnning your distilabel pipeline.") 269 | 270 | st.page_link( 271 | page="pages/3_🌱 Generate Dataset.py", 272 | label="Generate Dataset", 273 | icon="🌱", 274 | ) 275 | 276 | else: 277 | st.info( 278 | "Please fill in all the required domain fields to push the dataset seed to the Hub" 279 | ) 280 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/pages/3_🌱 Generate Dataset.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from defaults import ARGILLA_URL 4 | from hub import push_pipeline_params 5 | from utils import project_sidebar 6 | 7 | st.set_page_config( 8 | page_title="Domain Data Grower", 9 | page_icon="🧑‍🌾", 10 | ) 11 | 12 | project_sidebar() 13 | 14 | ################################################################################ 15 | # HEADER 16 | ################################################################################ 17 | 18 | st.header("🧑‍🌾 Domain Data Grower") 19 | st.divider() 20 | st.subheader("Step 3. Run the pipeline to generate synthetic data") 21 | st.write("Define the distilabel pipeline for generating the dataset.") 22 | 23 | hub_username = st.session_state.get("hub_username") 24 | project_name = st.session_state.get("project_name") 25 | hub_token = st.session_state.get("hub_token") 26 | 27 | ############################################################### 28 | # CONFIGURATION 29 | ############################################################### 30 | 31 | st.divider() 32 | 33 | st.markdown("## 🧰 Data Generation Pipeline") 34 | 35 | st.markdown( 36 | """ 37 | Now we need to define the configuration for the pipeline that will generate the synthetic data. 38 | The pipeline will generate synthetic data by combining self-instruction and domain expert responses. 39 | The self-instruction step generates instructions based on seed terms, and the domain expert step generates \ 40 | responses to those instructions. Take a look at the [distilabel docs](https://distilabel.argilla.io/latest/sections/learn/tasks/text_generation/#self-instruct) for more information. 41 | """ 42 | ) 43 | 44 | ############################################################### 45 | # INFERENCE 46 | ############################################################### 47 | 48 | st.markdown("#### 🤖 Inference configuration") 49 | 50 | st.write( 51 | """Add the url of the Huggingface inference API or endpoint that your pipeline should use to generate instruction and response pairs. \ 52 | Some domain tasks may be challenging for smaller models, so you may need to iterate over your task definition and model selection. \ 53 | This is a part of the process of generating high-quality synthetic data, human feedback is key to this process. \ 54 | You can find compatible models here:""" 55 | ) 56 | 57 | with st.expander("🤗 Recommended Models"): 58 | st.write("All inference endpoint compatible models can be found via the link below") 59 | st.link_button( 60 | "🤗 Inference compaptible models on the hub", 61 | "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending", 62 | ) 63 | st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b") 64 | st.code( 65 | "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" 66 | ) 67 | 68 | st.write("🪫Projects with less resources could take advantage of LLama 3 8b") 69 | st.code( 70 | "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" 71 | ) 72 | 73 | st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct") 74 | st.code( 75 | "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct" 76 | ) 77 | 78 | st.write("Note Hugggingface Pro gives access to more compute resources") 79 | st.link_button( 80 | "🤗 Huggingface Pro", 81 | "https://huggingface.co/pricing", 82 | ) 83 | 84 | 85 | self_instruct_base_url = st.text_input( 86 | label="Model base URL for instruction generation", 87 | value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct", 88 | ) 89 | domain_expert_base_url = st.text_input( 90 | label="Model base URL for domain expert response", 91 | value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct", 92 | ) 93 | 94 | ############################################################### 95 | # PARAMETERS 96 | ############################################################### 97 | 98 | st.divider() 99 | st.markdown("#### 🧮 Parameters configuration") 100 | 101 | st.write( 102 | "⚠️ Model and parameter choices significantly affect the quality of the generated data. \ 103 | We reccomend that you start with generating a few samples and review the data. Then scale up from there. \ 104 | You can run the pipeline multiple times with different configurations and append it to the same Argilla dataset." 105 | ) 106 | 107 | st.markdown( 108 | "Number of generations are the samples that each model will generate for each seed term, \ 109 | so if you have 10 seed terms, 2 instruction generations, and 2 response generations, you will have 40 samples in total." 110 | ) 111 | 112 | self_intruct_num_generations = st.slider( 113 | "Number of generations for self-instruction", 1, 10, 2 114 | ) 115 | domain_expert_num_generations = st.slider( 116 | "Number of generations for domain expert response", 1, 10, 2 117 | ) 118 | 119 | with st.expander("🔥 Advanced parameters"): 120 | st.markdown( 121 | "Temperature is a hyperparameter that controls the randomness of the generated text. \ 122 | Lower temperatures will generate more deterministic text, while higher temperatures \ 123 | will add more variation to generations." 124 | ) 125 | 126 | self_instruct_temperature = st.slider( 127 | "Temperature for self-instruction", 0.1, 1.0, 0.9 128 | ) 129 | domain_expert_temperature = st.slider( 130 | "Temperature for domain expert", 0.1, 1.0, 0.9 131 | ) 132 | 133 | st.markdown( 134 | "`max_new_tokens` is the maximum number of tokens (word like things) that can be generated by each model call. \ 135 | This is a way to control the length of the generated text. in some cases, you may want to increase this to \ 136 | generate longer responses. You should adapt this value to your model chice, but default of 2096 works \ 137 | in most cases." 138 | ) 139 | 140 | self_instruct_max_new_tokens = st.number_input( 141 | "Max new tokens for self-instruction", value=2096 142 | ) 143 | domain_expert_max_new_tokens = st.number_input( 144 | "Max new tokens for domain expert", value=2096 145 | ) 146 | 147 | ############################################################### 148 | # ARGILLA API 149 | ############################################################### 150 | 151 | st.divider() 152 | st.markdown("#### 🔬 Argilla API details to push the generated dataset") 153 | st.markdown( 154 | "Here you can define the Argilla API details to push the generated dataset to your Argilla space. \ 155 | These are the defaults that were set up for the project. You can change them if needed." 156 | ) 157 | argilla_url = st.text_input("Argilla API URL", ARGILLA_URL) 158 | argilla_api_key = st.text_input("Argilla API Key", "owner.apikey") 159 | argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name) 160 | st.divider() 161 | 162 | ############################################################### 163 | # Pipeline Run 164 | ############################################################### 165 | 166 | st.markdown("## Run the pipeline") 167 | 168 | st.markdown( 169 | "Once you've defined the pipeline configuration above, you can run the pipeline from your local machine." 170 | ) 171 | 172 | 173 | if all( 174 | [ 175 | argilla_api_key, 176 | argilla_url, 177 | self_instruct_base_url, 178 | domain_expert_base_url, 179 | self_intruct_num_generations, 180 | domain_expert_num_generations, 181 | self_instruct_temperature, 182 | domain_expert_temperature, 183 | hub_username, 184 | project_name, 185 | hub_token, 186 | argilla_dataset_name, 187 | ] 188 | ) and st.button("💾 Save Pipeline Config"): 189 | with st.spinner("Pushing pipeline to the Hub..."): 190 | push_pipeline_params( 191 | pipeline_params={ 192 | "argilla_api_url": argilla_url, 193 | "argilla_dataset_name": argilla_dataset_name, 194 | "self_instruct_base_url": self_instruct_base_url, 195 | "domain_expert_base_url": domain_expert_base_url, 196 | "self_instruct_temperature": self_instruct_temperature, 197 | "domain_expert_temperature": domain_expert_temperature, 198 | "self_intruct_num_generations": self_intruct_num_generations, 199 | "domain_expert_num_generations": domain_expert_num_generations, 200 | "self_instruct_max_new_tokens": self_instruct_max_new_tokens, 201 | "domain_expert_max_new_tokens": domain_expert_max_new_tokens, 202 | }, 203 | hub_username=hub_username, 204 | hub_token=hub_token, 205 | project_name=project_name, 206 | ) 207 | 208 | st.success( 209 | f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub." 210 | ) 211 | 212 | st.markdown( 213 | "To run the pipeline locally, you need to have the `distilabel` library installed. \ 214 | You can install it using the following command:" 215 | ) 216 | 217 | st.code( 218 | body=""" 219 | # Install the distilabel library 220 | pip install distilabel 221 | """, 222 | language="bash", 223 | ) 224 | 225 | st.markdown( 226 | "Next, you'll need to clone the pipeline code and install dependencies:" 227 | ) 228 | 229 | st.code( 230 | """ 231 | git clone https://github.com/huggingface/data-is-better-together 232 | cd data-is-better-together/domain-specific-datasets/distilabel_pipelines 233 | pip install -r requirements.txt 234 | huggingface-cli login 235 | """, 236 | language="bash", 237 | ) 238 | 239 | st.markdown("Finally, you can run the pipeline using the following command:") 240 | 241 | st.code( 242 | f""" 243 | python domain_expert_pipeline.py {hub_username}/{project_name}""", 244 | language="bash", 245 | ) 246 | st.markdown( 247 | "👩‍🚀 If you want to customise the pipeline take a look in `domain_expert_pipeline.py` \ 248 | and the [distilabel docs](https://distilabel.argilla.io/)" 249 | ) 250 | 251 | st.markdown( 252 | "🚀 Once you've run the pipeline your records will be available in the Argilla space" 253 | ) 254 | 255 | st.link_button("🔗 Argilla Space", argilla_url) 256 | 257 | st.markdown("Once you've reviewed the data, you can publish it on the next page:") 258 | 259 | st.page_link( 260 | page="pages/4_🔍 Review Generated Data.py", 261 | label="Review Generated Data", 262 | icon="🔍", 263 | ) 264 | 265 | else: 266 | st.info("Please fill all the required fields.") 267 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/pages/4_🔍 Review Generated Data.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from defaults import PROJECT_NAME, ARGILLA_URL, DATASET_REPO_ID 4 | from utils import project_sidebar 5 | from hub import push_argilla_dataset_to_hub 6 | 7 | st.set_page_config( 8 | page_title="Domain Data Grower", 9 | page_icon="🧑‍🌾", 10 | ) 11 | 12 | project_sidebar() 13 | 14 | ################################################################################ 15 | # HEADER 16 | ################################################################################ 17 | 18 | st.header("🧑‍🌾 Domain Data Grower") 19 | st.divider() 20 | 21 | st.write( 22 | """Once you have reviewed the synthetic data in Argilla, you can publish the 23 | generated dataset to the Hub.""" 24 | ) 25 | 26 | 27 | ################################################################################ 28 | # Configuration 29 | ################################################################################ 30 | 31 | st.divider() 32 | st.write("🔬 Argilla API details to push the generated dataset") 33 | argilla_url = st.text_input("Argilla API URL", ARGILLA_URL) 34 | argilla_api_key = st.text_input("Argilla API Key", "owner.apikey") 35 | argilla_dataset_name = st.text_input("Argilla Dataset Name", PROJECT_NAME) 36 | dataset_repo_id = st.text_input("Dataset Repo ID", DATASET_REPO_ID) 37 | st.divider() 38 | 39 | if st.button("🚀 Publish the generated dataset"): 40 | with st.spinner("Publishing the generated dataset..."): 41 | push_argilla_dataset_to_hub( 42 | name=argilla_dataset_name, 43 | repo_id=dataset_repo_id, 44 | url=argilla_url, 45 | api_key=argilla_api_key, 46 | workspace="admin", 47 | hub_token=st.session_state["hub_token"], 48 | ) 49 | st.success("The generated dataset has been published to the Hub.") 50 | -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/project_config.json: -------------------------------------------------------------------------------- 1 | {"project_name": "DEFAULT_DOMAIN", "argilla_space_repo_id": "burtenshaw/domain_test_4_argilla_space", "project_space_repo_id": "burtenshaw/domain_test_4_config_space", "dataset_repo_id": "burtenshaw/domain_test_4"} -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | python_dotenv 3 | streamlit 4 | huggingface_hub 5 | argilla -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/seed_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "domain": "farming", 3 | "perspectives": [ 4 | "Family Farming", 5 | "Agribusiness", 6 | "Permaculture", 7 | "Agroforestery", 8 | "Conventional Farming" 9 | ], 10 | "topics": [ 11 | "animal welfare", 12 | "economic growth", 13 | "land", 14 | "resources", 15 | "efficiency" 16 | ], 17 | "examples": [ 18 | { 19 | "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.", 20 | "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances." 21 | }, 22 | { 23 | "question": "Compare the environmental footprint of small-scale, local farming versus large-scale, industrial agriculture.", 24 | "answer": "Industrial agriculture typically emphasizes high-output, monoculture farming reliant on synthetic fertilizers and pesticides, which, as Horrigan, Lawrence, and Walker (2002) argue, leads to greater greenhouse gas emissions, higher energy use, and more water consumption compared to small-scale farming. In contrast, small-scale farms often employ diverse cropping systems and lower chemical inputs, resulting in a smaller environmental footprint. Pimentel et al. (2005) note that small-scale farms tend to have higher yields per unit area when environmental and sustainability factors are integrated into farming practices." 25 | }, 26 | { 27 | "question": "Analyze the economic implications of transitioning from conventional to organic farming.", 28 | "answer": "Transitioning from conventional to organic farming involves significant changes in farm management, input use, and market engagement. Crowder and Reganold (2015) present evidence that organic farms often yield smaller outputs initially but achieve higher profitability due to premium prices, lower input costs, and improved soil health over time. However, this transition requires upfront investments in knowledge and infrastructure, which can be economically challenging for some farmers, as noted by Seufert and Ramankutty (2017)." 29 | }, 30 | { 31 | "question": "Analyze the social, economic and environnmental impacts of land consolidation vs small-scale farmers.", 32 | "answer": "Land consolidation has been associated with increased agricultural productivity but also with negative social and environmental impacts. Larger land holdings typically lead to monocultures, which reduce biodiversity and increase vulnerability to pests and diseases, as highlighted by Li et al. (2017). Economically, while consolidation can lead to economies of scale and potential gains in gross margins, it often displaces rural populations, exacerbating poverty and reducing local food diversity (Sutherland et al., 2015)." 33 | }, 34 | { 35 | "question": "Investigate the relationship between land ownership patterns, agricultural productivity and environment sustainability. ", 36 | "answer": "Land ownership patterns critically influence agricultural productivity and sustainability. Secure land tenure supports investments in long-term improvements such as soil conservation and water management, which are pivotal for sustainable outcomes. Studies by Barrett et al. (2010) demonstrate that fragmented land ownership often results in inefficient resource use and higher transaction costs, which can detract from sustainability goals." 37 | } 38 | ], 39 | "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology." 40 | } -------------------------------------------------------------------------------- /cookbook-efforts/domain-specific-datasets/project_app/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import streamlit as st 4 | 5 | from defaults import ( 6 | PROJECT_NAME, 7 | ARGILLA_URL, 8 | DIBT_PARENT_APP_URL, 9 | DATASET_URL, 10 | DATASET_REPO_ID, 11 | ) 12 | 13 | 14 | def project_sidebar(): 15 | if PROJECT_NAME == "DEFAULT_DOMAIN": 16 | st.warning( 17 | "Please set up the project configuration in the parent app before proceeding." 18 | ) 19 | st.stop() 20 | 21 | st.sidebar.subheader(f"A Data Growing Project in the domain of {PROJECT_NAME}") 22 | st.sidebar.markdown( 23 | """ 24 | This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models. 25 | """ 26 | ) 27 | st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL) 28 | st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL) 29 | hub_username = DATASET_REPO_ID.split("/")[0] 30 | project_name = DATASET_REPO_ID.split("/")[1] 31 | st.session_state["project_name"] = project_name 32 | st.session_state["hub_username"] = hub_username 33 | st.session_state["hub_token"] = st.sidebar.text_input( 34 | "Hub Token", type="password", value=os.environ.get("HF_TOKEN", None) 35 | ) 36 | if st.session_state["hub_token"] is not None: 37 | os.environ["HF_TOKEN"] = st.session_state["hub_token"] 38 | st.sidebar.link_button( 39 | "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens" 40 | ) 41 | if all( 42 | ( 43 | st.session_state.get("project_name"), 44 | st.session_state.get("hub_username"), 45 | st.session_state.get("hub_token"), 46 | ) 47 | ): 48 | st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub") 49 | 50 | st.sidebar.divider() 51 | 52 | st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL) 53 | 54 | if st.session_state["hub_token"] is None: 55 | st.error("Please provide a Hub token to generate answers") 56 | st.stop() 57 | 58 | 59 | def create_seed_terms(topics: list[str], perspectives: list[str]) -> list[str]: 60 | """Create seed terms for self intruct to start from.""" 61 | 62 | return [ 63 | f"{topic} from a {perspective} perspective" 64 | for topic in topics 65 | for perspective in perspectives 66 | ] 67 | 68 | 69 | def create_application_instruction( 70 | domain: str, system_prompt: str, examples: list[dict[str, str]] 71 | ) -> str: 72 | """Create the instruction for Self-Instruct task.""" 73 | system_prompt = f"""AI assistant in the domain of {domain}. {system_prompt}""" 74 | examples_str = "" 75 | for example in examples: 76 | question = example["question"] 77 | answer = example["answer"] 78 | if len(answer) and len(question): 79 | examples_str += f"""\n- Question: {question}\n- Answer: {answer}\n""" 80 | examples_str += f"""\n- Question: {question}\n- Answer: {answer}\n""" 81 | if len(examples_str): 82 | system_prompt += """Below are some examples of questions and answers \ 83 | that the AI assistant would generate:""" 84 | system_prompt += "\nExamples:" 85 | system_prompt += f"\n{examples_str}" 86 | return system_prompt 87 | -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/01_data_prep.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 01. Creating our subsample of Aya to prepare for creating a DPO dataset" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook walks through the steps required to create a sample from the full Aya dataset for the language you are interested in working in. \n", 15 | "In this notebook and the subsequent notebooks we'll focus on Dutch as an example but the process will be rather similar for other languages." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from collections import Counter\n", 25 | "from datasets import Dataset\n", 26 | "from datasets import load_dataset\n", 27 | "from statistics import mean, median" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "Let's start by loading the Aya dataset!" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "aya_ds = load_dataset(\"CohereForAI/aya_dataset\",split='train')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "Dataset({\n", 55 | " features: ['inputs', 'targets', 'language', 'language_code', 'annotation_type', 'user_id'],\n", 56 | " num_rows: 202362\n", 57 | "})" 58 | ] 59 | }, 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "aya_ds" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "We want to only include the data that is relevant to the language we are interested in. This means we need to filter out the data that is not in Dutch. " 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "Dataset({\n", 85 | " features: ['inputs', 'targets', 'language', 'language_code', 'annotation_type', 'user_id'],\n", 86 | " num_rows: 1733\n", 87 | "})" 88 | ] 89 | }, 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "dutch_only = aya_ds.filter(lambda x: x['language'] == 'Dutch')\n", 97 | "dutch_only" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "### Getting some statistics about the data\n", 105 | "\n", 106 | "To help with the next stages of this process we'll get some statistics about the data. " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 5, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "def get_stats(ds: Dataset):\n", 116 | " input_lengths = []\n", 117 | " output_lengths = []\n", 118 | " annotator_counts: Counter = Counter()\n", 119 | " for row in ds:\n", 120 | " input_lengths.append(len(row[\"inputs\"]))\n", 121 | " output_lengths.append(len(row[\"targets\"]))\n", 122 | " annotator_counts.update(ds[\"user_id\"])\n", 123 | " mean_input_length = mean(input_lengths)\n", 124 | " median_input_length = median(input_lengths)\n", 125 | " mean_output_length = mean(output_lengths)\n", 126 | " median_output_length = median(output_lengths)\n", 127 | " max_input_length = max(input_lengths)\n", 128 | " max_output_length = max(output_lengths)\n", 129 | " return {\n", 130 | " \"number_of_unique_annotators\": len(annotator_counts),\n", 131 | " \"input_lengths\": input_lengths,\n", 132 | " \"output_lengths\": output_lengths,\n", 133 | " \"annotator_counts\": dict(annotator_counts),\n", 134 | " \"mean_input_length\": mean_input_length,\n", 135 | " \"median_input_length\": median_input_length,\n", 136 | " \"mean_output_length\": mean_output_length,\n", 137 | " \"median_output_length\": median_output_length,\n", 138 | " \"max_input_length\": max_input_length,\n", 139 | " \"max_output_length\": max_output_length,\n", 140 | " }" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "stats = get_stats(dutch_only)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "There are various things we might be interest in from these stats but some of the most relevant are the length of input and outputs of the data. This may help us decide which LLMs to use in the next stage of the process. " 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 7, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "Max input length: 3030\n", 169 | "Max output length: 21707\n", 170 | "Mean input length: 223.67109059434506\n", 171 | "Mean output length: 352.1806116560877\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "print(f\"Max input length: {stats['max_input_length']}\")\n", 177 | "print(f\"Max output length: {stats['max_output_length']}\")\n", 178 | "print(f\"Mean input length: {stats['mean_input_length']}\")\n", 179 | "print(f\"Mean output length: {stats['mean_output_length']}\")" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## Push the subset to the Hub \n", 187 | "\n", 188 | "To help us make testing our pipelines easier we'll create a very small test split (10 samples) that we can use when we're testing out our pipelines. " 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 8, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "dutch_only = dutch_only.train_test_split(test_size=100)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "We'll now push this subset to the Hub so that we can use it in the next stage of the process. Don't forget to update this to point to your own Hub workspace. If you are not already authenticated on the Hub uncomment the cell below and run it. \n" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "# from huggingface_hub import login \n", 214 | "# login()" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "dutch_only.push_to_hub(\"data-is-better-together/aya_dataset_dutch_example\")" 224 | ] 225 | } 226 | ], 227 | "metadata": { 228 | "kernelspec": { 229 | "display_name": ".venv", 230 | "language": "python", 231 | "name": "python3" 232 | }, 233 | "language_info": { 234 | "codemirror_mode": { 235 | "name": "ipython", 236 | "version": 3 237 | }, 238 | "file_extension": ".py", 239 | "mimetype": "text/x-python", 240 | "name": "python", 241 | "nbconvert_exporter": "python", 242 | "pygments_lexer": "ipython3", 243 | "version": "3.11.1" 244 | } 245 | }, 246 | "nbformat": 4, 247 | "nbformat_minor": 2 248 | } 249 | -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | # Multilingual DPO/ORPO Dataset Project 6 | 7 | This project aims to encourage the creation of DPO/ORPO datasets for more languages. By providing these tools, we aim to foster a community of people building DPO/ORPO datasets for different languages. Currently, many languages do not have DPO/ORPO datasets openly shared on the Hugging Face Hub. The [data-is-better-together/preference_data_by_language](https://huggingface.co/spaces/data-is-better-together/preference_data_by_language) Space gives an overview of the language coverage. At the time of this commit, only 14 languages with DPO/ORPO datasets are available on the Hugging Face Hub. Following this recipe, you can easily generate a DPO/ORPO dataset for a new language. 8 | 9 | ## What are the goals of this project? 10 | 11 | This project has the following goals: 12 | 13 | - An Argilla Interface for ranking responses generated by a human Aya annotator and a generated response. See the [aya_dutch_dpo](https://dibt-demo-argilla-space.hf.space/dataset/f47eac1c-8763-4513-ab02-b08eb66f7f65/annotation-mode)example. 14 | - A "raw dataset" with LLM feedback for each prompt. See the [data-is-better-together/aya_dutch_dpo_raw](https://huggingface.co/datasets/data-is-better-together/aya_dutch_dpo_raw) for an example. 15 | - A growing dataset with human-verified preferences for each response. See the [data-is-better-together/aya_dutch_dpo](https://huggingface.co/datasets/data-is-better-together/aya_dutch_dpo) for an example dataset. 16 | 17 | ## Why do we need DPO/ORPO datasets for more languages? 18 | 19 |
20 | What is Direct Preference Optimization (DPO)? 21 | Direct Preference Optimization (DPO) is a technique for training models to optimize for human preferences. 22 | 23 | > [Direct Preference Optimization (DPO)](https://huggingface.co/papers/2305.18290) has emerged as a promising alternative for aligning Large Language Models (LLMs) to human or AI preferences. Unlike [traditional alignment methods](https://huggingface.co/blog/rlhf), which are based on reinforcement learning, DPO recasts the alignment formulation as a simple loss function that can be optimized directly on a dataset of preferences ${(x, y_w, y_l)}$, where $x$ is a prompt and $(y_w,y_l)$ are the preferred and dispreferred responses. [source](https://huggingface.co/blog/pref-tuning) 24 | 25 | Or, in other words, to train a model using DPO you need a dataset of prompts and responses where one response is preferred over the other. This type of data is also used for ORPO, another alignment technique we'll describe in the next section. 26 | 27 | ![Sample of a preference tuning dataset.](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/pref_tuning/data.png) 28 | _Example of a preference tuning dataset. Each row contains a prompt and a "chosen" and "rejected" response._ 29 |

30 | 31 | DPO datasets are a powerful tool for fine-tuning language models to generate responses that are more aligned with human preferences, so are a valuable resource for improving the quality of chatbots and other generative models. However, currently, there are only a few DPO datasets available for a limited number of languages. By generating more DPO datasets for different languages, we can help to improve the quality of generative models in a wider range of languages. 32 | 33 | Recently, Odds Ratio Preference Optimization (ORPO) has been proposed as an alternative to DPO. ORPO is a novel approach to fine-tuning language models that incorporates preference alignment directly into the supervised fine-tuning (SFT) process by using the odds ratio to contrast favored and disfavored generation styles. By applying a minor penalty to the disfavored style during SFT, ORPO effectively guides the model toward the desired behavior without the need for an additional alignment step. 34 | 35 | _tl;dr_: if you have a DPO-style dataset + a strong base model you can use ORPO to train a chat model. Recently Argilla, KAIST, and Hugging Face used this approach to train [HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1) a very strong chat model using only 7k data preference pairs! 36 | 37 | ## How can you contribute? 38 | 39 | As part of Data Is Better Together, we're supporting the community in generating more DPO/ORPO datasets for different languages. If you would like to help, you can follow the steps below to generate a DPO/ORPO dataset for a language that you are interested in. There are already many language communities working together on the Hugging Face Discord server, so you can also join the server to collaborate with others on this project 🤗. 40 | 41 | ## Project Overview 42 | 43 | [Aya](https://cohere.com/blog/aya-multilingual), an open science initiative to accelerate multilingual AI progress, has released a dataset of human-annotated prompt-completion pairs across 71 languages. We can use this dataset to generate DPO/ORPO datasets for languages for which they don't currently exist. 44 | 45 | Here are the steps we'll take to generate a DPO/ORPO dataset for a new language: 46 | 47 | - Start from the [CohereForAI/aya_dataset](https://huggingface.co/datasets/CohereForAI/aya_dataset). 48 | - Filter the Aya dataset to the language you are focusing on. 49 | - Use [`distilabel`](https://github.com/argilla-io/distilabel) to generate a second response for each prompt in the filtered Aya dataset. 50 | - (Optional) Send the generated dataset to [Argilla](https://argilla.io/) for annotation where the community can choose which response is better. 51 | - (Optional) Train a model using the generated DPO/ORPO dataset and push forward the state of the art in your language 🚀🚀🚀 52 | 53 | You can find more detailed instructions on how to generate a DPO/ORPO dataset for a new language in the [instructions](./instructions.md). 54 | 55 | ### I'm GPU-poor, can I still get involved? 56 | 57 | Yes! The example scripts in this repository use Hugging Face Inference Endpoints for the inference component. This means you can run the scripts on your local machine without needing a GPU. We can provide you with GPU grants to run the `distilabel` script if you need them. Please reach out to us on the Hugging Face Discord server if you need a GPU grant. **Note**: We will want to ensure that you have a plan for how you will use the GPU grant before providing it, in particular, we'll want to see that you have set up an Argilla Space for your project already and have already done some work to identify the language you want to work on and the models you want to use. 58 | 59 | ### Next steps 60 | 61 | The current notebooks and code currently only show how to generate the synthetic data and create a preference dataset annotation Space. The next steps would be to collect human feedback on the synthetic data and then use this to train a model. We will cover this in a future notebook. -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/assets/banner.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/dpo-orpo-preference/assets/banner.webp -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/aya_dpo_gen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict 4 | 5 | import argilla as rg 6 | from custom_preference_to_argilla import CustomPreferenceToArgilla 7 | from distilabel.llms import InferenceEndpointsLLM 8 | from distilabel.pipeline import Pipeline 9 | from distilabel.steps import ( 10 | LoadHubDataset, 11 | StepInput, 12 | StepOutput, 13 | step, 14 | ) 15 | from distilabel.steps.tasks import TextGeneration, UltraFeedback 16 | from distilabel.steps.tasks.typing import ChatType 17 | from dotenv import load_dotenv 18 | from huggingface_hub import InferenceClient, login 19 | 20 | load_dotenv() 21 | 22 | ################################## 23 | # Configuration 24 | # This section contains the configuration for the pipeline. 25 | # This is where you can define the model ID, the maximum number of new tokens to generate, the input batch size for the model via the Inference Endpoints API, and the Argilla configuration. 26 | ################################## 27 | 28 | 29 | # Model Configuration 30 | MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" 31 | MAX_NEW_TOKENS = 2000 # Maximum number of new tokens to generate 32 | 33 | # Inference Endpoints Configuration 34 | # INFERENCE_ENDPOINTS_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B-Instruct" # Inference endpoints URL 35 | # ENDPOINT_NAME = "meta-llama/Meta-Llama-3-70B-Instruct" 36 | INPUT_BATCH_SIZE = 10 # Input batch size `for the model via the Inference Endpoints API, you can adjust this based on the model's requirements and the hardware you are using to deploy the model 37 | 38 | # Argilla Configuration 39 | ARGILLA_SPACE_URL = "https://dibt-demo-argilla-space.hf.space" # Argilla Space URL 40 | ARGILLA_DATASET_NAME = "aya_dutch_dpo" # Argilla dataset name 41 | ARGILLA_WORKSPACE_NAME = "admin" # Argilla workspace name 42 | # Dataset Configuration 43 | INPUT_DATASET_HUB_ID = "data-is-better-together/aya_dataset_dutch_example" # Input dataset hub ID (created in the previous step) 44 | OUTPUT_DATASET_HUB_ID = ( 45 | "data-is-better-together/aya_dutch_dpo_raw" # Output dataset hub ID 46 | ) 47 | SPLIT = "test" # Split of the dataset to use. Start with test whilst you are testing the pipeline and then switch to train when you are ready to generate the full dataset 48 | 49 | HUGGINGFACE_TOKEN = os.getenv("HF_API_KEY") 50 | 51 | ####################################### 52 | # Check required environment variables 53 | ####################################### 54 | assert ( 55 | HUGGINGFACE_TOKEN is not None 56 | ), "Please set the HF_API_KEY environment variable or authenticate with the Hugging Face CLI using `huggingface-cli login`" 57 | login(token=HUGGINGFACE_TOKEN) 58 | ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY") 59 | 60 | # Check if the API key is set 61 | assert ( 62 | ARGILLA_API_KEY is not None 63 | ), "Please set the ARGILLA_API_KEY environment variable or pass it as a parameter" 64 | 65 | ##################### 66 | # Helper functions 67 | ##################### 68 | 69 | 70 | def remove_existing_dataset(argilla_dataset_name: str): 71 | """Remove an existing dataset from Argilla. This is useful when re-running the pipeline multiple times.""" 72 | try: 73 | rg.init( 74 | api_url=ARGILLA_SPACE_URL, 75 | api_key=ARGILLA_API_KEY, 76 | workspace=ARGILLA_WORKSPACE_NAME, 77 | ) 78 | argilla_ds = rg.FeedbackDataset.from_argilla(argilla_dataset_name) 79 | argilla_ds.delete() 80 | except ValueError as e: 81 | print(e) 82 | 83 | 84 | ##################################### 85 | # Define distilabel custom steps 86 | ##################################### 87 | 88 | 89 | @step( 90 | inputs=["generation"], 91 | outputs=["predicted_generation_language", "predicted_generation_language_score"], 92 | ) 93 | def language_predict(inputs: StepInput) -> StepOutput: 94 | """ 95 | A step to predict the language of the generated text. 96 | Sometimes models fail to generate text in the desired language. 97 | This step helps to identify such cases using an external language prediction model. 98 | """ 99 | for input in inputs: 100 | try: 101 | cleaned_input = input["generation"].replace("\n", " ") 102 | resp = InferenceClient("laurievb/OpenLID").text_classification( 103 | cleaned_input 104 | ) 105 | top_prediction = resp[0] # top prediction is the first element in the list 106 | input["predicted_generation_language"] = top_prediction.label 107 | input["predicted_generation_language_score"] = min( 108 | 1.0, top_prediction.score 109 | ) # ensure score is between 0 and 1 110 | except Exception as e: 111 | print(e) 112 | input["predicted_generation_language"] = "error" 113 | input["predicted_generation_language_score"] = 0.0 114 | yield inputs 115 | 116 | 117 | @step(inputs=["targets", "generation"], outputs=["generations"]) 118 | def CombineAyaAndModelResponse( 119 | inputs: StepInput, 120 | ) -> StepOutput: 121 | """A step to combine the Aya and model responses and add the response sources.""" 122 | for input in inputs: 123 | input["generations"] = [input["targets"], input["generation"]] 124 | input["generation_models"] = ["aya", MODEL_ID] 125 | yield inputs 126 | 127 | 128 | ####################################################################### 129 | # Define a custom TextGeneration task focused on our target language 130 | ####################################################################### 131 | 132 | 133 | # Custom system prompt 134 | # This translates to something like: 135 | # You are an AI assistant. Your primary language is Dutch. Answer most questions and prompts in Dutch, unless specifically asked to use another language. 136 | # If you are asked to translate between two other languages, for example from French to English, perform the requested translation. 137 | # When quotes or passages in another language are given in a prompt, assume that the user wants you to understand them and refer to them when formulating your English response. Do not translate the foreign text yourself, unless specifically requested. 138 | 139 | 140 | system_prompt = """Je bent een AI-assistent. Je primaire taal is Nederlands. Beantwoord de meeste vragen en prompts in het Nederlands, tenzij specifiek gevraagd wordt om een andere taal te gebruiken. 141 | Als je gevraagd wordt om te vertalen tussen twee andere talen, bijvoorbeeld van Frans naar Engels, voer dan de gevraagde vertaling uit. Wanneer citaten of passages in een andere taal in een prompt worden gegeven, ga er dan van uit dat de gebruiker wil dat je ze begrijpt en ernaar verwijst bij het formuleren van je Nederlandse antwoord. Vertaal de anderstalige tekst zelf niet, tenzij dit specifiek wordt gevraagd.""" 142 | 143 | 144 | class DutchTextGeneration(TextGeneration): 145 | """A TextGeneration task adds an additional system prompt.""" 146 | 147 | def format_input(self, input: Dict[str, Any]) -> "ChatType": 148 | return [ 149 | {"role": "system", "content": system_prompt}, 150 | {"role": "user", "content": input["instruction"]}, 151 | ] 152 | 153 | 154 | ##################################### 155 | # Define the pipeline 156 | ##################################### 157 | 158 | with Pipeline(name="generate-dpo-responses") as pipeline: 159 | # Load the dataset from the Hugging Face Hub 160 | load_hub_dataset = LoadHubDataset( 161 | name="load_dataset", 162 | output_mappings={"inputs": "instruction"}, 163 | ) 164 | ##################################### 165 | # Define the LLM 166 | ##################################### 167 | llm = InferenceEndpointsLLM( 168 | model_id=MODEL_ID, 169 | tokenizer_id=MODEL_ID, 170 | model_display_name=MODEL_ID, 171 | api_key=HUGGINGFACE_TOKEN, 172 | ) 173 | # Generate responses using the model 174 | text_generation = DutchTextGeneration( 175 | name="text_generation", 176 | llm=llm, 177 | input_batch_size=INPUT_BATCH_SIZE, 178 | output_mappings={"model_name": "generation_model"}, 179 | num_generations=1, 180 | ) 181 | load_hub_dataset.connect(text_generation) 182 | language_prediction = language_predict(name="language_prediction") 183 | text_generation.connect(language_prediction) 184 | combine_columns = CombineAyaAndModelResponse( 185 | name="combine_columns", 186 | ) 187 | 188 | language_prediction.connect(combine_columns) 189 | ultrafeedback = UltraFeedback( 190 | name="ultrafeedback", aspect="overall-rating", llm=llm 191 | ) 192 | combine_columns.connect(ultrafeedback) 193 | to_argilla = CustomPreferenceToArgilla( 194 | name="to_argilla", 195 | api_url=ARGILLA_SPACE_URL, 196 | api_key=ARGILLA_API_KEY, 197 | dataset_name=ARGILLA_DATASET_NAME, 198 | dataset_workspace=ARGILLA_WORKSPACE_NAME, 199 | num_generations=2, 200 | metadata_properties=[ 201 | rg.TermsMetadataProperty(name="predicted_generation_language").dict(), # type: ignore 202 | rg.FloatMetadataProperty( # type: ignore 203 | name="predicted_generation_language_score", min=0.0, max=1.0 204 | ).dict(), 205 | ], 206 | ) 207 | ultrafeedback.connect(to_argilla) 208 | 209 | ##################################### 210 | # Run the pipeline 211 | ##################################### 212 | 213 | if __name__ == "__main__": 214 | start_time = time.time() 215 | if ARGILLA_DATASET_NAME: 216 | print(f"Removing existing dataset: {ARGILLA_DATASET_NAME}") 217 | remove_existing_dataset(ARGILLA_DATASET_NAME) 218 | dataset = pipeline.run( 219 | parameters={ 220 | "load_dataset": { 221 | "repo_id": INPUT_DATASET_HUB_ID, 222 | "split": SPLIT, 223 | }, 224 | "text_generation": { 225 | "llm": { 226 | "generation_kwargs": { 227 | "max_new_tokens": MAX_NEW_TOKENS, 228 | "do_sample": True, 229 | "stop_sequences": ["<|end_of_text|>", "<|eot_id|>"], 230 | } 231 | } 232 | }, 233 | "to_argilla": {"dataset_name": ARGILLA_DATASET_NAME}, 234 | } 235 | ) 236 | dataset.push_to_hub(OUTPUT_DATASET_HUB_ID, token=HUGGINGFACE_TOKEN) 237 | end_time = time.time() 238 | print(f"Output dataset: https://huggingface.co/datasets/{OUTPUT_DATASET_HUB_ID}") 239 | print(f"Time taken: {end_time - start_time} seconds") 240 | -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/custom_preference_to_argilla.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import hashlib 3 | from typing import TYPE_CHECKING, Any, Dict, List, Union 4 | 5 | from typing_extensions import override 6 | 7 | with contextlib.suppress(ImportError): 8 | import argilla as rg 9 | from distilabel.steps import PreferenceToArgilla, StepInput 10 | 11 | if TYPE_CHECKING: 12 | from distilabel.steps.typing import StepOutput, RatingQuestion, TextQuestion 13 | 14 | 15 | class CustomPreferenceToArgilla(PreferenceToArgilla): 16 | """ 17 | Custom PreferenceToArgilla step that adds metadata properties to the feedback records. 18 | This allows filtering based on metadata properties in the Argilla UI. 19 | """ 20 | 21 | metadata_properties: List[Dict[str, Any]] 22 | 23 | def load(self) -> None: 24 | super().load() 25 | for metadata_property in self.metadata_properties: 26 | metadata_property_type = metadata_property.pop("type", None) 27 | if metadata_property_type == "float": 28 | metadata_property = rg.FloatMetadataProperty.parse_obj( 29 | metadata_property 30 | ) 31 | elif metadata_property_type == "integer": 32 | metadata_property = rg.IntegerMetadataProperty.parse_obj( 33 | metadata_property 34 | ) 35 | elif metadata_property_type == "terms": 36 | metadata_property = rg.TermsMetadataProperty.parse_obj( 37 | metadata_property 38 | ) 39 | else: 40 | break 41 | self._rg_dataset.add_metadata_property(metadata_property) # type: ignore 42 | 43 | def _rating_rationale_pairs( 44 | self, 45 | ) -> List[Union["RatingQuestion", "TextQuestion"]]: 46 | questions = super()._rating_rationale_pairs() 47 | questions.append( 48 | rg.TextQuestion( # type: ignore 49 | name="improved_response", 50 | title="How would you improve the response?", 51 | required=False, 52 | ) 53 | ) 54 | return questions 55 | 56 | @override 57 | def process(self, inputs: StepInput) -> "StepOutput": # type: ignore 58 | records = [] 59 | for input in inputs: 60 | # Generate the SHA-256 hash of the instruction to use it as the metadata 61 | instruction_id = hashlib.sha256( 62 | input["instruction"].encode("utf-8") # type: ignore 63 | ).hexdigest() 64 | 65 | generations = { 66 | f"{self._generations}-{idx}": generation 67 | for idx, generation in enumerate(input["generations"]) # type: ignore 68 | } 69 | records.append( # type: ignore 70 | rg.FeedbackRecord( # type: ignore 71 | fields={ 72 | "id": instruction_id, 73 | "instruction": input["instruction"], # type: ignore 74 | **generations, 75 | }, 76 | suggestions=self._add_suggestions_if_any(input), # type: ignore 77 | metadata={ 78 | metadata_property["name"]: input[metadata_property["name"]] 79 | for metadata_property in self.metadata_properties 80 | if metadata_property["name"] in input 81 | }, 82 | ) 83 | ) 84 | self._rg_dataset.add_records(records) # type: ignore 85 | yield inputs 86 | -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/examples/en/01_en_data_prep.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 01. Creating our subsample of Aya to prepare for creating a DPO dataset" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook walks through the steps required to create a sample from the full Aya dataset for the language you are interested in working in. \n", 15 | "In this notebook and the subsequent notebooks we'll focus on Dutch as an example but the process will be rather similar for other languages." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from collections import Counter\n", 25 | "from datasets import Dataset\n", 26 | "from datasets import load_dataset\n", 27 | "from statistics import mean, median" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "aya_ds = load_dataset(\"CohereForAI/aya_dataset\",split='train')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "Dataset({\n", 48 | " features: ['inputs', 'targets', 'language', 'language_code', 'annotation_type', 'user_id'],\n", 49 | " num_rows: 202362\n", 50 | "})" 51 | ] 52 | }, 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "aya_ds" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "We want to only include the data that is relevant to the language we are interested in. This means we need to filter out the data that is not in Dutch. " 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "Dataset({\n", 78 | " features: ['inputs', 'targets', 'language', 'language_code', 'annotation_type', 'user_id'],\n", 79 | " num_rows: 1733\n", 80 | "})" 81 | ] 82 | }, 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "dutch_only = aya_ds.filter(lambda x: x['language'] == 'Dutch')\n", 90 | "dutch_only" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "### Getting some statistics about the data\n", 98 | "\n", 99 | "To help with the next stages of this process we'll get some statistics about the data. " 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "def get_stats(ds: Dataset):\n", 109 | " input_lengths = []\n", 110 | " output_lengths = []\n", 111 | " annotator_counts: Counter = Counter()\n", 112 | " for row in ds:\n", 113 | " input_lengths.append(len(row[\"inputs\"]))\n", 114 | " output_lengths.append(len(row[\"targets\"]))\n", 115 | " annotator_counts.update(ds[\"user_id\"])\n", 116 | " mean_input_length = mean(input_lengths)\n", 117 | " median_input_length = median(input_lengths)\n", 118 | " mean_output_length = mean(output_lengths)\n", 119 | " median_output_length = median(output_lengths)\n", 120 | " max_input_length = max(input_lengths)\n", 121 | " max_output_length = max(output_lengths)\n", 122 | " return {\n", 123 | " \"number_of_unique_annotators\": len(annotator_counts),\n", 124 | " \"input_lengths\": input_lengths,\n", 125 | " \"output_lengths\": output_lengths,\n", 126 | " \"annotator_counts\": dict(annotator_counts),\n", 127 | " \"mean_input_length\": mean_input_length,\n", 128 | " \"median_input_length\": median_input_length,\n", 129 | " \"mean_output_length\": mean_output_length,\n", 130 | " \"median_output_length\": median_output_length,\n", 131 | " \"max_input_length\": max_input_length,\n", 132 | " \"max_output_length\": max_output_length,\n", 133 | " }" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 6, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "stats = get_stats(dutch_only)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "There are various things we might be interest in from these stats but some of the most relevant are the length of input and outputs of the data. This may help us decide which LLMs to use in the next stage of the process. " 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "Max input length: 3030\n", 162 | "Max output length: 21707\n", 163 | "Mean input length: 223.67109059434506\n", 164 | "Mean output length: 352.1806116560877\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "print(f\"Max input length: {stats['max_input_length']}\")\n", 170 | "print(f\"Max output length: {stats['max_output_length']}\")\n", 171 | "print(f\"Mean input length: {stats['mean_input_length']}\")\n", 172 | "print(f\"Mean output length: {stats['mean_output_length']}\")" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## Push the subset to the Hub \n", 180 | "\n", 181 | "To help us make testing our pipelines easier we'll create a very small test split (10 samples) that we can use when we're testing out our pipelines. " 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 8, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "dutch_only = dutch_only.train_test_split(test_size=100)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "We'll now push this subset to the Hub so that we can use it in the next stage of the process. Don't forget to update this to point to your own Hub workspace. If you are not already authenticated on the Hub uncomment the cell below and run it. \n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 9, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# from huggingface_hub import login \n", 207 | "# login()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 10, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "application/vnd.jupyter.widget-view+json": { 218 | "model_id": "97f87da40e5f44a5b72c0afd35b3b37c", 219 | "version_major": 2, 220 | "version_minor": 0 221 | }, 222 | "text/plain": [ 223 | "Uploading the dataset shards: 0%| | 0/1 [00:00 StepOutput: 92 | """ 93 | A step to predict the language of the generated text. 94 | Sometimes models fail to generate text in the desired language. 95 | This step helps to identify such cases using an external language prediction model. 96 | """ 97 | for input in inputs: 98 | try: 99 | cleaned_input = input["generation"].replace("\n", " ") 100 | resp = InferenceClient("laurievb/OpenLID").text_classification( 101 | cleaned_input 102 | ) 103 | top_prediction = resp[0] # top prediction is the first element in the list 104 | input["predicted_generation_language"] = top_prediction.label 105 | input["predicted_generation_language_score"] = min( 106 | 1.0, top_prediction.score 107 | ) # ensure score is between 0 and 1 108 | except Exception as e: 109 | print(e) 110 | input["predicted_generation_language"] = "error" 111 | input["predicted_generation_language_score"] = 0.0 112 | yield inputs 113 | 114 | 115 | @step(inputs=["targets", "generation"], outputs=["generations"]) 116 | def CombineAyaAndModelResponse( 117 | inputs: StepInput, 118 | ) -> StepOutput: 119 | """A step to combine the Aya and model responses and add the response sources.""" 120 | for input in inputs: 121 | input["generations"] = [input["targets"], input["generation"]] 122 | input["generation_models"] = ["aya", MODEL_ID] 123 | yield inputs 124 | 125 | 126 | ####################################################################### 127 | # Define a custom TextGeneration task focused on our target language 128 | ####################################################################### 129 | 130 | 131 | # Custom system prompt 132 | # This translates to something like: 133 | # You are an AI assistant. Your primary language is Dutch. Answer most questions and prompts in Dutch, unless specifically asked to use another language. 134 | # If you are asked to translate between two other languages, for example from French to English, perform the requested translation. 135 | # When quotes or passages in another language are given in a prompt, assume that the user wants you to understand them and refer to them when formulating your English response. Do not translate the foreign text yourself, unless specifically requested. 136 | 137 | 138 | # system_prompt = """Je bent een AI-assistent. Je primaire taal is Nederlands. Beantwoord de meeste vragen en prompts in het Nederlands, tenzij specifiek gevraagd wordt om een andere taal te gebruiken. 139 | # Als je gevraagd wordt om te vertalen tussen twee andere talen, bijvoorbeeld van Frans naar Engels, voer dan de gevraagde vertaling uit. Wanneer citaten of passages in een andere taal in een prompt worden gegeven, ga er dan van uit dat de gebruiker wil dat je ze begrijpt en ernaar verwijst bij het formuleren van je Nederlandse antwoord. Vertaal de anderstalige tekst zelf niet, tenzij dit specifiek wordt gevraagd.""" 140 | 141 | 142 | # class DutchTextGeneration(TextGeneration): 143 | # """A TextGeneration task adds an additional system prompt.""" 144 | 145 | # def format_input(self, input: Dict[str, Any]) -> "ChatType": 146 | # return [ 147 | # {"role": "system", "content": system_prompt}, 148 | # {"role": "user", "content": input["instruction"]}, 149 | # ] 150 | 151 | 152 | ##################################### 153 | # Define the pipeline 154 | ##################################### 155 | 156 | with Pipeline(name="generate-dpo-responses") as pipeline: 157 | # Load the dataset from the Hugging Face Hub 158 | load_hub_dataset = LoadHubDataset( 159 | name="load_dataset", 160 | output_mappings={"inputs": "instruction"}, 161 | ) 162 | ##################################### 163 | # Define the LLM 164 | ##################################### 165 | llm = InferenceEndpointsLLM( 166 | model_id=MODEL_ID, 167 | tokenizer_id=MODEL_ID, 168 | model_display_name=MODEL_ID, 169 | api_key=HUGGINGFACE_TOKEN, 170 | ) 171 | # Generate responses using the model 172 | text_generation = TextGeneration( 173 | name="text_generation", 174 | llm=llm, 175 | input_batch_size=INPUT_BATCH_SIZE, 176 | output_mappings={"model_name": "generation_model"}, 177 | num_generations=1, 178 | ) 179 | load_hub_dataset.connect(text_generation) 180 | language_prediction = language_predict(name="language_prediction") 181 | text_generation.connect(language_prediction) 182 | combine_columns = CombineAyaAndModelResponse( 183 | name="combine_columns", 184 | ) 185 | 186 | language_prediction.connect(combine_columns) 187 | ultrafeedback = UltraFeedback( 188 | name="ultrafeedback", aspect="overall-rating", llm=llm 189 | ) 190 | combine_columns.connect(ultrafeedback) 191 | to_argilla = CustomPreferenceToArgilla( 192 | name="to_argilla", 193 | api_url=ARGILLA_SPACE_URL, 194 | api_key=ARGILLA_API_KEY, 195 | dataset_name=ARGILLA_DATASET_NAME, 196 | dataset_workspace=ARGILLA_WORKSPACE_NAME, 197 | num_generations=2, 198 | metadata_properties=[ 199 | rg.TermsMetadataProperty(name="predicted_generation_language").dict(), # type: ignore 200 | rg.FloatMetadataProperty( # type: ignore 201 | name="predicted_generation_language_score", min=0.0, max=1.0 202 | ).dict(), 203 | ], 204 | ) 205 | ultrafeedback.connect(to_argilla) 206 | 207 | ##################################### 208 | # Run the pipeline 209 | ##################################### 210 | 211 | if __name__ == "__main__": 212 | start_time = time.time() 213 | if ARGILLA_DATASET_NAME: 214 | print(f"Removing existing dataset: {ARGILLA_DATASET_NAME}") 215 | remove_existing_dataset(ARGILLA_DATASET_NAME) 216 | dataset = pipeline.run( 217 | parameters={ 218 | "load_dataset": { 219 | "repo_id": INPUT_DATASET_HUB_ID, 220 | "split": SPLIT, 221 | }, 222 | "text_generation": { 223 | "llm": { 224 | "generation_kwargs": { 225 | "max_new_tokens": MAX_NEW_TOKENS, 226 | "do_sample": True, 227 | "stop_sequences": ["<|end_of_text|>", "<|eot_id|>"], 228 | } 229 | } 230 | }, 231 | "to_argilla": {"dataset_name": ARGILLA_DATASET_NAME}, 232 | } 233 | ) 234 | dataset.push_to_hub(OUTPUT_DATASET_HUB_ID, token=HUGGINGFACE_TOKEN) 235 | end_time = time.time() 236 | print(f"Output dataset: https://huggingface.co/datasets/{OUTPUT_DATASET_HUB_ID}") 237 | print(f"Time taken: {end_time - start_time} seconds") 238 | -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/examples/en/custom_preference_to_argilla.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import hashlib 3 | from typing import TYPE_CHECKING, Any, Dict, List, Union 4 | 5 | from typing_extensions import override 6 | 7 | with contextlib.suppress(ImportError): 8 | import argilla as rg 9 | from distilabel.steps import PreferenceToArgilla, StepInput 10 | 11 | if TYPE_CHECKING: 12 | from distilabel.steps.typing import StepOutput, RatingQuestion, TextQuestion 13 | 14 | 15 | class CustomPreferenceToArgilla(PreferenceToArgilla): 16 | """ 17 | Custom PreferenceToArgilla step that adds metadata properties to the feedback records. 18 | This allows filtering based on metadata properties in the Argilla UI. 19 | """ 20 | 21 | metadata_properties: List[Dict[str, Any]] 22 | 23 | def load(self) -> None: 24 | super().load() 25 | for metadata_property in self.metadata_properties: 26 | metadata_property_type = metadata_property.pop("type", None) 27 | if metadata_property_type == "float": 28 | metadata_property = rg.FloatMetadataProperty.parse_obj( 29 | metadata_property 30 | ) 31 | elif metadata_property_type == "integer": 32 | metadata_property = rg.IntegerMetadataProperty.parse_obj( 33 | metadata_property 34 | ) 35 | elif metadata_property_type == "terms": 36 | metadata_property = rg.TermsMetadataProperty.parse_obj( 37 | metadata_property 38 | ) 39 | else: 40 | break 41 | self._rg_dataset.add_metadata_property(metadata_property) # type: ignore 42 | 43 | def _rating_rationale_pairs( 44 | self, 45 | ) -> List[Union["RatingQuestion", "TextQuestion"]]: 46 | questions = super()._rating_rationale_pairs() 47 | questions.append( 48 | rg.TextQuestion( # type: ignore 49 | name="improved_response", 50 | title="How would you improve the response?", 51 | required=False, 52 | ) 53 | ) 54 | return questions 55 | 56 | @override 57 | def process(self, inputs: StepInput) -> "StepOutput": # type: ignore 58 | records = [] 59 | for input in inputs: 60 | # Generate the SHA-256 hash of the instruction to use it as the metadata 61 | instruction_id = hashlib.sha256( 62 | input["instruction"].encode("utf-8") # type: ignore 63 | ).hexdigest() 64 | 65 | generations = { 66 | f"{self._generations}-{idx}": generation 67 | for idx, generation in enumerate(input["generations"]) # type: ignore 68 | } 69 | records.append( # type: ignore 70 | rg.FeedbackRecord( # type: ignore 71 | fields={ 72 | "id": instruction_id, 73 | "instruction": input["instruction"], # type: ignore 74 | **generations, 75 | }, 76 | suggestions=self._add_suggestions_if_any(input), # type: ignore 77 | metadata={ 78 | metadata_property["name"]: input[metadata_property["name"]] 79 | for metadata_property in self.metadata_properties 80 | if metadata_property["name"] in input 81 | }, 82 | ) 83 | ) 84 | self._rg_dataset.add_records(records) # type: ignore 85 | yield inputs 86 | -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/requirements.in: -------------------------------------------------------------------------------- 1 | distilabel 2 | argilla==1.27.0 3 | ipykernel 4 | python-dotenv 5 | transformers 6 | ipywidgets 7 | huggingface_hub -------------------------------------------------------------------------------- /cookbook-efforts/dpo-orpo-preference/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile requirements.in -o requirements.txt 3 | aiohttp==3.9.5 4 | # via 5 | # datasets 6 | # fsspec 7 | aiosignal==1.3.1 8 | # via aiohttp 9 | annotated-types==0.6.0 10 | # via pydantic 11 | anyio==4.3.0 12 | # via httpx 13 | appnope==0.1.4 14 | # via ipykernel 15 | argilla==1.27.0 16 | asttokens==2.4.1 17 | # via stack-data 18 | attrs==23.2.0 19 | # via aiohttp 20 | backoff==2.2.1 21 | # via argilla 22 | certifi==2024.2.2 23 | # via 24 | # httpcore 25 | # httpx 26 | # requests 27 | charset-normalizer==3.3.2 28 | # via requests 29 | click==8.1.7 30 | # via typer 31 | comm==0.2.2 32 | # via 33 | # ipykernel 34 | # ipywidgets 35 | datasets==2.19.0 36 | # via distilabel 37 | debugpy==1.8.1 38 | # via ipykernel 39 | decorator==5.1.1 40 | # via ipython 41 | deprecated==1.2.14 42 | # via argilla 43 | dill==0.3.8 44 | # via 45 | # datasets 46 | # multiprocess 47 | distilabel==1.0.3 48 | executing==2.0.1 49 | # via stack-data 50 | filelock==3.14.0 51 | # via 52 | # datasets 53 | # huggingface-hub 54 | # transformers 55 | frozenlist==1.4.1 56 | # via 57 | # aiohttp 58 | # aiosignal 59 | fsspec==2024.3.1 60 | # via 61 | # datasets 62 | # huggingface-hub 63 | h11==0.14.0 64 | # via httpcore 65 | httpcore==1.0.5 66 | # via httpx 67 | httpx==0.26.0 68 | # via 69 | # argilla 70 | # distilabel 71 | huggingface-hub==0.23.0 72 | # via 73 | # datasets 74 | # tokenizers 75 | # transformers 76 | idna==3.7 77 | # via 78 | # anyio 79 | # httpx 80 | # requests 81 | # yarl 82 | ipykernel==6.29.4 83 | ipython==8.24.0 84 | # via 85 | # ipykernel 86 | # ipywidgets 87 | ipywidgets==8.1.2 88 | jedi==0.19.1 89 | # via ipython 90 | jinja2==3.1.3 91 | # via distilabel 92 | jupyter-client==8.6.1 93 | # via ipykernel 94 | jupyter-core==5.7.2 95 | # via 96 | # ipykernel 97 | # jupyter-client 98 | jupyterlab-widgets==3.0.10 99 | # via ipywidgets 100 | markdown-it-py==3.0.0 101 | # via rich 102 | markupsafe==2.1.5 103 | # via jinja2 104 | matplotlib-inline==0.1.7 105 | # via 106 | # ipykernel 107 | # ipython 108 | mdurl==0.1.2 109 | # via markdown-it-py 110 | monotonic==1.6 111 | # via argilla 112 | multidict==6.0.5 113 | # via 114 | # aiohttp 115 | # yarl 116 | multiprocess==0.70.16 117 | # via 118 | # datasets 119 | # distilabel 120 | nest-asyncio==1.6.0 121 | # via 122 | # distilabel 123 | # ipykernel 124 | networkx==3.3 125 | # via distilabel 126 | numpy==1.23.5 127 | # via 128 | # argilla 129 | # datasets 130 | # pandas 131 | # pyarrow 132 | # scipy 133 | # transformers 134 | packaging==24.0 135 | # via 136 | # argilla 137 | # datasets 138 | # huggingface-hub 139 | # ipykernel 140 | # transformers 141 | pandas==2.2.2 142 | # via 143 | # argilla 144 | # datasets 145 | parso==0.8.4 146 | # via jedi 147 | pexpect==4.9.0 148 | # via ipython 149 | platformdirs==4.2.1 150 | # via jupyter-core 151 | prompt-toolkit==3.0.43 152 | # via ipython 153 | psutil==5.9.8 154 | # via ipykernel 155 | ptyprocess==0.7.0 156 | # via pexpect 157 | pure-eval==0.2.2 158 | # via stack-data 159 | pyarrow==16.0.0 160 | # via datasets 161 | pyarrow-hotfix==0.6 162 | # via datasets 163 | pydantic==2.7.1 164 | # via 165 | # argilla 166 | # distilabel 167 | pydantic-core==2.18.2 168 | # via pydantic 169 | pygments==2.17.2 170 | # via 171 | # ipython 172 | # rich 173 | python-dateutil==2.9.0.post0 174 | # via 175 | # jupyter-client 176 | # pandas 177 | python-dotenv==1.0.1 178 | pytz==2024.1 179 | # via pandas 180 | pyyaml==6.0.1 181 | # via 182 | # datasets 183 | # huggingface-hub 184 | # transformers 185 | pyzmq==26.0.3 186 | # via 187 | # ipykernel 188 | # jupyter-client 189 | regex==2024.4.28 190 | # via transformers 191 | requests==2.31.0 192 | # via 193 | # datasets 194 | # huggingface-hub 195 | # transformers 196 | rich==13.7.1 197 | # via 198 | # argilla 199 | # distilabel 200 | safetensors==0.4.3 201 | # via transformers 202 | scipy==1.13.0 203 | # via distilabel 204 | six==1.16.0 205 | # via 206 | # asttokens 207 | # python-dateutil 208 | sniffio==1.3.1 209 | # via 210 | # anyio 211 | # httpx 212 | stack-data==0.6.3 213 | # via ipython 214 | tblib==3.0.0 215 | # via distilabel 216 | tokenizers==0.19.1 217 | # via transformers 218 | tornado==6.4 219 | # via 220 | # ipykernel 221 | # jupyter-client 222 | tqdm==4.66.4 223 | # via 224 | # argilla 225 | # datasets 226 | # huggingface-hub 227 | # transformers 228 | traitlets==5.14.3 229 | # via 230 | # comm 231 | # ipykernel 232 | # ipython 233 | # ipywidgets 234 | # jupyter-client 235 | # jupyter-core 236 | # matplotlib-inline 237 | transformers==4.40.1 238 | typer==0.9.4 239 | # via 240 | # argilla 241 | # distilabel 242 | typing-extensions==4.11.0 243 | # via 244 | # huggingface-hub 245 | # ipython 246 | # pydantic 247 | # pydantic-core 248 | # typer 249 | tzdata==2024.1 250 | # via pandas 251 | urllib3==2.2.1 252 | # via requests 253 | wcwidth==0.2.13 254 | # via prompt-toolkit 255 | widgetsnbextension==4.0.10 256 | # via ipywidgets 257 | wrapt==1.14.1 258 | # via 259 | # argilla 260 | # deprecated 261 | xxhash==3.4.1 262 | # via datasets 263 | yarl==1.9.4 264 | # via aiohttp 265 | -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | # KTO Dataset Project 6 | 7 | The KTO Dataset Project aims to create more preference data according to the KTO format. With the provided tools, the community will be able to easily generate a KTO dataset in any language or domain they are interested in. This type of preference data is easier to collect than others like DPO, and can be used to train models to better align with human preferences. By following two simple steps, you will be able to create your KTO dataset. 8 | 9 | ## What is the goal of this project? 10 | 11 | The goal of this project is to create more KTO datasets for different languages or domains. This will help the community to train models that better align with human preferences. The project will provide the tools and resources to easily generate a KTO dataset. 12 | 13 | ### Why do we need more KTO datasets? 14 | 15 |
16 | What is a preference dataset? 17 | 18 | Preference tuning is a step often performed when creating a chat/instruction following model with the goal of more closely aligning the model's outputs with the "human preferences" (or more accurately one set of human preferences). Often this is done through some form of reinforcement learning. Increasingly instead of having a separate reward model, we can use a preference dataset to directly train the model. Two prominent approaches to this are: 19 | 20 | - Direct Preference Optimization (DPO) 21 | - Kahneman-Tversky Optimisation (KTO) 22 | 23 | We won't dive into all of the technical details here but instead focus on what the data for both of these approaches look like. The overall steps are something like this: 24 | 25 | - Have some prompts 26 | - Generate responses to these prompts 27 | - Rank/rate the responses to the prompts 28 | 29 | We'll use the example of haiku here but this could be any kind of text generation task. 30 |
31 | 32 |
33 | What is the difference between DPO vs KTO? 34 | 35 | Whilst both DPO and KTO are methods for preference tuning (and sound like things that would be shouted at the end of a street fighter level), they differ in the kinds of data they require. DPO requires a preference dataset where we have two sets of responses with one "chosen" and one "rejected". We can take a look at a screenshot from a dataset server of a DPO dataset below: 36 | 37 | ![Dataset Server](assets/viewer.png) 38 | 39 | As you can see, we have one column containing "chosen" responses and another containing "rejected" responses. This is the kind of data we would need for DPO. How would we collect this data once we have our candidate haiku responses? If we want to stick to using human feedback rather than a judge LM we would need to indicate their preferences between different haiku. 40 | 41 | There are different ways we could do this. We could ask humans to rate the haiku on a scale of 1-5, we could ask them to pick their favorite haiku from a set of 5, we could ask them to rank the haiku from best to worst etc. One disadvantage of DPO is that generating this kind of data from humans is quite cognitively demanding. It can be hard to compare two things and say which one is better and even with an optimized interface, it can be quite time-consuming. This is where KTO can provide an alternative. 42 | 43 | In contrast to DPO, KTO doesn't require two candidate responses i.e. "chosen" and "rejected". Instead, it can rely on a simple binary preference i.e. 👍👎. This is arguably much easier for an annotator to create. 44 |

45 | 46 | As we know, preference data is crucial for training models that better align with human preferences. However, collecting this DPO-formatted data can be time-consuming and expensive. This is where KTO datasets come in. KTO datasets are easier to collect than DPO datasets as they only require a prompt-response dataset with binary preference i.e. 👍👎. By creating more KTO datasets, we aim to improve our models more simply. 47 | 48 |
49 | Why should we generate responses to prompts? 50 | We could of course collect all of our preferences data by hand i.e. we could write a prompt like: "Write a recipe for banana bread" and then write two sets of responses one which we prefer over the other. However, this is time-consuming and not scalable. Instead, we can use a model to generate responses to our prompts and then use human feedback to determine which response we prefer. In our case, we can ask different LLMs to write haiku based on a prompt and then ask humans to rate the haiku. 51 | 52 | ![preference data](assets/dpo.png) 53 |
54 | 55 | ## How can you contribute? 56 | 57 | As part of Data Is Better Together, we're supporting the community in generating more KTO datasets for different languages or the domains they are interested in. If you would like to help, you can follow the steps below to generate a KTO dataset. There are already many communities working together on the Hugging Face Discord server, so you can also join the server to collaborate with others on this project 🤗. 58 | 59 | ## Project Overview 60 | 61 | Here we will walk through a simple example of how you might create a KTO dataset using synthetic data and human feedback. We will use haiku as our example but this could be any kind of text generation task. 62 | 63 | ### 1. Prerequisites 64 | 65 | * A 🤗 Hugging Face account: We'll extensively use the Hugging Face Hub both to generate our data via hosted model APIs and to share our generated datasets. You can sign up for a Hugging Face account [here](https://huggingface.co/join). 66 | 67 | * For the workflow we describe here, we assume you already have a dataset of prompts. This [notebook](https://github.com/davanstrien/haiku-dpo/blob/main/01_generate_haiku_prompts.ipynb) shows how you could generate a dataset of haiku prompts. This approach could be adapted to any kind of text-generation task. The [instruction generation](https://distilabel.argilla.io/latest/tutorials/create-a-math-preference-dataset/#instruction-generation) section of this Distilabel tutorial provides a good overview of how you might generate a dataset of prompts for a different kind of text generation task. 68 | 69 | ### 2. Produce generations with various open models 70 | 71 | We will use [Distilabel](https://github.com/argilla-io/distilabel) to generate our haiku responses based on our initial prompt dataset. To generate the dataset, we will use the following models: 72 | 73 | - [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) 74 | - [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) 75 | - [llama/Llama-2-70b-chat-hf](https://huggingface.co/llama/Llama-2-70b-chat-hf) 76 | 77 | However, you could swap these out for other models depending on your goals, budget, the domain you are working in etc. 78 | 79 | You will find the code to generate the haiku responses in [preference_gen.py](preference_gen.py). 80 | 81 | #### Hosted Model APIs 82 | 83 | We can use Hugging Face's free inference API to generate our haiku responses. This is a great way to get started with generating synthetic data. You can find more information on the supported models and how to use the API [here](https://huggingface.co/blog/inference-pro#supported-models). 84 | 85 | One of our models, "NousResearch/Nous-Hermes-2-Yi-34B" is hosted using [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated) instead. In the code, this part is commented out so it should be possible to run the code without needing to set up dedicated inference endpoints. 86 | 87 | > [!WARNING] 88 | > If you have local GPUs available, you can also adapt this approach using other [inference frameworks](https://distilabel.argilla.io/latest/components-gallery/llms/) such as Ollama or vLLM. 89 | 90 | #### The dataset produced 91 | 92 | A single row from the dataset produced by this code looks like this: 93 | 94 | ```python 95 | { 96 | "input": "Can you compose a haiku about the serenity of mountain peaks?", 97 | "generation_model": [ 98 | "mistralai/Mistral-7B-Instruct-v0.2", 99 | "meta-llama/Llama-2-70b-chat-hf", 100 | "NousResearch/Nous-Hermes-2-Yi-34B", 101 | ], 102 | "generation_prompt": [ 103 | "[INST] <>\nYou are a poet specialising in creating Haiku. \nYour haiku consist of three lines, with five syllables in the first line, seven in the second, and five in the third.\nBeyond being technically correct, your haiku should also be beautiful and meaningful. \nYou respond only with a haiku. You do not add anything else to your responses. \n\n<>\n\nCan you compose a haiku about the serenity of mountain peaks? [/INST]", 104 | "[INST] <>\nYou are a poet specialising in creating Haiku. \nYour haiku consist of three lines, with five syllables in the first line, seven in the second, and five in the third.\nBeyond being technically correct, your haiku should also be beautiful and meaningful. \nYou respond only with a haiku. You do not add anything else to your responses. \n\n<>\n\nCan you compose a haiku about the serenity of mountain peaks? [/INST]", 105 | "<|im_start|>system\nYou are a poet specialising in creating Haiku. \nYour haiku consist of three lines, with five syllables in the first line, seven in the second, and five in the third.\nBeyond being technically correct, your haiku should also be beautiful and meaningful. \nYou respond only with a haiku. You do not add anything else to your responses. \n\n<|im_end|>\n<|im_start|>user\nCan you compose a haiku about the serenity of mountain peaks?<|im_end|>\n<|im_start|>assistant\n", 106 | ], 107 | "raw_generation_responses": [ 108 | " Peaceful summit rests,\nSky's reflection in still lake,\nSilence whispers on.", 109 | " Snow-capped peaks rise high\nSilent, majestic, and serene\nNature's peaceful throne", 110 | "Mountain peaks, serene\nPeaceful silence, whispers breeze\nNature's tranquil song", 111 | ], 112 | "generations": [ 113 | " Peaceful summit rests,\nSky's reflection in still lake,\nSilence whispers on.", 114 | " Snow-capped peaks rise high\nSilent, majestic, and serene\nNature's peaceful throne", 115 | "Mountain peaks, serene\nPeaceful silence, whispers breeze\nNature's tranquil song", 116 | ], 117 | } 118 | ``` 119 | 120 | As you can hopefully see, we have a single prompt and three haiku responses. We also have the model that generated each haiku response. This kind of data could be used to generate both a DPO and KTO dataset. We will focus on KTO here. 121 | 122 | ### I'm GPU-poor, can I still get involved? 123 | 124 | Yes! The example scripts in this repository use Hugging Face Inference Endpoints for the inference component. This means you can run the scripts on your local machine without needing a GPU. We can provide you with GPU grants to run the `distilabel` script if you need them. Please reach out to us on the Hugging Face Discord server if you need a GPU grant. **Note**: We will want to ensure that you have a plan for how you will use the GPU grant before providing it, in particular, we'll want to see that you have set up an Argilla Space for your project already and have already done some work to identify the language you want to work on and the models you want to use. 125 | 126 | ## 3. Create a preference dataset annotation Space in Argilla hosted on Spaces with HF authentication 127 | 128 | Hugging Face Spaces offer a simple way to host ML demo apps directly on your profile or your organization’s profile. [Argilla](https://argilla.io/) is a powerful data annotation tool that is integrated strongly with Hugging Face Spaces and other parts of the Hugging Face ecosystem. 129 | 130 | ![Argilla Space](assets/space.png) 131 | 132 | The [create_preference_task.ipynb](01_create_preference_task.ipynb) notebook shows how you could create a preference dataset annotation Argilla Space that anyone with a Hugging Face account can contribute to. This is a great way to collect human feedback on your synthetic data. 133 | 134 | This will create a task that looks like this: 135 | 136 | ![Task](assets/task.png) 137 | 138 | ## Next steps 139 | 140 | The current notebooks and code currently only show how to generate the synthetic data and create a preference dataset annotation Space. The next steps would be to collect human feedback on the synthetic data and then use this to train a model. We will cover this in a future notebook. -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/access.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/access.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/app-creation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/app-creation.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/b822ac33-a10e-4da7-a36a-682b96d1fe0e.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/b822ac33-a10e-4da7-a36a-682b96d1fe0e.webp -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/datasets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/datasets.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/dpo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/dpo.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/secrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/secrets.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/space.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/storage.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/task.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/assets/viewer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/data-is-better-together/f7f0e8e2a73f1b289269b9d1b897659cf478c15f/cookbook-efforts/kto-preference/assets/viewer.png -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/preference_gen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | from datasets import load_dataset 5 | from distilabel.llm import LLM, InferenceEndpointsLLM, LLMPool, ProcessLLM 6 | from distilabel.pipeline import Pipeline 7 | from distilabel.tasks import Task, TextGenerationTask 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | # You need to set the HF_TOKEN environment variable to your Hugging Face API token 13 | HF_TOKEN = os.getenv("HF_TOKEN") 14 | assert HF_TOKEN is not None, "Please set HF_TOKEN to your Hugging Face API token" 15 | HF_USER_NAME = None 16 | assert HF_USER_NAME, "Please set HF_USER_NAME to your Hugging Face username" 17 | 18 | # if you want to sample from the dataset, set this to the number of samples you want 19 | # if the size of your sample is larger than the dataset the full dataset will be used 20 | SAMPLE_SIZE = None 21 | 22 | 23 | ## Load the dataset of prompts 24 | def prepare_data(): 25 | prompts = load_dataset("davanstrien/haiku_prompts", split="train") 26 | print(f"Loaded {len(prompts)} prompts") 27 | return prompts.rename_column("instructions", "input") 28 | 29 | 30 | dataset = prepare_data() 31 | 32 | ## Define the task 33 | 34 | task = TextGenerationTask( 35 | system_prompt="""You are a poet specialising in creating Haiku. \nYour haiku consist of three lines, with five syllables in the first line, seven in the second, and five in the third.\nBeyond being technically correct, your haiku should also be beautiful and meaningful. \nYou respond only with a haiku. You do not add anything else to your responses. \n\n""", 36 | ) 37 | 38 | print(task.system_prompt) 39 | 40 | 41 | # load llms 42 | def load_llama2(task: Task) -> LLM: 43 | return InferenceEndpointsLLM( 44 | "meta-llama/Llama-2-70b-chat-hf", 45 | token=HF_TOKEN, 46 | task=task, 47 | max_new_tokens=512, 48 | prompt_format="llama2", 49 | ) 50 | 51 | 52 | def load_mistral(task: Task) -> LLM: 53 | checkpoint = "mistralai/Mistral-7B-Instruct-v0.2" 54 | return InferenceEndpointsLLM( 55 | checkpoint, 56 | token=HF_TOKEN, 57 | task=task, 58 | max_new_tokens=512, 59 | prompt_format="llama2", 60 | ) 61 | 62 | 63 | # uncomment to use nous-hermes-2-yi-34b-aug 64 | 65 | # def load_nous_yi(task: Task) -> LLM: 66 | # checkpoint = "nous-hermes-2-yi-34b-aug" 67 | # return InferenceEndpointsLLM( 68 | # checkpoint, 69 | # token=HF_TOKEN, 70 | # task=task, 71 | # max_new_tokens=488, 72 | # prompt_format="chatml", 73 | # ) 74 | 75 | 76 | mistral = ProcessLLM(task=task, load_llm_fn=load_mistral) 77 | llama2 = ProcessLLM(task=task, load_llm_fn=load_llama2) 78 | # uncomment to use nous-hermes-2-yi-34b-aug 79 | # nous_yi = ProcessLLM(task=task, load_llm_fn=load_nous_yi) 80 | 81 | llms = [ 82 | mistral, 83 | llama2, 84 | ] # nous_yi] # uncomment to use nous-hermes-2-yi-34b-aug 85 | 86 | 87 | pool = LLMPool(llms=llms) 88 | 89 | 90 | pipeline = Pipeline(generator=pool) 91 | 92 | if SAMPLE_SIZE is not None: 93 | sample_idx = random.sample(range(len(dataset)), min(SAMPLE_SIZE, len(dataset))) 94 | dataset = dataset.select(sample_idx) 95 | print(f"Using {len(dataset)} prompts") 96 | 97 | print("Generating haiku...") 98 | haiku = pipeline.generate( 99 | dataset, 100 | num_generations=3, 101 | batch_size=1, 102 | display_progress_bar=True, 103 | shuffle_before_labelling=False, 104 | ) 105 | 106 | print(haiku) 107 | print("Pushing to hub...") 108 | haiku.push_to_hub(f"{HF_USER_NAME}/haiku_dpo", "aesthetic-preference", token=HF_TOKEN) 109 | -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/requirements.in: -------------------------------------------------------------------------------- 1 | argilla 2 | datasets 3 | distilabel[hf-inference-endpoints] 4 | huggingface_hub 5 | ipywidgets 6 | python-dotenv -------------------------------------------------------------------------------- /cookbook-efforts/kto-preference/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile requirements.in -o requirements.txt 3 | aiohttp==3.9.3 4 | # via 5 | # datasets 6 | # fsspec 7 | aiosignal==1.3.1 8 | # via aiohttp 9 | annotated-types==0.6.0 10 | # via pydantic 11 | anyio==4.3.0 12 | # via httpx 13 | argilla==1.25.0 14 | asttokens==2.4.1 15 | # via stack-data 16 | attrs==23.2.0 17 | # via aiohttp 18 | backoff==2.2.1 19 | # via argilla 20 | certifi==2024.2.2 21 | # via 22 | # httpcore 23 | # httpx 24 | # requests 25 | charset-normalizer==3.3.2 26 | # via requests 27 | click==8.1.7 28 | # via 29 | # nltk 30 | # typer 31 | comm==0.2.2 32 | # via ipywidgets 33 | datasets==2.18.0 34 | # via distilabel 35 | decorator==5.1.1 36 | # via ipython 37 | deprecated==1.2.14 38 | # via argilla 39 | dill==0.3.8 40 | # via 41 | # datasets 42 | # multiprocess 43 | distilabel==0.6.0 44 | executing==2.0.1 45 | # via stack-data 46 | filelock==3.13.1 47 | # via 48 | # datasets 49 | # huggingface-hub 50 | frozenlist==1.4.1 51 | # via 52 | # aiohttp 53 | # aiosignal 54 | fsspec==2024.2.0 55 | # via 56 | # datasets 57 | # huggingface-hub 58 | h11==0.14.0 59 | # via httpcore 60 | httpcore==1.0.4 61 | # via httpx 62 | httpx==0.26.0 63 | # via argilla 64 | huggingface-hub==0.21.4 65 | # via 66 | # datasets 67 | # distilabel 68 | idna==3.6 69 | # via 70 | # anyio 71 | # httpx 72 | # requests 73 | # yarl 74 | ipython==8.22.2 75 | # via ipywidgets 76 | ipywidgets==8.1.2 77 | jedi==0.19.1 78 | # via ipython 79 | jinja2==3.1.3 80 | # via distilabel 81 | joblib==1.3.2 82 | # via nltk 83 | jupyterlab-widgets==3.0.10 84 | # via ipywidgets 85 | markdown-it-py==3.0.0 86 | # via rich 87 | markupsafe==2.1.5 88 | # via jinja2 89 | matplotlib-inline==0.1.6 90 | # via ipython 91 | mdurl==0.1.2 92 | # via markdown-it-py 93 | monotonic==1.6 94 | # via argilla 95 | multidict==6.0.5 96 | # via 97 | # aiohttp 98 | # yarl 99 | multiprocess==0.70.16 100 | # via 101 | # datasets 102 | # distilabel 103 | nltk==3.8.1 104 | # via argilla 105 | numpy==1.23.5 106 | # via 107 | # argilla 108 | # datasets 109 | # pandas 110 | # pyarrow 111 | packaging==24.0 112 | # via 113 | # argilla 114 | # datasets 115 | # huggingface-hub 116 | pandas==2.2.1 117 | # via 118 | # argilla 119 | # datasets 120 | parso==0.8.3 121 | # via jedi 122 | pexpect==4.9.0 123 | # via ipython 124 | prompt-toolkit==3.0.43 125 | # via ipython 126 | ptyprocess==0.7.0 127 | # via pexpect 128 | pure-eval==0.2.2 129 | # via stack-data 130 | pyarrow==15.0.1 131 | # via datasets 132 | pyarrow-hotfix==0.6 133 | # via datasets 134 | pydantic==2.6.4 135 | # via argilla 136 | pydantic-core==2.16.3 137 | # via pydantic 138 | pygments==2.17.2 139 | # via 140 | # ipython 141 | # rich 142 | python-dateutil==2.9.0.post0 143 | # via pandas 144 | python-dotenv==1.0.1 145 | pytz==2024.1 146 | # via pandas 147 | pyyaml==6.0.1 148 | # via 149 | # datasets 150 | # huggingface-hub 151 | regex==2023.12.25 152 | # via nltk 153 | requests==2.31.0 154 | # via 155 | # datasets 156 | # huggingface-hub 157 | rich==13.7.1 158 | # via 159 | # argilla 160 | # distilabel 161 | six==1.16.0 162 | # via 163 | # asttokens 164 | # python-dateutil 165 | sniffio==1.3.1 166 | # via 167 | # anyio 168 | # httpx 169 | stack-data==0.6.3 170 | # via ipython 171 | tenacity==8.2.3 172 | # via distilabel 173 | tqdm==4.66.2 174 | # via 175 | # argilla 176 | # datasets 177 | # huggingface-hub 178 | # nltk 179 | traitlets==5.14.2 180 | # via 181 | # comm 182 | # ipython 183 | # ipywidgets 184 | # matplotlib-inline 185 | typer==0.9.0 186 | # via argilla 187 | typing-extensions==4.10.0 188 | # via 189 | # huggingface-hub 190 | # pydantic 191 | # pydantic-core 192 | # typer 193 | tzdata==2024.1 194 | # via pandas 195 | urllib3==2.2.1 196 | # via requests 197 | wcwidth==0.2.13 198 | # via prompt-toolkit 199 | widgetsnbextension==4.0.10 200 | # via ipywidgets 201 | wrapt==1.14.1 202 | # via 203 | # argilla 204 | # deprecated 205 | xxhash==3.4.1 206 | # via datasets 207 | yarl==1.9.4 208 | # via aiohttp 209 | --------------------------------------------------------------------------------