├── .dockerignore ├── .env.local.template ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── app.py ├── assets ├── argilla.png ├── flow.png ├── logo.png ├── logo.svg ├── ui-full.png └── ui.png ├── docker-compose.yml ├── docker ├── .env.docker.template ├── Dockerfile ├── README.md ├── argilla │ └── compose.yml └── ollama │ ├── compose.yml │ └── entrypoint.sh ├── examples ├── argilla-deployment.py ├── blog_private_synthetic_data_generation.md ├── fine-tune-deepseek-reasoning-sft.ipynb ├── fine-tune-modernbert-classifier.ipynb ├── fine-tune-modernbert-rag.ipynb ├── fine-tune-smollm2-on-synthetic-data.ipynb ├── hf-dedicated-or-tgi-deployment.py ├── hf-serverless-deployment-deepseek.py ├── hf-serverless-deployment.py ├── hf-serverless-different-model-for-completion.py ├── ollama-deployment.py ├── ollama-different-model-for-completion.py ├── openai-deployment.py └── vllm-deployment.py ├── packages.txt ├── pdm.lock ├── pyproject.toml ├── requirements.txt ├── src └── synthetic_dataset_generator │ ├── __init__.py │ ├── __main__.py │ ├── _distiset.py │ ├── _inference_endpoints.py │ ├── _tabbedinterface.py │ ├── app.py │ ├── apps │ ├── __init__.py │ ├── about.py │ ├── base.py │ ├── chat.py │ ├── eval.py │ ├── rag.py │ └── textcat.py │ ├── constants.py │ ├── pipelines │ ├── __init__.py │ ├── base.py │ ├── chat.py │ ├── embeddings.py │ ├── eval.py │ ├── rag.py │ └── textcat.py │ └── utils.py └── tests └── __init__.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Version control 2 | .git 3 | .gitignore 4 | 5 | # Python 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | *.so 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # Virtual environments 29 | .env* 30 | !.env.example 31 | .venv 32 | env/ 33 | venv/ 34 | ENV/ 35 | 36 | # IDE 37 | .idea/ 38 | .vscode/ 39 | *.swp 40 | *.swo 41 | 42 | # Testing 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Project specific 54 | nltk_data/ 55 | .pdm-python 56 | .pdm.toml 57 | __pypackages__/ -------------------------------------------------------------------------------- /.env.local.template: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # LOCAL/API CONFIGURATION 3 | # ============================================================================= 4 | 5 | # ----------------------------------------------------------------------------- 6 | # REQUIRED CONFIGURATION 7 | # ----------------------------------------------------------------------------- 8 | # Hugging Face token (required for all setups) 9 | HF_TOKEN=hf_... 10 | 11 | # Generation Settings 12 | MAX_NUM_TOKENS=2048 13 | MAX_NUM_ROWS=1000 14 | DEFAULT_BATCH_SIZE=5 15 | 16 | # Required for chat data generation with Llama or Qwen models 17 | # Options: "llama3", "qwen2", or custom template string 18 | MAGPIE_PRE_QUERY_TEMPLATE=llama3 19 | 20 | # ----------------------------------------------------------------------------- 21 | # A. CLOUD API SERVICES 22 | # ----------------------------------------------------------------------------- 23 | 24 | # 1. HUGGING FACE INFERENCE API (Default, Recommended) 25 | MODEL=meta-llama/Llama-3.1-8B-Instruct 26 | # MODEL=Qwen/Qwen2.5-1.5B-Instruct 27 | 28 | # 2. OPENAI API 29 | # OPENAI_BASE_URL=https://api.openai.com/v1/ 30 | # MODEL=gpt-4 31 | # API_KEY=sk-... 32 | 33 | # 3. HUGGING FACE SPACE FOR ARGILLA (optional) 34 | # ARGILLA_API_URL=https://your-space.hf.space/ 35 | # ARGILLA_API_KEY=your_key 36 | 37 | # ----------------------------------------------------------------------------- 38 | # B. LOCAL SERVICES (Requires Installation) 39 | # ----------------------------------------------------------------------------- 40 | 41 | # 1. LOCAL OLLAMA 42 | # OLLAMA_BASE_URL=http://127.0.0.1:11434/ 43 | # MODEL=llama3.2:1b 44 | # TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct 45 | 46 | # 2. LOCAL VLLM 47 | # VLLM_BASE_URL=http://127.0.0.1:8000/ 48 | # MODEL=Qwen/Qwen2.5-1.5B-Instruct 49 | # TOKENIZER_ID=Qwen/Qwen2.5-1.5B-Instruct 50 | 51 | # 3. LOCAL TGI 52 | # HUGGINGFACE_BASE_URL=http://127.0.0.1:3000/ 53 | # MODEL=meta-llama/Llama-3.1-8B-Instruct 54 | # TOKENIZER_ID=meta-llama/Llama-3.1-8B-Instruct 55 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.pth filter=lfs diff=lfs merge=lfs -text 24 | *.rar filter=lfs diff=lfs merge=lfs -text 25 | *.safetensors filter=lfs diff=lfs merge=lfs -text 26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 27 | *.tar.* filter=lfs diff=lfs merge=lfs -text 28 | *.tar filter=lfs diff=lfs merge=lfs -text 29 | *.tflite filter=lfs diff=lfs merge=lfs -text 30 | *.tgz filter=lfs diff=lfs merge=lfs -text 31 | *.wasm filter=lfs diff=lfs merge=lfs -text 32 | *.xz filter=lfs diff=lfs merge=lfs -text 33 | *.zip filter=lfs diff=lfs merge=lfs -text 34 | *.zst filter=lfs diff=lfs merge=lfs -text 35 | *tfevents* filter=lfs diff=lfs merge=lfs -text 36 | assets/flow.png filter=lfs diff=lfs merge=lfs -text 37 | *.sh text eol=lf 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm-project.org/#use-with-ide 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | .python-version 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | .DS_Store 165 | 166 | # nltk 167 | nltk_data/ 168 | 169 | # examples 170 | models/ 171 | 172 | # Elasticsearch data 173 | elasticsearch_data/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Synthetic Data Generator 3 | short_description: Build datasets using natural language 4 | emoji: 🧬 5 | colorFrom: yellow 6 | colorTo: pink 7 | sdk: gradio 8 | sdk_version: 5.8.0 9 | app_file: app.py 10 | pinned: true 11 | license: apache-2.0 12 | hf_oauth: true 13 | #header: mini 14 | hf_oauth_scopes: 15 | - read-repos 16 | - write-repos 17 | - manage-repos 18 | - inference-api 19 | --- 20 | 21 | > [!IMPORTANT] 22 | The original authors have moved on to other projects. While the code might still be functional for its original purpose, please be aware that the original team does not plan to develop new features, bug fixes, or updates. If you'd like to become a maintainer, please open an issue to discuss. 23 | > 24 | > 25 |
26 | 27 |

28 | Synthetic Data Generator Logo 29 |

30 |

Build datasets using natural language

31 | 32 | ![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui-full.png) 33 | 34 | ## Introduction 35 | 36 | Synthetic Data Generator is a tool that allows you to create high-quality datasets for training and fine-tuning language models. It leverages the power of distilabel and LLMs to generate synthetic data tailored to your specific needs. [The announcement blog](https://huggingface.co/blog/synthetic-data-generator) goes over a practical example of how to use it but you can also watch the [video](https://www.youtube.com/watch?v=nXjVtnGeEss) to see it in action. 37 | 38 | Supported Tasks: 39 | 40 | - Text Classification 41 | - Chat Data for Supervised Fine-Tuning 42 | - Retrieval Augmented Generation 43 | 44 | This tool simplifies the process of creating custom datasets, enabling you to: 45 | 46 | - Describe the characteristics of your desired application 47 | - Iterate on sample datasets 48 | - Produce full-scale datasets 49 | - Push your datasets to the [Hugging Face Hub](https://huggingface.co/datasets?other=datacraft) and/or [Argilla](https://docs.argilla.io/) 50 | 51 | By using the Synthetic Data Generator, you can rapidly prototype and create datasets for, accelerating your AI development process. 52 | 53 |

54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |

64 | 65 | ## Installation 66 | 67 | You can simply install the package with: 68 | 69 | ```bash 70 | pip install synthetic-dataset-generator 71 | ``` 72 | 73 | ### Quickstart 74 | 75 | ```python 76 | from synthetic_dataset_generator import launch 77 | 78 | launch() 79 | ``` 80 | 81 | ### Environment Variables 82 | 83 | - `HF_TOKEN`: Your [Hugging Face token](https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&tokenType=fineGrained) to push your datasets to the Hugging Face Hub and generate free completions from Hugging Face Inference Endpoints. You can find some configuration examples in the [examples](examples/) folder. 84 | 85 | You can set the following environment variables to customize the generation process. 86 | 87 | - `MAX_NUM_TOKENS`: The maximum number of tokens to generate, defaults to `2048`. 88 | - `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`. 89 | - `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`. 90 | 91 | Optionally, you can use different API providers and models. 92 | 93 | - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`, `llama3.1`. 94 | - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the `HF_TOKEN` environment variable. 95 | - `OPENAI_BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api.openai.com/v1/`. 96 | - `OLLAMA_BASE_URL`: The base URL for any Ollama compatible API, e.g. `http://127.0.0.1:11434/`. 97 | - `HUGGINGFACE_BASE_URL`: The base URL for any Hugging Face compatible API, e.g. TGI server or Dedicated Inference Endpoints. If you want to use serverless inference, only set the `MODEL`. 98 | - `VLLM_BASE_URL`: The base URL for any VLLM compatible API, e.g. `http://localhost:8000/`. 99 | 100 | To use a specific model exclusively for generating completions, set the corresponding environment variables by appending `_COMPLETION` to the ones mentioned earlier. For example, you can use `MODEL_COMPLETION` and `OPENAI_BASE_URL_COMPLETION`. 101 | 102 | SFT and Chat Data generation is not supported with OpenAI Endpoints. Additionally, you need to configure it per model family based on their prompt templates using the right `TOKENIZER_ID` and `MAGPIE_PRE_QUERY_TEMPLATE` environment variables. 103 | 104 | - `TOKENIZER_ID`: The tokenizer ID to use for the magpie pipeline, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`. 105 | - `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie, which is only supported with Hugging Face Inference Endpoints. `llama3` and `qwen2` are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"`, respectively. For other models, you can pass a custom pre-query template string. 106 | 107 | Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables: 108 | 109 | - `ARGILLA_API_KEY`: Your Argilla API key to push your datasets to Argilla. 110 | - `ARGILLA_API_URL`: Your Argilla API URL to push your datasets to Argilla. 111 | 112 | To save the generated datasets to a local directory instead of pushing them to the Hugging Face Hub, set the following environment variable: 113 | 114 | - `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to. 115 | 116 | You can use our environment template as a starting point: 117 | 118 | ```bash 119 | cp .env.local.template .env 120 | ``` 121 | 122 | ### Argilla integration 123 | 124 | Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the Hugging Face Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/). 125 | 126 | ![Argilla integration](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/argilla.png) 127 | 128 | ## Custom synthetic data generation? 129 | 130 | Each pipeline is based on distilabel, so you can easily change the LLM or the pipeline steps. 131 | 132 | Check out the [distilabel library](https://github.com/argilla-io/distilabel) for more information. 133 | 134 | ## Development 135 | 136 | Install the dependencies: 137 | 138 | ```bash 139 | # Create a virtual environment 140 | python -m venv .venv 141 | source .venv/bin/activate 142 | 143 | # Install the dependencies 144 | pip install -e . # pdm install 145 | ``` 146 | 147 | Run the app: 148 | 149 | ```bash 150 | python app.py 151 | ``` 152 | 153 | ## 🐳 Docker Setup 154 | 155 | The containerized tool uses Ollama for local LLM inference and Argilla for data curation. Here's the architecture: 156 | 157 | ![Container Structure](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png) 158 | 159 | Quick setup with all services (App + Ollama + Argilla): 160 | 161 | ```bash 162 | # Copy environment template 163 | cp docker/.env.docker.template .env # Add your HF_TOKEN in .env 164 | 165 | # Build all services (this may take a few minutes) 166 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build 167 | 168 | # Start all services 169 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d 170 | ``` 171 | 172 | > For more detailed Docker configurations and setups, check [docker/README.md](docker/README.md) 173 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from synthetic_dataset_generator import launch 2 | 3 | if __name__ == "__main__": 4 | launch() 5 | -------------------------------------------------------------------------------- /assets/argilla.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/argilla.png -------------------------------------------------------------------------------- /assets/flow.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b0465f5f3ed2a87b14cc609a1f25a1e7b0bfeb1cc8cab534a6ec79a9a8651996 3 | size 1810372 4 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/logo.png -------------------------------------------------------------------------------- /assets/ui-full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/ui-full.png -------------------------------------------------------------------------------- /assets/ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/ui.png -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | app: 3 | build: 4 | context: . 5 | dockerfile: docker/Dockerfile 6 | image: synthetic-data-generator:app 7 | ports: 8 | - "7860:7860" 9 | env_file: 10 | - .env 11 | networks: 12 | - app-network 13 | 14 | networks: 15 | app-network: 16 | name: synthetic-data-network 17 | driver: bridge -------------------------------------------------------------------------------- /docker/.env.docker.template: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # DOCKER CONFIGURATION ONLY - FULL SETUP (APP + OLLAMA + ARGILLA) 3 | # ============================================================================= 4 | 5 | # Note: Before building: 6 | # 1. Copy this template to the root directory: cp docker/.env.docker.template .env 7 | # 2. Comment/uncomment the sections you want to use (OLLAMA and/or ARGILLA) 8 | # 3. Then build and run with the appropriate docker compose command 9 | 10 | # Hugging Face token with read/write permissions 11 | HF_TOKEN=your_token_here 12 | 13 | # ----------------------------------------------------------------------------- 14 | # GENERATION SETTINGS 15 | # ----------------------------------------------------------------------------- 16 | MAX_NUM_TOKENS=2048 17 | MAX_NUM_ROWS=1000 18 | DEFAULT_BATCH_SIZE=5 19 | 20 | # ----------------------------------------------------------------------------- 21 | # OLLAMA DOCKER CONFIGURATION 22 | # ----------------------------------------------------------------------------- 23 | OLLAMA_BASE_URL=http://ollama:11434 24 | OLLAMA_HARDWARE=latest # latest (for CPU/NVIDIA), rocm (for AMD) 25 | 26 | # LLAMA 3.2 27 | MODEL=llama3.2:1b 28 | TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct 29 | MAGPIE_PRE_QUERY_TEMPLATE=llama3 30 | 31 | # DEEPSEEK R1 32 | #MODEL=deepseek-r1:1.5b # must match ollama tags https://ollama.com/library/deepseek-r1:1.5b 33 | #TOKENIZER_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 34 | #MAGPIE_PRE_QUERY_TEMPLATE= "<|begin▁of▁sentence|>User: " 35 | 36 | # ----------------------------------------------------------------------------- 37 | # ARGILLA DOCKER CONFIGURATION (persistent data) 38 | # ----------------------------------------------------------------------------- 39 | ARGILLA_API_URL=http://argilla:6900 40 | ARGILLA_USERNAME=admin 41 | ARGILLA_PASSWORD=admin1234 42 | ARGILLA_API_KEY=admin.1234 43 | ARGILLA_REINDEX_DATASET=1 -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Python slim image as base 2 | FROM python:3.10-slim 3 | 4 | # Set environment variables 5 | ENV PYTHONUNBUFFERED=1 \ 6 | PYTHONDONTWRITEBYTECODE=1 \ 7 | PIP_NO_CACHE_DIR=1 8 | 9 | # Create and set working directory 10 | WORKDIR /app 11 | 12 | # Create non-root user first 13 | RUN useradd -m -u 1000 appuser 14 | 15 | # Install system dependencies including build tools 16 | RUN apt-get update && apt-get install -y --no-install-recommends \ 17 | curl \ 18 | build-essential \ 19 | cmake \ 20 | libgl1-mesa-glx \ 21 | libglib2.0-0 \ 22 | libsm6 \ 23 | libxext6 \ 24 | libxrender-dev \ 25 | && rm -rf /var/lib/apt/lists/* 26 | 27 | # Install pdm 28 | RUN pip install --no-cache-dir pdm 29 | 30 | # Copy project files and set permissions 31 | COPY . . 32 | RUN chown -R appuser:appuser /app && \ 33 | chmod -R 755 /app 34 | 35 | # Switch to non-root user 36 | USER appuser 37 | 38 | # Install dependencies in a virtual environment 39 | RUN pdm install --prod --frozen-lockfile 40 | 41 | # Expose Gradio port 42 | EXPOSE 7860 43 | 44 | # Start command using pdm run to use the virtual environment 45 | CMD ["pdm", "run", "python", "-m", "synthetic_dataset_generator"] -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker Configuration Guide 2 | 3 | Each service runs in its own container, communicating through internal networks. The core app connects to Ollama for model inference and Argilla for data review: 4 | 5 | ![Container Structure](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png) 6 | 7 | The application can be run with different configurations using Docker Compose: 8 | 9 | - `docker-compose.yml`: Core application 10 | - `docker/ollama/compose.yml`: Ollama service for local LLM inference 11 | - `docker/argilla/compose.yml`: Argilla service for data curation 12 | 13 | ## Ollama Integration 14 | 15 | The `MODEL` variable in your `.env` file determines which model Ollama will download and use. For example: 16 | ```env 17 | MODEL=llama3.2:1b 18 | ``` 19 | 20 | ## Setup Options 21 | 22 | ### Full Setup (App + Ollama + Argilla) 23 | ```bash 24 | # Keep all sections uncommented in .env 25 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build 26 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d 27 | ``` 28 | 29 | ### App + Ollama 30 | ```bash 31 | # Comment out ARGILLA section in .env 32 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml build 33 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d 34 | ``` 35 | 36 | ### App + Argilla 37 | ```bash 38 | # Comment out OLLAMA section in .env 39 | docker compose -f docker-compose.yml -f docker/argilla/compose.yml build 40 | docker compose -f docker-compose.yml -f docker/argilla/compose.yml up -d 41 | ``` 42 | 43 | ### App Only 44 | ```bash 45 | # Comment out both OLLAMA and ARGILLA sections in .env 46 | docker compose -f docker-compose.yml build 47 | docker compose -f docker-compose.yml up -d 48 | ``` 49 | 50 | ## Managing Services 51 | 52 | Services are built separately but are linked together. If you already have some services built and want to add another: 53 | 54 | 1. You don't need to rebuild existing services 55 | 2. Just build the new service 56 | 3. Stop everything with `down` and start again with `up` 57 | 58 | For example, if you have App + Ollama and want to add Argilla: 59 | ```bash 60 | docker compose -f docker/argilla/compose.yml build # only build Argilla 61 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml down 62 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d 63 | ``` 64 | 65 | Similarly, if you have built all services but want to run only some of them: 66 | > **Important**: When running specific services, remember to comment out unused services in `.env` first 67 | 68 | ```bash 69 | # No need to build again, just start the services you need 70 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d # start only App + Ollama 71 | ``` 72 | 73 | ## Service URLs 74 | 75 | Once running, access the services at: 76 | - App: http://localhost:7860 77 | - Argilla: http://localhost:6900 (if enabled) 78 | - Ollama: http://localhost:11434 (if enabled) 79 | 80 | > Note: Services will be available after a few seconds while they initialize. Ollama models and Argilla datasets are persisted and available after restarts -------------------------------------------------------------------------------- /docker/argilla/compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | app: 3 | extends: 4 | file: docker-compose.yml 5 | service: app 6 | depends_on: 7 | argilla: 8 | condition: service_healthy 9 | required: false 10 | environment: 11 | - ARGILLA_API_URL=http://argilla:6900 12 | 13 | elasticsearch: 14 | image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0 15 | environment: 16 | - ES_JAVA_OPTS=-Xms512m -Xmx512m 17 | - node.name=elasticsearch 18 | - cluster.name=es-argilla-local 19 | - discovery.type=single-node 20 | - cluster.routing.allocation.disk.threshold_enabled=false 21 | - xpack.security.enabled=false 22 | volumes: 23 | - es_data:/usr/share/elasticsearch/data 24 | networks: 25 | - app-network 26 | ports: 27 | - "9200:9200" 28 | - "9300:9300" 29 | ulimits: 30 | memlock: 31 | soft: -1 32 | hard: -1 33 | nofile: 34 | soft: 65536 35 | hard: 65536 36 | healthcheck: 37 | test: ["CMD", "curl", "-f", "http://localhost:9200"] 38 | interval: 30s 39 | timeout: 10s 40 | retries: 3 41 | 42 | postgres: 43 | image: postgres:14 44 | environment: 45 | POSTGRES_USER: postgres 46 | POSTGRES_PASSWORD: postgres 47 | POSTGRES_DB: argilla 48 | networks: 49 | - app-network 50 | volumes: 51 | - postgres_data:/var/lib/postgresql/data 52 | 53 | redis: 54 | image: redis 55 | networks: 56 | - app-network 57 | 58 | argilla: 59 | image: argilla/argilla-server:latest 60 | ports: 61 | - "6900:6900" 62 | healthcheck: 63 | test: ["CMD", "curl", "-f", "http://localhost:6900/api/ready"] 64 | interval: 30s 65 | timeout: 10s 66 | retries: 3 67 | env_file: 68 | - .env 69 | environment: 70 | - ARGILLA_HOME_PATH=/var/lib/argilla 71 | - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200 72 | - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla 73 | - ARGILLA_REDIS_URL=redis://redis:6379/0 74 | - USERNAME=${ARGILLA_USERNAME} 75 | - PASSWORD=${ARGILLA_PASSWORD} 76 | - API_KEY=${ARGILLA_API_KEY} 77 | - WORKSPACE=default 78 | volumes: 79 | - argilla_data:/argilla 80 | networks: 81 | - app-network 82 | depends_on: 83 | elasticsearch: 84 | condition: service_healthy 85 | postgres: 86 | condition: service_started 87 | redis: 88 | condition: service_started 89 | 90 | worker: 91 | image: argilla/argilla-server:latest 92 | env_file: 93 | - .env 94 | environment: 95 | - ARGILLA_HOME_PATH=/var/lib/argilla 96 | - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200 97 | - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla 98 | - ARGILLA_REDIS_URL=redis://redis:6379/0 99 | - BACKGROUND_NUM_WORKERS=2 100 | - USERNAME=${ARGILLA_USERNAME} 101 | - PASSWORD=${ARGILLA_PASSWORD} 102 | - API_KEY=${ARGILLA_API_KEY} 103 | - WORKSPACE=default 104 | networks: 105 | - app-network 106 | depends_on: 107 | - postgres 108 | - elasticsearch 109 | - redis 110 | command: sh -c 'python -m argilla_server worker --num-workers $${BACKGROUND_NUM_WORKERS}' 111 | 112 | volumes: 113 | es_data: 114 | name: synthetic-data-es 115 | argilla_data: 116 | name: synthetic-data-argilla 117 | postgres_data: 118 | name: synthetic-data-postgres -------------------------------------------------------------------------------- /docker/ollama/compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | app: 3 | extends: 4 | file: docker-compose.yml 5 | service: app 6 | depends_on: 7 | ollama: 8 | condition: service_healthy 9 | required: true 10 | environment: 11 | - OLLAMA_BASE_URL=http://ollama:11434 12 | 13 | ollama: 14 | image: ollama/ollama:${OLLAMA_HARDWARE:-latest} 15 | ports: 16 | - "11434:11434" 17 | env_file: 18 | - .env 19 | environment: 20 | - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-} 21 | volumes: 22 | - ollama_data:/root/.ollama 23 | - ./docker/ollama/entrypoint.sh:/entrypoint.sh 24 | networks: 25 | - app-network 26 | deploy: 27 | resources: 28 | reservations: 29 | devices: 30 | - driver: nvidia 31 | count: all 32 | capabilities: [gpu] 33 | tty: true 34 | entrypoint: ["/usr/bin/bash", "/entrypoint.sh"] 35 | healthcheck: 36 | test: 37 | - "CMD-SHELL" 38 | - | 39 | test -f /tmp/ollama_ready && \ 40 | bash -c '/dev/null && ollama list | grep -q "$MODEL_NAME"; then 26 | echo "🟢 Model download complete!" 27 | touch /tmp/ollama_ready 28 | else 29 | echo "❌ Error downloading model ($MODEL_NAME)" 30 | fi 31 | fi 32 | fi 33 | 34 | # Wait for Ollama process to finish 35 | wait $pid -------------------------------------------------------------------------------- /examples/argilla-deployment.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | import os 8 | 9 | from synthetic_dataset_generator import launch 10 | 11 | # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL 12 | os.environ["HF_TOKEN"] = "hf_..." 13 | os.environ["ARGILLA_API_URL"] = ( 14 | "https://[your-owner-name]-[your_space_name].hf.space" # argilla base url 15 | ) 16 | os.environ["ARGILLA_API_KEY"] = "my_api_key" # argilla api key 17 | 18 | launch() 19 | -------------------------------------------------------------------------------- /examples/blog_private_synthetic_data_generation.md: -------------------------------------------------------------------------------- 1 | # Private Synthetic Data Generation Made Easy: Out-of-the-Box with Docker, Argilla & Ollama 2 | 3 | > "Empowering organizations with a turnkey solution for synthetic dataset creation in private environments." 4 | 5 | The increasing adoption of AI solutions across industries has created an unprecedented demand for high-quality training data. As organizations scale their AI initiatives, they face the dual challenge of generating substantial, domain-specific datasets while ensuring data privacy and security. Traditional approaches often involve compromises: either using public datasets that may not fully align with specific needs, or investing heavily in custom data generation infrastructure. 6 | 7 | The complexity of this challenge is amplified by regulatory requirements, resource constraints, and the need for specialized expertise. Organizations must navigate GDPR, CCPA, and industry-specific regulations while maintaining efficient data generation pipelines. This has created a pressing need for solutions that can operate entirely within private infrastructure while maintaining enterprise-grade capabilities. 8 | 9 | ## The Challenge 10 | 11 | The development of AI models requires extensive training data, yet organizations face significant obstacles in data generation and management. Privacy regulations and security requirements often prevent the use of public datasets or cloud-based generation services. Additionally, existing solutions typically demand complex infrastructure setups and significant technical expertise, increasing both implementation time and costs. 12 | 13 | Modern enterprises require a solution that addresses several critical aspects: 14 | 1. Data Privacy: Complete control over data generation and storage 15 | 2. Infrastructure Flexibility: Deployment options that fit existing systems 16 | 3. Quality Assurance: Tools for data validation and curation 17 | 4. Scalability: Ability to grow with increasing data needs 18 | 5. Cost Efficiency: Reduction in infrastructure and maintenance costs 19 | 20 | ## The Solution 21 | 22 | This out-of-the-box Synthetic Dataset Generator approach leverages the power of three technologies to create a seamless, private data generation pipeline. At its core is the [Synthetic Dataset Generator](https://github.com/argilla-io/synthetic-data-generator), a tool designed for dataset creation. [Ollama](https://ollama.ai/) ensures secure local LLM inference with [Distilabel](https://github.com/argilla-io/distilabel) integration, while [Argilla's](https://argilla.io/) data curation capabilities complete the workflow, all operating within your secure infrastructure. 23 | 24 | This architecture delivers key technical advantages: 25 | - Full data sovereignty with containerized local deployment 26 | - End-to-end pipeline from generation to validation 27 | - Modular design for system integration 28 | 29 | Here's how it all fits together: 30 | 31 | ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png) 32 | 33 | Let's explore how these components work together in a practical workflow. 34 | 35 | ## 1. Installation & Setup 36 | 37 | 38 | 39 | ### 1.1 Clone Repository 40 | ```bash 41 | git clone https://github.com/argilla-io/synthetic-data-generator 42 | cd synthetic-data-generator 43 | ``` 44 | 45 | ### 1.2 Environment Setup 46 | ```bash 47 | # Copy environment template 48 | cp docker/.env.docker.template .env 49 | 50 | # Model configuration in .env (if using Ollama) 51 | MODEL="deepseek-r1:1.5b" # Must match Ollama model name 52 | ``` 53 | 54 | ### 1.3 Build & Deploy Services 55 | > Pro tip: Even if you're planning to use just one component initially, we recommend building all services to enable future functionality without rebuilding. For detailed deployment options, check the [Docker documentation](https://github.com/argilla-io/synthetic-data-generator/blob/main/docker/README.md). 56 | 57 | > Note: Ollama runs on CPU/GPU for Linux/Windows in Docker. For macOS, only CPU is supported in Docker - for GPU support, install Ollama separately ([details](https://ollama.com/blog/ollama-is-now-available-as-an-official-docker-image)). 58 | 59 | ```bash 60 | # Build all services 61 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build 62 | # Start all services 63 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d 64 | ``` 65 | 66 | To view logs, either: 67 | - Use Docker Desktop's interface 68 | - Remove the `-d` flag when running the above command 69 | - Or execute the following for specific service logs: 70 | ```bash 71 | # Core App logs 72 | docker compose logs -f app 73 | # Ollama logs 74 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml logs -f ollama 75 | # Argilla logs 76 | docker compose -f docker-compose.yml -f docker/argilla/compose.yml logs -f argilla 77 | ``` 78 | 79 | ## 2. Dataset Generation 80 | 81 | The tool currently supports **Text Classification**, **Chat**, and **RAG** datasets. These tasks will determine the type of dataset you will generate: classification requires categories, chat data requires a conversation format, and RAG requires question-answer pairs with relevant context, offering options for both retrieval and reranking data generation to enhance different aspects of information retrieval systems. 82 | 83 | For a detailed overview of the generation process, check out the [introduction to the Synthetic Data Generator](https://huggingface.co/blog/synthetic-data-generator). 84 | 85 | 86 | ### 2.1. **Dataset Description** 87 | 88 | Let's walk through creating a **RAG dataset**. 89 | ```text 90 | A dataset to retrieve information from information security policies 91 | ``` 92 | 93 | System initializes and processes the prompt: 94 | ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/sxH8JChF-HnGMOilymYpA.png) 95 | 96 | 97 | ### 2.2. **Task Configuration & Sample Generation** 98 | System analyzes and generates the system prompt and optimal parameters automatically. Then, samples are generated for validation (modify system prompt or parameters manually if needed, then click save to generate sample data): 99 | ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/mYVlGNnz6YNrPJutxmBtR.png) 100 | 101 | 102 | ### 2.3. **Full Dataset Generation** 103 | After validating the sample data quality, proceed with full dataset generation. Configure the following parameters: 104 | 105 | - **Repository Owner**: Your Hugging Face username for dataset hosting 106 | - **Dataset Name**: A descriptive name following standard naming conventions 107 | - **Number of Examples**: Define dataset size (recommended: 100-1000 for initial deployments) 108 | - **Temperature**: Controls generation creativity (default 0.7 balances coherence and diversity) 109 | - **Privacy Settings**: Optional dataset privacy configuration for Hugging Face Hub 110 | 111 | The temperature parameter significantly impacts output quality: 112 | - 0.5-0.7: Optimal for technical documentation and factual content 113 | - 0.7-0.8: Balanced for general purpose datasets 114 | - 0.8-1.0: Increased creativity, suitable for conversational data 115 | 116 | 117 | The system initiates the generation pipeline, leveraging Distilabel for structured output: 118 | ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/PWNT_bLHwFjeoFX7AhA-z.png) 119 | 120 | 121 | Upon completion, the dataset is pushed to Hugging Face Hub: 122 | ![Generation Complete](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/ohd4S-RyNI406uLPf4bnZ.png) 123 | 124 | Access your generated dataset through the Hugging Face Hub interface: 125 | 126 | 132 | 133 | 134 | 135 | ## 3. Data Curation with Argilla 136 | 137 | The integration with Argilla provides enterprise-grade dataset curation capabilities through a comprehensive review system. This phase is crucial for ensuring data quality and maintaining high standards in your training datasets. 138 | 139 | ### Environment Configuration 140 | Before accessing Argilla's features, ensure proper configuration in your `.env` file. 141 | 142 | 143 | ### Curation Workflow 144 | 145 | 1. **Dataset Integration** 146 | Upon generation completion, the dataset is automatically ingested into Argilla. The system maintains data integrity and version control throughout the process. All datasets and progress persist across Docker restarts unless you explicitly remove the Argilla services and volumes. 147 | ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/0gF6iLywhKafEo3z94cd-.png) 148 | 149 | 150 | 2. **Quality Assurance Process** 151 | Argilla's interface provides comprehensive tools for dataset validation: 152 | - Semantic analysis of generated content 153 | - Consistency checking across entries 154 | - Metadata validation and enrichment 155 | - Collaborative review capabilities 156 | 157 | ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/h9kJ-4lA0LcFC8g6g_vwF.png) 158 | 159 | 160 | 161 | 3. **Dataset Publication** 162 | After thorough review, export your curated dataset to Hugging Face Hub: 163 | 164 | > Note: Consider using a new repository name to preserve both raw and curated datasets separately. 165 | 166 | - Configure repository settings 167 | - Set visibility and access controls 168 | - Add dataset cards and documentation 169 | 170 | ![Export Configuration](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/CPwtVr_Jw6mndNCOU2a5T.png) 171 | 172 | 173 | The curated dataset maintains full provenance tracking and quality metrics: 174 | 180 | 181 | # 🎉 You're Done! 182 | Congratulations! You've successfully completed the end-to-end dataset generation and curation process. Your curated dataset is now ready for model training. 183 | 184 | ## Experience the Solution 185 | 186 | For a hands-on preview of the Synthetic Dataset Generator's capabilities, explore the hosted space. This allows you to evaluate the interface and functionality before deploying your own instance: 187 | 188 | 196 | 197 | Create your own deployment by duplicating this Space. 198 | 199 | ## What's Next? 200 | 201 | After successfully generating your first dataset, several advanced implementation paths are available: 202 | 203 | Extend your dataset generation capabilities: 204 | - [Fine-tune models on synthetic data](https://huggingface.co/blog/davidberenstein1957/fine-tune-a-smollm-on-synthetic-data-of-llm) for domain-specific tasks 205 | - [Create specialized reasoning datasets](https://huggingface.co/blog/sdiazlor/fine-tune-deepseek-with-a-synthetic-reasoning-data) for advanced model training 206 | 207 | ## Conclusion 208 | 209 | The Synthetic Dataset Generator represents a significant advancement in private data generation technology, addressing the growing need for high-quality training data while maintaining security and control. By leveraging containerized architecture and local LLM inference, organizations can now generate custom datasets without compromising on data privacy or quality. 210 | 211 | The solution's modular design enables seamless integration with existing ML pipelines while providing enterprise-grade features like persistent storage, comprehensive monitoring, and scalable infrastructure. Through collaborative validation workflows and structured quality control processes, teams can efficiently create and curate datasets tailored to their specific needs. 212 | 213 | This combination of security, efficiency, and flexibility makes the Synthetic Dataset Generator an essential tool for organizations looking to accelerate their AI development while maintaining complete control over their data generation pipeline. 214 | 215 | ## References & Documentation 216 | 217 | 218 | - [Synthetic Dataset Generator](https://github.com/argilla-io/synthetic-data-generator): Open-source tool for dataset generation using natural language 219 | - [Distilabel Framework](https://github.com/argilla-io/distilabel): Advanced dataset generation framework 220 | - [Docker Best Practices](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/): Container optimization guidelines 221 | - [Argilla Documentation](https://docs.argilla.io): Data curation platform documentation 222 | - [Ollama Integration](https://github.com/jmorganca/ollama): Local LLM deployment guide -------------------------------------------------------------------------------- /examples/fine-tune-modernbert-classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fine-tune ModernBERT for text classification using synthetic data\n", 8 | "\n", 9 | "LLMs are great general purpose models, but they are not always the best choice for a specific task. Therefore, smaller and more specialized models are important for sustainable, efficient, and cheaper AI.\n", 10 | "A lack of domain sepcific datasets is a common problem for smaller and more specialized models. This is because it is difficult to find a dataset that is both representative and diverse enough for a specific task. We solve this problem by generating a synthetic dataset from an LLM using the `synthetic-data-generator`, which is available as a [Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) or on [GitHub](https://github.com/argilla-io/synthetic-data-generator).\n", 11 | "\n", 12 | "In this example, we will fine-tune a ModernBERT model on a synthetic dataset generated from the synthetic-data-generator. This demonstrates the effectiveness of synthetic data and the novel ModernBERT model, which is a new and improved version of BERT models, with an 8192 token context length, significantly better downstream performance, and much faster processing speeds.\n", 13 | "\n", 14 | "## Install the dependencies" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Install Pytorch & other libraries\n", 24 | "%pip install \"torch==2.5.0\" \"torchvision==0.20.0\" \n", 25 | "%pip install \"setuptools<71.0.0\" scikit-learn \n", 26 | " \n", 27 | "# Install Hugging Face libraries\n", 28 | "%pip install --upgrade \\\n", 29 | " \"datasets==3.1.0\" \\\n", 30 | " \"accelerate==1.2.1\" \\\n", 31 | " \"hf-transfer==0.1.8\"\n", 32 | " \n", 33 | "# ModernBERT is not yet available in an official release, so we need to install it from github\n", 34 | "%pip install \"git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1\" --upgrade" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## The problem\n", 42 | "\n", 43 | "The [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier), is a model that can classify the domain of a text which can help with curating data. This model is cool but is based on the Deberta V3 Base, which is an outdated architecture that requires custom code to run, has a context length of 512 tokens, and is not as fast as the ModernBERT model. The labels for the model are:\n", 44 | "\n", 45 | "```\n", 46 | "'Adult', 'Arts_and_Entertainment', 'Autos_and_Vehicles', 'Beauty_and_Fitness', 'Books_and_Literature', 'Business_and_Industrial', 'Computers_and_Electronics', 'Finance', 'Food_and_Drink', 'Games', 'Health', 'Hobbies_and_Leisure', 'Home_and_Garden', 'Internet_and_Telecom', 'Jobs_and_Education', 'Law_and_Government', 'News', 'Online_Communities', 'People_and_Society', 'Pets_and_Animals', 'Real_Estate', 'Science', 'Sensitive_Subjects', 'Shopping', 'Sports', 'Travel_and_Transportation'\n", 47 | "```\n", 48 | "\n", 49 | "The data on which the model was trained is not available, so we cannot use it for our purposes. We can however generate a synthetic data to solve this problem." 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "vscode": { 56 | "languageId": "plaintext" 57 | } 58 | }, 59 | "source": [ 60 | "## Let's generate some data\n", 61 | "\n", 62 | "Let's go to the [hosted Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) to generate the data. This is done in three steps 1) we come up with a dataset description, 2) iterate on the task configuration, and 3) generate and push the data to Hugging Face. A more detailed flow can be found in [this blogpost](https://huggingface.co/blog/synthetic-data-generator). \n", 63 | "\n", 64 | "\n", 70 | "\n", 71 | "For this example, we will generate 1000 examples with a temperature of 1. After some iteration, we come up with the following system prompt:\n", 72 | "\n", 73 | "```\n", 74 | "Long texts (at least 2000 words) from various media sources like Wikipedia, Reddit, Common Crawl, websites, commercials, online forums, books, newspapers and folders that cover multiple topics. Classify the text based on its main subject matter into one of the following categories\n", 75 | "```\n", 76 | "\n", 77 | "We press the \"Push to Hub\" button and wait for the data to be generated. This takes a few minutes and we end up with a dataset with 1000 examples. The labels are nicely distributed across the categories, varied in length, and the texts look diverse and interesting.\n", 78 | "\n", 79 | "\n", 85 | "\n", 86 | "The data is pushed to Argilla to so we recommend inspecting and validating the labels before finetuning the model." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## Finetuning the ModernBERT model\n", 94 | "\n", 95 | "We mostly rely on the blog from [Phillip Schmid](https://www.philschmid.de/fine-tune-modern-bert-in-2025). I will basic consumer hardware, my Apple M1 Max with 32GB of shared memory. We will use the `datasets` library to load the data and the `transformers` library to finetune the model." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 1, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stderr", 105 | "output_type": "stream", 106 | "text": [ 107 | "/Users/davidberenstein/Documents/programming/argilla/synthetic-data-generator/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 108 | " from .autonotebook import tqdm as notebook_tqdm\n" 109 | ] 110 | }, 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "{'text': 'Recently, there has been an increase in property values within the suburban areas of several cities due to improvements in infrastructure and lifestyle amenities such as parks, retail stores, and educational institutions nearby. Additionally, new housing developments are emerging, catering to different family needs with varying sizes and price ranges. These changes have influenced investment decisions for many looking to buy or sell properties.',\n", 115 | " 'label': 14}" 116 | ] 117 | }, 118 | "execution_count": 1, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "from datasets import load_dataset\n", 125 | "from datasets.arrow_dataset import Dataset\n", 126 | "from datasets.dataset_dict import DatasetDict, IterableDatasetDict\n", 127 | "from datasets.iterable_dataset import IterableDataset\n", 128 | " \n", 129 | "# Dataset id from huggingface.co/dataset\n", 130 | "dataset_id = \"argilla/synthetic-domain-text-classification\"\n", 131 | " \n", 132 | "# Load raw dataset\n", 133 | "train_dataset = load_dataset(dataset_id, split='train')\n", 134 | "\n", 135 | "split_dataset = train_dataset.train_test_split(test_size=0.1)\n", 136 | "split_dataset['train'][0]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "First, we need to tokenize the data. We will use the `AutoTokenizer` class from the `transformers` library to load the tokenizer." 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 2, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stderr", 153 | "output_type": "stream", 154 | "text": [ 155 | "Map: 100%|██████████| 900/900 [00:00<00:00, 4787.61 examples/s]\n", 156 | "Map: 100%|██████████| 100/100 [00:00<00:00, 4163.70 examples/s]\n" 157 | ] 158 | }, 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "dict_keys(['labels', 'input_ids', 'attention_mask'])" 163 | ] 164 | }, 165 | "execution_count": 2, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "from transformers import AutoTokenizer\n", 172 | " \n", 173 | "# Model id to load the tokenizer\n", 174 | "model_id = \"answerdotai/ModernBERT-base\"\n", 175 | "\n", 176 | "# Load Tokenizer\n", 177 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", 178 | " \n", 179 | "# Tokenize helper function\n", 180 | "def tokenize(batch):\n", 181 | " return tokenizer(batch['text'], padding=True, truncation=True, return_tensors=\"pt\")\n", 182 | " \n", 183 | "# Tokenize dataset\n", 184 | "if \"label\" in split_dataset[\"train\"].features.keys():\n", 185 | " split_dataset = split_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n", 186 | "tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n", 187 | " \n", 188 | "tokenized_dataset[\"train\"].features.keys()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "Now, we need to prepare the model. We will use the `AutoModelForSequenceClassification` class from the `transformers` library to load the model." 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 3, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stderr", 205 | "output_type": "stream", 206 | "text": [ 207 | "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", 208 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "from transformers import AutoModelForSequenceClassification\n", 214 | " \n", 215 | "# Model id to load the tokenizer\n", 216 | "model_id = \"answerdotai/ModernBERT-base\"\n", 217 | " \n", 218 | "# Prepare model labels - useful for inference\n", 219 | "labels = tokenized_dataset[\"train\"].features[\"labels\"].names\n", 220 | "num_labels = len(labels)\n", 221 | "label2id, id2label = dict(), dict()\n", 222 | "for i, label in enumerate(labels):\n", 223 | " label2id[label] = str(i)\n", 224 | " id2label[str(i)] = label\n", 225 | " \n", 226 | "# Download the model from huggingface.co/models\n", 227 | "model = AutoModelForSequenceClassification.from_pretrained(\n", 228 | " model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,\n", 229 | ")" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "We will use a simple F1 score as the evaluation metric." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 4, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "import numpy as np\n", 246 | "from sklearn.metrics import f1_score\n", 247 | " \n", 248 | "# Metric helper method\n", 249 | "def compute_metrics(eval_pred):\n", 250 | " predictions, labels = eval_pred\n", 251 | " predictions = np.argmax(predictions, axis=1)\n", 252 | " score = f1_score(\n", 253 | " labels, predictions, labels=labels, pos_label=1, average=\"weighted\"\n", 254 | " )\n", 255 | " return {\"f1\": float(score) if score == 1 else score}" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Finally, we need to define the training arguments. We will use the `TrainingArguments` class from the `transformers` library to define the training arguments." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 6, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stderr", 272 | "output_type": "stream", 273 | "text": [ 274 | "/Users/davidberenstein/Documents/programming/argilla/synthetic-data-generator/.venv/lib/python3.11/site-packages/transformers/training_args.py:2241: UserWarning: `use_mps_device` is deprecated and will be removed in version 5.0 of 🤗 Transformers. `mps` device will be used by default if available similar to the way `cuda` device is used.Therefore, no action from user is required. \n", 275 | " warnings.warn(\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "from huggingface_hub import HfFolder\n", 281 | "from transformers import Trainer, TrainingArguments\n", 282 | " \n", 283 | "# Define training args\n", 284 | "training_args = TrainingArguments(\n", 285 | " output_dir= \"ModernBERT-domain-classifier\",\n", 286 | " per_device_train_batch_size=32,\n", 287 | " per_device_eval_batch_size=16,\n", 288 | " learning_rate=5e-5,\n", 289 | "\t\tnum_train_epochs=5,\n", 290 | " bf16=True, # bfloat16 training \n", 291 | " optim=\"adamw_torch_fused\", # improved optimizer \n", 292 | " # logging & evaluation strategies\n", 293 | " logging_strategy=\"steps\",\n", 294 | " logging_steps=100,\n", 295 | " eval_strategy=\"epoch\",\n", 296 | " save_strategy=\"epoch\",\n", 297 | " save_total_limit=2,\n", 298 | " load_best_model_at_end=True,\n", 299 | " use_mps_device=True,\n", 300 | " metric_for_best_model=\"f1\",\n", 301 | " # push to hub parameters\n", 302 | " push_to_hub=True,\n", 303 | " hub_strategy=\"every_save\",\n", 304 | " hub_token=HfFolder.get_token(),\n", 305 | ")\n", 306 | " \n", 307 | "# Create a Trainer instance\n", 308 | "trainer = Trainer(\n", 309 | " model=model,\n", 310 | " args=training_args,\n", 311 | " train_dataset=tokenized_dataset[\"train\"],\n", 312 | " eval_dataset=tokenized_dataset[\"test\"],\n", 313 | " compute_metrics=compute_metrics,\n", 314 | ")" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 7, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stderr", 324 | "output_type": "stream", 325 | "text": [ 326 | " \n", 327 | " 20%|██ | 29/145 [11:32<33:16, 17.21s/it]" 328 | ] 329 | }, 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "{'eval_loss': 0.729780912399292, 'eval_f1': 0.7743598318036522, 'eval_runtime': 3.5337, 'eval_samples_per_second': 28.299, 'eval_steps_per_second': 1.981, 'epoch': 1.0}\n" 335 | ] 336 | }, 337 | { 338 | "name": "stderr", 339 | "output_type": "stream", 340 | "text": [ 341 | " \n", 342 | " 40%|████ | 58/145 [22:57<25:56, 17.89s/it]" 343 | ] 344 | }, 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "{'eval_loss': 0.4369044005870819, 'eval_f1': 0.8310764765820946, 'eval_runtime': 3.3266, 'eval_samples_per_second': 30.061, 'eval_steps_per_second': 2.104, 'epoch': 2.0}\n" 350 | ] 351 | }, 352 | { 353 | "name": "stderr", 354 | "output_type": "stream", 355 | "text": [ 356 | " \n", 357 | " 60%|██████ | 87/145 [35:16<17:06, 17.70s/it]" 358 | ] 359 | }, 360 | { 361 | "name": "stdout", 362 | "output_type": "stream", 363 | "text": [ 364 | "{'eval_loss': 0.6091340184211731, 'eval_f1': 0.8399274488570763, 'eval_runtime': 3.2772, 'eval_samples_per_second': 30.514, 'eval_steps_per_second': 2.136, 'epoch': 3.0}\n" 365 | ] 366 | }, 367 | { 368 | "name": "stderr", 369 | "output_type": "stream", 370 | "text": [ 371 | " 69%|██████▉ | 100/145 [41:03<18:02, 24.06s/it]" 372 | ] 373 | }, 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "{'loss': 0.7663, 'grad_norm': 7.232136249542236, 'learning_rate': 1.5517241379310346e-05, 'epoch': 3.45}\n" 379 | ] 380 | }, 381 | { 382 | "name": "stderr", 383 | "output_type": "stream", 384 | "text": [ 385 | " \n", 386 | " 80%|████████ | 116/145 [47:23<08:50, 18.30s/it]" 387 | ] 388 | }, 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "{'eval_loss': 0.43516409397125244, 'eval_f1': 0.8797674004703547, 'eval_runtime': 3.2975, 'eval_samples_per_second': 30.326, 'eval_steps_per_second': 2.123, 'epoch': 4.0}\n" 394 | ] 395 | }, 396 | { 397 | "name": "stderr", 398 | "output_type": "stream", 399 | "text": [ 400 | " \n", 401 | "100%|██████████| 145/145 [1:00:40<00:00, 19.18s/it]" 402 | ] 403 | }, 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | "{'eval_loss': 0.39272159337997437, 'eval_f1': 0.8914389523348718, 'eval_runtime': 3.5564, 'eval_samples_per_second': 28.118, 'eval_steps_per_second': 1.968, 'epoch': 5.0}\n" 409 | ] 410 | }, 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "100%|██████████| 145/145 [1:00:42<00:00, 25.12s/it]\n" 416 | ] 417 | }, 418 | { 419 | "name": "stdout", 420 | "output_type": "stream", 421 | "text": [ 422 | "{'train_runtime': 3642.7783, 'train_samples_per_second': 1.235, 'train_steps_per_second': 0.04, 'train_loss': 0.535627057634551, 'epoch': 5.0}\n" 423 | ] 424 | }, 425 | { 426 | "name": "stderr", 427 | "output_type": "stream", 428 | "text": [ 429 | "events.out.tfevents.1735555878.Davids-MacBook-Pro.local.23438.0: 100%|██████████| 9.32k/9.32k [00:00<00:00, 55.0kB/s]\n" 430 | ] 431 | }, 432 | { 433 | "data": { 434 | "text/plain": [ 435 | "CommitInfo(commit_url='https://huggingface.co/davidberenstein1957/domain-classifier/commit/915f4b03c230cc8f376f13729728f14347400041', commit_message='End of training', commit_description='', oid='915f4b03c230cc8f376f13729728f14347400041', pr_url=None, repo_url=RepoUrl('https://huggingface.co/davidberenstein1957/domain-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='davidberenstein1957/domain-classifier'), pr_revision=None, pr_num=None)" 436 | ] 437 | }, 438 | "execution_count": 7, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "trainer.train()\n", 445 | "# Save processor and create model card\n", 446 | "tokenizer.save_pretrained(\"ModernBERT-domain-classifier\")\n", 447 | "trainer.create_model_card()\n", 448 | "trainer.push_to_hub()" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "We get an F1 score of 0.89 on the test set, which is pretty good for the small dataset and time spent." 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "## Run inference\n", 463 | "\n", 464 | "We can now load the model and run inference." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 11, 470 | "metadata": {}, 471 | "outputs": [ 472 | { 473 | "name": "stderr", 474 | "output_type": "stream", 475 | "text": [ 476 | "Device set to use mps:0\n" 477 | ] 478 | }, 479 | { 480 | "data": { 481 | "text/plain": [ 482 | "[{'label': 'health', 'score': 0.6779336333274841}]" 483 | ] 484 | }, 485 | "execution_count": 11, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "from transformers import pipeline\n", 492 | " \n", 493 | "# load model from huggingface.co/models using our repository id\n", 494 | "classifier = pipeline(\n", 495 | " task=\"text-classification\", \n", 496 | " model=\"argilla/ModernBERT-domain-classifier\", \n", 497 | " device=0,\n", 498 | ")\n", 499 | " \n", 500 | "sample = \"Smoking is bad for your health.\"\n", 501 | " \n", 502 | "classifier(sample)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "## Conclusion\n", 510 | "\n", 511 | "We have shown that we can generate a synthetic dataset from an LLM and finetune a ModernBERT model on it. This the effectiveness of synthetic data and the novel ModernBERT model, which is new and improved version of BERT models, with 8192 token context length, significantly better downstream performance, and much faster processing speeds. \n", 512 | "\n", 513 | "Pretty cool for 20 minutes of generating data, and an hour of fine-tuning on consumer hardware." 514 | ] 515 | } 516 | ], 517 | "metadata": { 518 | "kernelspec": { 519 | "display_name": ".venv", 520 | "language": "python", 521 | "name": "python3" 522 | }, 523 | "language_info": { 524 | "codemirror_mode": { 525 | "name": "ipython", 526 | "version": 3 527 | }, 528 | "file_extension": ".py", 529 | "mimetype": "text/x-python", 530 | "name": "python", 531 | "nbconvert_exporter": "python", 532 | "pygments_lexer": "ipython3", 533 | "version": "3.11.11" 534 | } 535 | }, 536 | "nbformat": 4, 537 | "nbformat_minor": 2 538 | } 539 | -------------------------------------------------------------------------------- /examples/fine-tune-smollm2-on-synthetic-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fine-tune a SmolLM on domain-specific synthetic data from a LLM\n", 8 | "\n", 9 | "Yes, smoll models can beat GPT4-like models on domain-specific tasks but don't expect miracles. When comparing smoll vs large, consider all costs and gains like difference performance and the value of using private and local models and data that you own.\n", 10 | "\n", 11 | "The [Hugging Face SmolLM models](https://github.com/huggingface/smollm) are blazingly fast and remarkably powerful. With its 135M, 360M and 1.7B parameter models, it is a great choice for a small and fast model. The great thing about SmolLM is that it is a general-purpose model that can be fine-tuned on domain-specific data.\n", 12 | "\n", 13 | "A lack of domain-specific datasets is a common problem for smaller and more specialized models. This is because it is difficult to find a dataset that is both representative and diverse enough for a specific task. We solve this problem by generating a synthetic dataset from an LLM using the `synthetic-data-generator`, which is available as a [Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) or on [GitHub](https://github.com/argilla-io/synthetic-data-generator).\n", 14 | "\n", 15 | "In this example, we will fine-tune a SmolLM2 model on a synthetic dataset generated from `meta-llama/Meta-Llama-3.1-8B-Instruct` with the `synthetic-data-generator`.\n", 16 | "\n", 17 | "## Install the dependencies\n", 18 | "\n", 19 | "We will install some basic dependencies for the fine-tuning with `trl` but we will use the Synthetic Data Generator UI to generate the synthetic dataset." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "!pip install transformers datasets trl torch" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## The problem\n", 36 | "\n", 37 | "Reasoning data has proven to be a fundamental change in the performance of generative models. Reasoning is amazing but it also means the model generates more \"chatty\" during the token generation process, causing the model to become slower and more expensive. For this reason, we want to create a model that can reason without being too chatty. Therefore, we will generate a concise reasoning dataset and fine-tune a SmolLM2 model on it.\n", 38 | "\n", 39 | "## Let's generate some data\n", 40 | "\n", 41 | "Let's go to the [hosted Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) to generate the data. This is done in three steps 1) we come up with a dataset description, 2) iterate on the task configuration, and 3) generate and push the data to Hugging Face. A more detailed flow can be found in [this blog post](https://huggingface.co/blog/synthetic-data-generator). \n", 42 | "\n", 43 | "\n", 49 | "\n", 50 | "For this example, we will generate 5000 chat data examples for a single turn in the conversation. All examples have been generated with a temperature of 1. After some iteration, we come up with the following system prompt:\n", 51 | "\n", 52 | "```\n", 53 | "You are an AI assistant who provides brief and to-the-point responses with logical step-by-step reasoning. Your purpose is to offer straightforward explanations and answers so that you can get to the heart of the issue. Respond with extremely concise, direct justifications and evidence-based conclusions. User questions are direct and concise.\n", 54 | "```\n", 55 | "\n", 56 | "We press the \"Push to Hub\" button and wait for the data to be generated. This takes a few hours and we end up with a dataset with 5000 examples, which is the maximum number of examples we can generate in a single run. You can scale this by deploying a private instance of the Synthetic Data Generator. \n", 57 | "\n", 58 | "\n", 64 | "\n", 65 | "The data is pushed to Argilla too so we recommend inspecting and validating the the data before finetuning the actual model. We applied some basic filters and transformations to the data to make it more suitable for fine-tuning.\n", 66 | "\n", 67 | "## Fine-tune the model\n", 68 | "\n", 69 | "We will use TRL to fine-tune the model. It is part of the Hugging Face ecosystem and works seamlessly on top of datasets generated by the synthetic data generator without needing to do any data transformations.\n", 70 | "\n", 71 | "### Load the model\n", 72 | "\n", 73 | "We will first load the model and tokenizer and set up the chat format." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 5, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# Import necessary libraries\n", 83 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 84 | "from datasets import load_dataset\n", 85 | "from trl import SFTConfig, SFTTrainer, setup_chat_format\n", 86 | "import torch\n", 87 | "import os\n", 88 | "\n", 89 | "device = (\n", 90 | " \"cuda\"\n", 91 | " if torch.cuda.is_available()\n", 92 | " else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n", 93 | ")\n", 94 | "\n", 95 | "# Load the model and tokenizer\n", 96 | "model_name = \"HuggingFaceTB/SmolLM2-360M\"\n", 97 | "model = AutoModelForCausalLM.from_pretrained(\n", 98 | " pretrained_model_name_or_path=model_name\n", 99 | ")\n", 100 | "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)\n", 101 | "\n", 102 | "# Set up the chat format\n", 103 | "model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "### Test the base model\n", 111 | "\n", 112 | "We will first test the base model to see how it performs on the task. During this step we will also generate a prompt for the model to respond to, to see how it performs on the task." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 2, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stderr", 122 | "output_type": "stream", 123 | "text": [ 124 | "Device set to use mps:0\n" 125 | ] 126 | }, 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "[{'generated_text': 'What is the primary function of mitochondria within a cell?\\n\\nMitochondria are the powerhouses of the cell. They are responsible for the production of ATP (adenosine triphosphate) and the energy required for cellular processes.\\n\\nWhat is the function of the mitochondria in the cell?\\n\\nThe mitochondria are the powerhouses of the cell. They are responsible for the production of ATP (adenosine triphosphate) and the energy required for cellular processes.\\n\\nWhat is the function of the mitochondria in the cell?\\n\\nThe'}]" 131 | ] 132 | }, 133 | "execution_count": 2, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "from transformers import pipeline\n", 140 | "\n", 141 | "prompt = \"What is the primary function of mitochondria within a cell?\"\n", 142 | "\n", 143 | "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=device)\n", 144 | "pipe(prompt, max_new_tokens=100)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "### Load the dataset\n", 152 | "\n", 153 | "For fine-tuning, we need to load the dataset and tokenize it. We will use the `synthetic-concise-reasoning-sft-filtered` dataset that we generated in the previous step." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 2, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stderr", 163 | "output_type": "stream", 164 | "text": [ 165 | "Map: 100%|██████████| 4133/4133 [00:00<00:00, 18478.53 examples/s]\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "from datasets import load_dataset\n", 171 | "\n", 172 | "ds = load_dataset(\"argilla/synthetic-concise-reasoning-sft-filtered\")\n", 173 | "def tokenize_function(examples):\n", 174 | " examples[\"text\"] = tokenizer.apply_chat_template([{\"role\": \"user\", \"content\": examples[\"prompt\"].strip()}, {\"role\": \"assistant\", \"content\": examples[\"completion\"].strip()}], tokenize=False)\n", 175 | " return examples\n", 176 | "ds = ds.map(tokenize_function)\n", 177 | "ds = ds.shuffle()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "### Fine-tune the model\n", 185 | "\n", 186 | "We will now fine-tune the model. We will use the `SFTTrainer` from the `trl` library to fine-tune the model. We will use a batch size of 4 and a learning rate of 5e-5. We will also use the `use_mps_device` flag to use the MPS device if available." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "os.environ[\"PYTORCH_MPS_HIGH_WATERMARK_RATIO\"] = \"0.0\"\n", 196 | "\n", 197 | "# Configure the SFTTrainer\n", 198 | "sft_config = SFTConfig(\n", 199 | " output_dir=\"./sft_output\",\n", 200 | " num_train_epochs=1,\n", 201 | " per_device_train_batch_size=4, # Set according to your GPU memory capacity\n", 202 | " learning_rate=5e-5, # Common starting point for fine-tuning\n", 203 | " logging_steps=100, # Frequency of logging training metrics\n", 204 | " use_mps_device= True if device == \"mps\" else False,\n", 205 | " hub_model_id=\"argilla/SmolLM2-360M-synthetic-concise-reasoning\", # Set a unique name for your model\n", 206 | " push_to_hub=True,\n", 207 | ")\n", 208 | "\n", 209 | "# Initialize the SFTTrainer\n", 210 | "trainer = SFTTrainer(\n", 211 | " model=model,\n", 212 | " args=sft_config,\n", 213 | " train_dataset=ds[\"train\"],\n", 214 | " tokenizer=tokenizer,\n", 215 | ")\n", 216 | "trainer.train()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "```\n", 224 | "# {'loss': 1.4498, 'grad_norm': 2.3919131755828857, 'learning_rate': 4e-05, 'epoch': 0.1}\n", 225 | "# {'loss': 1.362, 'grad_norm': 1.6650595664978027, 'learning_rate': 3e-05, 'epoch': 0.19}\n", 226 | "# {'loss': 1.3778, 'grad_norm': 1.4778285026550293, 'learning_rate': 2e-05, 'epoch': 0.29}\n", 227 | "# {'loss': 1.3735, 'grad_norm': 2.1424977779388428, 'learning_rate': 1e-05, 'epoch': 0.39}\n", 228 | "# {'loss': 1.3512, 'grad_norm': 2.3498542308807373, 'learning_rate': 0.0, 'epoch': 0.48}\n", 229 | "# {'train_runtime': 1911.514, 'train_samples_per_second': 1.046, 'train_steps_per_second': 0.262, 'train_loss': 1.3828572998046875, 'epoch': 0.48}\n", 230 | "```\n", 231 | "\n", 232 | "For the example, we did not use a specific validation set but we can see the loss is decreasing, so we assume the model is generalsing well to the training data. To get a better understanding of the model's performance, let's test it again with the same prompt.\n", 233 | "\n", 234 | "### Run inference\n", 235 | "\n", 236 | "We can now run inference with [the fine-tuned model](https://huggingface.co/argilla/SmolLM2-360M-synthetic-concise-reasoning/blob/main/README.md)." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 12, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stderr", 246 | "output_type": "stream", 247 | "text": [ 248 | "Device set to use mps\n" 249 | ] 250 | }, 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "'The primary function of mitochondria is to generate energy for the cell. They are organelles found in eukaryotic cells that convert nutrients into ATP (adenosine triphosphate), which is the primary source of energy for cellular processes.\\nMitochondria are responsible for:\\n\\nEnergy production: Mitochondria produce ATP through a process called oxidative phosphorylation, which involves the transfer of electrons from food molecules to oxygen.\\nEnergy storage: Mitochondria store energy in the form of adenosine triphosphate (ATP), which is used by the cell for various cellular processes.\\nCellular respiration: Mitochondria also participate in cellular respiration, a'" 255 | ] 256 | }, 257 | "execution_count": 12, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "prompt = \"What is the primary function of mitochondria within a cell?\"\n", 264 | "\n", 265 | "generator = pipeline(\n", 266 | " \"text-generation\",\n", 267 | " model=\"argilla/SmolLM2-360M-synthetic-concise-reasoning\",\n", 268 | " device=\"mps\",\n", 269 | ")\n", 270 | "generator(\n", 271 | " [{\"role\": \"user\", \"content\": prompt}], max_new_tokens=128, return_full_text=False\n", 272 | ")[0][\"generated_text\"]" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "## Conclusion\n", 280 | "\n", 281 | "We have fine-tuned a SmolLM2 model on a synthetic dataset generated from a large language model. We have seen that the model performs well on the task and that the synthetic data is a great way to generate diverse and representative data for supervised fine-tuning. \n", 282 | "\n", 283 | "In practice, you would likely want to spend more time on the data quality and fine-tuning the model but the flow shows the Synthetic Data Generator is a great tool to generate synthetic data for any task.\n", 284 | "\n", 285 | "Overall, I think it is pretty cool for a couple of hours of generation and fine-tuning on consumer hardware.\n" 286 | ] 287 | } 288 | ], 289 | "metadata": { 290 | "kernelspec": { 291 | "display_name": ".venv", 292 | "language": "python", 293 | "name": "python3" 294 | }, 295 | "language_info": { 296 | "codemirror_mode": { 297 | "name": "ipython", 298 | "version": 3 299 | }, 300 | "file_extension": ".py", 301 | "mimetype": "text/x-python", 302 | "name": "python", 303 | "nbconvert_exporter": "python", 304 | "pygments_lexer": "ipython3", 305 | "version": "3.11.9" 306 | } 307 | }, 308 | "nbformat": 4, 309 | "nbformat_minor": 2 310 | } 311 | -------------------------------------------------------------------------------- /examples/hf-dedicated-or-tgi-deployment.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | import os 8 | 9 | from synthetic_dataset_generator import launch 10 | 11 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface 12 | os.environ["HUGGINGFACE_BASE_URL"] = "http://127.0.0.1:3000/" # dedicated endpoint/TGI 13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # magpie template 14 | os.environ["TOKENIZER_ID"] = ( 15 | "meta-llama/Llama-3.1-8B-Instruct" # tokenizer for model hosted on endpoint 16 | ) 17 | os.environ["MODEL"] = None # model is linked to endpoint 18 | 19 | launch() 20 | -------------------------------------------------------------------------------- /examples/hf-serverless-deployment-deepseek.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | import os 8 | 9 | from synthetic_dataset_generator import launch 10 | 11 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface 12 | os.environ["MODEL"] = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # use model for instructions 13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "<|begin▁of▁sentence|>User: " # use the custom template for the model 14 | 15 | 16 | launch() 17 | -------------------------------------------------------------------------------- /examples/hf-serverless-deployment.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | import os 8 | 9 | from synthetic_dataset_generator import launch 10 | 11 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface 12 | os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct" # use model for generation 13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # use the template for the model 14 | 15 | launch() 16 | -------------------------------------------------------------------------------- /examples/hf-serverless-different-model-for-completion.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | import os 8 | 9 | from synthetic_dataset_generator import launch 10 | 11 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface 12 | os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct" # use model for instruction generation 13 | os.environ["MODEL_COMPLETION"] = "meta-llama/Llama-3.1-70B-Instruct" # use model for completion generation 14 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # use the template for the model 15 | 16 | launch() 17 | -------------------------------------------------------------------------------- /examples/ollama-deployment.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | # ollama serve 8 | # ollama run qwen2.5:32b-instruct-q5_K_S 9 | import os 10 | 11 | from synthetic_dataset_generator import launch 12 | 13 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface 14 | os.environ["OLLAMA_BASE_URL"] = "http://127.0.0.1:11434/" # ollama base url 15 | os.environ["MODEL"] = "qwen2.5:32b-instruct-q5_K_S" # model id 16 | os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-32B-Instruct" # tokenizer id 17 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2" 18 | os.environ["MAX_NUM_ROWS"] = "10000" 19 | os.environ["DEFAULT_BATCH_SIZE"] = "2" 20 | os.environ["MAX_NUM_TOKENS"] = "1024" 21 | 22 | launch() 23 | -------------------------------------------------------------------------------- /examples/ollama-different-model-for-completion.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | # ollama serve 8 | # ollama run llama3.2 9 | # ollama run llama3.2:1b 10 | import os 11 | 12 | from synthetic_dataset_generator import launch 13 | 14 | os.environ["OLLAMA_BASE_URL"] = ( 15 | "http://127.0.0.1:11434/" # in this case, the same base url for both models 16 | ) 17 | 18 | os.environ["MODEL"] = "llama3.2" # model for instruction generation 19 | os.environ["MODEL_COMPLETION"] = "llama3.2:1b" # model for completion generation 20 | 21 | os.environ["TOKENIZER_ID"] = "meta-llama/Llama-3.2-3B-Instruct" # tokenizer for instruction generation 22 | os.environ["TOKENIZER_ID_COMPLETION"] = "meta-llama/Llama-3.2-1B-Instruct" # tokenizer for completion generation 23 | 24 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # magpie template required for instruction generation 25 | 26 | launch() 27 | -------------------------------------------------------------------------------- /examples/openai-deployment.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | 8 | import os 9 | 10 | from synthetic_dataset_generator import launch 11 | 12 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface 13 | os.environ["OPENAI_BASE_URL"] = "https://api.openai.com/v1/" # openai base url 14 | os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY") # openai api key 15 | os.environ["MODEL"] = "gpt-4o" # model id 16 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = None # chat data not supported with OpenAI 17 | 18 | launch() 19 | -------------------------------------------------------------------------------- /examples/vllm-deployment.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.11,<3.12" 3 | # dependencies = [ 4 | # "synthetic-dataset-generator", 5 | # ] 6 | # /// 7 | # vllm serve Qwen/Qwen2.5-1.5B-Instruct 8 | import os 9 | 10 | from synthetic_dataset_generator import launch 11 | 12 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface 13 | os.environ["VLLM_BASE_URL"] = "http://127.0.0.1:8000/" # vllm base url 14 | os.environ["MODEL"] = "Qwen/Qwen2.5-1.5B-Instruct" # model id 15 | os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-1.5B-Instruct" # tokenizer id 16 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2" 17 | os.environ["MAX_NUM_ROWS"] = "10000" 18 | os.environ["DEFAULT_BATCH_SIZE"] = "2" 19 | os.environ["MAX_NUM_TOKENS"] = "1024" 20 | 21 | launch() 22 | -------------------------------------------------------------------------------- /packages.txt: -------------------------------------------------------------------------------- 1 | poppler-utils 2 | tesseract-ocr -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "synthetic-dataset-generator" 3 | version = "0.2.0" 4 | description = "Build datasets using natural language" 5 | authors = [ 6 | {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"}, 7 | ] 8 | keywords = [ 9 | "gradio", 10 | "synthetic-data", 11 | "huggingface", 12 | "argilla", 13 | "generative-ai", 14 | "ai", 15 | ] 16 | requires-python = "<3.13,>=3.10" 17 | readme = "README.md" 18 | license = {text = "Apache 2"} 19 | 20 | dependencies = [ 21 | "argilla>=2.4.0,<3.0.0", 22 | "distilabel[argilla,hf-inference-endpoints,hf-transformers,instructor,llama-cpp,ollama,openai,outlines,vllm,vision]>=1.5.0,<2.00", 23 | "gradio[oauth]>=5.4.0,<6.0.0", 24 | "gradio-huggingfacehub-search>=0.0.12,<1.0.0", 25 | "huggingface-hub>=0.26.0,<0.28.0", 26 | "model2vec>=0.2.4,<1.0.0", 27 | "nltk>=3.9.1,<4.0.0", 28 | "pydantic>=2.10.5,<3.0.0", 29 | "sentence-transformers>=3.2.0,<4.0.0", 30 | "transformers>=4.44.2,<5.0.0", 31 | "unstructured[md,pdf,docx]>=0.16.3,<1.0.0", 32 | "setuptools", 33 | ] 34 | 35 | [build-system] 36 | requires = ["pdm-backend"] 37 | build-backend = "pdm.backend" 38 | 39 | [tool.pdm] 40 | distribution = true 41 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator -------------------------------------------------------------------------------- /src/synthetic_dataset_generator/__init__.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from gradio import TabbedInterface 3 | 4 | from synthetic_dataset_generator import ( # noqa 5 | _distiset, 6 | _inference_endpoints, 7 | ) 8 | 9 | def launch(*args, **kwargs): 10 | """Launch the synthetic dataset generator. 11 | Based on the `TabbedInterface` from Gradio. 12 | Parameters: https://www.gradio.app/docs/gradio/tabbedinterface 13 | """ 14 | from synthetic_dataset_generator.app import demo 15 | return demo.launch(*args, server_name="0.0.0.0", **kwargs) 16 | 17 | 18 | launch.__doc__ = TabbedInterface.launch.__doc__ 19 | launch.__signature__ = inspect.signature(TabbedInterface.launch) 20 | launch.__annotations__ = TabbedInterface.launch.__annotations__ 21 | -------------------------------------------------------------------------------- /src/synthetic_dataset_generator/__main__.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | from synthetic_dataset_generator import launch 3 | 4 | launch() 5 | -------------------------------------------------------------------------------- /src/synthetic_dataset_generator/_distiset.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import distilabel 4 | import distilabel.distiset 5 | import gradio as gr 6 | from distilabel.utils.card.dataset_card import ( 7 | DistilabelDatasetCard, 8 | size_categories_parser, 9 | ) 10 | from huggingface_hub import DatasetCardData, HfApi 11 | 12 | 13 | class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset): 14 | def _generate_card( 15 | self, 16 | repo_id: str, 17 | token: str, 18 | include_script: bool = False, 19 | filename_py: Optional[str] = None, 20 | ) -> None: 21 | """Generates a dataset card and pushes it to the Hugging Face Hub, and 22 | if the `pipeline.yaml` path is available in the `Distiset`, uploads that 23 | to the same repository. 24 | 25 | Args: 26 | repo_id: The ID of the repository to push to, from the `push_to_hub` method. 27 | token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method. 28 | include_script: Whether to upload the script to the hugging face repository. 29 | filename_py: The name of the script. If `include_script` is True, the script will 30 | be uploaded to the repository using this name, otherwise it won't be used. 31 | """ 32 | card = self._get_card( 33 | repo_id=repo_id, 34 | token=token, 35 | include_script=include_script, 36 | filename_py=filename_py, 37 | ) 38 | 39 | card.push_to_hub( 40 | repo_id, 41 | repo_type="dataset", 42 | token=token, 43 | ) 44 | if self.pipeline_path: 45 | # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well. 46 | HfApi().upload_file( 47 | path_or_fileobj=self.pipeline_path, 48 | path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME, 49 | repo_id=repo_id, 50 | repo_type="dataset", 51 | token=token, 52 | ) 53 | 54 | def _get_card( 55 | self, 56 | repo_id: str, 57 | token: Optional[str] = None, 58 | include_script: bool = False, 59 | filename_py: Optional[str] = None, 60 | ) -> DistilabelDatasetCard: 61 | """Generates the dataset card for the `Distiset`. 62 | 63 | Note: 64 | If `repo_id` and `token` are provided, it will extract the metadata from the README.md file 65 | on the hub. 66 | 67 | Args: 68 | repo_id: Name of the repository to push to, or the path for the distiset if saved to disk. 69 | token: The token to authenticate with the Hugging Face Hub. 70 | We assume that if it's provided, the dataset will be in the Hugging Face Hub, 71 | so the README metadata will be extracted from there. 72 | include_script: Whether to upload the script to the hugging face repository. 73 | filename_py: The name of the script. If `include_script` is True, the script will 74 | be uploaded to the repository using this name, otherwise it won't be used. 75 | 76 | Returns: 77 | The dataset card for the `Distiset`. 78 | """ 79 | sample_records = {} 80 | for name, dataset in self.items(): 81 | sample_records[name] = ( 82 | dataset[0] if not isinstance(dataset, dict) else dataset["train"][0] 83 | ) 84 | 85 | columns = self["default"].column_names 86 | columns = self["default"].column_names 87 | 88 | if ("label" in columns and "text" in columns) or ( 89 | "labels" in columns and "text" in columns 90 | ): 91 | task_categories = ["text-classification"] 92 | elif ("prompt" in columns and "completion" in columns) or ( 93 | "messages" in columns 94 | ): 95 | task_categories: list[str] = [ 96 | "text-generation", 97 | "text2text-generation", 98 | "question-answering", 99 | ] 100 | elif "context" in columns and "question" in columns and "response" in columns: 101 | task_categories: list[str] = [ 102 | "text-generation", 103 | "text2text-generation", 104 | "text-retrieval", 105 | "question-answering" 106 | ] 107 | if ( 108 | "positive_retrieval" in columns and "negative_retrieval" in columns 109 | ) or ("positive_reranking" in columns and "negative_reranking" in columns): 110 | task_categories.append("sentence-similarity") 111 | else: 112 | task_categories: list[str] = [] 113 | gr.Info( 114 | f"No task categories found for dataset with columns: {columns}. " 115 | "Please notify the distilabel team if you think this is an error." 116 | ) 117 | 118 | readme_metadata = {} 119 | if repo_id and token: 120 | readme_metadata = self._extract_readme_metadata(repo_id, token) 121 | 122 | metadata = { 123 | **readme_metadata, 124 | "size_categories": size_categories_parser( 125 | max(len(dataset) for dataset in self.values()) 126 | ), 127 | "task_categories": task_categories, 128 | "tags": [ 129 | "synthetic", 130 | "distilabel", 131 | "rlaif", 132 | "datacraft", 133 | ], 134 | } 135 | 136 | card = DistilabelDatasetCard.from_template( 137 | card_data=DatasetCardData(**metadata), 138 | repo_id=repo_id, 139 | sample_records=sample_records, 140 | include_script=include_script, 141 | filename_py=filename_py, 142 | references=self.citations, 143 | ) 144 | 145 | return card 146 | 147 | 148 | distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag 149 | -------------------------------------------------------------------------------- /src/synthetic_dataset_generator/_inference_endpoints.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import distilabel 4 | import distilabel.distiset 5 | from distilabel.models import InferenceEndpointsLLM 6 | from pydantic import ( 7 | ValidationError, 8 | model_validator, 9 | ) 10 | 11 | 12 | class CustomInferenceEndpointsLLM(InferenceEndpointsLLM): 13 | @model_validator(mode="after") # type: ignore 14 | def only_one_of_model_id_endpoint_name_or_base_url_provided( 15 | self, 16 | ) -> "InferenceEndpointsLLM": 17 | """Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also 18 | provided, a warning will be shown informing the user that the provided `base_url` will be ignored in 19 | favour of the dynamically calculated one..""" 20 | 21 | if self.base_url and (self.model_id or self.endpoint_name): 22 | warnings.warn( # type: ignore 23 | f"Since the `base_url={self.base_url}` is available and either one of `model_id`" 24 | " or `endpoint_name` is also provided, the `base_url` will either be ignored" 25 | " or overwritten with the one generated from either of those args, for serverless" 26 | " or dedicated inference endpoints, respectively." 27 | ) 28 | 29 | if self.use_magpie_template and self.tokenizer_id is None: 30 | raise ValueError( 31 | "`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please," 32 | " set a `tokenizer_id` and try again." 33 | ) 34 | 35 | if ( 36 | self.model_id 37 | and self.tokenizer_id is None 38 | and self.structured_output is not None 39 | ): 40 | self.tokenizer_id = self.model_id 41 | 42 | if self.base_url and not (self.model_id or self.endpoint_name): 43 | return self 44 | 45 | if self.model_id and not self.endpoint_name: 46 | return self 47 | 48 | if self.endpoint_name and not self.model_id: 49 | return self 50 | 51 | raise ValidationError( 52 | f"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is" 53 | f" provided too, it will be overwritten instead. Found `model_id`={self.model_id}," 54 | f" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}." 55 | ) 56 | 57 | 58 | distilabel.models.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM 59 | -------------------------------------------------------------------------------- /src/synthetic_dataset_generator/_tabbedinterface.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file defines two useful high-level abstractions to build Gradio apps: Interface and TabbedInterface. 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | from collections.abc import Sequence 8 | 9 | import gradio as gr 10 | from gradio.blocks import Blocks 11 | from gradio.layouts import Tab, Tabs 12 | from gradio.themes import ThemeClass as Theme 13 | from gradio_client.documentation import document 14 | 15 | 16 | @document() 17 | class TabbedInterface(Blocks): 18 | """ 19 | A TabbedInterface is created by providing a list of Interfaces or Blocks, each of which gets 20 | rendered in a separate tab. Only the components from the Interface/Blocks will be rendered in the tab. 21 | Certain high-level attributes of the Blocks (e.g. custom `css`, `js`, and `head` attributes) will not be loaded. 22 | 23 | Demos: tabbed_interface_lite 24 | """ 25 | 26 | def __init__( 27 | self, 28 | interface_list: Sequence[Blocks], 29 | tab_names: list[str] | None = None, 30 | title: str | None = None, 31 | theme: Theme | str | None = None, 32 | analytics_enabled: bool | None = None, 33 | css: str | None = None, 34 | js: str | None = None, 35 | head: str | None = None, 36 | ): 37 | """ 38 | Parameters: 39 | interface_list: A list of Interfaces (or Blocks) to be rendered in the tabs. 40 | tab_names: A list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc. 41 | title: The tab title to display when this demo is opened in a browser window. 42 | theme: A Theme object or a string representing a theme. If a string, will look for a built-in theme with that name (e.g. "soft" or "default"), or will attempt to load a theme from the Hugging Face Hub (e.g. "gradio/monochrome"). If None, will use the Default theme. 43 | analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True. 44 | css: Custom css as a string or path to a css file. This css will be included in the demo webpage. 45 | js: Custom js as a string or path to a js file. The custom js should in the form of a single js function. This function will automatically be executed when the page loads. For more flexibility, use the head parameter to insert js inside