├── .dockerignore
├── .env.local.template
├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── assets
├── argilla.png
├── flow.png
├── logo.png
├── logo.svg
├── ui-full.png
└── ui.png
├── docker-compose.yml
├── docker
├── .env.docker.template
├── Dockerfile
├── README.md
├── argilla
│ └── compose.yml
└── ollama
│ ├── compose.yml
│ └── entrypoint.sh
├── examples
├── argilla-deployment.py
├── blog_private_synthetic_data_generation.md
├── fine-tune-deepseek-reasoning-sft.ipynb
├── fine-tune-modernbert-classifier.ipynb
├── fine-tune-modernbert-rag.ipynb
├── fine-tune-smollm2-on-synthetic-data.ipynb
├── hf-dedicated-or-tgi-deployment.py
├── hf-serverless-deployment-deepseek.py
├── hf-serverless-deployment.py
├── hf-serverless-different-model-for-completion.py
├── ollama-deployment.py
├── ollama-different-model-for-completion.py
├── openai-deployment.py
└── vllm-deployment.py
├── packages.txt
├── pdm.lock
├── pyproject.toml
├── requirements.txt
├── src
└── synthetic_dataset_generator
│ ├── __init__.py
│ ├── __main__.py
│ ├── _distiset.py
│ ├── _inference_endpoints.py
│ ├── _tabbedinterface.py
│ ├── app.py
│ ├── apps
│ ├── __init__.py
│ ├── about.py
│ ├── base.py
│ ├── chat.py
│ ├── eval.py
│ ├── rag.py
│ └── textcat.py
│ ├── constants.py
│ ├── pipelines
│ ├── __init__.py
│ ├── base.py
│ ├── chat.py
│ ├── embeddings.py
│ ├── eval.py
│ ├── rag.py
│ └── textcat.py
│ └── utils.py
└── tests
└── __init__.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Version control
2 | .git
3 | .gitignore
4 |
5 | # Python
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 | *.so
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # Virtual environments
29 | .env*
30 | !.env.example
31 | .venv
32 | env/
33 | venv/
34 | ENV/
35 |
36 | # IDE
37 | .idea/
38 | .vscode/
39 | *.swp
40 | *.swo
41 |
42 | # Testing
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Project specific
54 | nltk_data/
55 | .pdm-python
56 | .pdm.toml
57 | __pypackages__/
--------------------------------------------------------------------------------
/.env.local.template:
--------------------------------------------------------------------------------
1 | # =============================================================================
2 | # LOCAL/API CONFIGURATION
3 | # =============================================================================
4 |
5 | # -----------------------------------------------------------------------------
6 | # REQUIRED CONFIGURATION
7 | # -----------------------------------------------------------------------------
8 | # Hugging Face token (required for all setups)
9 | HF_TOKEN=hf_...
10 |
11 | # Generation Settings
12 | MAX_NUM_TOKENS=2048
13 | MAX_NUM_ROWS=1000
14 | DEFAULT_BATCH_SIZE=5
15 |
16 | # Required for chat data generation with Llama or Qwen models
17 | # Options: "llama3", "qwen2", or custom template string
18 | MAGPIE_PRE_QUERY_TEMPLATE=llama3
19 |
20 | # -----------------------------------------------------------------------------
21 | # A. CLOUD API SERVICES
22 | # -----------------------------------------------------------------------------
23 |
24 | # 1. HUGGING FACE INFERENCE API (Default, Recommended)
25 | MODEL=meta-llama/Llama-3.1-8B-Instruct
26 | # MODEL=Qwen/Qwen2.5-1.5B-Instruct
27 |
28 | # 2. OPENAI API
29 | # OPENAI_BASE_URL=https://api.openai.com/v1/
30 | # MODEL=gpt-4
31 | # API_KEY=sk-...
32 |
33 | # 3. HUGGING FACE SPACE FOR ARGILLA (optional)
34 | # ARGILLA_API_URL=https://your-space.hf.space/
35 | # ARGILLA_API_KEY=your_key
36 |
37 | # -----------------------------------------------------------------------------
38 | # B. LOCAL SERVICES (Requires Installation)
39 | # -----------------------------------------------------------------------------
40 |
41 | # 1. LOCAL OLLAMA
42 | # OLLAMA_BASE_URL=http://127.0.0.1:11434/
43 | # MODEL=llama3.2:1b
44 | # TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
45 |
46 | # 2. LOCAL VLLM
47 | # VLLM_BASE_URL=http://127.0.0.1:8000/
48 | # MODEL=Qwen/Qwen2.5-1.5B-Instruct
49 | # TOKENIZER_ID=Qwen/Qwen2.5-1.5B-Instruct
50 |
51 | # 3. LOCAL TGI
52 | # HUGGINGFACE_BASE_URL=http://127.0.0.1:3000/
53 | # MODEL=meta-llama/Llama-3.1-8B-Instruct
54 | # TOKENIZER_ID=meta-llama/Llama-3.1-8B-Instruct
55 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tar filter=lfs diff=lfs merge=lfs -text
29 | *.tflite filter=lfs diff=lfs merge=lfs -text
30 | *.tgz filter=lfs diff=lfs merge=lfs -text
31 | *.wasm filter=lfs diff=lfs merge=lfs -text
32 | *.xz filter=lfs diff=lfs merge=lfs -text
33 | *.zip filter=lfs diff=lfs merge=lfs -text
34 | *.zst filter=lfs diff=lfs merge=lfs -text
35 | *tfevents* filter=lfs diff=lfs merge=lfs -text
36 | assets/flow.png filter=lfs diff=lfs merge=lfs -text
37 | *.sh text eol=lf
38 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm-project.org/#use-with-ide
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | .python-version
133 |
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 |
138 | # Rope project settings
139 | .ropeproject
140 |
141 | # mkdocs documentation
142 | /site
143 |
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 |
149 | # Pyre type checker
150 | .pyre/
151 |
152 | # pytype static type analyzer
153 | .pytype/
154 |
155 | # Cython debug symbols
156 | cython_debug/
157 |
158 | # PyCharm
159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | # and can be added to the global gitignore or merged into this file. For a more nuclear
162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | .DS_Store
165 |
166 | # nltk
167 | nltk_data/
168 |
169 | # examples
170 | models/
171 |
172 | # Elasticsearch data
173 | elasticsearch_data/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Synthetic Data Generator
3 | short_description: Build datasets using natural language
4 | emoji: 🧬
5 | colorFrom: yellow
6 | colorTo: pink
7 | sdk: gradio
8 | sdk_version: 5.8.0
9 | app_file: app.py
10 | pinned: true
11 | license: apache-2.0
12 | hf_oauth: true
13 | #header: mini
14 | hf_oauth_scopes:
15 | - read-repos
16 | - write-repos
17 | - manage-repos
18 | - inference-api
19 | ---
20 |
21 | > [!IMPORTANT]
22 | The original authors have moved on to other projects. While the code might still be functional for its original purpose, please be aware that the original team does not plan to develop new features, bug fixes, or updates. If you'd like to become a maintainer, please open an issue to discuss.
23 | >
24 | >
25 |
26 |
27 |
28 |
29 |
30 | Build datasets using natural language
31 |
32 | 
33 |
34 | ## Introduction
35 |
36 | Synthetic Data Generator is a tool that allows you to create high-quality datasets for training and fine-tuning language models. It leverages the power of distilabel and LLMs to generate synthetic data tailored to your specific needs. [The announcement blog](https://huggingface.co/blog/synthetic-data-generator) goes over a practical example of how to use it but you can also watch the [video](https://www.youtube.com/watch?v=nXjVtnGeEss) to see it in action.
37 |
38 | Supported Tasks:
39 |
40 | - Text Classification
41 | - Chat Data for Supervised Fine-Tuning
42 | - Retrieval Augmented Generation
43 |
44 | This tool simplifies the process of creating custom datasets, enabling you to:
45 |
46 | - Describe the characteristics of your desired application
47 | - Iterate on sample datasets
48 | - Produce full-scale datasets
49 | - Push your datasets to the [Hugging Face Hub](https://huggingface.co/datasets?other=datacraft) and/or [Argilla](https://docs.argilla.io/)
50 |
51 | By using the Synthetic Data Generator, you can rapidly prototype and create datasets for, accelerating your AI development process.
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 | ## Installation
66 |
67 | You can simply install the package with:
68 |
69 | ```bash
70 | pip install synthetic-dataset-generator
71 | ```
72 |
73 | ### Quickstart
74 |
75 | ```python
76 | from synthetic_dataset_generator import launch
77 |
78 | launch()
79 | ```
80 |
81 | ### Environment Variables
82 |
83 | - `HF_TOKEN`: Your [Hugging Face token](https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&tokenType=fineGrained) to push your datasets to the Hugging Face Hub and generate free completions from Hugging Face Inference Endpoints. You can find some configuration examples in the [examples](examples/) folder.
84 |
85 | You can set the following environment variables to customize the generation process.
86 |
87 | - `MAX_NUM_TOKENS`: The maximum number of tokens to generate, defaults to `2048`.
88 | - `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`.
89 | - `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`.
90 |
91 | Optionally, you can use different API providers and models.
92 |
93 | - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`, `llama3.1`.
94 | - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the `HF_TOKEN` environment variable.
95 | - `OPENAI_BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api.openai.com/v1/`.
96 | - `OLLAMA_BASE_URL`: The base URL for any Ollama compatible API, e.g. `http://127.0.0.1:11434/`.
97 | - `HUGGINGFACE_BASE_URL`: The base URL for any Hugging Face compatible API, e.g. TGI server or Dedicated Inference Endpoints. If you want to use serverless inference, only set the `MODEL`.
98 | - `VLLM_BASE_URL`: The base URL for any VLLM compatible API, e.g. `http://localhost:8000/`.
99 |
100 | To use a specific model exclusively for generating completions, set the corresponding environment variables by appending `_COMPLETION` to the ones mentioned earlier. For example, you can use `MODEL_COMPLETION` and `OPENAI_BASE_URL_COMPLETION`.
101 |
102 | SFT and Chat Data generation is not supported with OpenAI Endpoints. Additionally, you need to configure it per model family based on their prompt templates using the right `TOKENIZER_ID` and `MAGPIE_PRE_QUERY_TEMPLATE` environment variables.
103 |
104 | - `TOKENIZER_ID`: The tokenizer ID to use for the magpie pipeline, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`.
105 | - `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie, which is only supported with Hugging Face Inference Endpoints. `llama3` and `qwen2` are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"`, respectively. For other models, you can pass a custom pre-query template string.
106 |
107 | Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:
108 |
109 | - `ARGILLA_API_KEY`: Your Argilla API key to push your datasets to Argilla.
110 | - `ARGILLA_API_URL`: Your Argilla API URL to push your datasets to Argilla.
111 |
112 | To save the generated datasets to a local directory instead of pushing them to the Hugging Face Hub, set the following environment variable:
113 |
114 | - `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to.
115 |
116 | You can use our environment template as a starting point:
117 |
118 | ```bash
119 | cp .env.local.template .env
120 | ```
121 |
122 | ### Argilla integration
123 |
124 | Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the Hugging Face Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).
125 |
126 | 
127 |
128 | ## Custom synthetic data generation?
129 |
130 | Each pipeline is based on distilabel, so you can easily change the LLM or the pipeline steps.
131 |
132 | Check out the [distilabel library](https://github.com/argilla-io/distilabel) for more information.
133 |
134 | ## Development
135 |
136 | Install the dependencies:
137 |
138 | ```bash
139 | # Create a virtual environment
140 | python -m venv .venv
141 | source .venv/bin/activate
142 |
143 | # Install the dependencies
144 | pip install -e . # pdm install
145 | ```
146 |
147 | Run the app:
148 |
149 | ```bash
150 | python app.py
151 | ```
152 |
153 | ## 🐳 Docker Setup
154 |
155 | The containerized tool uses Ollama for local LLM inference and Argilla for data curation. Here's the architecture:
156 |
157 | 
158 |
159 | Quick setup with all services (App + Ollama + Argilla):
160 |
161 | ```bash
162 | # Copy environment template
163 | cp docker/.env.docker.template .env # Add your HF_TOKEN in .env
164 |
165 | # Build all services (this may take a few minutes)
166 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
167 |
168 | # Start all services
169 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
170 | ```
171 |
172 | > For more detailed Docker configurations and setups, check [docker/README.md](docker/README.md)
173 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from synthetic_dataset_generator import launch
2 |
3 | if __name__ == "__main__":
4 | launch()
5 |
--------------------------------------------------------------------------------
/assets/argilla.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/argilla.png
--------------------------------------------------------------------------------
/assets/flow.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b0465f5f3ed2a87b14cc609a1f25a1e7b0bfeb1cc8cab534a6ec79a9a8651996
3 | size 1810372
4 |
--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/logo.png
--------------------------------------------------------------------------------
/assets/ui-full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/ui-full.png
--------------------------------------------------------------------------------
/assets/ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/ui.png
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | app:
3 | build:
4 | context: .
5 | dockerfile: docker/Dockerfile
6 | image: synthetic-data-generator:app
7 | ports:
8 | - "7860:7860"
9 | env_file:
10 | - .env
11 | networks:
12 | - app-network
13 |
14 | networks:
15 | app-network:
16 | name: synthetic-data-network
17 | driver: bridge
--------------------------------------------------------------------------------
/docker/.env.docker.template:
--------------------------------------------------------------------------------
1 | # =============================================================================
2 | # DOCKER CONFIGURATION ONLY - FULL SETUP (APP + OLLAMA + ARGILLA)
3 | # =============================================================================
4 |
5 | # Note: Before building:
6 | # 1. Copy this template to the root directory: cp docker/.env.docker.template .env
7 | # 2. Comment/uncomment the sections you want to use (OLLAMA and/or ARGILLA)
8 | # 3. Then build and run with the appropriate docker compose command
9 |
10 | # Hugging Face token with read/write permissions
11 | HF_TOKEN=your_token_here
12 |
13 | # -----------------------------------------------------------------------------
14 | # GENERATION SETTINGS
15 | # -----------------------------------------------------------------------------
16 | MAX_NUM_TOKENS=2048
17 | MAX_NUM_ROWS=1000
18 | DEFAULT_BATCH_SIZE=5
19 |
20 | # -----------------------------------------------------------------------------
21 | # OLLAMA DOCKER CONFIGURATION
22 | # -----------------------------------------------------------------------------
23 | OLLAMA_BASE_URL=http://ollama:11434
24 | OLLAMA_HARDWARE=latest # latest (for CPU/NVIDIA), rocm (for AMD)
25 |
26 | # LLAMA 3.2
27 | MODEL=llama3.2:1b
28 | TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
29 | MAGPIE_PRE_QUERY_TEMPLATE=llama3
30 |
31 | # DEEPSEEK R1
32 | #MODEL=deepseek-r1:1.5b # must match ollama tags https://ollama.com/library/deepseek-r1:1.5b
33 | #TOKENIZER_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
34 | #MAGPIE_PRE_QUERY_TEMPLATE= "<|begin▁of▁sentence|>User: "
35 |
36 | # -----------------------------------------------------------------------------
37 | # ARGILLA DOCKER CONFIGURATION (persistent data)
38 | # -----------------------------------------------------------------------------
39 | ARGILLA_API_URL=http://argilla:6900
40 | ARGILLA_USERNAME=admin
41 | ARGILLA_PASSWORD=admin1234
42 | ARGILLA_API_KEY=admin.1234
43 | ARGILLA_REINDEX_DATASET=1
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use Python slim image as base
2 | FROM python:3.10-slim
3 |
4 | # Set environment variables
5 | ENV PYTHONUNBUFFERED=1 \
6 | PYTHONDONTWRITEBYTECODE=1 \
7 | PIP_NO_CACHE_DIR=1
8 |
9 | # Create and set working directory
10 | WORKDIR /app
11 |
12 | # Create non-root user first
13 | RUN useradd -m -u 1000 appuser
14 |
15 | # Install system dependencies including build tools
16 | RUN apt-get update && apt-get install -y --no-install-recommends \
17 | curl \
18 | build-essential \
19 | cmake \
20 | libgl1-mesa-glx \
21 | libglib2.0-0 \
22 | libsm6 \
23 | libxext6 \
24 | libxrender-dev \
25 | && rm -rf /var/lib/apt/lists/*
26 |
27 | # Install pdm
28 | RUN pip install --no-cache-dir pdm
29 |
30 | # Copy project files and set permissions
31 | COPY . .
32 | RUN chown -R appuser:appuser /app && \
33 | chmod -R 755 /app
34 |
35 | # Switch to non-root user
36 | USER appuser
37 |
38 | # Install dependencies in a virtual environment
39 | RUN pdm install --prod --frozen-lockfile
40 |
41 | # Expose Gradio port
42 | EXPOSE 7860
43 |
44 | # Start command using pdm run to use the virtual environment
45 | CMD ["pdm", "run", "python", "-m", "synthetic_dataset_generator"]
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | # Docker Configuration Guide
2 |
3 | Each service runs in its own container, communicating through internal networks. The core app connects to Ollama for model inference and Argilla for data review:
4 |
5 | 
6 |
7 | The application can be run with different configurations using Docker Compose:
8 |
9 | - `docker-compose.yml`: Core application
10 | - `docker/ollama/compose.yml`: Ollama service for local LLM inference
11 | - `docker/argilla/compose.yml`: Argilla service for data curation
12 |
13 | ## Ollama Integration
14 |
15 | The `MODEL` variable in your `.env` file determines which model Ollama will download and use. For example:
16 | ```env
17 | MODEL=llama3.2:1b
18 | ```
19 |
20 | ## Setup Options
21 |
22 | ### Full Setup (App + Ollama + Argilla)
23 | ```bash
24 | # Keep all sections uncommented in .env
25 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
26 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
27 | ```
28 |
29 | ### App + Ollama
30 | ```bash
31 | # Comment out ARGILLA section in .env
32 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml build
33 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d
34 | ```
35 |
36 | ### App + Argilla
37 | ```bash
38 | # Comment out OLLAMA section in .env
39 | docker compose -f docker-compose.yml -f docker/argilla/compose.yml build
40 | docker compose -f docker-compose.yml -f docker/argilla/compose.yml up -d
41 | ```
42 |
43 | ### App Only
44 | ```bash
45 | # Comment out both OLLAMA and ARGILLA sections in .env
46 | docker compose -f docker-compose.yml build
47 | docker compose -f docker-compose.yml up -d
48 | ```
49 |
50 | ## Managing Services
51 |
52 | Services are built separately but are linked together. If you already have some services built and want to add another:
53 |
54 | 1. You don't need to rebuild existing services
55 | 2. Just build the new service
56 | 3. Stop everything with `down` and start again with `up`
57 |
58 | For example, if you have App + Ollama and want to add Argilla:
59 | ```bash
60 | docker compose -f docker/argilla/compose.yml build # only build Argilla
61 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml down
62 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
63 | ```
64 |
65 | Similarly, if you have built all services but want to run only some of them:
66 | > **Important**: When running specific services, remember to comment out unused services in `.env` first
67 |
68 | ```bash
69 | # No need to build again, just start the services you need
70 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d # start only App + Ollama
71 | ```
72 |
73 | ## Service URLs
74 |
75 | Once running, access the services at:
76 | - App: http://localhost:7860
77 | - Argilla: http://localhost:6900 (if enabled)
78 | - Ollama: http://localhost:11434 (if enabled)
79 |
80 | > Note: Services will be available after a few seconds while they initialize. Ollama models and Argilla datasets are persisted and available after restarts
--------------------------------------------------------------------------------
/docker/argilla/compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | app:
3 | extends:
4 | file: docker-compose.yml
5 | service: app
6 | depends_on:
7 | argilla:
8 | condition: service_healthy
9 | required: false
10 | environment:
11 | - ARGILLA_API_URL=http://argilla:6900
12 |
13 | elasticsearch:
14 | image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0
15 | environment:
16 | - ES_JAVA_OPTS=-Xms512m -Xmx512m
17 | - node.name=elasticsearch
18 | - cluster.name=es-argilla-local
19 | - discovery.type=single-node
20 | - cluster.routing.allocation.disk.threshold_enabled=false
21 | - xpack.security.enabled=false
22 | volumes:
23 | - es_data:/usr/share/elasticsearch/data
24 | networks:
25 | - app-network
26 | ports:
27 | - "9200:9200"
28 | - "9300:9300"
29 | ulimits:
30 | memlock:
31 | soft: -1
32 | hard: -1
33 | nofile:
34 | soft: 65536
35 | hard: 65536
36 | healthcheck:
37 | test: ["CMD", "curl", "-f", "http://localhost:9200"]
38 | interval: 30s
39 | timeout: 10s
40 | retries: 3
41 |
42 | postgres:
43 | image: postgres:14
44 | environment:
45 | POSTGRES_USER: postgres
46 | POSTGRES_PASSWORD: postgres
47 | POSTGRES_DB: argilla
48 | networks:
49 | - app-network
50 | volumes:
51 | - postgres_data:/var/lib/postgresql/data
52 |
53 | redis:
54 | image: redis
55 | networks:
56 | - app-network
57 |
58 | argilla:
59 | image: argilla/argilla-server:latest
60 | ports:
61 | - "6900:6900"
62 | healthcheck:
63 | test: ["CMD", "curl", "-f", "http://localhost:6900/api/ready"]
64 | interval: 30s
65 | timeout: 10s
66 | retries: 3
67 | env_file:
68 | - .env
69 | environment:
70 | - ARGILLA_HOME_PATH=/var/lib/argilla
71 | - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
72 | - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
73 | - ARGILLA_REDIS_URL=redis://redis:6379/0
74 | - USERNAME=${ARGILLA_USERNAME}
75 | - PASSWORD=${ARGILLA_PASSWORD}
76 | - API_KEY=${ARGILLA_API_KEY}
77 | - WORKSPACE=default
78 | volumes:
79 | - argilla_data:/argilla
80 | networks:
81 | - app-network
82 | depends_on:
83 | elasticsearch:
84 | condition: service_healthy
85 | postgres:
86 | condition: service_started
87 | redis:
88 | condition: service_started
89 |
90 | worker:
91 | image: argilla/argilla-server:latest
92 | env_file:
93 | - .env
94 | environment:
95 | - ARGILLA_HOME_PATH=/var/lib/argilla
96 | - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
97 | - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
98 | - ARGILLA_REDIS_URL=redis://redis:6379/0
99 | - BACKGROUND_NUM_WORKERS=2
100 | - USERNAME=${ARGILLA_USERNAME}
101 | - PASSWORD=${ARGILLA_PASSWORD}
102 | - API_KEY=${ARGILLA_API_KEY}
103 | - WORKSPACE=default
104 | networks:
105 | - app-network
106 | depends_on:
107 | - postgres
108 | - elasticsearch
109 | - redis
110 | command: sh -c 'python -m argilla_server worker --num-workers $${BACKGROUND_NUM_WORKERS}'
111 |
112 | volumes:
113 | es_data:
114 | name: synthetic-data-es
115 | argilla_data:
116 | name: synthetic-data-argilla
117 | postgres_data:
118 | name: synthetic-data-postgres
--------------------------------------------------------------------------------
/docker/ollama/compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | app:
3 | extends:
4 | file: docker-compose.yml
5 | service: app
6 | depends_on:
7 | ollama:
8 | condition: service_healthy
9 | required: true
10 | environment:
11 | - OLLAMA_BASE_URL=http://ollama:11434
12 |
13 | ollama:
14 | image: ollama/ollama:${OLLAMA_HARDWARE:-latest}
15 | ports:
16 | - "11434:11434"
17 | env_file:
18 | - .env
19 | environment:
20 | - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-}
21 | volumes:
22 | - ollama_data:/root/.ollama
23 | - ./docker/ollama/entrypoint.sh:/entrypoint.sh
24 | networks:
25 | - app-network
26 | deploy:
27 | resources:
28 | reservations:
29 | devices:
30 | - driver: nvidia
31 | count: all
32 | capabilities: [gpu]
33 | tty: true
34 | entrypoint: ["/usr/bin/bash", "/entrypoint.sh"]
35 | healthcheck:
36 | test:
37 | - "CMD-SHELL"
38 | - |
39 | test -f /tmp/ollama_ready && \
40 | bash -c '/dev/null && ollama list | grep -q "$MODEL_NAME"; then
26 | echo "🟢 Model download complete!"
27 | touch /tmp/ollama_ready
28 | else
29 | echo "❌ Error downloading model ($MODEL_NAME)"
30 | fi
31 | fi
32 | fi
33 |
34 | # Wait for Ollama process to finish
35 | wait $pid
--------------------------------------------------------------------------------
/examples/argilla-deployment.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 | import os
8 |
9 | from synthetic_dataset_generator import launch
10 |
11 | # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
12 | os.environ["HF_TOKEN"] = "hf_..."
13 | os.environ["ARGILLA_API_URL"] = (
14 | "https://[your-owner-name]-[your_space_name].hf.space" # argilla base url
15 | )
16 | os.environ["ARGILLA_API_KEY"] = "my_api_key" # argilla api key
17 |
18 | launch()
19 |
--------------------------------------------------------------------------------
/examples/blog_private_synthetic_data_generation.md:
--------------------------------------------------------------------------------
1 | # Private Synthetic Data Generation Made Easy: Out-of-the-Box with Docker, Argilla & Ollama
2 |
3 | > "Empowering organizations with a turnkey solution for synthetic dataset creation in private environments."
4 |
5 | The increasing adoption of AI solutions across industries has created an unprecedented demand for high-quality training data. As organizations scale their AI initiatives, they face the dual challenge of generating substantial, domain-specific datasets while ensuring data privacy and security. Traditional approaches often involve compromises: either using public datasets that may not fully align with specific needs, or investing heavily in custom data generation infrastructure.
6 |
7 | The complexity of this challenge is amplified by regulatory requirements, resource constraints, and the need for specialized expertise. Organizations must navigate GDPR, CCPA, and industry-specific regulations while maintaining efficient data generation pipelines. This has created a pressing need for solutions that can operate entirely within private infrastructure while maintaining enterprise-grade capabilities.
8 |
9 | ## The Challenge
10 |
11 | The development of AI models requires extensive training data, yet organizations face significant obstacles in data generation and management. Privacy regulations and security requirements often prevent the use of public datasets or cloud-based generation services. Additionally, existing solutions typically demand complex infrastructure setups and significant technical expertise, increasing both implementation time and costs.
12 |
13 | Modern enterprises require a solution that addresses several critical aspects:
14 | 1. Data Privacy: Complete control over data generation and storage
15 | 2. Infrastructure Flexibility: Deployment options that fit existing systems
16 | 3. Quality Assurance: Tools for data validation and curation
17 | 4. Scalability: Ability to grow with increasing data needs
18 | 5. Cost Efficiency: Reduction in infrastructure and maintenance costs
19 |
20 | ## The Solution
21 |
22 | This out-of-the-box Synthetic Dataset Generator approach leverages the power of three technologies to create a seamless, private data generation pipeline. At its core is the [Synthetic Dataset Generator](https://github.com/argilla-io/synthetic-data-generator), a tool designed for dataset creation. [Ollama](https://ollama.ai/) ensures secure local LLM inference with [Distilabel](https://github.com/argilla-io/distilabel) integration, while [Argilla's](https://argilla.io/) data curation capabilities complete the workflow, all operating within your secure infrastructure.
23 |
24 | This architecture delivers key technical advantages:
25 | - Full data sovereignty with containerized local deployment
26 | - End-to-end pipeline from generation to validation
27 | - Modular design for system integration
28 |
29 | Here's how it all fits together:
30 |
31 | 
32 |
33 | Let's explore how these components work together in a practical workflow.
34 |
35 | ## 1. Installation & Setup
36 |
37 |
38 |
39 | ### 1.1 Clone Repository
40 | ```bash
41 | git clone https://github.com/argilla-io/synthetic-data-generator
42 | cd synthetic-data-generator
43 | ```
44 |
45 | ### 1.2 Environment Setup
46 | ```bash
47 | # Copy environment template
48 | cp docker/.env.docker.template .env
49 |
50 | # Model configuration in .env (if using Ollama)
51 | MODEL="deepseek-r1:1.5b" # Must match Ollama model name
52 | ```
53 |
54 | ### 1.3 Build & Deploy Services
55 | > Pro tip: Even if you're planning to use just one component initially, we recommend building all services to enable future functionality without rebuilding. For detailed deployment options, check the [Docker documentation](https://github.com/argilla-io/synthetic-data-generator/blob/main/docker/README.md).
56 |
57 | > Note: Ollama runs on CPU/GPU for Linux/Windows in Docker. For macOS, only CPU is supported in Docker - for GPU support, install Ollama separately ([details](https://ollama.com/blog/ollama-is-now-available-as-an-official-docker-image)).
58 |
59 | ```bash
60 | # Build all services
61 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
62 | # Start all services
63 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
64 | ```
65 |
66 | To view logs, either:
67 | - Use Docker Desktop's interface
68 | - Remove the `-d` flag when running the above command
69 | - Or execute the following for specific service logs:
70 | ```bash
71 | # Core App logs
72 | docker compose logs -f app
73 | # Ollama logs
74 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml logs -f ollama
75 | # Argilla logs
76 | docker compose -f docker-compose.yml -f docker/argilla/compose.yml logs -f argilla
77 | ```
78 |
79 | ## 2. Dataset Generation
80 |
81 | The tool currently supports **Text Classification**, **Chat**, and **RAG** datasets. These tasks will determine the type of dataset you will generate: classification requires categories, chat data requires a conversation format, and RAG requires question-answer pairs with relevant context, offering options for both retrieval and reranking data generation to enhance different aspects of information retrieval systems.
82 |
83 | For a detailed overview of the generation process, check out the [introduction to the Synthetic Data Generator](https://huggingface.co/blog/synthetic-data-generator).
84 |
85 |
86 | ### 2.1. **Dataset Description**
87 |
88 | Let's walk through creating a **RAG dataset**.
89 | ```text
90 | A dataset to retrieve information from information security policies
91 | ```
92 |
93 | System initializes and processes the prompt:
94 | 
95 |
96 |
97 | ### 2.2. **Task Configuration & Sample Generation**
98 | System analyzes and generates the system prompt and optimal parameters automatically. Then, samples are generated for validation (modify system prompt or parameters manually if needed, then click save to generate sample data):
99 | 
100 |
101 |
102 | ### 2.3. **Full Dataset Generation**
103 | After validating the sample data quality, proceed with full dataset generation. Configure the following parameters:
104 |
105 | - **Repository Owner**: Your Hugging Face username for dataset hosting
106 | - **Dataset Name**: A descriptive name following standard naming conventions
107 | - **Number of Examples**: Define dataset size (recommended: 100-1000 for initial deployments)
108 | - **Temperature**: Controls generation creativity (default 0.7 balances coherence and diversity)
109 | - **Privacy Settings**: Optional dataset privacy configuration for Hugging Face Hub
110 |
111 | The temperature parameter significantly impacts output quality:
112 | - 0.5-0.7: Optimal for technical documentation and factual content
113 | - 0.7-0.8: Balanced for general purpose datasets
114 | - 0.8-1.0: Increased creativity, suitable for conversational data
115 |
116 |
117 | The system initiates the generation pipeline, leveraging Distilabel for structured output:
118 | 
119 |
120 |
121 | Upon completion, the dataset is pushed to Hugging Face Hub:
122 | 
123 |
124 | Access your generated dataset through the Hugging Face Hub interface:
125 |
126 |
132 |
133 |
134 |
135 | ## 3. Data Curation with Argilla
136 |
137 | The integration with Argilla provides enterprise-grade dataset curation capabilities through a comprehensive review system. This phase is crucial for ensuring data quality and maintaining high standards in your training datasets.
138 |
139 | ### Environment Configuration
140 | Before accessing Argilla's features, ensure proper configuration in your `.env` file.
141 |
142 |
143 | ### Curation Workflow
144 |
145 | 1. **Dataset Integration**
146 | Upon generation completion, the dataset is automatically ingested into Argilla. The system maintains data integrity and version control throughout the process. All datasets and progress persist across Docker restarts unless you explicitly remove the Argilla services and volumes.
147 | 
148 |
149 |
150 | 2. **Quality Assurance Process**
151 | Argilla's interface provides comprehensive tools for dataset validation:
152 | - Semantic analysis of generated content
153 | - Consistency checking across entries
154 | - Metadata validation and enrichment
155 | - Collaborative review capabilities
156 |
157 | 
158 |
159 |
160 |
161 | 3. **Dataset Publication**
162 | After thorough review, export your curated dataset to Hugging Face Hub:
163 |
164 | > Note: Consider using a new repository name to preserve both raw and curated datasets separately.
165 |
166 | - Configure repository settings
167 | - Set visibility and access controls
168 | - Add dataset cards and documentation
169 |
170 | 
171 |
172 |
173 | The curated dataset maintains full provenance tracking and quality metrics:
174 |
180 |
181 | # 🎉 You're Done!
182 | Congratulations! You've successfully completed the end-to-end dataset generation and curation process. Your curated dataset is now ready for model training.
183 |
184 | ## Experience the Solution
185 |
186 | For a hands-on preview of the Synthetic Dataset Generator's capabilities, explore the hosted space. This allows you to evaluate the interface and functionality before deploying your own instance:
187 |
188 |
196 |
197 | Create your own deployment by duplicating this Space.
198 |
199 | ## What's Next?
200 |
201 | After successfully generating your first dataset, several advanced implementation paths are available:
202 |
203 | Extend your dataset generation capabilities:
204 | - [Fine-tune models on synthetic data](https://huggingface.co/blog/davidberenstein1957/fine-tune-a-smollm-on-synthetic-data-of-llm) for domain-specific tasks
205 | - [Create specialized reasoning datasets](https://huggingface.co/blog/sdiazlor/fine-tune-deepseek-with-a-synthetic-reasoning-data) for advanced model training
206 |
207 | ## Conclusion
208 |
209 | The Synthetic Dataset Generator represents a significant advancement in private data generation technology, addressing the growing need for high-quality training data while maintaining security and control. By leveraging containerized architecture and local LLM inference, organizations can now generate custom datasets without compromising on data privacy or quality.
210 |
211 | The solution's modular design enables seamless integration with existing ML pipelines while providing enterprise-grade features like persistent storage, comprehensive monitoring, and scalable infrastructure. Through collaborative validation workflows and structured quality control processes, teams can efficiently create and curate datasets tailored to their specific needs.
212 |
213 | This combination of security, efficiency, and flexibility makes the Synthetic Dataset Generator an essential tool for organizations looking to accelerate their AI development while maintaining complete control over their data generation pipeline.
214 |
215 | ## References & Documentation
216 |
217 |
218 | - [Synthetic Dataset Generator](https://github.com/argilla-io/synthetic-data-generator): Open-source tool for dataset generation using natural language
219 | - [Distilabel Framework](https://github.com/argilla-io/distilabel): Advanced dataset generation framework
220 | - [Docker Best Practices](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/): Container optimization guidelines
221 | - [Argilla Documentation](https://docs.argilla.io): Data curation platform documentation
222 | - [Ollama Integration](https://github.com/jmorganca/ollama): Local LLM deployment guide
--------------------------------------------------------------------------------
/examples/fine-tune-modernbert-classifier.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fine-tune ModernBERT for text classification using synthetic data\n",
8 | "\n",
9 | "LLMs are great general purpose models, but they are not always the best choice for a specific task. Therefore, smaller and more specialized models are important for sustainable, efficient, and cheaper AI.\n",
10 | "A lack of domain sepcific datasets is a common problem for smaller and more specialized models. This is because it is difficult to find a dataset that is both representative and diverse enough for a specific task. We solve this problem by generating a synthetic dataset from an LLM using the `synthetic-data-generator`, which is available as a [Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) or on [GitHub](https://github.com/argilla-io/synthetic-data-generator).\n",
11 | "\n",
12 | "In this example, we will fine-tune a ModernBERT model on a synthetic dataset generated from the synthetic-data-generator. This demonstrates the effectiveness of synthetic data and the novel ModernBERT model, which is a new and improved version of BERT models, with an 8192 token context length, significantly better downstream performance, and much faster processing speeds.\n",
13 | "\n",
14 | "## Install the dependencies"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "# Install Pytorch & other libraries\n",
24 | "%pip install \"torch==2.5.0\" \"torchvision==0.20.0\" \n",
25 | "%pip install \"setuptools<71.0.0\" scikit-learn \n",
26 | " \n",
27 | "# Install Hugging Face libraries\n",
28 | "%pip install --upgrade \\\n",
29 | " \"datasets==3.1.0\" \\\n",
30 | " \"accelerate==1.2.1\" \\\n",
31 | " \"hf-transfer==0.1.8\"\n",
32 | " \n",
33 | "# ModernBERT is not yet available in an official release, so we need to install it from github\n",
34 | "%pip install \"git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1\" --upgrade"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## The problem\n",
42 | "\n",
43 | "The [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier), is a model that can classify the domain of a text which can help with curating data. This model is cool but is based on the Deberta V3 Base, which is an outdated architecture that requires custom code to run, has a context length of 512 tokens, and is not as fast as the ModernBERT model. The labels for the model are:\n",
44 | "\n",
45 | "```\n",
46 | "'Adult', 'Arts_and_Entertainment', 'Autos_and_Vehicles', 'Beauty_and_Fitness', 'Books_and_Literature', 'Business_and_Industrial', 'Computers_and_Electronics', 'Finance', 'Food_and_Drink', 'Games', 'Health', 'Hobbies_and_Leisure', 'Home_and_Garden', 'Internet_and_Telecom', 'Jobs_and_Education', 'Law_and_Government', 'News', 'Online_Communities', 'People_and_Society', 'Pets_and_Animals', 'Real_Estate', 'Science', 'Sensitive_Subjects', 'Shopping', 'Sports', 'Travel_and_Transportation'\n",
47 | "```\n",
48 | "\n",
49 | "The data on which the model was trained is not available, so we cannot use it for our purposes. We can however generate a synthetic data to solve this problem."
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {
55 | "vscode": {
56 | "languageId": "plaintext"
57 | }
58 | },
59 | "source": [
60 | "## Let's generate some data\n",
61 | "\n",
62 | "Let's go to the [hosted Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) to generate the data. This is done in three steps 1) we come up with a dataset description, 2) iterate on the task configuration, and 3) generate and push the data to Hugging Face. A more detailed flow can be found in [this blogpost](https://huggingface.co/blog/synthetic-data-generator). \n",
63 | "\n",
64 | "\n",
70 | "\n",
71 | "For this example, we will generate 1000 examples with a temperature of 1. After some iteration, we come up with the following system prompt:\n",
72 | "\n",
73 | "```\n",
74 | "Long texts (at least 2000 words) from various media sources like Wikipedia, Reddit, Common Crawl, websites, commercials, online forums, books, newspapers and folders that cover multiple topics. Classify the text based on its main subject matter into one of the following categories\n",
75 | "```\n",
76 | "\n",
77 | "We press the \"Push to Hub\" button and wait for the data to be generated. This takes a few minutes and we end up with a dataset with 1000 examples. The labels are nicely distributed across the categories, varied in length, and the texts look diverse and interesting.\n",
78 | "\n",
79 | "\n",
85 | "\n",
86 | "The data is pushed to Argilla to so we recommend inspecting and validating the labels before finetuning the model."
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "## Finetuning the ModernBERT model\n",
94 | "\n",
95 | "We mostly rely on the blog from [Phillip Schmid](https://www.philschmid.de/fine-tune-modern-bert-in-2025). I will basic consumer hardware, my Apple M1 Max with 32GB of shared memory. We will use the `datasets` library to load the data and the `transformers` library to finetune the model."
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 1,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "name": "stderr",
105 | "output_type": "stream",
106 | "text": [
107 | "/Users/davidberenstein/Documents/programming/argilla/synthetic-data-generator/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
108 | " from .autonotebook import tqdm as notebook_tqdm\n"
109 | ]
110 | },
111 | {
112 | "data": {
113 | "text/plain": [
114 | "{'text': 'Recently, there has been an increase in property values within the suburban areas of several cities due to improvements in infrastructure and lifestyle amenities such as parks, retail stores, and educational institutions nearby. Additionally, new housing developments are emerging, catering to different family needs with varying sizes and price ranges. These changes have influenced investment decisions for many looking to buy or sell properties.',\n",
115 | " 'label': 14}"
116 | ]
117 | },
118 | "execution_count": 1,
119 | "metadata": {},
120 | "output_type": "execute_result"
121 | }
122 | ],
123 | "source": [
124 | "from datasets import load_dataset\n",
125 | "from datasets.arrow_dataset import Dataset\n",
126 | "from datasets.dataset_dict import DatasetDict, IterableDatasetDict\n",
127 | "from datasets.iterable_dataset import IterableDataset\n",
128 | " \n",
129 | "# Dataset id from huggingface.co/dataset\n",
130 | "dataset_id = \"argilla/synthetic-domain-text-classification\"\n",
131 | " \n",
132 | "# Load raw dataset\n",
133 | "train_dataset = load_dataset(dataset_id, split='train')\n",
134 | "\n",
135 | "split_dataset = train_dataset.train_test_split(test_size=0.1)\n",
136 | "split_dataset['train'][0]"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "First, we need to tokenize the data. We will use the `AutoTokenizer` class from the `transformers` library to load the tokenizer."
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 2,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "name": "stderr",
153 | "output_type": "stream",
154 | "text": [
155 | "Map: 100%|██████████| 900/900 [00:00<00:00, 4787.61 examples/s]\n",
156 | "Map: 100%|██████████| 100/100 [00:00<00:00, 4163.70 examples/s]\n"
157 | ]
158 | },
159 | {
160 | "data": {
161 | "text/plain": [
162 | "dict_keys(['labels', 'input_ids', 'attention_mask'])"
163 | ]
164 | },
165 | "execution_count": 2,
166 | "metadata": {},
167 | "output_type": "execute_result"
168 | }
169 | ],
170 | "source": [
171 | "from transformers import AutoTokenizer\n",
172 | " \n",
173 | "# Model id to load the tokenizer\n",
174 | "model_id = \"answerdotai/ModernBERT-base\"\n",
175 | "\n",
176 | "# Load Tokenizer\n",
177 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
178 | " \n",
179 | "# Tokenize helper function\n",
180 | "def tokenize(batch):\n",
181 | " return tokenizer(batch['text'], padding=True, truncation=True, return_tensors=\"pt\")\n",
182 | " \n",
183 | "# Tokenize dataset\n",
184 | "if \"label\" in split_dataset[\"train\"].features.keys():\n",
185 | " split_dataset = split_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n",
186 | "tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n",
187 | " \n",
188 | "tokenized_dataset[\"train\"].features.keys()"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "Now, we need to prepare the model. We will use the `AutoModelForSequenceClassification` class from the `transformers` library to load the model."
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 3,
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "name": "stderr",
205 | "output_type": "stream",
206 | "text": [
207 | "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
208 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
209 | ]
210 | }
211 | ],
212 | "source": [
213 | "from transformers import AutoModelForSequenceClassification\n",
214 | " \n",
215 | "# Model id to load the tokenizer\n",
216 | "model_id = \"answerdotai/ModernBERT-base\"\n",
217 | " \n",
218 | "# Prepare model labels - useful for inference\n",
219 | "labels = tokenized_dataset[\"train\"].features[\"labels\"].names\n",
220 | "num_labels = len(labels)\n",
221 | "label2id, id2label = dict(), dict()\n",
222 | "for i, label in enumerate(labels):\n",
223 | " label2id[label] = str(i)\n",
224 | " id2label[str(i)] = label\n",
225 | " \n",
226 | "# Download the model from huggingface.co/models\n",
227 | "model = AutoModelForSequenceClassification.from_pretrained(\n",
228 | " model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,\n",
229 | ")"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "We will use a simple F1 score as the evaluation metric."
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 4,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "import numpy as np\n",
246 | "from sklearn.metrics import f1_score\n",
247 | " \n",
248 | "# Metric helper method\n",
249 | "def compute_metrics(eval_pred):\n",
250 | " predictions, labels = eval_pred\n",
251 | " predictions = np.argmax(predictions, axis=1)\n",
252 | " score = f1_score(\n",
253 | " labels, predictions, labels=labels, pos_label=1, average=\"weighted\"\n",
254 | " )\n",
255 | " return {\"f1\": float(score) if score == 1 else score}"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "metadata": {},
261 | "source": [
262 | "Finally, we need to define the training arguments. We will use the `TrainingArguments` class from the `transformers` library to define the training arguments."
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 6,
268 | "metadata": {},
269 | "outputs": [
270 | {
271 | "name": "stderr",
272 | "output_type": "stream",
273 | "text": [
274 | "/Users/davidberenstein/Documents/programming/argilla/synthetic-data-generator/.venv/lib/python3.11/site-packages/transformers/training_args.py:2241: UserWarning: `use_mps_device` is deprecated and will be removed in version 5.0 of 🤗 Transformers. `mps` device will be used by default if available similar to the way `cuda` device is used.Therefore, no action from user is required. \n",
275 | " warnings.warn(\n"
276 | ]
277 | }
278 | ],
279 | "source": [
280 | "from huggingface_hub import HfFolder\n",
281 | "from transformers import Trainer, TrainingArguments\n",
282 | " \n",
283 | "# Define training args\n",
284 | "training_args = TrainingArguments(\n",
285 | " output_dir= \"ModernBERT-domain-classifier\",\n",
286 | " per_device_train_batch_size=32,\n",
287 | " per_device_eval_batch_size=16,\n",
288 | " learning_rate=5e-5,\n",
289 | "\t\tnum_train_epochs=5,\n",
290 | " bf16=True, # bfloat16 training \n",
291 | " optim=\"adamw_torch_fused\", # improved optimizer \n",
292 | " # logging & evaluation strategies\n",
293 | " logging_strategy=\"steps\",\n",
294 | " logging_steps=100,\n",
295 | " eval_strategy=\"epoch\",\n",
296 | " save_strategy=\"epoch\",\n",
297 | " save_total_limit=2,\n",
298 | " load_best_model_at_end=True,\n",
299 | " use_mps_device=True,\n",
300 | " metric_for_best_model=\"f1\",\n",
301 | " # push to hub parameters\n",
302 | " push_to_hub=True,\n",
303 | " hub_strategy=\"every_save\",\n",
304 | " hub_token=HfFolder.get_token(),\n",
305 | ")\n",
306 | " \n",
307 | "# Create a Trainer instance\n",
308 | "trainer = Trainer(\n",
309 | " model=model,\n",
310 | " args=training_args,\n",
311 | " train_dataset=tokenized_dataset[\"train\"],\n",
312 | " eval_dataset=tokenized_dataset[\"test\"],\n",
313 | " compute_metrics=compute_metrics,\n",
314 | ")"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 7,
320 | "metadata": {},
321 | "outputs": [
322 | {
323 | "name": "stderr",
324 | "output_type": "stream",
325 | "text": [
326 | " \n",
327 | " 20%|██ | 29/145 [11:32<33:16, 17.21s/it]"
328 | ]
329 | },
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "{'eval_loss': 0.729780912399292, 'eval_f1': 0.7743598318036522, 'eval_runtime': 3.5337, 'eval_samples_per_second': 28.299, 'eval_steps_per_second': 1.981, 'epoch': 1.0}\n"
335 | ]
336 | },
337 | {
338 | "name": "stderr",
339 | "output_type": "stream",
340 | "text": [
341 | " \n",
342 | " 40%|████ | 58/145 [22:57<25:56, 17.89s/it]"
343 | ]
344 | },
345 | {
346 | "name": "stdout",
347 | "output_type": "stream",
348 | "text": [
349 | "{'eval_loss': 0.4369044005870819, 'eval_f1': 0.8310764765820946, 'eval_runtime': 3.3266, 'eval_samples_per_second': 30.061, 'eval_steps_per_second': 2.104, 'epoch': 2.0}\n"
350 | ]
351 | },
352 | {
353 | "name": "stderr",
354 | "output_type": "stream",
355 | "text": [
356 | " \n",
357 | " 60%|██████ | 87/145 [35:16<17:06, 17.70s/it]"
358 | ]
359 | },
360 | {
361 | "name": "stdout",
362 | "output_type": "stream",
363 | "text": [
364 | "{'eval_loss': 0.6091340184211731, 'eval_f1': 0.8399274488570763, 'eval_runtime': 3.2772, 'eval_samples_per_second': 30.514, 'eval_steps_per_second': 2.136, 'epoch': 3.0}\n"
365 | ]
366 | },
367 | {
368 | "name": "stderr",
369 | "output_type": "stream",
370 | "text": [
371 | " 69%|██████▉ | 100/145 [41:03<18:02, 24.06s/it]"
372 | ]
373 | },
374 | {
375 | "name": "stdout",
376 | "output_type": "stream",
377 | "text": [
378 | "{'loss': 0.7663, 'grad_norm': 7.232136249542236, 'learning_rate': 1.5517241379310346e-05, 'epoch': 3.45}\n"
379 | ]
380 | },
381 | {
382 | "name": "stderr",
383 | "output_type": "stream",
384 | "text": [
385 | " \n",
386 | " 80%|████████ | 116/145 [47:23<08:50, 18.30s/it]"
387 | ]
388 | },
389 | {
390 | "name": "stdout",
391 | "output_type": "stream",
392 | "text": [
393 | "{'eval_loss': 0.43516409397125244, 'eval_f1': 0.8797674004703547, 'eval_runtime': 3.2975, 'eval_samples_per_second': 30.326, 'eval_steps_per_second': 2.123, 'epoch': 4.0}\n"
394 | ]
395 | },
396 | {
397 | "name": "stderr",
398 | "output_type": "stream",
399 | "text": [
400 | " \n",
401 | "100%|██████████| 145/145 [1:00:40<00:00, 19.18s/it]"
402 | ]
403 | },
404 | {
405 | "name": "stdout",
406 | "output_type": "stream",
407 | "text": [
408 | "{'eval_loss': 0.39272159337997437, 'eval_f1': 0.8914389523348718, 'eval_runtime': 3.5564, 'eval_samples_per_second': 28.118, 'eval_steps_per_second': 1.968, 'epoch': 5.0}\n"
409 | ]
410 | },
411 | {
412 | "name": "stderr",
413 | "output_type": "stream",
414 | "text": [
415 | "100%|██████████| 145/145 [1:00:42<00:00, 25.12s/it]\n"
416 | ]
417 | },
418 | {
419 | "name": "stdout",
420 | "output_type": "stream",
421 | "text": [
422 | "{'train_runtime': 3642.7783, 'train_samples_per_second': 1.235, 'train_steps_per_second': 0.04, 'train_loss': 0.535627057634551, 'epoch': 5.0}\n"
423 | ]
424 | },
425 | {
426 | "name": "stderr",
427 | "output_type": "stream",
428 | "text": [
429 | "events.out.tfevents.1735555878.Davids-MacBook-Pro.local.23438.0: 100%|██████████| 9.32k/9.32k [00:00<00:00, 55.0kB/s]\n"
430 | ]
431 | },
432 | {
433 | "data": {
434 | "text/plain": [
435 | "CommitInfo(commit_url='https://huggingface.co/davidberenstein1957/domain-classifier/commit/915f4b03c230cc8f376f13729728f14347400041', commit_message='End of training', commit_description='', oid='915f4b03c230cc8f376f13729728f14347400041', pr_url=None, repo_url=RepoUrl('https://huggingface.co/davidberenstein1957/domain-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='davidberenstein1957/domain-classifier'), pr_revision=None, pr_num=None)"
436 | ]
437 | },
438 | "execution_count": 7,
439 | "metadata": {},
440 | "output_type": "execute_result"
441 | }
442 | ],
443 | "source": [
444 | "trainer.train()\n",
445 | "# Save processor and create model card\n",
446 | "tokenizer.save_pretrained(\"ModernBERT-domain-classifier\")\n",
447 | "trainer.create_model_card()\n",
448 | "trainer.push_to_hub()"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "We get an F1 score of 0.89 on the test set, which is pretty good for the small dataset and time spent."
456 | ]
457 | },
458 | {
459 | "cell_type": "markdown",
460 | "metadata": {},
461 | "source": [
462 | "## Run inference\n",
463 | "\n",
464 | "We can now load the model and run inference."
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": 11,
470 | "metadata": {},
471 | "outputs": [
472 | {
473 | "name": "stderr",
474 | "output_type": "stream",
475 | "text": [
476 | "Device set to use mps:0\n"
477 | ]
478 | },
479 | {
480 | "data": {
481 | "text/plain": [
482 | "[{'label': 'health', 'score': 0.6779336333274841}]"
483 | ]
484 | },
485 | "execution_count": 11,
486 | "metadata": {},
487 | "output_type": "execute_result"
488 | }
489 | ],
490 | "source": [
491 | "from transformers import pipeline\n",
492 | " \n",
493 | "# load model from huggingface.co/models using our repository id\n",
494 | "classifier = pipeline(\n",
495 | " task=\"text-classification\", \n",
496 | " model=\"argilla/ModernBERT-domain-classifier\", \n",
497 | " device=0,\n",
498 | ")\n",
499 | " \n",
500 | "sample = \"Smoking is bad for your health.\"\n",
501 | " \n",
502 | "classifier(sample)"
503 | ]
504 | },
505 | {
506 | "cell_type": "markdown",
507 | "metadata": {},
508 | "source": [
509 | "## Conclusion\n",
510 | "\n",
511 | "We have shown that we can generate a synthetic dataset from an LLM and finetune a ModernBERT model on it. This the effectiveness of synthetic data and the novel ModernBERT model, which is new and improved version of BERT models, with 8192 token context length, significantly better downstream performance, and much faster processing speeds. \n",
512 | "\n",
513 | "Pretty cool for 20 minutes of generating data, and an hour of fine-tuning on consumer hardware."
514 | ]
515 | }
516 | ],
517 | "metadata": {
518 | "kernelspec": {
519 | "display_name": ".venv",
520 | "language": "python",
521 | "name": "python3"
522 | },
523 | "language_info": {
524 | "codemirror_mode": {
525 | "name": "ipython",
526 | "version": 3
527 | },
528 | "file_extension": ".py",
529 | "mimetype": "text/x-python",
530 | "name": "python",
531 | "nbconvert_exporter": "python",
532 | "pygments_lexer": "ipython3",
533 | "version": "3.11.11"
534 | }
535 | },
536 | "nbformat": 4,
537 | "nbformat_minor": 2
538 | }
539 |
--------------------------------------------------------------------------------
/examples/fine-tune-smollm2-on-synthetic-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fine-tune a SmolLM on domain-specific synthetic data from a LLM\n",
8 | "\n",
9 | "Yes, smoll models can beat GPT4-like models on domain-specific tasks but don't expect miracles. When comparing smoll vs large, consider all costs and gains like difference performance and the value of using private and local models and data that you own.\n",
10 | "\n",
11 | "The [Hugging Face SmolLM models](https://github.com/huggingface/smollm) are blazingly fast and remarkably powerful. With its 135M, 360M and 1.7B parameter models, it is a great choice for a small and fast model. The great thing about SmolLM is that it is a general-purpose model that can be fine-tuned on domain-specific data.\n",
12 | "\n",
13 | "A lack of domain-specific datasets is a common problem for smaller and more specialized models. This is because it is difficult to find a dataset that is both representative and diverse enough for a specific task. We solve this problem by generating a synthetic dataset from an LLM using the `synthetic-data-generator`, which is available as a [Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) or on [GitHub](https://github.com/argilla-io/synthetic-data-generator).\n",
14 | "\n",
15 | "In this example, we will fine-tune a SmolLM2 model on a synthetic dataset generated from `meta-llama/Meta-Llama-3.1-8B-Instruct` with the `synthetic-data-generator`.\n",
16 | "\n",
17 | "## Install the dependencies\n",
18 | "\n",
19 | "We will install some basic dependencies for the fine-tuning with `trl` but we will use the Synthetic Data Generator UI to generate the synthetic dataset."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "!pip install transformers datasets trl torch"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## The problem\n",
36 | "\n",
37 | "Reasoning data has proven to be a fundamental change in the performance of generative models. Reasoning is amazing but it also means the model generates more \"chatty\" during the token generation process, causing the model to become slower and more expensive. For this reason, we want to create a model that can reason without being too chatty. Therefore, we will generate a concise reasoning dataset and fine-tune a SmolLM2 model on it.\n",
38 | "\n",
39 | "## Let's generate some data\n",
40 | "\n",
41 | "Let's go to the [hosted Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) to generate the data. This is done in three steps 1) we come up with a dataset description, 2) iterate on the task configuration, and 3) generate and push the data to Hugging Face. A more detailed flow can be found in [this blog post](https://huggingface.co/blog/synthetic-data-generator). \n",
42 | "\n",
43 | "\n",
49 | "\n",
50 | "For this example, we will generate 5000 chat data examples for a single turn in the conversation. All examples have been generated with a temperature of 1. After some iteration, we come up with the following system prompt:\n",
51 | "\n",
52 | "```\n",
53 | "You are an AI assistant who provides brief and to-the-point responses with logical step-by-step reasoning. Your purpose is to offer straightforward explanations and answers so that you can get to the heart of the issue. Respond with extremely concise, direct justifications and evidence-based conclusions. User questions are direct and concise.\n",
54 | "```\n",
55 | "\n",
56 | "We press the \"Push to Hub\" button and wait for the data to be generated. This takes a few hours and we end up with a dataset with 5000 examples, which is the maximum number of examples we can generate in a single run. You can scale this by deploying a private instance of the Synthetic Data Generator. \n",
57 | "\n",
58 | "\n",
64 | "\n",
65 | "The data is pushed to Argilla too so we recommend inspecting and validating the the data before finetuning the actual model. We applied some basic filters and transformations to the data to make it more suitable for fine-tuning.\n",
66 | "\n",
67 | "## Fine-tune the model\n",
68 | "\n",
69 | "We will use TRL to fine-tune the model. It is part of the Hugging Face ecosystem and works seamlessly on top of datasets generated by the synthetic data generator without needing to do any data transformations.\n",
70 | "\n",
71 | "### Load the model\n",
72 | "\n",
73 | "We will first load the model and tokenizer and set up the chat format."
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 5,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "# Import necessary libraries\n",
83 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
84 | "from datasets import load_dataset\n",
85 | "from trl import SFTConfig, SFTTrainer, setup_chat_format\n",
86 | "import torch\n",
87 | "import os\n",
88 | "\n",
89 | "device = (\n",
90 | " \"cuda\"\n",
91 | " if torch.cuda.is_available()\n",
92 | " else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
93 | ")\n",
94 | "\n",
95 | "# Load the model and tokenizer\n",
96 | "model_name = \"HuggingFaceTB/SmolLM2-360M\"\n",
97 | "model = AutoModelForCausalLM.from_pretrained(\n",
98 | " pretrained_model_name_or_path=model_name\n",
99 | ")\n",
100 | "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)\n",
101 | "\n",
102 | "# Set up the chat format\n",
103 | "model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "### Test the base model\n",
111 | "\n",
112 | "We will first test the base model to see how it performs on the task. During this step we will also generate a prompt for the model to respond to, to see how it performs on the task."
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 2,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "name": "stderr",
122 | "output_type": "stream",
123 | "text": [
124 | "Device set to use mps:0\n"
125 | ]
126 | },
127 | {
128 | "data": {
129 | "text/plain": [
130 | "[{'generated_text': 'What is the primary function of mitochondria within a cell?\\n\\nMitochondria are the powerhouses of the cell. They are responsible for the production of ATP (adenosine triphosphate) and the energy required for cellular processes.\\n\\nWhat is the function of the mitochondria in the cell?\\n\\nThe mitochondria are the powerhouses of the cell. They are responsible for the production of ATP (adenosine triphosphate) and the energy required for cellular processes.\\n\\nWhat is the function of the mitochondria in the cell?\\n\\nThe'}]"
131 | ]
132 | },
133 | "execution_count": 2,
134 | "metadata": {},
135 | "output_type": "execute_result"
136 | }
137 | ],
138 | "source": [
139 | "from transformers import pipeline\n",
140 | "\n",
141 | "prompt = \"What is the primary function of mitochondria within a cell?\"\n",
142 | "\n",
143 | "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=device)\n",
144 | "pipe(prompt, max_new_tokens=100)"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "### Load the dataset\n",
152 | "\n",
153 | "For fine-tuning, we need to load the dataset and tokenize it. We will use the `synthetic-concise-reasoning-sft-filtered` dataset that we generated in the previous step."
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 2,
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "name": "stderr",
163 | "output_type": "stream",
164 | "text": [
165 | "Map: 100%|██████████| 4133/4133 [00:00<00:00, 18478.53 examples/s]\n"
166 | ]
167 | }
168 | ],
169 | "source": [
170 | "from datasets import load_dataset\n",
171 | "\n",
172 | "ds = load_dataset(\"argilla/synthetic-concise-reasoning-sft-filtered\")\n",
173 | "def tokenize_function(examples):\n",
174 | " examples[\"text\"] = tokenizer.apply_chat_template([{\"role\": \"user\", \"content\": examples[\"prompt\"].strip()}, {\"role\": \"assistant\", \"content\": examples[\"completion\"].strip()}], tokenize=False)\n",
175 | " return examples\n",
176 | "ds = ds.map(tokenize_function)\n",
177 | "ds = ds.shuffle()"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "### Fine-tune the model\n",
185 | "\n",
186 | "We will now fine-tune the model. We will use the `SFTTrainer` from the `trl` library to fine-tune the model. We will use a batch size of 4 and a learning rate of 5e-5. We will also use the `use_mps_device` flag to use the MPS device if available."
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "os.environ[\"PYTORCH_MPS_HIGH_WATERMARK_RATIO\"] = \"0.0\"\n",
196 | "\n",
197 | "# Configure the SFTTrainer\n",
198 | "sft_config = SFTConfig(\n",
199 | " output_dir=\"./sft_output\",\n",
200 | " num_train_epochs=1,\n",
201 | " per_device_train_batch_size=4, # Set according to your GPU memory capacity\n",
202 | " learning_rate=5e-5, # Common starting point for fine-tuning\n",
203 | " logging_steps=100, # Frequency of logging training metrics\n",
204 | " use_mps_device= True if device == \"mps\" else False,\n",
205 | " hub_model_id=\"argilla/SmolLM2-360M-synthetic-concise-reasoning\", # Set a unique name for your model\n",
206 | " push_to_hub=True,\n",
207 | ")\n",
208 | "\n",
209 | "# Initialize the SFTTrainer\n",
210 | "trainer = SFTTrainer(\n",
211 | " model=model,\n",
212 | " args=sft_config,\n",
213 | " train_dataset=ds[\"train\"],\n",
214 | " tokenizer=tokenizer,\n",
215 | ")\n",
216 | "trainer.train()"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "```\n",
224 | "# {'loss': 1.4498, 'grad_norm': 2.3919131755828857, 'learning_rate': 4e-05, 'epoch': 0.1}\n",
225 | "# {'loss': 1.362, 'grad_norm': 1.6650595664978027, 'learning_rate': 3e-05, 'epoch': 0.19}\n",
226 | "# {'loss': 1.3778, 'grad_norm': 1.4778285026550293, 'learning_rate': 2e-05, 'epoch': 0.29}\n",
227 | "# {'loss': 1.3735, 'grad_norm': 2.1424977779388428, 'learning_rate': 1e-05, 'epoch': 0.39}\n",
228 | "# {'loss': 1.3512, 'grad_norm': 2.3498542308807373, 'learning_rate': 0.0, 'epoch': 0.48}\n",
229 | "# {'train_runtime': 1911.514, 'train_samples_per_second': 1.046, 'train_steps_per_second': 0.262, 'train_loss': 1.3828572998046875, 'epoch': 0.48}\n",
230 | "```\n",
231 | "\n",
232 | "For the example, we did not use a specific validation set but we can see the loss is decreasing, so we assume the model is generalsing well to the training data. To get a better understanding of the model's performance, let's test it again with the same prompt.\n",
233 | "\n",
234 | "### Run inference\n",
235 | "\n",
236 | "We can now run inference with [the fine-tuned model](https://huggingface.co/argilla/SmolLM2-360M-synthetic-concise-reasoning/blob/main/README.md)."
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 12,
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "name": "stderr",
246 | "output_type": "stream",
247 | "text": [
248 | "Device set to use mps\n"
249 | ]
250 | },
251 | {
252 | "data": {
253 | "text/plain": [
254 | "'The primary function of mitochondria is to generate energy for the cell. They are organelles found in eukaryotic cells that convert nutrients into ATP (adenosine triphosphate), which is the primary source of energy for cellular processes.\\nMitochondria are responsible for:\\n\\nEnergy production: Mitochondria produce ATP through a process called oxidative phosphorylation, which involves the transfer of electrons from food molecules to oxygen.\\nEnergy storage: Mitochondria store energy in the form of adenosine triphosphate (ATP), which is used by the cell for various cellular processes.\\nCellular respiration: Mitochondria also participate in cellular respiration, a'"
255 | ]
256 | },
257 | "execution_count": 12,
258 | "metadata": {},
259 | "output_type": "execute_result"
260 | }
261 | ],
262 | "source": [
263 | "prompt = \"What is the primary function of mitochondria within a cell?\"\n",
264 | "\n",
265 | "generator = pipeline(\n",
266 | " \"text-generation\",\n",
267 | " model=\"argilla/SmolLM2-360M-synthetic-concise-reasoning\",\n",
268 | " device=\"mps\",\n",
269 | ")\n",
270 | "generator(\n",
271 | " [{\"role\": \"user\", \"content\": prompt}], max_new_tokens=128, return_full_text=False\n",
272 | ")[0][\"generated_text\"]"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "## Conclusion\n",
280 | "\n",
281 | "We have fine-tuned a SmolLM2 model on a synthetic dataset generated from a large language model. We have seen that the model performs well on the task and that the synthetic data is a great way to generate diverse and representative data for supervised fine-tuning. \n",
282 | "\n",
283 | "In practice, you would likely want to spend more time on the data quality and fine-tuning the model but the flow shows the Synthetic Data Generator is a great tool to generate synthetic data for any task.\n",
284 | "\n",
285 | "Overall, I think it is pretty cool for a couple of hours of generation and fine-tuning on consumer hardware.\n"
286 | ]
287 | }
288 | ],
289 | "metadata": {
290 | "kernelspec": {
291 | "display_name": ".venv",
292 | "language": "python",
293 | "name": "python3"
294 | },
295 | "language_info": {
296 | "codemirror_mode": {
297 | "name": "ipython",
298 | "version": 3
299 | },
300 | "file_extension": ".py",
301 | "mimetype": "text/x-python",
302 | "name": "python",
303 | "nbconvert_exporter": "python",
304 | "pygments_lexer": "ipython3",
305 | "version": "3.11.9"
306 | }
307 | },
308 | "nbformat": 4,
309 | "nbformat_minor": 2
310 | }
311 |
--------------------------------------------------------------------------------
/examples/hf-dedicated-or-tgi-deployment.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 | import os
8 |
9 | from synthetic_dataset_generator import launch
10 |
11 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
12 | os.environ["HUGGINGFACE_BASE_URL"] = "http://127.0.0.1:3000/" # dedicated endpoint/TGI
13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # magpie template
14 | os.environ["TOKENIZER_ID"] = (
15 | "meta-llama/Llama-3.1-8B-Instruct" # tokenizer for model hosted on endpoint
16 | )
17 | os.environ["MODEL"] = None # model is linked to endpoint
18 |
19 | launch()
20 |
--------------------------------------------------------------------------------
/examples/hf-serverless-deployment-deepseek.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 | import os
8 |
9 | from synthetic_dataset_generator import launch
10 |
11 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
12 | os.environ["MODEL"] = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # use model for instructions
13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "<|begin▁of▁sentence|>User: " # use the custom template for the model
14 |
15 |
16 | launch()
17 |
--------------------------------------------------------------------------------
/examples/hf-serverless-deployment.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 | import os
8 |
9 | from synthetic_dataset_generator import launch
10 |
11 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
12 | os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct" # use model for generation
13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # use the template for the model
14 |
15 | launch()
16 |
--------------------------------------------------------------------------------
/examples/hf-serverless-different-model-for-completion.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 | import os
8 |
9 | from synthetic_dataset_generator import launch
10 |
11 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
12 | os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct" # use model for instruction generation
13 | os.environ["MODEL_COMPLETION"] = "meta-llama/Llama-3.1-70B-Instruct" # use model for completion generation
14 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # use the template for the model
15 |
16 | launch()
17 |
--------------------------------------------------------------------------------
/examples/ollama-deployment.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 | # ollama serve
8 | # ollama run qwen2.5:32b-instruct-q5_K_S
9 | import os
10 |
11 | from synthetic_dataset_generator import launch
12 |
13 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
14 | os.environ["OLLAMA_BASE_URL"] = "http://127.0.0.1:11434/" # ollama base url
15 | os.environ["MODEL"] = "qwen2.5:32b-instruct-q5_K_S" # model id
16 | os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-32B-Instruct" # tokenizer id
17 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2"
18 | os.environ["MAX_NUM_ROWS"] = "10000"
19 | os.environ["DEFAULT_BATCH_SIZE"] = "2"
20 | os.environ["MAX_NUM_TOKENS"] = "1024"
21 |
22 | launch()
23 |
--------------------------------------------------------------------------------
/examples/ollama-different-model-for-completion.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 | # ollama serve
8 | # ollama run llama3.2
9 | # ollama run llama3.2:1b
10 | import os
11 |
12 | from synthetic_dataset_generator import launch
13 |
14 | os.environ["OLLAMA_BASE_URL"] = (
15 | "http://127.0.0.1:11434/" # in this case, the same base url for both models
16 | )
17 |
18 | os.environ["MODEL"] = "llama3.2" # model for instruction generation
19 | os.environ["MODEL_COMPLETION"] = "llama3.2:1b" # model for completion generation
20 |
21 | os.environ["TOKENIZER_ID"] = "meta-llama/Llama-3.2-3B-Instruct" # tokenizer for instruction generation
22 | os.environ["TOKENIZER_ID_COMPLETION"] = "meta-llama/Llama-3.2-1B-Instruct" # tokenizer for completion generation
23 |
24 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # magpie template required for instruction generation
25 |
26 | launch()
27 |
--------------------------------------------------------------------------------
/examples/openai-deployment.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 |
8 | import os
9 |
10 | from synthetic_dataset_generator import launch
11 |
12 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
13 | os.environ["OPENAI_BASE_URL"] = "https://api.openai.com/v1/" # openai base url
14 | os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY") # openai api key
15 | os.environ["MODEL"] = "gpt-4o" # model id
16 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = None # chat data not supported with OpenAI
17 |
18 | launch()
19 |
--------------------------------------------------------------------------------
/examples/vllm-deployment.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.11,<3.12"
3 | # dependencies = [
4 | # "synthetic-dataset-generator",
5 | # ]
6 | # ///
7 | # vllm serve Qwen/Qwen2.5-1.5B-Instruct
8 | import os
9 |
10 | from synthetic_dataset_generator import launch
11 |
12 | os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
13 | os.environ["VLLM_BASE_URL"] = "http://127.0.0.1:8000/" # vllm base url
14 | os.environ["MODEL"] = "Qwen/Qwen2.5-1.5B-Instruct" # model id
15 | os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-1.5B-Instruct" # tokenizer id
16 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2"
17 | os.environ["MAX_NUM_ROWS"] = "10000"
18 | os.environ["DEFAULT_BATCH_SIZE"] = "2"
19 | os.environ["MAX_NUM_TOKENS"] = "1024"
20 |
21 | launch()
22 |
--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | poppler-utils
2 | tesseract-ocr
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "synthetic-dataset-generator"
3 | version = "0.2.0"
4 | description = "Build datasets using natural language"
5 | authors = [
6 | {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
7 | ]
8 | keywords = [
9 | "gradio",
10 | "synthetic-data",
11 | "huggingface",
12 | "argilla",
13 | "generative-ai",
14 | "ai",
15 | ]
16 | requires-python = "<3.13,>=3.10"
17 | readme = "README.md"
18 | license = {text = "Apache 2"}
19 |
20 | dependencies = [
21 | "argilla>=2.4.0,<3.0.0",
22 | "distilabel[argilla,hf-inference-endpoints,hf-transformers,instructor,llama-cpp,ollama,openai,outlines,vllm,vision]>=1.5.0,<2.00",
23 | "gradio[oauth]>=5.4.0,<6.0.0",
24 | "gradio-huggingfacehub-search>=0.0.12,<1.0.0",
25 | "huggingface-hub>=0.26.0,<0.28.0",
26 | "model2vec>=0.2.4,<1.0.0",
27 | "nltk>=3.9.1,<4.0.0",
28 | "pydantic>=2.10.5,<3.0.0",
29 | "sentence-transformers>=3.2.0,<4.0.0",
30 | "transformers>=4.44.2,<5.0.0",
31 | "unstructured[md,pdf,docx]>=0.16.3,<1.0.0",
32 | "setuptools",
33 | ]
34 |
35 | [build-system]
36 | requires = ["pdm-backend"]
37 | build-backend = "pdm.backend"
38 |
39 | [tool.pdm]
40 | distribution = true
41 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/__init__.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from gradio import TabbedInterface
3 |
4 | from synthetic_dataset_generator import ( # noqa
5 | _distiset,
6 | _inference_endpoints,
7 | )
8 |
9 | def launch(*args, **kwargs):
10 | """Launch the synthetic dataset generator.
11 | Based on the `TabbedInterface` from Gradio.
12 | Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
13 | """
14 | from synthetic_dataset_generator.app import demo
15 | return demo.launch(*args, server_name="0.0.0.0", **kwargs)
16 |
17 |
18 | launch.__doc__ = TabbedInterface.launch.__doc__
19 | launch.__signature__ = inspect.signature(TabbedInterface.launch)
20 | launch.__annotations__ = TabbedInterface.launch.__annotations__
21 |
--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/__main__.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 | from synthetic_dataset_generator import launch
3 |
4 | launch()
5 |
--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/_distiset.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import distilabel
4 | import distilabel.distiset
5 | import gradio as gr
6 | from distilabel.utils.card.dataset_card import (
7 | DistilabelDatasetCard,
8 | size_categories_parser,
9 | )
10 | from huggingface_hub import DatasetCardData, HfApi
11 |
12 |
13 | class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
14 | def _generate_card(
15 | self,
16 | repo_id: str,
17 | token: str,
18 | include_script: bool = False,
19 | filename_py: Optional[str] = None,
20 | ) -> None:
21 | """Generates a dataset card and pushes it to the Hugging Face Hub, and
22 | if the `pipeline.yaml` path is available in the `Distiset`, uploads that
23 | to the same repository.
24 |
25 | Args:
26 | repo_id: The ID of the repository to push to, from the `push_to_hub` method.
27 | token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
28 | include_script: Whether to upload the script to the hugging face repository.
29 | filename_py: The name of the script. If `include_script` is True, the script will
30 | be uploaded to the repository using this name, otherwise it won't be used.
31 | """
32 | card = self._get_card(
33 | repo_id=repo_id,
34 | token=token,
35 | include_script=include_script,
36 | filename_py=filename_py,
37 | )
38 |
39 | card.push_to_hub(
40 | repo_id,
41 | repo_type="dataset",
42 | token=token,
43 | )
44 | if self.pipeline_path:
45 | # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.
46 | HfApi().upload_file(
47 | path_or_fileobj=self.pipeline_path,
48 | path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME,
49 | repo_id=repo_id,
50 | repo_type="dataset",
51 | token=token,
52 | )
53 |
54 | def _get_card(
55 | self,
56 | repo_id: str,
57 | token: Optional[str] = None,
58 | include_script: bool = False,
59 | filename_py: Optional[str] = None,
60 | ) -> DistilabelDatasetCard:
61 | """Generates the dataset card for the `Distiset`.
62 |
63 | Note:
64 | If `repo_id` and `token` are provided, it will extract the metadata from the README.md file
65 | on the hub.
66 |
67 | Args:
68 | repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.
69 | token: The token to authenticate with the Hugging Face Hub.
70 | We assume that if it's provided, the dataset will be in the Hugging Face Hub,
71 | so the README metadata will be extracted from there.
72 | include_script: Whether to upload the script to the hugging face repository.
73 | filename_py: The name of the script. If `include_script` is True, the script will
74 | be uploaded to the repository using this name, otherwise it won't be used.
75 |
76 | Returns:
77 | The dataset card for the `Distiset`.
78 | """
79 | sample_records = {}
80 | for name, dataset in self.items():
81 | sample_records[name] = (
82 | dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
83 | )
84 |
85 | columns = self["default"].column_names
86 | columns = self["default"].column_names
87 |
88 | if ("label" in columns and "text" in columns) or (
89 | "labels" in columns and "text" in columns
90 | ):
91 | task_categories = ["text-classification"]
92 | elif ("prompt" in columns and "completion" in columns) or (
93 | "messages" in columns
94 | ):
95 | task_categories: list[str] = [
96 | "text-generation",
97 | "text2text-generation",
98 | "question-answering",
99 | ]
100 | elif "context" in columns and "question" in columns and "response" in columns:
101 | task_categories: list[str] = [
102 | "text-generation",
103 | "text2text-generation",
104 | "text-retrieval",
105 | "question-answering"
106 | ]
107 | if (
108 | "positive_retrieval" in columns and "negative_retrieval" in columns
109 | ) or ("positive_reranking" in columns and "negative_reranking" in columns):
110 | task_categories.append("sentence-similarity")
111 | else:
112 | task_categories: list[str] = []
113 | gr.Info(
114 | f"No task categories found for dataset with columns: {columns}. "
115 | "Please notify the distilabel team if you think this is an error."
116 | )
117 |
118 | readme_metadata = {}
119 | if repo_id and token:
120 | readme_metadata = self._extract_readme_metadata(repo_id, token)
121 |
122 | metadata = {
123 | **readme_metadata,
124 | "size_categories": size_categories_parser(
125 | max(len(dataset) for dataset in self.values())
126 | ),
127 | "task_categories": task_categories,
128 | "tags": [
129 | "synthetic",
130 | "distilabel",
131 | "rlaif",
132 | "datacraft",
133 | ],
134 | }
135 |
136 | card = DistilabelDatasetCard.from_template(
137 | card_data=DatasetCardData(**metadata),
138 | repo_id=repo_id,
139 | sample_records=sample_records,
140 | include_script=include_script,
141 | filename_py=filename_py,
142 | references=self.citations,
143 | )
144 |
145 | return card
146 |
147 |
148 | distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag
149 |
--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/_inference_endpoints.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import distilabel
4 | import distilabel.distiset
5 | from distilabel.models import InferenceEndpointsLLM
6 | from pydantic import (
7 | ValidationError,
8 | model_validator,
9 | )
10 |
11 |
12 | class CustomInferenceEndpointsLLM(InferenceEndpointsLLM):
13 | @model_validator(mode="after") # type: ignore
14 | def only_one_of_model_id_endpoint_name_or_base_url_provided(
15 | self,
16 | ) -> "InferenceEndpointsLLM":
17 | """Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also
18 | provided, a warning will be shown informing the user that the provided `base_url` will be ignored in
19 | favour of the dynamically calculated one.."""
20 |
21 | if self.base_url and (self.model_id or self.endpoint_name):
22 | warnings.warn( # type: ignore
23 | f"Since the `base_url={self.base_url}` is available and either one of `model_id`"
24 | " or `endpoint_name` is also provided, the `base_url` will either be ignored"
25 | " or overwritten with the one generated from either of those args, for serverless"
26 | " or dedicated inference endpoints, respectively."
27 | )
28 |
29 | if self.use_magpie_template and self.tokenizer_id is None:
30 | raise ValueError(
31 | "`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,"
32 | " set a `tokenizer_id` and try again."
33 | )
34 |
35 | if (
36 | self.model_id
37 | and self.tokenizer_id is None
38 | and self.structured_output is not None
39 | ):
40 | self.tokenizer_id = self.model_id
41 |
42 | if self.base_url and not (self.model_id or self.endpoint_name):
43 | return self
44 |
45 | if self.model_id and not self.endpoint_name:
46 | return self
47 |
48 | if self.endpoint_name and not self.model_id:
49 | return self
50 |
51 | raise ValidationError(
52 | f"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is"
53 | f" provided too, it will be overwritten instead. Found `model_id`={self.model_id},"
54 | f" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}."
55 | )
56 |
57 |
58 | distilabel.models.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM
59 |
--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/_tabbedinterface.py:
--------------------------------------------------------------------------------
1 | """
2 | This file defines two useful high-level abstractions to build Gradio apps: Interface and TabbedInterface.
3 | """
4 |
5 | from __future__ import annotations
6 |
7 | from collections.abc import Sequence
8 |
9 | import gradio as gr
10 | from gradio.blocks import Blocks
11 | from gradio.layouts import Tab, Tabs
12 | from gradio.themes import ThemeClass as Theme
13 | from gradio_client.documentation import document
14 |
15 |
16 | @document()
17 | class TabbedInterface(Blocks):
18 | """
19 | A TabbedInterface is created by providing a list of Interfaces or Blocks, each of which gets
20 | rendered in a separate tab. Only the components from the Interface/Blocks will be rendered in the tab.
21 | Certain high-level attributes of the Blocks (e.g. custom `css`, `js`, and `head` attributes) will not be loaded.
22 |
23 | Demos: tabbed_interface_lite
24 | """
25 |
26 | def __init__(
27 | self,
28 | interface_list: Sequence[Blocks],
29 | tab_names: list[str] | None = None,
30 | title: str | None = None,
31 | theme: Theme | str | None = None,
32 | analytics_enabled: bool | None = None,
33 | css: str | None = None,
34 | js: str | None = None,
35 | head: str | None = None,
36 | ):
37 | """
38 | Parameters:
39 | interface_list: A list of Interfaces (or Blocks) to be rendered in the tabs.
40 | tab_names: A list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
41 | title: The tab title to display when this demo is opened in a browser window.
42 | theme: A Theme object or a string representing a theme. If a string, will look for a built-in theme with that name (e.g. "soft" or "default"), or will attempt to load a theme from the Hugging Face Hub (e.g. "gradio/monochrome"). If None, will use the Default theme.
43 | analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
44 | css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
45 | js: Custom js as a string or path to a js file. The custom js should in the form of a single js function. This function will automatically be executed when the page loads. For more flexibility, use the head parameter to insert js inside