├── .dockerignore
├── .env.local.template
├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── assets
    ├── argilla.png
    ├── flow.png
    ├── logo.png
    ├── logo.svg
    ├── ui-full.png
    └── ui.png
├── docker-compose.yml
├── docker
    ├── .env.docker.template
    ├── Dockerfile
    ├── README.md
    ├── argilla
    │   └── compose.yml
    └── ollama
    │   ├── compose.yml
    │   └── entrypoint.sh
├── examples
    ├── argilla-deployment.py
    ├── blog_private_synthetic_data_generation.md
    ├── fine-tune-deepseek-reasoning-sft.ipynb
    ├── fine-tune-modernbert-classifier.ipynb
    ├── fine-tune-modernbert-rag.ipynb
    ├── fine-tune-smollm2-on-synthetic-data.ipynb
    ├── hf-dedicated-or-tgi-deployment.py
    ├── hf-serverless-deployment-deepseek.py
    ├── hf-serverless-deployment.py
    ├── hf-serverless-different-model-for-completion.py
    ├── ollama-deployment.py
    ├── ollama-different-model-for-completion.py
    ├── openai-deployment.py
    └── vllm-deployment.py
├── packages.txt
├── pdm.lock
├── pyproject.toml
├── requirements.txt
├── src
    └── synthetic_dataset_generator
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── _distiset.py
    │   ├── _inference_endpoints.py
    │   ├── _tabbedinterface.py
    │   ├── app.py
    │   ├── apps
    │       ├── __init__.py
    │       ├── about.py
    │       ├── base.py
    │       ├── chat.py
    │       ├── eval.py
    │       ├── rag.py
    │       └── textcat.py
    │   ├── constants.py
    │   ├── pipelines
    │       ├── __init__.py
    │       ├── base.py
    │       ├── chat.py
    │       ├── embeddings.py
    │       ├── eval.py
    │       ├── rag.py
    │       └── textcat.py
    │   └── utils.py
└── tests
    └── __init__.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Version control
 2 | .git
 3 | .gitignore
 4 | 
 5 | # Python
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | *.so
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # Virtual environments
29 | .env*
30 | !.env.example
31 | .venv
32 | env/
33 | venv/
34 | ENV/
35 | 
36 | # IDE
37 | .idea/
38 | .vscode/
39 | *.swp
40 | *.swo
41 | 
42 | # Testing
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 | 
53 | # Project specific
54 | nltk_data/
55 | .pdm-python
56 | .pdm.toml
57 | __pypackages__/ 


--------------------------------------------------------------------------------
/.env.local.template:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # LOCAL/API CONFIGURATION
 3 | # =============================================================================
 4 | 
 5 | # -----------------------------------------------------------------------------
 6 | # REQUIRED CONFIGURATION
 7 | # -----------------------------------------------------------------------------
 8 | # Hugging Face token (required for all setups)
 9 | HF_TOKEN=hf_...
10 | 
11 | # Generation Settings
12 | MAX_NUM_TOKENS=2048
13 | MAX_NUM_ROWS=1000
14 | DEFAULT_BATCH_SIZE=5
15 | 
16 | # Required for chat data generation with Llama or Qwen models
17 | # Options: "llama3", "qwen2", or custom template string
18 | MAGPIE_PRE_QUERY_TEMPLATE=llama3
19 | 
20 | # -----------------------------------------------------------------------------
21 | # A. CLOUD API SERVICES
22 | # -----------------------------------------------------------------------------
23 | 
24 | # 1. HUGGING FACE INFERENCE API (Default, Recommended)
25 | MODEL=meta-llama/Llama-3.1-8B-Instruct
26 | # MODEL=Qwen/Qwen2.5-1.5B-Instruct
27 | 
28 | # 2. OPENAI API
29 | # OPENAI_BASE_URL=https://api.openai.com/v1/
30 | # MODEL=gpt-4
31 | # API_KEY=sk-...
32 | 
33 | # 3. HUGGING FACE SPACE FOR ARGILLA (optional)
34 | # ARGILLA_API_URL=https://your-space.hf.space/
35 | # ARGILLA_API_KEY=your_key
36 | 
37 | # -----------------------------------------------------------------------------
38 | # B. LOCAL SERVICES (Requires Installation)
39 | # -----------------------------------------------------------------------------
40 | 
41 | # 1. LOCAL OLLAMA
42 | # OLLAMA_BASE_URL=http://127.0.0.1:11434/
43 | # MODEL=llama3.2:1b
44 | # TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
45 | 
46 | # 2. LOCAL VLLM
47 | # VLLM_BASE_URL=http://127.0.0.1:8000/
48 | # MODEL=Qwen/Qwen2.5-1.5B-Instruct
49 | # TOKENIZER_ID=Qwen/Qwen2.5-1.5B-Instruct
50 | 
51 | # 3. LOCAL TGI
52 | # HUGGINGFACE_BASE_URL=http://127.0.0.1:3000/
53 | # MODEL=meta-llama/Llama-3.1-8B-Instruct
54 | # TOKENIZER_ID=meta-llama/Llama-3.1-8B-Instruct
55 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tar filter=lfs diff=lfs merge=lfs -text
29 | *.tflite filter=lfs diff=lfs merge=lfs -text
30 | *.tgz filter=lfs diff=lfs merge=lfs -text
31 | *.wasm filter=lfs diff=lfs merge=lfs -text
32 | *.xz filter=lfs diff=lfs merge=lfs -text
33 | *.zip filter=lfs diff=lfs merge=lfs -text
34 | *.zst filter=lfs diff=lfs merge=lfs -text
35 | *tfevents* filter=lfs diff=lfs merge=lfs -text
36 | assets/flow.png filter=lfs diff=lfs merge=lfs -text
37 | *.sh text eol=lf
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm-project.org/#use-with-ide
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | .python-version
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | .DS_Store
165 | 
166 | # nltk
167 | nltk_data/
168 | 
169 | # examples
170 | models/
171 | 
172 | # Elasticsearch data
173 | elasticsearch_data/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Synthetic Data Generator
  3 | short_description: Build datasets using natural language
  4 | emoji: 🧬
  5 | colorFrom: yellow
  6 | colorTo: pink
  7 | sdk: gradio
  8 | sdk_version: 5.8.0
  9 | app_file: app.py
 10 | pinned: true
 11 | license: apache-2.0
 12 | hf_oauth: true
 13 | #header: mini
 14 | hf_oauth_scopes:
 15 | - read-repos
 16 | - write-repos
 17 | - manage-repos
 18 | - inference-api
 19 | ---
 20 | 
 21 | > [!IMPORTANT]  
 22 | The original authors have moved on to other projects. While the code might still be functional for its original purpose, please be aware that the original team does not plan to develop new features, bug fixes, or updates. If you'd like to become a maintainer, please open an issue to discuss.
 23 | >
 24 | > 
 25 | <br>
 26 | 
 27 | <h2 align="center">
 28 |   <a href=""><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" width="80%"></a>
 29 | </h2>
 30 | <h3 align="center">Build datasets using natural language</h3>
 31 | 
 32 | ![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui-full.png)
 33 | 
 34 | ## Introduction
 35 | 
 36 | Synthetic Data Generator is a tool that allows you to create high-quality datasets for training and fine-tuning language models. It leverages the power of distilabel and LLMs to generate synthetic data tailored to your specific needs. [The announcement blog](https://huggingface.co/blog/synthetic-data-generator) goes over a practical example of how to use it but you can also watch the [video](https://www.youtube.com/watch?v=nXjVtnGeEss) to see it in action.
 37 | 
 38 | Supported Tasks:
 39 | 
 40 | - Text Classification
 41 | - Chat Data for Supervised Fine-Tuning
 42 | - Retrieval Augmented Generation
 43 | 
 44 | This tool simplifies the process of creating custom datasets, enabling you to:
 45 | 
 46 | - Describe the characteristics of your desired application
 47 | - Iterate on sample datasets
 48 | - Produce full-scale datasets
 49 | - Push your datasets to the [Hugging Face Hub](https://huggingface.co/datasets?other=datacraft) and/or [Argilla](https://docs.argilla.io/)
 50 | 
 51 | By using the Synthetic Data Generator, you can rapidly prototype and create datasets for, accelerating your AI development process.
 52 | 
 53 | <p align="center">
 54 | <a href="https://twitter.com/argilla_io">
 55 | <img src="https://img.shields.io/badge/twitter-black?logo=x"/>
 56 | </a>
 57 | <a href="https://www.linkedin.com/company/argilla-io">
 58 | <img src="https://img.shields.io/badge/linkedin-blue?logo=linkedin"/>
 59 | </a>
 60 | <a href="http://hf.co/join/discord">
 61 | <img src="https://img.shields.io/badge/Discord-7289DA?&logo=discord&logoColor=white"/>
 62 | </a>
 63 | </p>
 64 | 
 65 | ## Installation
 66 | 
 67 | You can simply install the package with:
 68 | 
 69 | ```bash
 70 | pip install synthetic-dataset-generator
 71 | ```
 72 | 
 73 | ### Quickstart
 74 | 
 75 | ```python
 76 | from synthetic_dataset_generator import launch
 77 | 
 78 | launch()
 79 | ```
 80 | 
 81 | ### Environment Variables
 82 | 
 83 | - `HF_TOKEN`: Your [Hugging Face token](https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&tokenType=fineGrained) to push your datasets to the Hugging Face Hub and generate free completions from Hugging Face Inference Endpoints. You can find some configuration examples in the [examples](examples/) folder.
 84 | 
 85 | You can set the following environment variables to customize the generation process.
 86 | 
 87 | - `MAX_NUM_TOKENS`: The maximum number of tokens to generate, defaults to `2048`.
 88 | - `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`.
 89 | - `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`.
 90 | 
 91 | Optionally, you can use different API providers and models.
 92 | 
 93 | - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`, `llama3.1`.
 94 | - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the `HF_TOKEN` environment variable.
 95 | - `OPENAI_BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api.openai.com/v1/`.
 96 | - `OLLAMA_BASE_URL`: The base URL for any Ollama compatible API, e.g. `http://127.0.0.1:11434/`.
 97 | - `HUGGINGFACE_BASE_URL`: The base URL for any Hugging Face compatible API, e.g. TGI server or Dedicated Inference Endpoints. If you want to use serverless inference, only set the `MODEL`.
 98 | - `VLLM_BASE_URL`: The base URL for any VLLM compatible API, e.g. `http://localhost:8000/`.
 99 | 
100 | To use a specific model exclusively for generating completions, set the corresponding environment variables by appending `_COMPLETION` to the ones mentioned earlier. For example, you can use `MODEL_COMPLETION` and `OPENAI_BASE_URL_COMPLETION`.
101 | 
102 | SFT and Chat Data generation is not supported with OpenAI Endpoints. Additionally, you need to configure it per model family based on their prompt templates using the right `TOKENIZER_ID` and `MAGPIE_PRE_QUERY_TEMPLATE` environment variables.
103 | 
104 | - `TOKENIZER_ID`: The tokenizer ID to use for the magpie pipeline, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`.
105 | - `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie, which is only supported with Hugging Face Inference Endpoints. `llama3` and `qwen2` are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"`, respectively. For other models, you can pass a custom pre-query template string.
106 | 
107 | Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:
108 | 
109 | - `ARGILLA_API_KEY`: Your Argilla API key to push your datasets to Argilla.
110 | - `ARGILLA_API_URL`: Your Argilla API URL to push your datasets to Argilla.
111 | 
112 | To save the generated datasets to a local directory instead of pushing them to the Hugging Face Hub, set the following environment variable:
113 | 
114 | - `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to.
115 | 
116 | You can use our environment template as a starting point:
117 | 
118 | ```bash
119 | cp .env.local.template .env
120 | ```
121 | 
122 | ### Argilla integration
123 | 
124 | Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the Hugging Face Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).
125 | 
126 | ![Argilla integration](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/argilla.png)
127 | 
128 | ## Custom synthetic data generation?
129 | 
130 | Each pipeline is based on distilabel, so you can easily change the LLM or the pipeline steps.
131 | 
132 | Check out the [distilabel library](https://github.com/argilla-io/distilabel) for more information.
133 | 
134 | ## Development
135 | 
136 | Install the dependencies:
137 | 
138 | ```bash
139 | # Create a virtual environment
140 | python -m venv .venv
141 | source .venv/bin/activate
142 | 
143 | # Install the dependencies
144 | pip install -e . # pdm install
145 | ```
146 | 
147 | Run the app:
148 | 
149 | ```bash
150 | python app.py
151 | ```
152 | 
153 | ## 🐳 Docker Setup
154 | 
155 | The containerized tool uses Ollama for local LLM inference and Argilla for data curation. Here's the architecture:
156 | 
157 | ![Container Structure](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png)
158 | 
159 | Quick setup with all services (App + Ollama + Argilla):
160 | 
161 | ```bash
162 | # Copy environment template
163 | cp docker/.env.docker.template .env # Add your HF_TOKEN in .env
164 | 
165 | # Build all services (this may take a few minutes)
166 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
167 | 
168 | # Start all services
169 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
170 | ```
171 | 
172 | > For more detailed Docker configurations and setups, check [docker/README.md](docker/README.md)
173 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from synthetic_dataset_generator import launch
2 | 
3 | if __name__ == "__main__":
4 |     launch()
5 | 


--------------------------------------------------------------------------------
/assets/argilla.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/argilla.png


--------------------------------------------------------------------------------
/assets/flow.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b0465f5f3ed2a87b14cc609a1f25a1e7b0bfeb1cc8cab534a6ec79a9a8651996
3 | size 1810372
4 | 


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/logo.png


--------------------------------------------------------------------------------
/assets/ui-full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/ui-full.png


--------------------------------------------------------------------------------
/assets/ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/assets/ui.png


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   app:
 3 |     build:
 4 |       context: .
 5 |       dockerfile: docker/Dockerfile
 6 |     image: synthetic-data-generator:app
 7 |     ports:
 8 |       - "7860:7860"
 9 |     env_file:
10 |       - .env
11 |     networks:
12 |       - app-network
13 | 
14 | networks:
15 |   app-network:
16 |     name: synthetic-data-network
17 |     driver: bridge


--------------------------------------------------------------------------------
/docker/.env.docker.template:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # DOCKER CONFIGURATION ONLY - FULL SETUP (APP + OLLAMA + ARGILLA)
 3 | # =============================================================================
 4 | 
 5 | # Note: Before building:
 6 | # 1. Copy this template to the root directory: cp docker/.env.docker.template .env
 7 | # 2. Comment/uncomment the sections you want to use (OLLAMA and/or ARGILLA)
 8 | # 3. Then build and run with the appropriate docker compose command
 9 | 
10 | # Hugging Face token with read/write permissions
11 | HF_TOKEN=your_token_here
12 | 
13 | # -----------------------------------------------------------------------------
14 | # GENERATION SETTINGS
15 | # -----------------------------------------------------------------------------
16 | MAX_NUM_TOKENS=2048
17 | MAX_NUM_ROWS=1000
18 | DEFAULT_BATCH_SIZE=5
19 | 
20 | # -----------------------------------------------------------------------------
21 | # OLLAMA DOCKER CONFIGURATION
22 | # -----------------------------------------------------------------------------
23 | OLLAMA_BASE_URL=http://ollama:11434
24 | OLLAMA_HARDWARE=latest # latest (for CPU/NVIDIA), rocm (for AMD)
25 | 
26 | # LLAMA 3.2
27 | MODEL=llama3.2:1b
28 | TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
29 | MAGPIE_PRE_QUERY_TEMPLATE=llama3
30 | 
31 | # DEEPSEEK R1
32 | #MODEL=deepseek-r1:1.5b # must match ollama tags https://ollama.com/library/deepseek-r1:1.5b
33 | #TOKENIZER_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
34 | #MAGPIE_PRE_QUERY_TEMPLATE= "<｜begin▁of▁sentence｜>User: "
35 | 
36 | # -----------------------------------------------------------------------------
37 | # ARGILLA DOCKER CONFIGURATION (persistent data)
38 | # -----------------------------------------------------------------------------
39 | ARGILLA_API_URL=http://argilla:6900
40 | ARGILLA_USERNAME=admin
41 | ARGILLA_PASSWORD=admin1234
42 | ARGILLA_API_KEY=admin.1234
43 | ARGILLA_REINDEX_DATASET=1 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use Python slim image as base
 2 | FROM python:3.10-slim
 3 | 
 4 | # Set environment variables
 5 | ENV PYTHONUNBUFFERED=1 \
 6 |     PYTHONDONTWRITEBYTECODE=1 \
 7 |     PIP_NO_CACHE_DIR=1
 8 | 
 9 | # Create and set working directory
10 | WORKDIR /app
11 | 
12 | # Create non-root user first
13 | RUN useradd -m -u 1000 appuser
14 | 
15 | # Install system dependencies including build tools
16 | RUN apt-get update && apt-get install -y --no-install-recommends \
17 |     curl \
18 |     build-essential \
19 |     cmake \
20 |     libgl1-mesa-glx \
21 |     libglib2.0-0 \
22 |     libsm6 \
23 |     libxext6 \
24 |     libxrender-dev \
25 |     && rm -rf /var/lib/apt/lists/*
26 | 
27 | # Install pdm
28 | RUN pip install --no-cache-dir pdm
29 | 
30 | # Copy project files and set permissions
31 | COPY . .
32 | RUN chown -R appuser:appuser /app && \
33 |     chmod -R 755 /app
34 | 
35 | # Switch to non-root user
36 | USER appuser
37 | 
38 | # Install dependencies in a virtual environment
39 | RUN pdm install --prod --frozen-lockfile
40 | 
41 | # Expose Gradio port
42 | EXPOSE 7860
43 | 
44 | # Start command using pdm run to use the virtual environment
45 | CMD ["pdm", "run", "python", "-m", "synthetic_dataset_generator"] 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Docker Configuration Guide
 2 | 
 3 | Each service runs in its own container, communicating through internal networks. The core app connects to Ollama for model inference and Argilla for data review:
 4 | 
 5 | ![Container Structure](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png)
 6 | 
 7 | The application can be run with different configurations using Docker Compose:
 8 | 
 9 | - `docker-compose.yml`: Core application
10 | - `docker/ollama/compose.yml`: Ollama service for local LLM inference
11 | - `docker/argilla/compose.yml`: Argilla service for data curation
12 | 
13 | ## Ollama Integration
14 | 
15 | The `MODEL` variable in your `.env` file determines which model Ollama will download and use. For example:
16 | ```env
17 | MODEL=llama3.2:1b
18 | ```
19 | 
20 | ## Setup Options
21 | 
22 | ### Full Setup (App + Ollama + Argilla)
23 | ```bash
24 | # Keep all sections uncommented in .env
25 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
26 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
27 | ```
28 | 
29 | ### App + Ollama
30 | ```bash
31 | # Comment out ARGILLA section in .env
32 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml build
33 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d
34 | ```
35 | 
36 | ### App + Argilla
37 | ```bash
38 | # Comment out OLLAMA section in .env
39 | docker compose -f docker-compose.yml -f docker/argilla/compose.yml build
40 | docker compose -f docker-compose.yml -f docker/argilla/compose.yml up -d
41 | ```
42 | 
43 | ### App Only
44 | ```bash
45 | # Comment out both OLLAMA and ARGILLA sections in .env
46 | docker compose -f docker-compose.yml build
47 | docker compose -f docker-compose.yml up -d
48 | ```
49 | 
50 | ## Managing Services
51 | 
52 | Services are built separately but are linked together. If you already have some services built and want to add another:
53 | 
54 | 1. You don't need to rebuild existing services
55 | 2. Just build the new service
56 | 3. Stop everything with `down` and start again with `up`
57 | 
58 | For example, if you have App + Ollama and want to add Argilla:
59 | ```bash
60 | docker compose -f docker/argilla/compose.yml build  # only build Argilla
61 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml down
62 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
63 | ```
64 | 
65 | Similarly, if you have built all services but want to run only some of them:
66 | > **Important**: When running specific services, remember to comment out unused services in `.env` first
67 | 
68 | ```bash
69 | # No need to build again, just start the services you need
70 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d  # start only App + Ollama
71 | ```
72 | 
73 | ## Service URLs
74 | 
75 | Once running, access the services at:
76 | - App: http://localhost:7860
77 | - Argilla: http://localhost:6900 (if enabled)
78 | - Ollama: http://localhost:11434 (if enabled)
79 | 
80 | > Note:  Services will be available after a few seconds while they initialize. Ollama models and Argilla datasets are persisted and available after restarts


--------------------------------------------------------------------------------
/docker/argilla/compose.yml:
--------------------------------------------------------------------------------
  1 | services:
  2 |   app:
  3 |     extends:
  4 |       file: docker-compose.yml
  5 |       service: app
  6 |     depends_on:
  7 |       argilla:
  8 |         condition: service_healthy
  9 |         required: false
 10 |     environment:
 11 |       - ARGILLA_API_URL=http://argilla:6900
 12 | 
 13 |   elasticsearch:
 14 |     image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0
 15 |     environment:
 16 |       - ES_JAVA_OPTS=-Xms512m -Xmx512m
 17 |       - node.name=elasticsearch
 18 |       - cluster.name=es-argilla-local
 19 |       - discovery.type=single-node
 20 |       - cluster.routing.allocation.disk.threshold_enabled=false
 21 |       - xpack.security.enabled=false
 22 |     volumes:
 23 |       - es_data:/usr/share/elasticsearch/data
 24 |     networks:
 25 |       - app-network
 26 |     ports:
 27 |       - "9200:9200"
 28 |       - "9300:9300"
 29 |     ulimits:
 30 |       memlock:
 31 |         soft: -1
 32 |         hard: -1
 33 |       nofile:
 34 |         soft: 65536
 35 |         hard: 65536
 36 |     healthcheck:
 37 |       test: ["CMD", "curl", "-f", "http://localhost:9200"]
 38 |       interval: 30s
 39 |       timeout: 10s
 40 |       retries: 3
 41 | 
 42 |   postgres:
 43 |     image: postgres:14
 44 |     environment:
 45 |       POSTGRES_USER: postgres
 46 |       POSTGRES_PASSWORD: postgres
 47 |       POSTGRES_DB: argilla
 48 |     networks:
 49 |       - app-network
 50 |     volumes:
 51 |       - postgres_data:/var/lib/postgresql/data
 52 | 
 53 |   redis:
 54 |     image: redis
 55 |     networks:
 56 |       - app-network
 57 | 
 58 |   argilla:
 59 |     image: argilla/argilla-server:latest
 60 |     ports:
 61 |       - "6900:6900"
 62 |     healthcheck:
 63 |       test: ["CMD", "curl", "-f", "http://localhost:6900/api/ready"]
 64 |       interval: 30s
 65 |       timeout: 10s
 66 |       retries: 3
 67 |     env_file:
 68 |       - .env
 69 |     environment:
 70 |       - ARGILLA_HOME_PATH=/var/lib/argilla
 71 |       - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
 72 |       - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
 73 |       - ARGILLA_REDIS_URL=redis://redis:6379/0
 74 |       - USERNAME=${ARGILLA_USERNAME}
 75 |       - PASSWORD=${ARGILLA_PASSWORD}
 76 |       - API_KEY=${ARGILLA_API_KEY}
 77 |       - WORKSPACE=default
 78 |     volumes:
 79 |       - argilla_data:/argilla
 80 |     networks:
 81 |       - app-network
 82 |     depends_on:
 83 |       elasticsearch:
 84 |         condition: service_healthy
 85 |       postgres:
 86 |         condition: service_started
 87 |       redis:
 88 |         condition: service_started
 89 | 
 90 |   worker:
 91 |     image: argilla/argilla-server:latest
 92 |     env_file:
 93 |       - .env
 94 |     environment:
 95 |       - ARGILLA_HOME_PATH=/var/lib/argilla
 96 |       - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
 97 |       - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
 98 |       - ARGILLA_REDIS_URL=redis://redis:6379/0
 99 |       - BACKGROUND_NUM_WORKERS=2
100 |       - USERNAME=${ARGILLA_USERNAME}
101 |       - PASSWORD=${ARGILLA_PASSWORD}
102 |       - API_KEY=${ARGILLA_API_KEY}
103 |       - WORKSPACE=default
104 |     networks:
105 |       - app-network
106 |     depends_on:
107 |       - postgres
108 |       - elasticsearch
109 |       - redis
110 |     command: sh -c 'python -m argilla_server worker --num-workers $${BACKGROUND_NUM_WORKERS}'
111 | 
112 | volumes:
113 |   es_data:
114 |     name: synthetic-data-es
115 |   argilla_data:
116 |     name: synthetic-data-argilla
117 |   postgres_data:
118 |     name: synthetic-data-postgres 


--------------------------------------------------------------------------------
/docker/ollama/compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   app:
 3 |     extends:
 4 |       file: docker-compose.yml
 5 |       service: app
 6 |     depends_on:
 7 |       ollama:
 8 |         condition: service_healthy
 9 |         required: true
10 |     environment:
11 |       - OLLAMA_BASE_URL=http://ollama:11434
12 | 
13 |   ollama:
14 |     image: ollama/ollama:${OLLAMA_HARDWARE:-latest}
15 |     ports:
16 |       - "11434:11434"
17 |     env_file:
18 |       - .env
19 |     environment:
20 |       - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-}
21 |     volumes:
22 |       - ollama_data:/root/.ollama
23 |       - ./docker/ollama/entrypoint.sh:/entrypoint.sh
24 |     networks:
25 |       - app-network
26 |     deploy:
27 |       resources:
28 |         reservations:
29 |           devices:
30 |             - driver: nvidia
31 |               count: all
32 |               capabilities: [gpu]
33 |     tty: true
34 |     entrypoint: ["/usr/bin/bash", "/entrypoint.sh"]
35 |     healthcheck:
36 |       test: 
37 |         - "CMD-SHELL"
38 |         - |
39 |           test -f /tmp/ollama_ready && \
40 |           bash -c '</dev/tcp/localhost/11434'
41 |       interval: 10s
42 |       timeout: 10s
43 |       retries: 100
44 |       start_period: 10s
45 | 
46 | volumes:
47 |   ollama_data:
48 |     name: synthetic-data-ollama


--------------------------------------------------------------------------------
/docker/ollama/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Start Ollama in the background
 4 | /bin/ollama serve &
 5 | # Record Process ID
 6 | pid=$!
 7 | 
 8 | # Pause for Ollama to start
 9 | sleep 5
10 | 
11 | # Extract model name from MODEL variable (removing quotes if present)
12 | MODEL_NAME=$(echo $MODEL | tr -d '"')
13 | 
14 | # Verificar que MODEL_NAME tenga un valor
15 | if [ -z "$MODEL_NAME" ]; then
16 |     echo "❌ No model specified in MODEL environment variable"
17 | else
18 |     # Check if model exists
19 |     if ollama list | grep -q "$MODEL_NAME"; then
20 |         echo "🟢 Model ($MODEL_NAME) already installed"
21 |         touch /tmp/ollama_ready
22 |     else
23 |         echo "🔴 Retrieving model ($MODEL_NAME)..."
24 |         # Intentar descargar el modelo sin crear el archivo hasta estar seguros
25 |         if ollama pull "$MODEL_NAME" 2>/dev/null && ollama list | grep -q "$MODEL_NAME"; then
26 |             echo "🟢 Model download complete!"
27 |             touch /tmp/ollama_ready
28 |         else
29 |             echo "❌ Error downloading model ($MODEL_NAME)"
30 |         fi
31 |     fi
32 | fi
33 | 
34 | # Wait for Ollama process to finish
35 | wait $pid 


--------------------------------------------------------------------------------
/examples/argilla-deployment.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | import os
 8 | 
 9 | from synthetic_dataset_generator import launch
10 | 
11 | # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
12 | os.environ["HF_TOKEN"] = "hf_..."
13 | os.environ["ARGILLA_API_URL"] = (
14 |     "https://[your-owner-name]-[your_space_name].hf.space"  # argilla base url
15 | )
16 | os.environ["ARGILLA_API_KEY"] = "my_api_key"  # argilla api key
17 | 
18 | launch()
19 | 


--------------------------------------------------------------------------------
/examples/blog_private_synthetic_data_generation.md:
--------------------------------------------------------------------------------
  1 | # Private Synthetic Data Generation Made Easy: Out-of-the-Box with Docker, Argilla & Ollama
  2 | 
  3 | > "Empowering organizations with a turnkey solution for synthetic dataset creation in private environments."
  4 | 
  5 | The increasing adoption of AI solutions across industries has created an unprecedented demand for high-quality training data. As organizations scale their AI initiatives, they face the dual challenge of generating substantial, domain-specific datasets while ensuring data privacy and security. Traditional approaches often involve compromises: either using public datasets that may not fully align with specific needs, or investing heavily in custom data generation infrastructure.
  6 | 
  7 | The complexity of this challenge is amplified by regulatory requirements, resource constraints, and the need for specialized expertise. Organizations must navigate GDPR, CCPA, and industry-specific regulations while maintaining efficient data generation pipelines. This has created a pressing need for solutions that can operate entirely within private infrastructure while maintaining enterprise-grade capabilities.
  8 | 
  9 | ## The Challenge
 10 | 
 11 | The development of AI models requires extensive training data, yet organizations face significant obstacles in data generation and management. Privacy regulations and security requirements often prevent the use of public datasets or cloud-based generation services. Additionally, existing solutions typically demand complex infrastructure setups and significant technical expertise, increasing both implementation time and costs.
 12 | 
 13 | Modern enterprises require a solution that addresses several critical aspects:
 14 | 1. Data Privacy: Complete control over data generation and storage
 15 | 2. Infrastructure Flexibility: Deployment options that fit existing systems
 16 | 3. Quality Assurance: Tools for data validation and curation
 17 | 4. Scalability: Ability to grow with increasing data needs
 18 | 5. Cost Efficiency: Reduction in infrastructure and maintenance costs
 19 | 
 20 | ## The Solution
 21 | 
 22 | This out-of-the-box Synthetic Dataset Generator approach leverages the power of three technologies to create a seamless, private data generation pipeline. At its core is the [Synthetic Dataset Generator](https://github.com/argilla-io/synthetic-data-generator), a tool designed for dataset creation. [Ollama](https://ollama.ai/) ensures secure local LLM inference with [Distilabel](https://github.com/argilla-io/distilabel) integration, while [Argilla's](https://argilla.io/) data curation capabilities complete the workflow, all operating within your secure infrastructure.
 23 | 
 24 | This architecture delivers key technical advantages:
 25 | - Full data sovereignty with containerized local deployment
 26 | - End-to-end pipeline from generation to validation
 27 | - Modular design for system integration
 28 | 
 29 | Here's how it all fits together:
 30 | 
 31 | ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/Uz-kDOBrV-_GahUrc1K_O.png)
 32 | 
 33 | Let's explore how these components work together in a practical workflow.
 34 | 
 35 | ## 1. Installation & Setup
 36 | 
 37 | 
 38 | 
 39 | ### 1.1 Clone Repository
 40 | ```bash
 41 | git clone https://github.com/argilla-io/synthetic-data-generator
 42 | cd synthetic-data-generator
 43 | ```
 44 | 
 45 | ### 1.2 Environment Setup
 46 | ```bash
 47 | # Copy environment template
 48 | cp docker/.env.docker.template .env
 49 | 
 50 | # Model configuration in .env (if using Ollama)
 51 | MODEL="deepseek-r1:1.5b"  # Must match Ollama model name
 52 | ```
 53 | 
 54 | ### 1.3 Build & Deploy Services
 55 | > Pro tip: Even if you're planning to use just one component initially, we recommend building all services to enable future functionality without rebuilding. For detailed deployment options, check the [Docker documentation](https://github.com/argilla-io/synthetic-data-generator/blob/main/docker/README.md).
 56 | 
 57 | > Note: Ollama runs on CPU/GPU for Linux/Windows in Docker. For macOS, only CPU is supported in Docker - for GPU support, install Ollama separately ([details](https://ollama.com/blog/ollama-is-now-available-as-an-official-docker-image)).
 58 | 
 59 | ```bash
 60 | # Build all services
 61 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
 62 | # Start all services
 63 | docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
 64 | ```
 65 | 
 66 | To view logs, either:
 67 | - Use Docker Desktop's interface
 68 | - Remove the `-d` flag when running the above command
 69 | - Or execute the following for specific service logs:
 70 |   ```bash
 71 |   # Core App logs
 72 |   docker compose logs -f app
 73 |   # Ollama logs
 74 |   docker compose -f docker-compose.yml -f docker/ollama/compose.yml logs -f ollama
 75 |   # Argilla logs
 76 |   docker compose -f docker-compose.yml -f docker/argilla/compose.yml logs -f argilla
 77 |   ```
 78 | 
 79 | ## 2. Dataset Generation
 80 | 
 81 | The tool currently supports **Text Classification**, **Chat**, and **RAG** datasets. These tasks will determine the type of dataset you will generate: classification requires categories, chat data requires a conversation format, and RAG requires question-answer pairs with relevant context, offering options for both retrieval and reranking data generation to enhance different aspects of information retrieval systems.
 82 | 
 83 | For a detailed overview of the generation process, check out the [introduction to the Synthetic Data Generator](https://huggingface.co/blog/synthetic-data-generator).
 84 | 
 85 | 
 86 | ### 2.1. **Dataset Description**
 87 | 
 88 |    Let's walk through creating a **RAG dataset**.
 89 |    ```text
 90 |    A dataset to retrieve information from information security policies
 91 |    ```
 92 | 
 93 |    System initializes and processes the prompt:
 94 |    ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/sxH8JChF-HnGMOilymYpA.png)
 95 | 
 96 | 
 97 | ### 2.2. **Task Configuration & Sample Generation**
 98 |    System analyzes and generates the system prompt and optimal parameters automatically. Then, samples are generated for validation (modify system prompt or parameters manually if needed, then click save to generate sample data):
 99 |    ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/mYVlGNnz6YNrPJutxmBtR.png)
100 | 
101 | 
102 | ### 2.3. **Full Dataset Generation**
103 | After validating the sample data quality, proceed with full dataset generation. Configure the following parameters:
104 | 
105 | - **Repository Owner**: Your Hugging Face username for dataset hosting
106 | - **Dataset Name**: A descriptive name following standard naming conventions
107 | - **Number of Examples**: Define dataset size (recommended: 100-1000 for initial deployments)
108 | - **Temperature**: Controls generation creativity (default 0.7 balances coherence and diversity)
109 | - **Privacy Settings**: Optional dataset privacy configuration for Hugging Face Hub
110 | 
111 | The temperature parameter significantly impacts output quality:
112 | - 0.5-0.7: Optimal for technical documentation and factual content
113 | - 0.7-0.8: Balanced for general purpose datasets
114 | - 0.8-1.0: Increased creativity, suitable for conversational data
115 | 
116 | 
117 | The system initiates the generation pipeline, leveraging Distilabel for structured output:
118 | ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/PWNT_bLHwFjeoFX7AhA-z.png)
119 | 
120 |    
121 | Upon completion, the dataset is pushed to Hugging Face Hub:
122 | ![Generation Complete](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/ohd4S-RyNI406uLPf4bnZ.png)
123 | 
124 | Access your generated dataset through the Hugging Face Hub interface:
125 | 
126 | <iframe
127 |    src="https://huggingface.co/datasets/daqc/info-security-policies-rag-distiset/embed/viewer/default/train"
128 |    frameborder="0"
129 |    width="100%"
130 |    height="560px"
131 | ></iframe>
132 |    
133 | 
134 | 
135 | ## 3. Data Curation with Argilla
136 | 
137 | The integration with Argilla provides enterprise-grade dataset curation capabilities through a comprehensive review system. This phase is crucial for ensuring data quality and maintaining high standards in your training datasets.
138 | 
139 | ### Environment Configuration
140 | Before accessing Argilla's features, ensure proper configuration in your `.env` file.
141 | 
142 | 
143 | ### Curation Workflow
144 | 
145 | 1. **Dataset Integration**
146 |    Upon generation completion, the dataset is automatically ingested into Argilla. The system maintains data integrity and version control throughout the process. All datasets and progress persist across Docker restarts unless you explicitly remove the Argilla services and volumes.
147 |    ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/0gF6iLywhKafEo3z94cd-.png)
148 | 
149 | 
150 | 2. **Quality Assurance Process**
151 |    Argilla's interface provides comprehensive tools for dataset validation:
152 |    - Semantic analysis of generated content
153 |    - Consistency checking across entries
154 |    - Metadata validation and enrichment
155 |    - Collaborative review capabilities
156 |    
157 |    ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/h9kJ-4lA0LcFC8g6g_vwF.png)
158 | 
159 | 
160 | 
161 | 3. **Dataset Publication**
162 |    After thorough review, export your curated dataset to Hugging Face Hub:
163 |    
164 |    > Note: Consider using a new repository name to preserve both raw and curated datasets separately.
165 |    
166 |    - Configure repository settings
167 |    - Set visibility and access controls
168 |    - Add dataset cards and documentation
169 |    
170 |    ![Export Configuration](https://cdn-uploads.huggingface.co/production/uploads/64461026e1fd8d65b27e6187/CPwtVr_Jw6mndNCOU2a5T.png)
171 | 
172 | 
173 | The curated dataset maintains full provenance tracking and quality metrics:
174 | <iframe
175 |    src="https://huggingface.co/datasets/daqc/info-security-policies-rag-distiset-argilla/embed/viewer/default/train"
176 |    frameborder="0"
177 |    width="100%"
178 |    height="560px"
179 | ></iframe>
180 | 
181 | # 🎉 You're Done!
182 | Congratulations! You've successfully completed the end-to-end dataset generation and curation process. Your curated dataset is now ready for model training.
183 | 
184 | ## Experience the Solution
185 | 
186 | For a hands-on preview of the Synthetic Dataset Generator's capabilities, explore the hosted space. This allows you to evaluate the interface and functionality before deploying your own instance:
187 | 
188 | <iframe
189 |   src="https://argilla-synthetic-data-generator.hf.space"
190 |   frameborder="0"
191 |   width="850"
192 |   height="450"
193 |   referrerpolicy="same-origin"
194 |   sandbox="allow-scripts"
195 | ></iframe>
196 | 
197 | Create your own deployment by <a href="https://huggingface.co/spaces/argilla/synthetic-data-generator?duplicate=true">duplicating this Space</a>.
198 | 
199 | ## What's Next?
200 | 
201 | After successfully generating your first dataset, several advanced implementation paths are available:
202 | 
203 | Extend your dataset generation capabilities:
204 | - [Fine-tune models on synthetic data](https://huggingface.co/blog/davidberenstein1957/fine-tune-a-smollm-on-synthetic-data-of-llm) for domain-specific tasks
205 | - [Create specialized reasoning datasets](https://huggingface.co/blog/sdiazlor/fine-tune-deepseek-with-a-synthetic-reasoning-data) for advanced model training
206 | 
207 | ## Conclusion
208 | 
209 | The Synthetic Dataset Generator represents a significant advancement in private data generation technology, addressing the growing need for high-quality training data while maintaining security and control. By leveraging containerized architecture and local LLM inference, organizations can now generate custom datasets without compromising on data privacy or quality.
210 | 
211 | The solution's modular design enables seamless integration with existing ML pipelines while providing enterprise-grade features like persistent storage, comprehensive monitoring, and scalable infrastructure. Through collaborative validation workflows and structured quality control processes, teams can efficiently create and curate datasets tailored to their specific needs.
212 | 
213 | This combination of security, efficiency, and flexibility makes the Synthetic Dataset Generator an essential tool for organizations looking to accelerate their AI development while maintaining complete control over their data generation pipeline.
214 | 
215 | ## References & Documentation
216 | 
217 | 
218 | - [Synthetic Dataset Generator](https://github.com/argilla-io/synthetic-data-generator): Open-source tool for  dataset generation using natural language
219 | - [Distilabel Framework](https://github.com/argilla-io/distilabel): Advanced dataset generation framework
220 | - [Docker Best Practices](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/): Container optimization guidelines
221 | - [Argilla Documentation](https://docs.argilla.io): Data curation platform documentation
222 | - [Ollama Integration](https://github.com/jmorganca/ollama): Local LLM deployment guide


--------------------------------------------------------------------------------
/examples/fine-tune-modernbert-classifier.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Fine-tune ModernBERT for text classification using synthetic data\n",
  8 |     "\n",
  9 |     "LLMs are great general purpose models, but they are not always the best choice for a specific task. Therefore, smaller and more specialized models are important for sustainable, efficient, and cheaper AI.\n",
 10 |     "A lack of domain sepcific datasets is a common problem for smaller and more specialized models. This is because it is difficult to find a dataset that is both representative and diverse enough for a specific task. We solve this problem by generating a synthetic dataset from an LLM using the `synthetic-data-generator`, which is available as a [Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) or on [GitHub](https://github.com/argilla-io/synthetic-data-generator).\n",
 11 |     "\n",
 12 |     "In this example, we will fine-tune a ModernBERT model on a synthetic dataset generated from the synthetic-data-generator. This demonstrates the effectiveness of synthetic data and the novel ModernBERT model, which is a new and improved version of BERT models, with an 8192 token context length, significantly better downstream performance, and much faster processing speeds.\n",
 13 |     "\n",
 14 |     "## Install the dependencies"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# Install Pytorch & other libraries\n",
 24 |     "%pip install \"torch==2.5.0\" \"torchvision==0.20.0\" \n",
 25 |     "%pip install \"setuptools<71.0.0\" scikit-learn \n",
 26 |     " \n",
 27 |     "# Install Hugging Face libraries\n",
 28 |     "%pip install  --upgrade \\\n",
 29 |     "  \"datasets==3.1.0\" \\\n",
 30 |     "  \"accelerate==1.2.1\" \\\n",
 31 |     "  \"hf-transfer==0.1.8\"\n",
 32 |     " \n",
 33 |     "# ModernBERT is not yet available in an official release, so we need to install it from github\n",
 34 |     "%pip install \"git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1\" --upgrade"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## The problem\n",
 42 |     "\n",
 43 |     "The [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier), is a model that can classify the domain of a text which can help with curating data. This model is cool but is based on the Deberta V3 Base, which is an outdated architecture that requires custom code to run, has a context length of 512 tokens, and is not as fast as the ModernBERT model. The labels for the model are:\n",
 44 |     "\n",
 45 |     "```\n",
 46 |     "'Adult', 'Arts_and_Entertainment', 'Autos_and_Vehicles', 'Beauty_and_Fitness', 'Books_and_Literature', 'Business_and_Industrial', 'Computers_and_Electronics', 'Finance', 'Food_and_Drink', 'Games', 'Health', 'Hobbies_and_Leisure', 'Home_and_Garden', 'Internet_and_Telecom', 'Jobs_and_Education', 'Law_and_Government', 'News', 'Online_Communities', 'People_and_Society', 'Pets_and_Animals', 'Real_Estate', 'Science', 'Sensitive_Subjects', 'Shopping', 'Sports', 'Travel_and_Transportation'\n",
 47 |     "```\n",
 48 |     "\n",
 49 |     "The data on which the model was trained is not available, so we cannot use it for our purposes. We can however generate a synthetic data to solve this problem."
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {
 55 |     "vscode": {
 56 |      "languageId": "plaintext"
 57 |     }
 58 |    },
 59 |    "source": [
 60 |     "## Let's generate some data\n",
 61 |     "\n",
 62 |     "Let's go to the [hosted Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) to generate the data. This is done in three steps 1) we come up with a dataset description, 2) iterate on the task configuration, and 3) generate and push the data to Hugging Face. A more detailed flow can be found in [this blogpost](https://huggingface.co/blog/synthetic-data-generator). \n",
 63 |     "\n",
 64 |     "<iframe\n",
 65 |     "\tsrc=\"https://argilla-synthetic-data-generator.hf.space\"\n",
 66 |     "\tframeborder=\"0\"\n",
 67 |     "\twidth=\"850\"\n",
 68 |     "\theight=\"450\"\n",
 69 |     "></iframe>\n",
 70 |     "\n",
 71 |     "For this example, we will generate 1000 examples with a temperature of 1. After some iteration, we come up with the following system prompt:\n",
 72 |     "\n",
 73 |     "```\n",
 74 |     "Long texts (at least 2000 words) from various media sources like Wikipedia, Reddit, Common Crawl, websites, commercials, online forums, books, newspapers and folders that cover multiple topics. Classify the text based on its main subject matter into one of the following categories\n",
 75 |     "```\n",
 76 |     "\n",
 77 |     "We press the \"Push to Hub\" button and wait for the data to be generated. This takes a few minutes and we end up with a dataset with 1000 examples. The labels are nicely distributed across the categories, varied in length, and the texts look diverse and interesting.\n",
 78 |     "\n",
 79 |     "<iframe\n",
 80 |     "  src=\"https://huggingface.co/datasets/argilla/synthetic-domain-text-classification/embed/viewer/default/train\"\n",
 81 |     "  frameborder=\"0\"\n",
 82 |     "  width=\"100%\"\n",
 83 |     "  height=\"560px\"\n",
 84 |     "></iframe>\n",
 85 |     "\n",
 86 |     "The data is pushed to Argilla to so we recommend inspecting and validating the labels before finetuning the model."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "## Finetuning the ModernBERT model\n",
 94 |     "\n",
 95 |     "We mostly rely on the blog from [Phillip Schmid](https://www.philschmid.de/fine-tune-modern-bert-in-2025). I will basic consumer hardware, my Apple M1 Max with 32GB of shared memory. We will use the `datasets` library to load the data and the `transformers` library to finetune the model."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 1,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stderr",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "/Users/davidberenstein/Documents/programming/argilla/synthetic-data-generator/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
108 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
109 |      ]
110 |     },
111 |     {
112 |      "data": {
113 |       "text/plain": [
114 |        "{'text': 'Recently, there has been an increase in property values within the suburban areas of several cities due to improvements in infrastructure and lifestyle amenities such as parks, retail stores, and educational institutions nearby. Additionally, new housing developments are emerging, catering to different family needs with varying sizes and price ranges. These changes have influenced investment decisions for many looking to buy or sell properties.',\n",
115 |        " 'label': 14}"
116 |       ]
117 |      },
118 |      "execution_count": 1,
119 |      "metadata": {},
120 |      "output_type": "execute_result"
121 |     }
122 |    ],
123 |    "source": [
124 |     "from datasets import load_dataset\n",
125 |     "from datasets.arrow_dataset import Dataset\n",
126 |     "from datasets.dataset_dict import DatasetDict, IterableDatasetDict\n",
127 |     "from datasets.iterable_dataset import IterableDataset\n",
128 |     " \n",
129 |     "# Dataset id from huggingface.co/dataset\n",
130 |     "dataset_id = \"argilla/synthetic-domain-text-classification\"\n",
131 |     " \n",
132 |     "# Load raw dataset\n",
133 |     "train_dataset = load_dataset(dataset_id, split='train')\n",
134 |     "\n",
135 |     "split_dataset = train_dataset.train_test_split(test_size=0.1)\n",
136 |     "split_dataset['train'][0]"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "First, we need to tokenize the data. We will use the `AutoTokenizer` class from the `transformers` library to load the tokenizer."
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 2,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "name": "stderr",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "Map: 100%|██████████| 900/900 [00:00<00:00, 4787.61 examples/s]\n",
156 |       "Map: 100%|██████████| 100/100 [00:00<00:00, 4163.70 examples/s]\n"
157 |      ]
158 |     },
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "dict_keys(['labels', 'input_ids', 'attention_mask'])"
163 |       ]
164 |      },
165 |      "execution_count": 2,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "from transformers import AutoTokenizer\n",
172 |     " \n",
173 |     "# Model id to load the tokenizer\n",
174 |     "model_id = \"answerdotai/ModernBERT-base\"\n",
175 |     "\n",
176 |     "# Load Tokenizer\n",
177 |     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
178 |     " \n",
179 |     "# Tokenize helper function\n",
180 |     "def tokenize(batch):\n",
181 |     "    return tokenizer(batch['text'], padding=True, truncation=True, return_tensors=\"pt\")\n",
182 |     " \n",
183 |     "# Tokenize dataset\n",
184 |     "if \"label\" in split_dataset[\"train\"].features.keys():\n",
185 |     "    split_dataset =  split_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n",
186 |     "tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n",
187 |     " \n",
188 |     "tokenized_dataset[\"train\"].features.keys()"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "Now, we need to prepare the model. We will use the `AutoModelForSequenceClassification` class from the `transformers` library to load the model."
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 3,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "name": "stderr",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
208 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
209 |      ]
210 |     }
211 |    ],
212 |    "source": [
213 |     "from transformers import AutoModelForSequenceClassification\n",
214 |     " \n",
215 |     "# Model id to load the tokenizer\n",
216 |     "model_id = \"answerdotai/ModernBERT-base\"\n",
217 |     " \n",
218 |     "# Prepare model labels - useful for inference\n",
219 |     "labels = tokenized_dataset[\"train\"].features[\"labels\"].names\n",
220 |     "num_labels = len(labels)\n",
221 |     "label2id, id2label = dict(), dict()\n",
222 |     "for i, label in enumerate(labels):\n",
223 |     "    label2id[label] = str(i)\n",
224 |     "    id2label[str(i)] = label\n",
225 |     " \n",
226 |     "# Download the model from huggingface.co/models\n",
227 |     "model = AutoModelForSequenceClassification.from_pretrained(\n",
228 |     "    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,\n",
229 |     ")"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "We will use a simple F1 score as the evaluation metric."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 4,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "import numpy as np\n",
246 |     "from sklearn.metrics import f1_score\n",
247 |     " \n",
248 |     "# Metric helper method\n",
249 |     "def compute_metrics(eval_pred):\n",
250 |     "    predictions, labels = eval_pred\n",
251 |     "    predictions = np.argmax(predictions, axis=1)\n",
252 |     "    score = f1_score(\n",
253 |     "            labels, predictions, labels=labels, pos_label=1, average=\"weighted\"\n",
254 |     "        )\n",
255 |     "    return {\"f1\": float(score) if score == 1 else score}"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "Finally, we need to define the training arguments. We will use the `TrainingArguments` class from the `transformers` library to define the training arguments."
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 6,
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "name": "stderr",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "/Users/davidberenstein/Documents/programming/argilla/synthetic-data-generator/.venv/lib/python3.11/site-packages/transformers/training_args.py:2241: UserWarning: `use_mps_device` is deprecated and will be removed in version 5.0 of 🤗 Transformers. `mps` device will be used by default if available similar to the way `cuda` device is used.Therefore, no action from user is required. \n",
275 |       "  warnings.warn(\n"
276 |      ]
277 |     }
278 |    ],
279 |    "source": [
280 |     "from huggingface_hub import HfFolder\n",
281 |     "from transformers import Trainer, TrainingArguments\n",
282 |     " \n",
283 |     "# Define training args\n",
284 |     "training_args = TrainingArguments(\n",
285 |     "    output_dir= \"ModernBERT-domain-classifier\",\n",
286 |     "    per_device_train_batch_size=32,\n",
287 |     "    per_device_eval_batch_size=16,\n",
288 |     "    learning_rate=5e-5,\n",
289 |     "\t\tnum_train_epochs=5,\n",
290 |     "    bf16=True, # bfloat16 training \n",
291 |     "    optim=\"adamw_torch_fused\", # improved optimizer \n",
292 |     "    # logging & evaluation strategies\n",
293 |     "    logging_strategy=\"steps\",\n",
294 |     "    logging_steps=100,\n",
295 |     "    eval_strategy=\"epoch\",\n",
296 |     "    save_strategy=\"epoch\",\n",
297 |     "    save_total_limit=2,\n",
298 |     "    load_best_model_at_end=True,\n",
299 |     "    use_mps_device=True,\n",
300 |     "    metric_for_best_model=\"f1\",\n",
301 |     "    # push to hub parameters\n",
302 |     "    push_to_hub=True,\n",
303 |     "    hub_strategy=\"every_save\",\n",
304 |     "    hub_token=HfFolder.get_token(),\n",
305 |     ")\n",
306 |     " \n",
307 |     "# Create a Trainer instance\n",
308 |     "trainer = Trainer(\n",
309 |     "    model=model,\n",
310 |     "    args=training_args,\n",
311 |     "    train_dataset=tokenized_dataset[\"train\"],\n",
312 |     "    eval_dataset=tokenized_dataset[\"test\"],\n",
313 |     "    compute_metrics=compute_metrics,\n",
314 |     ")"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 7,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "name": "stderr",
324 |      "output_type": "stream",
325 |      "text": [
326 |       "                                                 \n",
327 |       " 20%|██        | 29/145 [11:32<33:16, 17.21s/it]"
328 |      ]
329 |     },
330 |     {
331 |      "name": "stdout",
332 |      "output_type": "stream",
333 |      "text": [
334 |       "{'eval_loss': 0.729780912399292, 'eval_f1': 0.7743598318036522, 'eval_runtime': 3.5337, 'eval_samples_per_second': 28.299, 'eval_steps_per_second': 1.981, 'epoch': 1.0}\n"
335 |      ]
336 |     },
337 |     {
338 |      "name": "stderr",
339 |      "output_type": "stream",
340 |      "text": [
341 |       "                                                  \n",
342 |       " 40%|████      | 58/145 [22:57<25:56, 17.89s/it]"
343 |      ]
344 |     },
345 |     {
346 |      "name": "stdout",
347 |      "output_type": "stream",
348 |      "text": [
349 |       "{'eval_loss': 0.4369044005870819, 'eval_f1': 0.8310764765820946, 'eval_runtime': 3.3266, 'eval_samples_per_second': 30.061, 'eval_steps_per_second': 2.104, 'epoch': 2.0}\n"
350 |      ]
351 |     },
352 |     {
353 |      "name": "stderr",
354 |      "output_type": "stream",
355 |      "text": [
356 |       "                                                \n",
357 |       " 60%|██████    | 87/145 [35:16<17:06, 17.70s/it]"
358 |      ]
359 |     },
360 |     {
361 |      "name": "stdout",
362 |      "output_type": "stream",
363 |      "text": [
364 |       "{'eval_loss': 0.6091340184211731, 'eval_f1': 0.8399274488570763, 'eval_runtime': 3.2772, 'eval_samples_per_second': 30.514, 'eval_steps_per_second': 2.136, 'epoch': 3.0}\n"
365 |      ]
366 |     },
367 |     {
368 |      "name": "stderr",
369 |      "output_type": "stream",
370 |      "text": [
371 |       " 69%|██████▉   | 100/145 [41:03<18:02, 24.06s/it]"
372 |      ]
373 |     },
374 |     {
375 |      "name": "stdout",
376 |      "output_type": "stream",
377 |      "text": [
378 |       "{'loss': 0.7663, 'grad_norm': 7.232136249542236, 'learning_rate': 1.5517241379310346e-05, 'epoch': 3.45}\n"
379 |      ]
380 |     },
381 |     {
382 |      "name": "stderr",
383 |      "output_type": "stream",
384 |      "text": [
385 |       "                                                 \n",
386 |       " 80%|████████  | 116/145 [47:23<08:50, 18.30s/it]"
387 |      ]
388 |     },
389 |     {
390 |      "name": "stdout",
391 |      "output_type": "stream",
392 |      "text": [
393 |       "{'eval_loss': 0.43516409397125244, 'eval_f1': 0.8797674004703547, 'eval_runtime': 3.2975, 'eval_samples_per_second': 30.326, 'eval_steps_per_second': 2.123, 'epoch': 4.0}\n"
394 |      ]
395 |     },
396 |     {
397 |      "name": "stderr",
398 |      "output_type": "stream",
399 |      "text": [
400 |       "                                                   \n",
401 |       "100%|██████████| 145/145 [1:00:40<00:00, 19.18s/it]"
402 |      ]
403 |     },
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "{'eval_loss': 0.39272159337997437, 'eval_f1': 0.8914389523348718, 'eval_runtime': 3.5564, 'eval_samples_per_second': 28.118, 'eval_steps_per_second': 1.968, 'epoch': 5.0}\n"
409 |      ]
410 |     },
411 |     {
412 |      "name": "stderr",
413 |      "output_type": "stream",
414 |      "text": [
415 |       "100%|██████████| 145/145 [1:00:42<00:00, 25.12s/it]\n"
416 |      ]
417 |     },
418 |     {
419 |      "name": "stdout",
420 |      "output_type": "stream",
421 |      "text": [
422 |       "{'train_runtime': 3642.7783, 'train_samples_per_second': 1.235, 'train_steps_per_second': 0.04, 'train_loss': 0.535627057634551, 'epoch': 5.0}\n"
423 |      ]
424 |     },
425 |     {
426 |      "name": "stderr",
427 |      "output_type": "stream",
428 |      "text": [
429 |       "events.out.tfevents.1735555878.Davids-MacBook-Pro.local.23438.0: 100%|██████████| 9.32k/9.32k [00:00<00:00, 55.0kB/s]\n"
430 |      ]
431 |     },
432 |     {
433 |      "data": {
434 |       "text/plain": [
435 |        "CommitInfo(commit_url='https://huggingface.co/davidberenstein1957/domain-classifier/commit/915f4b03c230cc8f376f13729728f14347400041', commit_message='End of training', commit_description='', oid='915f4b03c230cc8f376f13729728f14347400041', pr_url=None, repo_url=RepoUrl('https://huggingface.co/davidberenstein1957/domain-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='davidberenstein1957/domain-classifier'), pr_revision=None, pr_num=None)"
436 |       ]
437 |      },
438 |      "execution_count": 7,
439 |      "metadata": {},
440 |      "output_type": "execute_result"
441 |     }
442 |    ],
443 |    "source": [
444 |     "trainer.train()\n",
445 |     "# Save processor and create model card\n",
446 |     "tokenizer.save_pretrained(\"ModernBERT-domain-classifier\")\n",
447 |     "trainer.create_model_card()\n",
448 |     "trainer.push_to_hub()"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "We get an F1 score of 0.89 on the test set, which is pretty good for the small dataset and time spent."
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "markdown",
460 |    "metadata": {},
461 |    "source": [
462 |     "## Run inference\n",
463 |     "\n",
464 |     "We can now load the model and run inference."
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 11,
470 |    "metadata": {},
471 |    "outputs": [
472 |     {
473 |      "name": "stderr",
474 |      "output_type": "stream",
475 |      "text": [
476 |       "Device set to use mps:0\n"
477 |      ]
478 |     },
479 |     {
480 |      "data": {
481 |       "text/plain": [
482 |        "[{'label': 'health', 'score': 0.6779336333274841}]"
483 |       ]
484 |      },
485 |      "execution_count": 11,
486 |      "metadata": {},
487 |      "output_type": "execute_result"
488 |     }
489 |    ],
490 |    "source": [
491 |     "from transformers import pipeline\n",
492 |     " \n",
493 |     "# load model from huggingface.co/models using our repository id\n",
494 |     "classifier = pipeline(\n",
495 |     "    task=\"text-classification\", \n",
496 |     "    model=\"argilla/ModernBERT-domain-classifier\", \n",
497 |     "    device=0,\n",
498 |     ")\n",
499 |     " \n",
500 |     "sample = \"Smoking is bad for your health.\"\n",
501 |     " \n",
502 |     "classifier(sample)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {},
508 |    "source": [
509 |     "## Conclusion\n",
510 |     "\n",
511 |     "We have shown that we can generate a synthetic dataset from an LLM and finetune a ModernBERT model on it. This the effectiveness of synthetic data and the novel ModernBERT model, which is new and improved version of BERT models, with 8192 token context length, significantly better downstream performance, and much faster processing speeds. \n",
512 |     "\n",
513 |     "Pretty cool for 20 minutes of generating data, and an hour of fine-tuning on consumer hardware."
514 |    ]
515 |   }
516 |  ],
517 |  "metadata": {
518 |   "kernelspec": {
519 |    "display_name": ".venv",
520 |    "language": "python",
521 |    "name": "python3"
522 |   },
523 |   "language_info": {
524 |    "codemirror_mode": {
525 |     "name": "ipython",
526 |     "version": 3
527 |    },
528 |    "file_extension": ".py",
529 |    "mimetype": "text/x-python",
530 |    "name": "python",
531 |    "nbconvert_exporter": "python",
532 |    "pygments_lexer": "ipython3",
533 |    "version": "3.11.11"
534 |   }
535 |  },
536 |  "nbformat": 4,
537 |  "nbformat_minor": 2
538 | }
539 | 


--------------------------------------------------------------------------------
/examples/fine-tune-smollm2-on-synthetic-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Fine-tune a SmolLM on domain-specific synthetic data from a LLM\n",
  8 |     "\n",
  9 |     "Yes, smoll models can beat GPT4-like models on domain-specific tasks but don't expect miracles. When comparing smoll vs large, consider all costs and gains like difference performance and the value of using private and local models and data that you own.\n",
 10 |     "\n",
 11 |     "The [Hugging Face SmolLM models](https://github.com/huggingface/smollm) are blazingly fast and remarkably powerful. With its 135M, 360M and 1.7B parameter models, it is a great choice for a small and fast model. The great thing about SmolLM is that it is a general-purpose model that can be fine-tuned on domain-specific data.\n",
 12 |     "\n",
 13 |     "A lack of domain-specific datasets is a common problem for smaller and more specialized models. This is because it is difficult to find a dataset that is both representative and diverse enough for a specific task. We solve this problem by generating a synthetic dataset from an LLM using the `synthetic-data-generator`, which is available as a [Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) or on [GitHub](https://github.com/argilla-io/synthetic-data-generator).\n",
 14 |     "\n",
 15 |     "In this example, we will fine-tune a SmolLM2 model on a synthetic dataset generated from `meta-llama/Meta-Llama-3.1-8B-Instruct` with the `synthetic-data-generator`.\n",
 16 |     "\n",
 17 |     "## Install the dependencies\n",
 18 |     "\n",
 19 |     "We will install some basic dependencies for the fine-tuning with `trl` but we will use the Synthetic Data Generator UI to generate the synthetic dataset."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "!pip install transformers datasets trl torch"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## The problem\n",
 36 |     "\n",
 37 |     "Reasoning data has proven to be a fundamental change in the performance of generative models. Reasoning is amazing but it also means the model generates more \"chatty\" during the token generation process, causing the model to become slower and more expensive. For this reason, we want to create a model that can reason without being too chatty. Therefore, we will generate a concise reasoning dataset and fine-tune a SmolLM2 model on it.\n",
 38 |     "\n",
 39 |     "## Let's generate some data\n",
 40 |     "\n",
 41 |     "Let's go to the [hosted Hugging Face Space](https://huggingface.co/spaces/argilla/synthetic-data-generator) to generate the data. This is done in three steps 1) we come up with a dataset description, 2) iterate on the task configuration, and 3) generate and push the data to Hugging Face. A more detailed flow can be found in [this blog post](https://huggingface.co/blog/synthetic-data-generator). \n",
 42 |     "\n",
 43 |     "<iframe\n",
 44 |     "\tsrc=\"https://argilla-synthetic-data-generator.hf.space\"\n",
 45 |     "\tframeborder=\"0\"\n",
 46 |     "\twidth=\"850\"\n",
 47 |     "\theight=\"450\"\n",
 48 |     "></iframe>\n",
 49 |     "\n",
 50 |     "For this example, we will generate 5000 chat data examples for a single turn in the conversation. All examples have been generated with a temperature of 1. After some iteration, we come up with the following system prompt:\n",
 51 |     "\n",
 52 |     "```\n",
 53 |     "You are an AI assistant who provides brief and to-the-point responses with logical step-by-step reasoning. Your purpose is to offer straightforward explanations and answers so that you can get to the heart of the issue. Respond with extremely concise, direct justifications and evidence-based conclusions. User questions are direct and concise.\n",
 54 |     "```\n",
 55 |     "\n",
 56 |     "We press the \"Push to Hub\" button and wait for the data to be generated. This takes a few hours and we end up with a dataset with 5000 examples, which is the maximum number of examples we can generate in a single run. You can scale this by deploying a private instance of the Synthetic Data Generator. \n",
 57 |     "\n",
 58 |     "<iframe\n",
 59 |     "  src=\"https://huggingface.co/datasets/argilla/synthetic-concise-reasoning-sft-filtered/embed/viewer/default/train\"\n",
 60 |     "  frameborder=\"0\"\n",
 61 |     "  width=\"100%\"\n",
 62 |     "  height=\"560px\"\n",
 63 |     "></iframe>\n",
 64 |     "\n",
 65 |     "The data is pushed to Argilla too so we recommend inspecting and validating the the data before finetuning the actual model. We applied some basic filters and transformations to the data to make it more suitable for fine-tuning.\n",
 66 |     "\n",
 67 |     "## Fine-tune the model\n",
 68 |     "\n",
 69 |     "We will use TRL to fine-tune the model. It is part of the Hugging Face ecosystem and works seamlessly on top of datasets generated by the synthetic data generator without needing to do any data transformations.\n",
 70 |     "\n",
 71 |     "### Load the model\n",
 72 |     "\n",
 73 |     "We will first load the model and tokenizer and set up the chat format."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 5,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# Import necessary libraries\n",
 83 |     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
 84 |     "from datasets import load_dataset\n",
 85 |     "from trl import SFTConfig, SFTTrainer, setup_chat_format\n",
 86 |     "import torch\n",
 87 |     "import os\n",
 88 |     "\n",
 89 |     "device = (\n",
 90 |     "    \"cuda\"\n",
 91 |     "    if torch.cuda.is_available()\n",
 92 |     "    else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
 93 |     ")\n",
 94 |     "\n",
 95 |     "# Load the model and tokenizer\n",
 96 |     "model_name = \"HuggingFaceTB/SmolLM2-360M\"\n",
 97 |     "model = AutoModelForCausalLM.from_pretrained(\n",
 98 |     "    pretrained_model_name_or_path=model_name\n",
 99 |     ")\n",
100 |     "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)\n",
101 |     "\n",
102 |     "# Set up the chat format\n",
103 |     "model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "### Test the base model\n",
111 |     "\n",
112 |     "We will first test the base model to see how it performs on the task. During this step we will also generate a prompt for the model to respond to, to see how it performs on the task."
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 2,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stderr",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "Device set to use mps:0\n"
125 |      ]
126 |     },
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "[{'generated_text': 'What is the primary function of mitochondria within a cell?\\n\\nMitochondria are the powerhouses of the cell. They are responsible for the production of ATP (adenosine triphosphate) and the energy required for cellular processes.\\n\\nWhat is the function of the mitochondria in the cell?\\n\\nThe mitochondria are the powerhouses of the cell. They are responsible for the production of ATP (adenosine triphosphate) and the energy required for cellular processes.\\n\\nWhat is the function of the mitochondria in the cell?\\n\\nThe'}]"
131 |       ]
132 |      },
133 |      "execution_count": 2,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "from transformers import pipeline\n",
140 |     "\n",
141 |     "prompt = \"What is the primary function of mitochondria within a cell?\"\n",
142 |     "\n",
143 |     "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=device)\n",
144 |     "pipe(prompt, max_new_tokens=100)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "### Load the dataset\n",
152 |     "\n",
153 |     "For fine-tuning, we need to load the dataset and tokenize it. We will use the `synthetic-concise-reasoning-sft-filtered` dataset that we generated in the previous step."
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 2,
159 |    "metadata": {},
160 |    "outputs": [
161 |     {
162 |      "name": "stderr",
163 |      "output_type": "stream",
164 |      "text": [
165 |       "Map: 100%|██████████| 4133/4133 [00:00<00:00, 18478.53 examples/s]\n"
166 |      ]
167 |     }
168 |    ],
169 |    "source": [
170 |     "from datasets import load_dataset\n",
171 |     "\n",
172 |     "ds = load_dataset(\"argilla/synthetic-concise-reasoning-sft-filtered\")\n",
173 |     "def tokenize_function(examples):\n",
174 |     "    examples[\"text\"] = tokenizer.apply_chat_template([{\"role\": \"user\", \"content\": examples[\"prompt\"].strip()}, {\"role\": \"assistant\", \"content\": examples[\"completion\"].strip()}], tokenize=False)\n",
175 |     "    return examples\n",
176 |     "ds = ds.map(tokenize_function)\n",
177 |     "ds = ds.shuffle()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "### Fine-tune the model\n",
185 |     "\n",
186 |     "We will now fine-tune the model. We will use the `SFTTrainer` from the `trl` library to fine-tune the model. We will use a batch size of 4 and a learning rate of 5e-5. We will also use the `use_mps_device` flag to use the MPS device if available."
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "os.environ[\"PYTORCH_MPS_HIGH_WATERMARK_RATIO\"] = \"0.0\"\n",
196 |     "\n",
197 |     "# Configure the SFTTrainer\n",
198 |     "sft_config = SFTConfig(\n",
199 |     "    output_dir=\"./sft_output\",\n",
200 |     "    num_train_epochs=1,\n",
201 |     "    per_device_train_batch_size=4,  # Set according to your GPU memory capacity\n",
202 |     "    learning_rate=5e-5,  # Common starting point for fine-tuning\n",
203 |     "    logging_steps=100,  # Frequency of logging training metrics\n",
204 |     "    use_mps_device= True if device == \"mps\" else False,\n",
205 |     "    hub_model_id=\"argilla/SmolLM2-360M-synthetic-concise-reasoning\",  # Set a unique name for your model\n",
206 |     "    push_to_hub=True,\n",
207 |     ")\n",
208 |     "\n",
209 |     "# Initialize the SFTTrainer\n",
210 |     "trainer = SFTTrainer(\n",
211 |     "    model=model,\n",
212 |     "    args=sft_config,\n",
213 |     "    train_dataset=ds[\"train\"],\n",
214 |     "    tokenizer=tokenizer,\n",
215 |     ")\n",
216 |     "trainer.train()"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "```\n",
224 |     "# {'loss': 1.4498, 'grad_norm': 2.3919131755828857, 'learning_rate': 4e-05, 'epoch': 0.1}\n",
225 |     "# {'loss': 1.362, 'grad_norm': 1.6650595664978027, 'learning_rate': 3e-05, 'epoch': 0.19}\n",
226 |     "# {'loss': 1.3778, 'grad_norm': 1.4778285026550293, 'learning_rate': 2e-05, 'epoch': 0.29}\n",
227 |     "# {'loss': 1.3735, 'grad_norm': 2.1424977779388428, 'learning_rate': 1e-05, 'epoch': 0.39}\n",
228 |     "# {'loss': 1.3512, 'grad_norm': 2.3498542308807373, 'learning_rate': 0.0, 'epoch': 0.48}\n",
229 |     "# {'train_runtime': 1911.514, 'train_samples_per_second': 1.046, 'train_steps_per_second': 0.262, 'train_loss': 1.3828572998046875, 'epoch': 0.48}\n",
230 |     "```\n",
231 |     "\n",
232 |     "For the example, we did not use a specific validation set but we can see the loss is decreasing, so we assume the model is generalsing well to the training data. To get a better understanding of the model's performance, let's test it again with the same prompt.\n",
233 |     "\n",
234 |     "### Run inference\n",
235 |     "\n",
236 |     "We can now run inference with [the fine-tuned model](https://huggingface.co/argilla/SmolLM2-360M-synthetic-concise-reasoning/blob/main/README.md)."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 12,
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "name": "stderr",
246 |      "output_type": "stream",
247 |      "text": [
248 |       "Device set to use mps\n"
249 |      ]
250 |     },
251 |     {
252 |      "data": {
253 |       "text/plain": [
254 |        "'The primary function of mitochondria is to generate energy for the cell. They are organelles found in eukaryotic cells that convert nutrients into ATP (adenosine triphosphate), which is the primary source of energy for cellular processes.\\nMitochondria are responsible for:\\n\\nEnergy production: Mitochondria produce ATP through a process called oxidative phosphorylation, which involves the transfer of electrons from food molecules to oxygen.\\nEnergy storage: Mitochondria store energy in the form of adenosine triphosphate (ATP), which is used by the cell for various cellular processes.\\nCellular respiration: Mitochondria also participate in cellular respiration, a'"
255 |       ]
256 |      },
257 |      "execution_count": 12,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "prompt = \"What is the primary function of mitochondria within a cell?\"\n",
264 |     "\n",
265 |     "generator = pipeline(\n",
266 |     "    \"text-generation\",\n",
267 |     "    model=\"argilla/SmolLM2-360M-synthetic-concise-reasoning\",\n",
268 |     "    device=\"mps\",\n",
269 |     ")\n",
270 |     "generator(\n",
271 |     "    [{\"role\": \"user\", \"content\": prompt}], max_new_tokens=128, return_full_text=False\n",
272 |     ")[0][\"generated_text\"]"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "## Conclusion\n",
280 |     "\n",
281 |     "We have fine-tuned a SmolLM2 model on a synthetic dataset generated from a large language model. We have seen that the model performs well on the task and that the synthetic data is a great way to generate diverse and representative data for supervised fine-tuning. \n",
282 |     "\n",
283 |     "In practice, you would likely want to spend more time on the data quality and fine-tuning the model but the flow shows the Synthetic Data Generator is a great tool to generate synthetic data for any task.\n",
284 |     "\n",
285 |     "Overall, I think it is pretty cool for a couple of hours of generation and fine-tuning on consumer hardware.\n"
286 |    ]
287 |   }
288 |  ],
289 |  "metadata": {
290 |   "kernelspec": {
291 |    "display_name": ".venv",
292 |    "language": "python",
293 |    "name": "python3"
294 |   },
295 |   "language_info": {
296 |    "codemirror_mode": {
297 |     "name": "ipython",
298 |     "version": 3
299 |    },
300 |    "file_extension": ".py",
301 |    "mimetype": "text/x-python",
302 |    "name": "python",
303 |    "nbconvert_exporter": "python",
304 |    "pygments_lexer": "ipython3",
305 |    "version": "3.11.9"
306 |   }
307 |  },
308 |  "nbformat": 4,
309 |  "nbformat_minor": 2
310 | }
311 | 


--------------------------------------------------------------------------------
/examples/hf-dedicated-or-tgi-deployment.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | import os
 8 | 
 9 | from synthetic_dataset_generator import launch
10 | 
11 | os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
12 | os.environ["HUGGINGFACE_BASE_URL"] = "http://127.0.0.1:3000/"  # dedicated endpoint/TGI
13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # magpie template
14 | os.environ["TOKENIZER_ID"] = (
15 |     "meta-llama/Llama-3.1-8B-Instruct"  # tokenizer for model hosted on endpoint
16 | )
17 | os.environ["MODEL"] = None  # model is linked to endpoint
18 | 
19 | launch()
20 | 


--------------------------------------------------------------------------------
/examples/hf-serverless-deployment-deepseek.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | import os
 8 | 
 9 | from synthetic_dataset_generator import launch
10 | 
11 | os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
12 | os.environ["MODEL"] = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # use model for instructions
13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "<｜begin▁of▁sentence｜>User: "  # use the custom template for the model
14 | 
15 | 
16 | launch()
17 | 


--------------------------------------------------------------------------------
/examples/hf-serverless-deployment.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | import os
 8 | 
 9 | from synthetic_dataset_generator import launch
10 | 
11 | os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
12 | os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct"  # use model for generation
13 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # use the template for the model
14 | 
15 | launch()
16 | 


--------------------------------------------------------------------------------
/examples/hf-serverless-different-model-for-completion.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | import os
 8 | 
 9 | from synthetic_dataset_generator import launch
10 | 
11 | os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
12 | os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct"  # use model for instruction generation
13 | os.environ["MODEL_COMPLETION"] = "meta-llama/Llama-3.1-70B-Instruct"  # use model for completion generation
14 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # use the template for the model
15 | 
16 | launch()
17 | 


--------------------------------------------------------------------------------
/examples/ollama-deployment.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | # ollama serve
 8 | # ollama run qwen2.5:32b-instruct-q5_K_S
 9 | import os
10 | 
11 | from synthetic_dataset_generator import launch
12 | 
13 | os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
14 | os.environ["OLLAMA_BASE_URL"] = "http://127.0.0.1:11434/"  # ollama base url
15 | os.environ["MODEL"] = "qwen2.5:32b-instruct-q5_K_S"  # model id
16 | os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-32B-Instruct"  # tokenizer id
17 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2"
18 | os.environ["MAX_NUM_ROWS"] = "10000"
19 | os.environ["DEFAULT_BATCH_SIZE"] = "2"
20 | os.environ["MAX_NUM_TOKENS"] = "1024"
21 | 
22 | launch()
23 | 


--------------------------------------------------------------------------------
/examples/ollama-different-model-for-completion.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | # ollama serve
 8 | # ollama run llama3.2
 9 | # ollama run llama3.2:1b
10 | import os
11 | 
12 | from synthetic_dataset_generator import launch
13 | 
14 | os.environ["OLLAMA_BASE_URL"] = (
15 |     "http://127.0.0.1:11434/"  # in this case, the same base url for both models
16 | )
17 | 
18 | os.environ["MODEL"] = "llama3.2" # model for instruction generation
19 | os.environ["MODEL_COMPLETION"] = "llama3.2:1b" # model for completion generation
20 | 
21 | os.environ["TOKENIZER_ID"] = "meta-llama/Llama-3.2-3B-Instruct" # tokenizer for instruction generation
22 | os.environ["TOKENIZER_ID_COMPLETION"] = "meta-llama/Llama-3.2-1B-Instruct" # tokenizer for completion generation
23 | 
24 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # magpie template required for instruction generation
25 | 
26 | launch()
27 | 


--------------------------------------------------------------------------------
/examples/openai-deployment.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | 
 8 | import os
 9 | 
10 | from synthetic_dataset_generator import launch
11 | 
12 | os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
13 | os.environ["OPENAI_BASE_URL"] = "https://api.openai.com/v1/"  # openai base url
14 | os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY")  # openai api key
15 | os.environ["MODEL"] = "gpt-4o"  # model id
16 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = None  # chat data not supported with OpenAI
17 | 
18 | launch()
19 | 


--------------------------------------------------------------------------------
/examples/vllm-deployment.py:
--------------------------------------------------------------------------------
 1 | # /// script
 2 | # requires-python = ">=3.11,<3.12"
 3 | # dependencies = [
 4 | #     "synthetic-dataset-generator",
 5 | # ]
 6 | # ///
 7 | # vllm serve Qwen/Qwen2.5-1.5B-Instruct
 8 | import os
 9 | 
10 | from synthetic_dataset_generator import launch
11 | 
12 | os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
13 | os.environ["VLLM_BASE_URL"] = "http://127.0.0.1:8000/"  # vllm base url
14 | os.environ["MODEL"] = "Qwen/Qwen2.5-1.5B-Instruct"  # model id
15 | os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-1.5B-Instruct"  # tokenizer id
16 | os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2"
17 | os.environ["MAX_NUM_ROWS"] = "10000"
18 | os.environ["DEFAULT_BATCH_SIZE"] = "2"
19 | os.environ["MAX_NUM_TOKENS"] = "1024"
20 | 
21 | launch()
22 | 


--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | poppler-utils
2 | tesseract-ocr


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "synthetic-dataset-generator"
 3 | version = "0.2.0"
 4 | description = "Build datasets using natural language"
 5 | authors = [
 6 |     {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
 7 | ]
 8 | keywords = [
 9 |     "gradio",
10 |     "synthetic-data",
11 |     "huggingface",
12 |     "argilla",
13 |     "generative-ai",
14 |     "ai",
15 | ]
16 | requires-python = "<3.13,>=3.10"
17 | readme = "README.md"
18 | license = {text = "Apache 2"}
19 | 
20 | dependencies = [
21 |     "argilla>=2.4.0,<3.0.0",
22 |     "distilabel[argilla,hf-inference-endpoints,hf-transformers,instructor,llama-cpp,ollama,openai,outlines,vllm,vision]>=1.5.0,<2.00",
23 |     "gradio[oauth]>=5.4.0,<6.0.0",
24 |     "gradio-huggingfacehub-search>=0.0.12,<1.0.0",
25 |     "huggingface-hub>=0.26.0,<0.28.0",
26 |     "model2vec>=0.2.4,<1.0.0",
27 |     "nltk>=3.9.1,<4.0.0",
28 |     "pydantic>=2.10.5,<3.0.0",
29 |     "sentence-transformers>=3.2.0,<4.0.0",
30 |     "transformers>=4.44.2,<5.0.0",
31 |     "unstructured[md,pdf,docx]>=0.16.3,<1.0.0",
32 |     "setuptools",
33 | ]
34 | 
35 | [build-system]
36 | requires = ["pdm-backend"]
37 | build-backend = "pdm.backend"
38 | 
39 | [tool.pdm]
40 | distribution = true
41 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/__init__.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | from gradio import TabbedInterface
 3 | 
 4 | from synthetic_dataset_generator import (  # noqa
 5 |     _distiset,
 6 |     _inference_endpoints,
 7 | )
 8 | 
 9 | def launch(*args, **kwargs):
10 |     """Launch the synthetic dataset generator.
11 |     Based on the `TabbedInterface` from Gradio.
12 |     Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
13 |     """
14 |     from synthetic_dataset_generator.app import demo
15 |     return demo.launch(*args, server_name="0.0.0.0", **kwargs)
16 | 
17 | 
18 | launch.__doc__ = TabbedInterface.launch.__doc__
19 | launch.__signature__ = inspect.signature(TabbedInterface.launch)
20 | launch.__annotations__ = TabbedInterface.launch.__annotations__
21 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/__main__.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 |     from synthetic_dataset_generator import launch
3 | 
4 |     launch()
5 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/_distiset.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import distilabel
  4 | import distilabel.distiset
  5 | import gradio as gr
  6 | from distilabel.utils.card.dataset_card import (
  7 |     DistilabelDatasetCard,
  8 |     size_categories_parser,
  9 | )
 10 | from huggingface_hub import DatasetCardData, HfApi
 11 | 
 12 | 
 13 | class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
 14 |     def _generate_card(
 15 |         self,
 16 |         repo_id: str,
 17 |         token: str,
 18 |         include_script: bool = False,
 19 |         filename_py: Optional[str] = None,
 20 |     ) -> None:
 21 |         """Generates a dataset card and pushes it to the Hugging Face Hub, and
 22 |         if the `pipeline.yaml` path is available in the `Distiset`, uploads that
 23 |         to the same repository.
 24 | 
 25 |         Args:
 26 |             repo_id: The ID of the repository to push to, from the `push_to_hub` method.
 27 |             token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
 28 |             include_script: Whether to upload the script to the hugging face repository.
 29 |             filename_py: The name of the script. If `include_script` is True, the script will
 30 |                 be uploaded to the repository using this name, otherwise it won't be used.
 31 |         """
 32 |         card = self._get_card(
 33 |             repo_id=repo_id,
 34 |             token=token,
 35 |             include_script=include_script,
 36 |             filename_py=filename_py,
 37 |         )
 38 | 
 39 |         card.push_to_hub(
 40 |             repo_id,
 41 |             repo_type="dataset",
 42 |             token=token,
 43 |         )
 44 |         if self.pipeline_path:
 45 |             # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.
 46 |             HfApi().upload_file(
 47 |                 path_or_fileobj=self.pipeline_path,
 48 |                 path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME,
 49 |                 repo_id=repo_id,
 50 |                 repo_type="dataset",
 51 |                 token=token,
 52 |             )
 53 | 
 54 |     def _get_card(
 55 |         self,
 56 |         repo_id: str,
 57 |         token: Optional[str] = None,
 58 |         include_script: bool = False,
 59 |         filename_py: Optional[str] = None,
 60 |     ) -> DistilabelDatasetCard:
 61 |         """Generates the dataset card for the `Distiset`.
 62 | 
 63 |         Note:
 64 |             If `repo_id` and `token` are provided, it will extract the metadata from the README.md file
 65 |             on the hub.
 66 | 
 67 |         Args:
 68 |             repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.
 69 |             token: The token to authenticate with the Hugging Face Hub.
 70 |                 We assume that if it's provided, the dataset will be in the Hugging Face Hub,
 71 |                 so the README metadata will be extracted from there.
 72 |             include_script: Whether to upload the script to the hugging face repository.
 73 |             filename_py: The name of the script. If `include_script` is True, the script will
 74 |                 be uploaded to the repository using this name, otherwise it won't be used.
 75 | 
 76 |         Returns:
 77 |             The dataset card for the `Distiset`.
 78 |         """
 79 |         sample_records = {}
 80 |         for name, dataset in self.items():
 81 |             sample_records[name] = (
 82 |                 dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
 83 |             )
 84 | 
 85 |         columns = self["default"].column_names
 86 |         columns = self["default"].column_names
 87 | 
 88 |         if ("label" in columns and "text" in columns) or (
 89 |             "labels" in columns and "text" in columns
 90 |         ):
 91 |             task_categories = ["text-classification"]
 92 |         elif ("prompt" in columns and "completion" in columns) or (
 93 |             "messages" in columns
 94 |         ):
 95 |             task_categories: list[str] = [
 96 |                 "text-generation",
 97 |                 "text2text-generation",
 98 |                 "question-answering",
 99 |             ]
100 |         elif "context" in columns and "question" in columns and "response" in columns:
101 |             task_categories: list[str] = [
102 |                 "text-generation",
103 |                 "text2text-generation",
104 |                 "text-retrieval",
105 |                 "question-answering"
106 |             ]
107 |             if (
108 |                 "positive_retrieval" in columns and "negative_retrieval" in columns
109 |             ) or ("positive_reranking" in columns and "negative_reranking" in columns):
110 |                 task_categories.append("sentence-similarity")
111 |         else:
112 |             task_categories: list[str] = []
113 |             gr.Info(
114 |                 f"No task categories found for dataset with columns: {columns}. "
115 |                 "Please notify the distilabel team if you think this is an error."
116 |             )
117 | 
118 |         readme_metadata = {}
119 |         if repo_id and token:
120 |             readme_metadata = self._extract_readme_metadata(repo_id, token)
121 | 
122 |         metadata = {
123 |             **readme_metadata,
124 |             "size_categories": size_categories_parser(
125 |                 max(len(dataset) for dataset in self.values())
126 |             ),
127 |             "task_categories": task_categories,
128 |             "tags": [
129 |                 "synthetic",
130 |                 "distilabel",
131 |                 "rlaif",
132 |                 "datacraft",
133 |             ],
134 |         }
135 | 
136 |         card = DistilabelDatasetCard.from_template(
137 |             card_data=DatasetCardData(**metadata),
138 |             repo_id=repo_id,
139 |             sample_records=sample_records,
140 |             include_script=include_script,
141 |             filename_py=filename_py,
142 |             references=self.citations,
143 |         )
144 | 
145 |         return card
146 | 
147 | 
148 | distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag
149 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/_inference_endpoints.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import distilabel
 4 | import distilabel.distiset
 5 | from distilabel.models import InferenceEndpointsLLM
 6 | from pydantic import (
 7 |     ValidationError,
 8 |     model_validator,
 9 | )
10 | 
11 | 
12 | class CustomInferenceEndpointsLLM(InferenceEndpointsLLM):
13 |     @model_validator(mode="after")  # type: ignore
14 |     def only_one_of_model_id_endpoint_name_or_base_url_provided(
15 |         self,
16 |     ) -> "InferenceEndpointsLLM":
17 |         """Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also
18 |         provided, a warning will be shown informing the user that the provided `base_url` will be ignored in
19 |         favour of the dynamically calculated one.."""
20 | 
21 |         if self.base_url and (self.model_id or self.endpoint_name):
22 |             warnings.warn(  # type: ignore
23 |                 f"Since the `base_url={self.base_url}` is available and either one of `model_id`"
24 |                 " or `endpoint_name` is also provided, the `base_url` will either be ignored"
25 |                 " or overwritten with the one generated from either of those args, for serverless"
26 |                 " or dedicated inference endpoints, respectively."
27 |             )
28 | 
29 |         if self.use_magpie_template and self.tokenizer_id is None:
30 |             raise ValueError(
31 |                 "`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,"
32 |                 " set a `tokenizer_id` and try again."
33 |             )
34 | 
35 |         if (
36 |             self.model_id
37 |             and self.tokenizer_id is None
38 |             and self.structured_output is not None
39 |         ):
40 |             self.tokenizer_id = self.model_id
41 | 
42 |         if self.base_url and not (self.model_id or self.endpoint_name):
43 |             return self
44 | 
45 |         if self.model_id and not self.endpoint_name:
46 |             return self
47 | 
48 |         if self.endpoint_name and not self.model_id:
49 |             return self
50 | 
51 |         raise ValidationError(
52 |             f"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is"
53 |             f" provided too, it will be overwritten instead. Found `model_id`={self.model_id},"
54 |             f" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}."
55 |         )
56 | 
57 | 
58 | distilabel.models.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM
59 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/_tabbedinterface.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file defines two useful high-level abstractions to build Gradio apps: Interface and TabbedInterface.
 3 | """
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | from collections.abc import Sequence
 8 | 
 9 | import gradio as gr
10 | from gradio.blocks import Blocks
11 | from gradio.layouts import Tab, Tabs
12 | from gradio.themes import ThemeClass as Theme
13 | from gradio_client.documentation import document
14 | 
15 | 
16 | @document()
17 | class TabbedInterface(Blocks):
18 |     """
19 |     A TabbedInterface is created by providing a list of Interfaces or Blocks, each of which gets
20 |     rendered in a separate tab. Only the components from the Interface/Blocks will be rendered in the tab.
21 |     Certain high-level attributes of the Blocks (e.g. custom `css`, `js`, and `head` attributes) will not be loaded.
22 | 
23 |     Demos: tabbed_interface_lite
24 |     """
25 | 
26 |     def __init__(
27 |         self,
28 |         interface_list: Sequence[Blocks],
29 |         tab_names: list[str] | None = None,
30 |         title: str | None = None,
31 |         theme: Theme | str | None = None,
32 |         analytics_enabled: bool | None = None,
33 |         css: str | None = None,
34 |         js: str | None = None,
35 |         head: str | None = None,
36 |     ):
37 |         """
38 |         Parameters:
39 |             interface_list: A list of Interfaces (or Blocks) to be rendered in the tabs.
40 |             tab_names: A list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
41 |             title: The tab title to display when this demo is opened in a browser window.
42 |             theme: A Theme object or a string representing a theme. If a string, will look for a built-in theme with that name (e.g. "soft" or "default"), or will attempt to load a theme from the Hugging Face Hub (e.g. "gradio/monochrome"). If None, will use the Default theme.
43 |             analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
44 |             css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
45 |             js: Custom js as a string or path to a js file. The custom js should in the form of a single js function. This function will automatically be executed when the page loads. For more flexibility, use the head parameter to insert js inside <script> tags.
46 |             head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, multiple scripts, stylesheets, etc. to the page.
47 |         Returns:
48 |             a Gradio Tabbed Interface for the given interfaces
49 |         """
50 |         super().__init__(
51 |             title="Synthetic Data Generator",
52 |             theme=theme,
53 |             analytics_enabled=analytics_enabled,
54 |             mode="tabbed_interface",
55 |             css=css,
56 |             js=js,
57 |             head=head,
58 |         )
59 |         if tab_names is None:
60 |             tab_names = [f"Tab {i}" for i in range(len(interface_list))]
61 |         with self:
62 |             h3 = "<div style='text-align: center;'><h2>Build datasets using natural language</h2></div>"
63 |             if title:
64 |                 gr.HTML(value=title + h3)
65 |             gr.LoginButton(value="Sign in", variant="primary", elem_id="sign_in_button")
66 |             with Tabs():
67 |                 for interface, tab_name in zip(interface_list, tab_names, strict=False):
68 |                     with Tab(label=tab_name):
69 |                         interface.render()
70 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/app.py:
--------------------------------------------------------------------------------
 1 | from synthetic_dataset_generator._tabbedinterface import TabbedInterface
 2 | 
 3 | # from synthetic_dataset_generator.apps.eval import app as eval_app
 4 | from synthetic_dataset_generator.apps.rag import app as rag_app
 5 | from synthetic_dataset_generator.apps.about import app as about_app
 6 | from synthetic_dataset_generator.apps.chat import app as chat_app
 7 | from synthetic_dataset_generator.apps.textcat import app as textcat_app
 8 | 
 9 | theme = "argilla/argilla-theme"
10 | 
11 | css = """
12 | .main_ui_logged_out{opacity: 0.3; pointer-events: none}
13 | button[role="tab"][aria-selected="true"] { border: 0; background: var(--button-primary-background-fill); color: white; border-top-right-radius: var(--radius-md); border-top-left-radius: var(--radius-md)}
14 | button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-primary-background-fill); background: var(var(--button-primary-background-fill-hover))}
15 | .tabitem {border: 0; padding-inline: 0}
16 | .gallery-item {background: var(--background-fill-secondary); text-align: left}
17 | .table-wrap .tbody td {vertical-align: top}
18 | #system_prompt_examples {color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
19 | .container {padding-inline: 0 !important}
20 | .gradio-container { width: 100% !important; }
21 | .gradio-row { display: flex !important; flex-direction: row !important; }
22 | .gradio-column { flex: 1 !important; min-width: 0 !important; }
23 | #sign_in_button {flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto;}
24 | .datasets {height: 70px;}
25 | """
26 | 
27 | image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
28 | 
29 | demo = TabbedInterface(
30 |     [textcat_app, chat_app, rag_app, about_app],
31 |     ["Text Classification", "Chat Data", "RAG", "About"],
32 |     css=css,
33 |     title=image,
34 |     theme=theme,
35 | )
36 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/src/synthetic_dataset_generator/apps/__init__.py


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/apps/about.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | 
 3 | with gr.Blocks() as app:
 4 |     gr.Markdown(
 5 |         """
 6 |         Synthetic data is artificially generated information that mimics real-world data. It allows overcoming data limitations by expanding or enhancing datasets.
 7 | 
 8 |         Introducing the Synthetic Data Generator, a user-friendly application that takes a no-code approach to creating custom datasets with Large Language Models (LLMs). The best part: A simple step-by-step process, making dataset creation a non-technical breeze, allowing anyone to create datasets and models in minutes and without any code.
 9 | 
10 |         The synthetic data generator takes your custom prompt and returns a dataset for your use case, using a synthetic data pipeline. In the background this is powered by [distilabel](https://distilabel.argilla.io/latest/) and the [free Hugging Face text-generation API](https://huggingface.co/docs/api-inference/en/index) but we don't need to worry about these complexities and we can focus on using the UI.
11 | 
12 |         - Read more in [our announcement blog post](https://huggingface.co/blog/synthetic-data-generator)
13 |         - Find the library on [GitHub](https://github.com/argilla-io/synthetic-data-generator)
14 |         """
15 |     )
16 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/apps/base.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import uuid
  3 | from tqdm import tqdm
  4 | from typing import Union
  5 | 
  6 | import argilla as rg
  7 | import gradio as gr
  8 | import pandas as pd
  9 | from datasets import Dataset, concatenate_datasets, get_dataset_config_names, get_dataset_split_names, load_dataset
 10 | from gradio import OAuthToken
 11 | from huggingface_hub import HfApi, upload_file, repo_exists
 12 | from unstructured.chunking.title import chunk_by_title
 13 | from unstructured.partition.auto import partition
 14 | 
 15 | from synthetic_dataset_generator.constants import MAX_NUM_ROWS, SAVE_LOCAL_DIR
 16 | from synthetic_dataset_generator.utils import get_argilla_client
 17 | 
 18 | if SAVE_LOCAL_DIR is not None:
 19 |     import os
 20 |     os.makedirs(SAVE_LOCAL_DIR, exist_ok=True)
 21 | 
 22 | 
 23 | def validate_argilla_user_workspace_dataset(
 24 |     dataset_name: str,
 25 |     add_to_existing_dataset: bool = True,
 26 |     oauth_token: Union[OAuthToken, None] = None,
 27 |     progress=gr.Progress(),
 28 | ) -> str:
 29 |     progress(0.1, desc="Validating dataset configuration")
 30 |     hf_user = HfApi().whoami(token=oauth_token.token)["name"]
 31 |     client = get_argilla_client()
 32 |     if dataset_name is None or dataset_name == "":
 33 |         raise gr.Error("Dataset name is required")
 34 |     # Create user if it doesn't exist
 35 |     rg_user = client.users(username=hf_user)
 36 |     if rg_user is None:
 37 |         rg_user = client.users.add(
 38 |             rg.User(username=hf_user, role="admin", password=str(uuid.uuid4()))
 39 |         )
 40 |     # Create workspace if it doesn't exist
 41 |     workspace = client.workspaces(name=hf_user)
 42 |     if workspace is None:
 43 |         workspace = client.workspaces.add(rg.Workspace(name=hf_user))
 44 |         workspace.add_user(hf_user)
 45 |     # Check if dataset exists
 46 |     dataset = client.datasets(name=dataset_name, workspace=hf_user)
 47 |     if dataset and not add_to_existing_dataset:
 48 |         raise gr.Error(f"Dataset {dataset_name} already exists")
 49 |     progress(1.0, desc="Dataset configuration validated")
 50 |     return ""
 51 | 
 52 | 
 53 | def push_pipeline_code_to_hub(
 54 |     pipeline_code: str,
 55 |     org_name: str,
 56 |     repo_name: str,
 57 |     oauth_token: Union[OAuthToken, None] = None,
 58 |     progress=gr.Progress(),
 59 | ):
 60 |     repo_id: str | None = validate_push_to_hub(org_name, repo_name)
 61 |     progress(0.1, desc="Uploading pipeline code")
 62 |     with io.BytesIO(pipeline_code.encode("utf-8")) as f:
 63 |         upload_file(
 64 |             path_or_fileobj=f,
 65 |             path_in_repo="pipeline.py",
 66 |             repo_id=repo_id,
 67 |             repo_type="dataset",
 68 |             token=oauth_token.token,
 69 |             commit_message="Include pipeline script",
 70 |             create_pr=False,
 71 |         )
 72 |     progress(1.0, desc="Pipeline code uploaded")
 73 | 
 74 | 
 75 | def validate_push_to_hub(org_name: str, repo_name: str):
 76 |     repo_id = (
 77 |         f"{org_name}/{repo_name}"
 78 |         if repo_name is not None and org_name is not None
 79 |         else None
 80 |     )
 81 |     if repo_id is not None:
 82 |         if not all([repo_id, org_name, repo_name]):
 83 |             raise gr.Error(
 84 |                 "Please provide a `repo_name` and `org_name` to push the dataset to."
 85 |             )
 86 |     return repo_id
 87 | 
 88 | 
 89 | def combine_datasets(
 90 |     repo_id: str, dataset: Dataset, oauth_token: Union[OAuthToken, None]
 91 | ) -> Dataset:
 92 |     try:
 93 |         new_dataset = load_dataset(
 94 |             repo_id,
 95 |             split="train",
 96 |             download_mode="force_redownload",
 97 |             token=oauth_token.token,
 98 |         )
 99 |         return concatenate_datasets([dataset, new_dataset])
100 |     except Exception:
101 |         return dataset
102 | 
103 | 
104 | def show_success_message(org_name: str, repo_name: str) -> gr.Markdown:
105 |     client = get_argilla_client()
106 |     if client is None:
107 |         return gr.Markdown(
108 |             value=f"""
109 |                 <div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">
110 |                     <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
111 |                     <p style="margin-top: 0.5em;">
112 |                         The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
113 |                         <div style="display: flex; gap: 10px;">
114 |                             <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" class="lg primary svelte-1137axg" style="color: white !important; margin-top: 0.5em; text-decoration: none;">
115 |                                 Open in Hugging Face
116 |                             </a>
117 |                         </div>
118 |                     </p>
119 |                     <p style="margin-top: 1em; color: var(--block-title-text-color)">
120 |                         By configuring an `ARGILLA_API_URL` and `ARGILLA_API_KEY` you can curate the dataset in Argilla.
121 |                         Unfamiliar with Argilla? Here are some docs to help you get started:
122 |                         <br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/" target="_blank">How to get started with Argilla</a>
123 |                         <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
124 |                         <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
125 |                     </p>
126 |                 </div>
127 |                 """,
128 |             visible=True,
129 |             height=None,
130 |             min_height=None,
131 |             max_height=None,
132 |         )
133 |     argilla_api_url = client.api_url
134 |     # Transform Docker internal URL to localhost if needed
135 |     if "argilla:" in argilla_api_url:
136 |         argilla_api_url = argilla_api_url.replace("argilla:", "127.0.0.1:")
137 |     return gr.Markdown(
138 |         value=f"""
139 |                 <div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">
140 |                     <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
141 |                     <p style="margin-top: 0.5em;">
142 |                         The generated dataset is <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank">available in the Hub</a>. It is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
143 |                         <div style="display: flex; gap: 10px;">
144 |                             <a href="{argilla_api_url}" target="_blank" class="lg primary svelte-1137axg" style="color: white !important; margin-top: 0.5em; text-decoration: none;">
145 |                                 Open in Argilla
146 |                             </a>
147 |                         </div>
148 |                     </p>
149 |                     <p style="margin-top: 1em; color: var(--block-title-text-color)">
150 |                         Unfamiliar with Argilla? Here are some docs to help you get started:
151 |                         <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
152 |                         <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
153 |                     </p>
154 |                 </div>
155 |             """,
156 |         visible=True,
157 |         height=None,
158 |         min_height=None,
159 |         max_height=None,
160 |     )
161 | 
162 | 
163 | def hide_success_message() -> gr.Markdown:
164 |     return gr.Markdown(value="", visible=True, height=100)
165 | 
166 | 
167 | def test_max_num_rows(num_rows: int) -> int:
168 |     if num_rows > MAX_NUM_ROWS:
169 |         num_rows = MAX_NUM_ROWS
170 |         gr.Info(
171 |             f"Number of rows is larger than the configured maximum. Setting number of rows to {MAX_NUM_ROWS}. Set environment variable `MAX_NUM_ROWS` to change this behavior."
172 |         )
173 |     return num_rows
174 | 
175 | 
176 | def get_iframe(hub_repo_id: str) -> str:
177 |     if not hub_repo_id:
178 |         return ""
179 | 
180 |     if not repo_exists(repo_id=hub_repo_id, repo_type="dataset"):
181 |         return ""
182 | 
183 |     url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
184 |     iframe = f"""
185 |     <iframe
186 |         src="{url}"
187 |         frameborder="0"
188 |         width="100%"
189 |         height="600px"
190 |     ></iframe>
191 |     """
192 |     return iframe
193 | 
194 | 
195 | def _get_valid_columns(dataframe: pd.DataFrame):
196 |     doc_valid_columns = []
197 | 
198 |     for col in dataframe.columns:
199 |         sample_val = dataframe[col].iloc[0]
200 |         if isinstance(sample_val, str):
201 |             doc_valid_columns.append(col)
202 | 
203 |     return doc_valid_columns
204 | 
205 | 
206 | def load_dataset_from_hub(
207 |     repo_id: str,
208 |     num_rows: int = 10,
209 |     token: Union[OAuthToken, None] = None,
210 |     progress=gr.Progress(track_tqdm=True),
211 | ):
212 |     if not repo_id:
213 |         raise gr.Error("Please provide a Hub repo ID")
214 |     subsets = get_dataset_config_names(repo_id, token=token)
215 |     splits = get_dataset_split_names(repo_id, subsets[0], token=token)
216 |     ds = load_dataset(repo_id, subsets[0], split=splits[0], token=token, streaming=True)
217 |     rows = []
218 |     for idx, row in enumerate(tqdm(ds, desc="Loading the dataset", total=num_rows)):
219 |         rows.append(row)
220 |         if idx == num_rows:
221 |             break
222 |     ds = Dataset.from_list(rows)
223 |     dataframe = ds.to_pandas()
224 |     doc_valid_columns = _get_valid_columns(dataframe)
225 |     col_doc = doc_valid_columns[0] if doc_valid_columns else ""
226 |     return (
227 |         dataframe,
228 |         gr.Dropdown(
229 |             choices=doc_valid_columns,
230 |             label="Documents column",
231 |             value=col_doc,
232 |             interactive=(False if col_doc == "" else True),
233 |             multiselect=False,
234 |         ),
235 |     )
236 | 
237 | 
238 | def preprocess_input_data(
239 |     file_paths: list[str], num_rows: int, progress=gr.Progress(track_tqdm=True)
240 | ):
241 |     if not file_paths:
242 |         raise gr.Error("Please provide an input file")
243 | 
244 |     data = {}
245 |     total_chunks = 0
246 | 
247 |     for file_path in tqdm(file_paths, desc="Processing files", total=len(file_paths)):
248 |         partitioned_file = partition(filename=file_path)
249 |         chunks = [str(chunk) for chunk in chunk_by_title(partitioned_file)]
250 |         data[file_path] = chunks
251 |         total_chunks += len(chunks)
252 |         if total_chunks >= num_rows:
253 |             break
254 | 
255 |     dataframe = pd.DataFrame.from_records(
256 |         [(k, v) for k, values in data.items() for v in values],
257 |         columns=["filename", "chunks"],
258 |     )
259 |     col_doc = "chunks"
260 | 
261 |     return (
262 |         dataframe,
263 |         gr.Dropdown(
264 |             choices=["chunks"],
265 |             label="Documents column",
266 |             value=col_doc,
267 |             interactive=(False if col_doc == "" else True),
268 |             multiselect=False,
269 |         ),
270 |     )
271 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/apps/textcat.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | import uuid
  5 | from typing import List, Union
  6 | 
  7 | import argilla as rg
  8 | import gradio as gr
  9 | import pandas as pd
 10 | from datasets import ClassLabel, Dataset, Features, Sequence, Value
 11 | from distilabel.distiset import Distiset
 12 | from huggingface_hub import HfApi
 13 | 
 14 | from synthetic_dataset_generator.apps.base import (
 15 |     combine_datasets,
 16 |     hide_success_message,
 17 |     push_pipeline_code_to_hub,
 18 |     show_success_message,
 19 |     test_max_num_rows,
 20 |     validate_argilla_user_workspace_dataset,
 21 |     validate_push_to_hub,
 22 | )
 23 | from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE, SAVE_LOCAL_DIR
 24 | from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
 25 | from synthetic_dataset_generator.pipelines.embeddings import (
 26 |     get_embeddings,
 27 |     get_sentence_embedding_dimensions,
 28 | )
 29 | from synthetic_dataset_generator.pipelines.textcat import (
 30 |     DEFAULT_DATASET_DESCRIPTIONS,
 31 |     generate_pipeline_code,
 32 |     get_labeller_generator,
 33 |     get_prompt_generator,
 34 |     get_textcat_generator,
 35 | )
 36 | from synthetic_dataset_generator.utils import (
 37 |     get_argilla_client,
 38 |     get_org_dropdown,
 39 |     get_preprocess_labels,
 40 |     get_random_repo_name,
 41 |     swap_visibility,
 42 | )
 43 | 
 44 | 
 45 | def _get_dataframe():
 46 |     return gr.Dataframe(
 47 |         headers=["labels", "text"],
 48 |         wrap=True,
 49 |         interactive=False,
 50 |     )
 51 | 
 52 | 
 53 | def generate_system_prompt(dataset_description: str, progress=gr.Progress()):
 54 |     progress(0.0, desc="Starting")
 55 |     progress(0.3, desc="Initializing")
 56 |     generate_description = get_prompt_generator()
 57 |     progress(0.7, desc="Generating")
 58 |     result = next(
 59 |         generate_description.process(
 60 |             [
 61 |                 {
 62 |                     "instruction": dataset_description,
 63 |                 }
 64 |             ]
 65 |         )
 66 |     )[0]["generation"]
 67 |     progress(1.0, desc="Prompt generated")
 68 |     data = json.loads(result)
 69 |     system_prompt = data["classification_task"]
 70 |     labels = get_preprocess_labels(data["labels"])
 71 |     return system_prompt, labels
 72 | 
 73 | 
 74 | def generate_sample_dataset(
 75 |     system_prompt: str,
 76 |     difficulty: str,
 77 |     clarity: str,
 78 |     labels: List[str],
 79 |     multi_label: bool,
 80 |     progress=gr.Progress(),
 81 | ):
 82 |     dataframe = generate_dataset(
 83 |         system_prompt=system_prompt,
 84 |         difficulty=difficulty,
 85 |         clarity=clarity,
 86 |         labels=labels,
 87 |         multi_label=multi_label,
 88 |         num_rows=10,
 89 |         progress=progress,
 90 |         is_sample=True,
 91 |     )
 92 |     return dataframe
 93 | 
 94 | 
 95 | def generate_dataset(
 96 |     system_prompt: str,
 97 |     difficulty: str,
 98 |     clarity: str,
 99 |     labels: List[str] = None,
100 |     multi_label: bool = False,
101 |     num_rows: int = 10,
102 |     temperature: float = 0.9,
103 |     is_sample: bool = False,
104 |     progress=gr.Progress(),
105 | ) -> pd.DataFrame:
106 |     num_rows = test_max_num_rows(num_rows)
107 |     progress(0.0, desc="(1/2) Generating dataset")
108 |     labels = get_preprocess_labels(labels)
109 |     textcat_generator = get_textcat_generator(
110 |         difficulty=difficulty,
111 |         clarity=clarity,
112 |         temperature=temperature,
113 |         is_sample=is_sample,
114 |     )
115 |     updated_system_prompt = f"{system_prompt}. Optional labels: {', '.join(labels)}."
116 |     if multi_label:
117 |         updated_system_prompt = f"{updated_system_prompt}. Only apply relevant labels. Applying less labels is always better than applying too many labels."
118 |     labeller_generator = get_labeller_generator(
119 |         system_prompt=updated_system_prompt,
120 |         labels=labels,
121 |         multi_label=multi_label,
122 |     )
123 |     total_steps: int = num_rows * 2
124 |     batch_size = DEFAULT_BATCH_SIZE
125 | 
126 |     # create text classification data
127 |     n_processed = 0
128 |     textcat_results = []
129 |     rewritten_system_prompts = get_rewritten_prompts(system_prompt, num_rows)
130 |     while n_processed < num_rows:
131 |         progress(
132 |             2 * 0.5 * n_processed / num_rows,
133 |             total=total_steps,
134 |             desc="(1/2) Generating dataset",
135 |         )
136 |         remaining_rows = num_rows - n_processed
137 |         batch_size = min(batch_size, remaining_rows)
138 |         inputs = []
139 |         for _ in range(batch_size):
140 |             k = 1
141 |             if multi_label:
142 |                 num_labels = len(labels)
143 |                 k = int(
144 |                     random.betavariate(alpha=(num_labels - 1), beta=num_labels)
145 |                     * num_labels
146 |                 )
147 |             sampled_labels = random.sample(labels, min(k, len(labels)))
148 |             random.shuffle(sampled_labels)
149 |             inputs.append(
150 |                 {
151 |                     "task": f"{random.choice(rewritten_system_prompts)}. The text represents the following categories: {', '.join(sampled_labels)}"
152 |                 }
153 |             )
154 |         batch = list(textcat_generator.process(inputs=inputs))
155 |         textcat_results.extend(batch[0])
156 |         n_processed += batch_size
157 |         random.seed(a=random.randint(0, 2**32 - 1))
158 |     for result in textcat_results:
159 |         result["text"] = result["input_text"]
160 | 
161 |     # label text classification data
162 |     progress(2 * 0.5, desc="(2/2) Labeling dataset")
163 |     n_processed = 0
164 |     labeller_results = []
165 |     while n_processed < num_rows:
166 |         progress(
167 |             0.5 + 0.5 * n_processed / num_rows,
168 |             total=total_steps,
169 |             desc="(2/2) Labeling dataset",
170 |         )
171 |         batch = textcat_results[n_processed : n_processed + batch_size]
172 |         labels_batch = list(labeller_generator.process(inputs=batch))
173 |         labeller_results.extend(labels_batch[0])
174 |         n_processed += batch_size
175 |         random.seed(a=random.randint(0, 2**32 - 1))
176 |     progress(
177 |         1,
178 |         total=total_steps,
179 |         desc="(2/2) Creating dataset",
180 |     )
181 | 
182 |     # create final dataset
183 |     distiset_results = []
184 |     for result in labeller_results:
185 |         record = {key: result[key] for key in ["labels", "text"] if key in result}
186 |         distiset_results.append(record)
187 | 
188 |     dataframe = pd.DataFrame(distiset_results)
189 |     if multi_label:
190 | 
191 |         def _validate_labels(x):
192 |             if isinstance(x, str):  # single label
193 |                 return [x.lower().strip()]
194 |             elif isinstance(x, list):  # multiple labels
195 |                 return list(
196 |                     set(
197 |                         label.lower().strip()
198 |                         for label in x
199 |                         if isinstance(label, str) and label.lower().strip() in labels
200 |                     )
201 |                 )
202 |             else:
203 |                 return list(set([random.choice(labels)]))
204 | 
205 |         dataframe["labels"] = dataframe["labels"].apply(_validate_labels)
206 |         dataframe = dataframe[dataframe["labels"].notna()]
207 |     else:
208 | 
209 |         def _validate_labels(x):
210 |             if isinstance(x, str) and x.lower().strip() in labels:
211 |                 return x.lower().strip()
212 |             elif isinstance(x, list):
213 |                 options = [
214 |                     label.lower().strip()
215 |                     for label in x
216 |                     if isinstance(label, str) and label.lower().strip() in labels
217 |                 ]
218 |                 if options:
219 |                     return random.choice(options)
220 |                 else:
221 |                     return random.choice(labels)
222 |             else:
223 |                 return random.choice(labels)
224 | 
225 |         dataframe = dataframe.rename(columns={"labels": "label"})
226 |         dataframe["label"] = dataframe["label"].apply(_validate_labels)
227 |     dataframe = dataframe[dataframe["text"].notna()]
228 | 
229 |     progress(1.0, desc="Dataset created")
230 |     return dataframe
231 | 
232 | 
233 | def push_dataset_to_hub(
234 |     dataframe: pd.DataFrame,
235 |     org_name: str,
236 |     repo_name: str,
237 |     multi_label: bool = False,
238 |     labels: List[str] = None,
239 |     oauth_token: Union[gr.OAuthToken, None] = None,
240 |     private: bool = False,
241 |     pipeline_code: str = "",
242 |     progress=gr.Progress(),
243 | ):
244 |     progress(0.0, desc="Validating")
245 |     repo_id = validate_push_to_hub(org_name, repo_name)
246 |     progress(0.3, desc="Preprocessing")
247 |     labels = get_preprocess_labels(labels)
248 |     progress(0.7, desc="Creating dataset")
249 |     if multi_label:
250 |         features = Features(
251 |             {
252 |                 "text": Value("string"),
253 |                 "labels": Sequence(feature=ClassLabel(names=labels)),
254 |             }
255 |         )
256 |     else:
257 |         features = Features(
258 |             {"text": Value("string"), "label": ClassLabel(names=labels)}
259 |         )
260 |     dataset = Dataset.from_pandas(
261 |         dataframe.reset_index(drop=True),
262 |         features=features,
263 |     )
264 |     dataset = combine_datasets(repo_id, dataset, oauth_token)
265 |     distiset = Distiset({"default": dataset})
266 |     progress(0.9, desc="Pushing dataset")
267 |     distiset.push_to_hub(
268 |         repo_id=repo_id,
269 |         private=private,
270 |         include_script=False,
271 |         token=oauth_token.token,
272 |         create_pr=False,
273 |     )
274 |     push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
275 |     progress(1.0, desc="Dataset pushed")
276 | 
277 | 
278 | def push_dataset(
279 |     org_name: str,
280 |     repo_name: str,
281 |     system_prompt: str,
282 |     difficulty: str,
283 |     clarity: str,
284 |     multi_label: int = 1,
285 |     num_rows: int = 10,
286 |     labels: List[str] = None,
287 |     private: bool = False,
288 |     temperature: float = 0.8,
289 |     pipeline_code: str = "",
290 |     oauth_token: Union[gr.OAuthToken, None] = None,
291 |     progress=gr.Progress(),
292 | ) -> pd.DataFrame:
293 |     dataframe = generate_dataset(
294 |         system_prompt=system_prompt,
295 |         difficulty=difficulty,
296 |         clarity=clarity,
297 |         multi_label=multi_label,
298 |         labels=labels,
299 |         num_rows=num_rows,
300 |         temperature=temperature,
301 |     )
302 |     push_dataset_to_hub(
303 |         dataframe=dataframe,
304 |         org_name=org_name,
305 |         repo_name=repo_name,
306 |         multi_label=multi_label,
307 |         labels=labels,
308 |         oauth_token=oauth_token,
309 |         private=private,
310 |         pipeline_code=pipeline_code,
311 |     )
312 | 
313 |     dataframe = dataframe[
314 |         (dataframe["text"].str.strip() != "") & (dataframe["text"].notna())
315 |     ]
316 |     try:
317 |         progress(0.1, desc="Setting up user and workspace")
318 |         hf_user = HfApi().whoami(token=oauth_token.token)["name"]
319 |         client = get_argilla_client()
320 |         if client is None:
321 |             return ""
322 |         labels = get_preprocess_labels(labels)
323 |         progress(0.5, desc="Creating dataset in Argilla")
324 |         settings = rg.Settings(
325 |             fields=[
326 |                 rg.TextField(
327 |                     name="text",
328 |                     description="The text classification data",
329 |                     title="Text",
330 |                 ),
331 |             ],
332 |             questions=[
333 |                 (
334 |                     rg.MultiLabelQuestion(
335 |                         name="labels",
336 |                         title="Labels",
337 |                         description="The labels of the conversation",
338 |                         labels=labels,
339 |                     )
340 |                     if multi_label
341 |                     else rg.LabelQuestion(
342 |                         name="label",
343 |                         title="Label",
344 |                         description="The label of the text",
345 |                         labels=labels,
346 |                     )
347 |                 ),
348 |             ],
349 |             metadata=[
350 |                 rg.IntegerMetadataProperty(name="text_length", title="Text Length"),
351 |             ],
352 |             vectors=[
353 |                 rg.VectorField(
354 |                     name="text_embeddings",
355 |                     dimensions=get_sentence_embedding_dimensions(),
356 |                 )
357 |             ],
358 |             guidelines="Please review the text and provide or correct the label where needed.",
359 |         )
360 | 
361 |         dataframe["text_length"] = dataframe["text"].apply(len)
362 |         dataframe["text_embeddings"] = get_embeddings(dataframe["text"].to_list())
363 | 
364 |         rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
365 |         if rg_dataset is None:
366 |             rg_dataset = rg.Dataset(
367 |                 name=repo_name,
368 |                 workspace=hf_user,
369 |                 settings=settings,
370 |                 client=client,
371 |             )
372 |             rg_dataset = rg_dataset.create()
373 |         progress(0.7, desc="Pushing dataset")
374 |         hf_dataset = Dataset.from_pandas(dataframe)
375 |         records = [
376 |             rg.Record(
377 |                 fields={
378 |                     "text": sample["text"],
379 |                 },
380 |                 metadata={"text_length": sample["text_length"]},
381 |                 vectors={"text_embeddings": sample["text_embeddings"]},
382 |                 suggestions=(
383 |                     [
384 |                         rg.Suggestion(
385 |                             question_name="labels" if multi_label else "label",
386 |                             value=(
387 |                                 sample["labels"] if multi_label else sample["label"]
388 |                             ),
389 |                         )
390 |                     ]
391 |                     if (
392 |                         (not multi_label and sample["label"] in labels)
393 |                         or (
394 |                             multi_label
395 |                             and all(label in labels for label in sample["labels"])
396 |                         )
397 |                     )
398 |                     else None
399 |                 ),
400 |             )
401 |             for sample in hf_dataset
402 |         ]
403 |         rg_dataset.records.log(records=records)
404 |         progress(1.0, desc="Dataset pushed")
405 |     except Exception as e:
406 |         raise gr.Error(f"Error pushing dataset to Argilla: {e}")
407 |     return ""
408 | 
409 | 
410 | def save_local(
411 |     system_prompt: str,
412 |     difficulty: str,
413 |     clarity: str,
414 |     labels: List[str],
415 |     multi_label: bool,
416 |     num_rows: int,
417 |     temperature: float,
418 |     repo_name: str,
419 | ) -> pd.DataFrame:
420 |     dataframe = generate_dataset(
421 |         system_prompt=system_prompt,
422 |         difficulty=difficulty,
423 |         clarity=clarity,
424 |         multi_label=multi_label,
425 |         labels=labels,
426 |         num_rows=num_rows,
427 |         temperature=temperature,
428 |     )
429 |     local_dataset = Dataset.from_pandas(dataframe)
430 |     output_csv = os.path.join(SAVE_LOCAL_DIR, repo_name + ".csv")
431 |     output_json = os.path.join(SAVE_LOCAL_DIR, repo_name + ".json")
432 |     local_dataset.to_csv(output_csv, index=False)
433 |     local_dataset.to_json(output_json, index=False)
434 |     return output_csv, output_json
435 | 
436 | 
437 | def validate_input_labels(labels: List[str]) -> List[str]:
438 |     if (
439 |         not labels
440 |         or len(set(label.lower().strip() for label in labels if label.strip())) < 2
441 |     ):
442 |         raise gr.Error(
443 |             f"Please provide at least 2 unique, non-empty labels to classify your text. You provided {len(labels) if labels else 0}."
444 |         )
445 |     return labels
446 | 
447 | 
448 | def show_pipeline_code_visibility():
449 |     return {pipeline_code_ui: gr.Accordion(visible=True)}
450 | 
451 | 
452 | def hide_pipeline_code_visibility():
453 |     return {pipeline_code_ui: gr.Accordion(visible=False)}
454 | 
455 | 
456 | def show_save_local_button():
457 |     return {btn_save_local: gr.Button(visible=True)}
458 | 
459 | 
460 | def hide_save_local_button():
461 |     return {btn_save_local: gr.Button(visible=False)}
462 | 
463 | 
464 | def show_save_local():
465 |     gr.update(success_message, min_height=0)
466 |     return {
467 |         csv_file: gr.File(visible=True),
468 |         json_file: gr.File(visible=True),
469 |         success_message: success_message,
470 |     }
471 | 
472 | 
473 | def hide_save_local():
474 |     gr.update(success_message, min_height=100)
475 |     return {
476 |         csv_file: gr.File(visible=False),
477 |         json_file: gr.File(visible=False),
478 |         success_message: success_message,
479 |     }
480 | 
481 | 
482 | ######################
483 | # Gradio UI
484 | ######################
485 | 
486 | 
487 | with gr.Blocks() as app:
488 |     with gr.Column() as main_ui:
489 |         gr.Markdown("## 1. Describe the dataset you want")
490 |         with gr.Row():
491 |             with gr.Column(scale=2):
492 |                 dataset_description = gr.Textbox(
493 |                     label="Dataset description",
494 |                     placeholder="Give a precise description of your desired dataset.",
495 |                 )
496 |                 with gr.Row():
497 |                     clear_btn_part = gr.Button(
498 |                         "Clear",
499 |                         variant="secondary",
500 |                     )
501 |                     load_btn = gr.Button(
502 |                         "Create",
503 |                         variant="primary",
504 |                     )
505 |             with gr.Column(scale=3):
506 |                 examples = gr.Examples(
507 |                     examples=DEFAULT_DATASET_DESCRIPTIONS,
508 |                     inputs=[dataset_description],
509 |                     cache_examples=False,
510 |                     label="Examples",
511 |                 )
512 | 
513 |         gr.HTML("<hr>")
514 |         gr.Markdown("## 2. Configure your dataset")
515 |         with gr.Row(equal_height=False):
516 |             with gr.Column(scale=2):
517 |                 system_prompt = gr.Textbox(
518 |                     label="System prompt",
519 |                     placeholder="You are a helpful assistant.",
520 |                     visible=True,
521 |                 )
522 |                 labels = gr.Dropdown(
523 |                     choices=[],
524 |                     allow_custom_value=True,
525 |                     interactive=True,
526 |                     label="Labels",
527 |                     multiselect=True,
528 |                     info="Add the labels to classify the text.",
529 |                 )
530 |                 multi_label = gr.Checkbox(
531 |                     label="Multi-label",
532 |                     value=False,
533 |                     interactive=True,
534 |                     info="If checked, the text will be classified into multiple labels.",
535 |                 )
536 |                 clarity = gr.Dropdown(
537 |                     choices=[
538 |                         ("Clear", "clear"),
539 |                         (
540 |                             "Understandable",
541 |                             "understandable with some effort",
542 |                         ),
543 |                         ("Ambiguous", "ambiguous"),
544 |                         ("Mixed", "mixed"),
545 |                     ],
546 |                     value="mixed",
547 |                     label="Clarity",
548 |                     info="Set how easily the correct label or labels can be identified.",
549 |                     interactive=True,
550 |                 )
551 |                 difficulty = gr.Dropdown(
552 |                     choices=[
553 |                         ("High School", "high school"),
554 |                         ("College", "college"),
555 |                         ("PhD", "PhD"),
556 |                         ("Mixed", "mixed"),
557 |                     ],
558 |                     value="high school",
559 |                     label="Difficulty",
560 |                     info="Select the comprehension level for the text. Ensure it matches the task context.",
561 |                     interactive=True,
562 |                 )
563 |                 with gr.Row():
564 |                     clear_btn_full = gr.Button("Clear", variant="secondary")
565 |                     btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
566 |             with gr.Column(scale=3):
567 |                 dataframe = _get_dataframe()
568 | 
569 |         gr.HTML("<hr>")
570 |         gr.Markdown("## 3. Generate your dataset")
571 |         with gr.Row(equal_height=False):
572 |             with gr.Column(scale=2):
573 |                 org_name = get_org_dropdown()
574 |                 repo_name = gr.Textbox(
575 |                     label="Repo name",
576 |                     placeholder="dataset_name",
577 |                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
578 |                     interactive=True,
579 |                 )
580 |                 num_rows = gr.Number(
581 |                     label="Number of rows",
582 |                     value=10,
583 |                     interactive=True,
584 |                     scale=1,
585 |                 )
586 |                 temperature = gr.Slider(
587 |                     label="Temperature",
588 |                     minimum=0.1,
589 |                     maximum=1.5,
590 |                     value=0.8,
591 |                     step=0.1,
592 |                     interactive=True,
593 |                 )
594 |                 private = gr.Checkbox(
595 |                     label="Private dataset",
596 |                     value=False,
597 |                     interactive=True,
598 |                     scale=1,
599 |                 )
600 |                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
601 |                 btn_save_local = gr.Button(
602 |                     "Save locally", variant="primary", scale=2, visible=False
603 |                 )
604 |             with gr.Column(scale=3):
605 |                 csv_file = gr.File(
606 |                     label="CSV",
607 |                     elem_classes="datasets",
608 |                     visible=False,
609 |                 )
610 |                 json_file = gr.File(
611 |                     label="JSON",
612 |                     elem_classes="datasets",
613 |                     visible=False,
614 |                 )
615 |                 success_message = gr.Markdown(
616 |                     visible=False,
617 |                     min_height=0,  # don't remove this otherwise progress is not visible
618 |                 )
619 |                 with gr.Accordion(
620 |                     "Customize your pipeline with distilabel",
621 |                     open=False,
622 |                     visible=False,
623 |                 ) as pipeline_code_ui:
624 |                     code = generate_pipeline_code(
625 |                         system_prompt.value,
626 |                         difficulty=difficulty.value,
627 |                         clarity=clarity.value,
628 |                         labels=labels.value,
629 |                         num_labels=len(labels.value) if multi_label.value else 1,
630 |                         num_rows=num_rows.value,
631 |                     )
632 |                     pipeline_code = gr.Code(
633 |                         value=code,
634 |                         language="python",
635 |                         label="Distilabel Pipeline Code",
636 |                     )
637 | 
638 |     load_btn.click(
639 |         fn=generate_system_prompt,
640 |         inputs=[dataset_description],
641 |         outputs=[system_prompt, labels],
642 |     ).then(
643 |         fn=generate_sample_dataset,
644 |         inputs=[system_prompt, difficulty, clarity, labels, multi_label],
645 |         outputs=[dataframe],
646 |     )
647 | 
648 |     btn_apply_to_sample_dataset.click(
649 |         fn=validate_input_labels,
650 |         inputs=[labels],
651 |         outputs=[labels],
652 |     ).success(
653 |         fn=generate_sample_dataset,
654 |         inputs=[system_prompt, difficulty, clarity, labels, multi_label],
655 |         outputs=[dataframe],
656 |     )
657 | 
658 |     btn_push_to_hub.click(
659 |         fn=validate_argilla_user_workspace_dataset,
660 |         inputs=[repo_name],
661 |         outputs=[success_message],
662 |     ).then(
663 |         fn=validate_push_to_hub,
664 |         inputs=[org_name, repo_name],
665 |         outputs=[success_message],
666 |     ).success(
667 |         fn=validate_input_labels,
668 |         inputs=[labels],
669 |         outputs=[labels],
670 |     ).success(
671 |         fn=hide_save_local,
672 |         outputs=[csv_file, json_file, success_message],
673 |     ).success(
674 |         fn=hide_success_message,
675 |         outputs=[success_message],
676 |     ).success(
677 |         fn=hide_pipeline_code_visibility,
678 |         inputs=[],
679 |         outputs=[pipeline_code_ui],
680 |     ).success(
681 |         fn=push_dataset,
682 |         inputs=[
683 |             org_name,
684 |             repo_name,
685 |             system_prompt,
686 |             difficulty,
687 |             clarity,
688 |             multi_label,
689 |             num_rows,
690 |             labels,
691 |             private,
692 |             temperature,
693 |             pipeline_code,
694 |         ],
695 |         outputs=[success_message],
696 |     ).success(
697 |         fn=show_success_message,
698 |         inputs=[org_name, repo_name],
699 |         outputs=[success_message],
700 |     ).success(
701 |         fn=generate_pipeline_code,
702 |         inputs=[
703 |             system_prompt,
704 |             difficulty,
705 |             clarity,
706 |             labels,
707 |             multi_label,
708 |             num_rows,
709 |         ],
710 |         outputs=[pipeline_code],
711 |     ).success(
712 |         fn=show_pipeline_code_visibility,
713 |         inputs=[],
714 |         outputs=[pipeline_code_ui],
715 |     )
716 | 
717 |     btn_save_local.click(
718 |         fn=hide_success_message,
719 |         outputs=[success_message],
720 |     ).success(
721 |         fn=hide_pipeline_code_visibility,
722 |         inputs=[],
723 |         outputs=[pipeline_code_ui],
724 |     ).success(
725 |         fn=show_save_local,
726 |         inputs=[],
727 |         outputs=[csv_file, json_file, success_message],
728 |     ).success(
729 |         save_local,
730 |         inputs=[
731 |             system_prompt,
732 |             difficulty,
733 |             clarity,
734 |             labels,
735 |             multi_label,
736 |             num_rows,
737 |             temperature,
738 |             repo_name,
739 |         ],
740 |         outputs=[csv_file, json_file],
741 |     ).success(
742 |         fn=generate_pipeline_code,
743 |         inputs=[
744 |             system_prompt,
745 |             difficulty,
746 |             clarity,
747 |             labels,
748 |             multi_label,
749 |             num_rows,
750 |         ],
751 |         outputs=[pipeline_code],
752 |     ).success(
753 |         fn=show_pipeline_code_visibility,
754 |         inputs=[],
755 |         outputs=[pipeline_code_ui],
756 |     )
757 | 
758 |     gr.on(
759 |         triggers=[clear_btn_part.click, clear_btn_full.click],
760 |         fn=lambda _: (
761 |             "",
762 |             "",
763 |             [],
764 |             "",
765 |             _get_dataframe(),
766 |         ),
767 |         inputs=[dataframe],
768 |         outputs=[dataset_description, system_prompt, labels, multi_label, dataframe],
769 |     )
770 | 
771 |     app.load(fn=swap_visibility, outputs=main_ui)
772 |     app.load(fn=get_org_dropdown, outputs=[org_name])
773 |     app.load(fn=get_random_repo_name, outputs=[repo_name])
774 |     if SAVE_LOCAL_DIR is not None:
775 |         app.load(fn=show_save_local_button, outputs=btn_save_local)
776 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/constants.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import warnings
  3 | 
  4 | import argilla as rg
  5 | 
  6 | # Inference
  7 | MAX_NUM_TOKENS = int(os.getenv("MAX_NUM_TOKENS", 2048))
  8 | MAX_NUM_ROWS = int(os.getenv("MAX_NUM_ROWS", 1000))
  9 | DEFAULT_BATCH_SIZE = int(os.getenv("DEFAULT_BATCH_SIZE", 5))
 10 | 
 11 | # Directory to locally save the generated data
 12 | SAVE_LOCAL_DIR = os.getenv(key="SAVE_LOCAL_DIR", default=None)
 13 | 
 14 | # Models
 15 | MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
 16 | TOKENIZER_ID = os.getenv(key="TOKENIZER_ID", default=None)
 17 | OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
 18 | OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL")
 19 | HUGGINGFACE_BASE_URL = os.getenv("HUGGINGFACE_BASE_URL")
 20 | VLLM_BASE_URL = os.getenv("VLLM_BASE_URL")
 21 | 
 22 | # Just used in case of selecting a different model for completions
 23 | MODEL_COMPLETION = os.getenv("MODEL_COMPLETION", MODEL)
 24 | TOKENIZER_ID_COMPLETION = os.getenv("TOKENIZER_ID_COMPLETION", TOKENIZER_ID)
 25 | OPENAI_BASE_URL_COMPLETION = os.getenv("OPENAI_BASE_URL_COMPLETION", OPENAI_BASE_URL)
 26 | OLLAMA_BASE_URL_COMPLETION = os.getenv("OLLAMA_BASE_URL_COMPLETION", OLLAMA_BASE_URL)
 27 | HUGGINGFACE_BASE_URL_COMPLETION = os.getenv(
 28 |     "HUGGINGFACE_BASE_URL_COMPLETION", HUGGINGFACE_BASE_URL
 29 | )
 30 | VLLM_BASE_URL_COMPLETION = os.getenv("VLLM_BASE_URL_COMPLETION", VLLM_BASE_URL)
 31 | 
 32 | base_urls = [OPENAI_BASE_URL, OLLAMA_BASE_URL, HUGGINGFACE_BASE_URL, VLLM_BASE_URL]
 33 | base_urls_completion = [
 34 |     OPENAI_BASE_URL_COMPLETION,
 35 |     OLLAMA_BASE_URL_COMPLETION,
 36 |     HUGGINGFACE_BASE_URL_COMPLETION,
 37 |     VLLM_BASE_URL_COMPLETION,
 38 | ]
 39 | 
 40 | 
 41 | # Validate the configuration of the model and base URLs.
 42 | def validate_configuration(base_urls, model, env_context=""):
 43 |     huggingface_url = base_urls[2]
 44 |     if huggingface_url and model:
 45 |         raise ValueError(
 46 |             f"`HUGGINGFACE_BASE_URL{env_context}` and `MODEL{env_context}` cannot be set at the same time. "
 47 |             "Use a model id for serverless inference and a base URL dedicated to Hugging Face Inference Endpoints."
 48 |         )
 49 | 
 50 |     if not model and any(base_urls):
 51 |         raise ValueError(
 52 |             f"`MODEL{env_context}` is not set. Please provide a model id for inference."
 53 |         )
 54 | 
 55 |     active_urls = [url for url in base_urls if url]
 56 |     if len(active_urls) > 1:
 57 |         raise ValueError(
 58 |             f"Multiple base URLs are provided: {', '.join(active_urls)}. "
 59 |             "Only one base URL can be set at a time."
 60 |         )
 61 | validate_configuration(base_urls, MODEL)
 62 | validate_configuration(base_urls_completion, MODEL_COMPLETION, "_COMPLETION")
 63 | 
 64 | BASE_URL = OPENAI_BASE_URL or OLLAMA_BASE_URL or HUGGINGFACE_BASE_URL or VLLM_BASE_URL
 65 | BASE_URL_COMPLETION = (
 66 |     OPENAI_BASE_URL_COMPLETION
 67 |     or OLLAMA_BASE_URL_COMPLETION
 68 |     or HUGGINGFACE_BASE_URL_COMPLETION
 69 |     or VLLM_BASE_URL_COMPLETION
 70 | )
 71 | 
 72 | # API Keys
 73 | HF_TOKEN = os.getenv("HF_TOKEN")
 74 | if not HF_TOKEN:
 75 |     raise ValueError(
 76 |         "HF_TOKEN is not set. Ensure you have set the HF_TOKEN environment variable that has access to the Hugging Face Hub repositories and Inference Endpoints."
 77 |     )
 78 | 
 79 | _API_KEY = os.getenv("API_KEY")
 80 | API_KEYS = (
 81 |     [_API_KEY]
 82 |     if _API_KEY
 83 |     else [HF_TOKEN] + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
 84 | )
 85 | API_KEYS = [token for token in API_KEYS if token]
 86 | 
 87 | # Determine if SFT is available
 88 | SFT_AVAILABLE = False
 89 | llama_options = ["llama3", "llama-3", "llama 3"]
 90 | qwen_options = ["qwen2", "qwen-2", "qwen 2"]
 91 | 
 92 | if passed_pre_query_template := os.getenv("MAGPIE_PRE_QUERY_TEMPLATE", "").lower():
 93 |     SFT_AVAILABLE = True
 94 |     if passed_pre_query_template in llama_options:
 95 |         MAGPIE_PRE_QUERY_TEMPLATE = "llama3"
 96 |     elif passed_pre_query_template in qwen_options:
 97 |         MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
 98 |     else:
 99 |         MAGPIE_PRE_QUERY_TEMPLATE = passed_pre_query_template
100 | elif MODEL.lower() in llama_options or any(
101 |     option in MODEL.lower() for option in llama_options
102 | ):
103 |     SFT_AVAILABLE = True
104 |     MAGPIE_PRE_QUERY_TEMPLATE = "llama3"
105 | elif MODEL.lower() in qwen_options or any(
106 |     option in MODEL.lower() for option in qwen_options
107 | ):
108 |     SFT_AVAILABLE = True
109 |     MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
110 | 
111 | if OPENAI_BASE_URL:
112 |     SFT_AVAILABLE = False
113 | 
114 | if not SFT_AVAILABLE:
115 |     warnings.warn(
116 |         "`SFT_AVAILABLE` is set to `False`. Use Hugging Face Inference Endpoints or Ollama to generate chat data, provide a `TOKENIZER_ID` and `MAGPIE_PRE_QUERY_TEMPLATE`. You can also use `HUGGINGFACE_BASE_URL` to with vllm."
117 |     )
118 |     MAGPIE_PRE_QUERY_TEMPLATE = None
119 | 
120 | # Embeddings
121 | STATIC_EMBEDDING_MODEL = "minishlab/potion-base-8M"
122 | 
123 | # Argilla
124 | ARGILLA_API_URL = os.getenv("ARGILLA_API_URL") or os.getenv(
125 |     "ARGILLA_API_URL_SDG_REVIEWER"
126 | )
127 | ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY") or os.getenv(
128 |     "ARGILLA_API_KEY_SDG_REVIEWER"
129 | )
130 | 
131 | if not ARGILLA_API_URL or not ARGILLA_API_KEY:
132 |     warnings.warn("ARGILLA_API_URL or ARGILLA_API_KEY is not set or is empty")
133 |     argilla_client = None
134 | else:
135 |     argilla_client = rg.Argilla(
136 |         api_url=ARGILLA_API_URL,
137 |         api_key=ARGILLA_API_KEY,
138 |     )
139 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/src/synthetic_dataset_generator/pipelines/__init__.py


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/pipelines/base.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | 
  4 | from distilabel.models import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
  5 | from distilabel.steps.tasks import TextGeneration
  6 | 
  7 | from synthetic_dataset_generator.constants import (
  8 |     API_KEYS,
  9 |     DEFAULT_BATCH_SIZE,
 10 |     HUGGINGFACE_BASE_URL,
 11 |     HUGGINGFACE_BASE_URL_COMPLETION,
 12 |     MODEL,
 13 |     MODEL_COMPLETION,
 14 |     OLLAMA_BASE_URL,
 15 |     OLLAMA_BASE_URL_COMPLETION,
 16 |     OPENAI_BASE_URL,
 17 |     OPENAI_BASE_URL_COMPLETION,
 18 |     TOKENIZER_ID,
 19 |     TOKENIZER_ID_COMPLETION,
 20 |     VLLM_BASE_URL,
 21 |     VLLM_BASE_URL_COMPLETION,
 22 | )
 23 | 
 24 | TOKEN_INDEX = 0
 25 | 
 26 | 
 27 | def _get_next_api_key():
 28 |     global TOKEN_INDEX
 29 |     api_key = API_KEYS[TOKEN_INDEX % len(API_KEYS)]
 30 |     TOKEN_INDEX += 1
 31 |     return api_key
 32 | 
 33 | 
 34 | def _get_prompt_rewriter():
 35 |     generation_kwargs = {
 36 |         "temperature": 1,
 37 |     }
 38 |     system_prompt = "You are a prompt rewriter. You are given a prompt and you need to rewrite it keeping the same structure but highlighting different aspects of the original without adding anything new."
 39 |     prompt_rewriter = TextGeneration(
 40 |         llm=_get_llm(generation_kwargs=generation_kwargs),
 41 |         system_prompt=system_prompt,
 42 |         use_system_prompt=True,
 43 |     )
 44 |     prompt_rewriter.load()
 45 |     return prompt_rewriter
 46 | 
 47 | 
 48 | def get_rewritten_prompts(prompt: str, num_rows: int):
 49 |     prompt_rewriter = _get_prompt_rewriter()
 50 |     # create prompt rewrites
 51 |     inputs = [
 52 |         {"instruction": f"Original prompt: {prompt} \nRewritten prompt: "}
 53 |         for i in range(math.floor(num_rows / 100))
 54 |     ]
 55 |     n_processed = 0
 56 |     prompt_rewrites = [prompt]
 57 |     while n_processed < num_rows:
 58 |         batch = list(
 59 |             prompt_rewriter.process(
 60 |                 inputs=inputs[n_processed : n_processed + DEFAULT_BATCH_SIZE]
 61 |             )
 62 |         )
 63 |         prompt_rewrites += [entry["generation"] for entry in batch[0]]
 64 |         n_processed += DEFAULT_BATCH_SIZE
 65 |         random.seed(a=random.randint(0, 2**32 - 1))
 66 |     return prompt_rewrites
 67 | 
 68 | 
 69 | def _get_llm_class() -> str:
 70 |     if OPENAI_BASE_URL:
 71 |         return "OpenAILLM"
 72 |     elif OLLAMA_BASE_URL:
 73 |         return "OllamaLLM"
 74 |     elif HUGGINGFACE_BASE_URL:
 75 |         return "InferenceEndpointsLLM"
 76 |     elif VLLM_BASE_URL:
 77 |         return "ClientvLLM"
 78 |     else:
 79 |         return "InferenceEndpointsLLM"
 80 | 
 81 | 
 82 | def _get_llm(
 83 |     structured_output: dict = None,
 84 |     use_magpie_template: str = False,
 85 |     is_completion: bool = False,
 86 |     **kwargs,
 87 | ):
 88 |     model = MODEL_COMPLETION if is_completion else MODEL
 89 |     tokenizer_id = TOKENIZER_ID_COMPLETION if is_completion else TOKENIZER_ID or model
 90 |     base_urls = {
 91 |         "openai": OPENAI_BASE_URL_COMPLETION if is_completion else OPENAI_BASE_URL,
 92 |         "ollama": OLLAMA_BASE_URL_COMPLETION if is_completion else OLLAMA_BASE_URL,
 93 |         "huggingface": HUGGINGFACE_BASE_URL_COMPLETION if is_completion else HUGGINGFACE_BASE_URL,
 94 |         "vllm": VLLM_BASE_URL_COMPLETION if is_completion else VLLM_BASE_URL,
 95 |     }
 96 | 
 97 |     if base_urls["openai"]:
 98 |         llm = OpenAILLM(
 99 |             model=model,
100 |             base_url=base_urls["openai"],
101 |             api_key=_get_next_api_key(),
102 |             structured_output=structured_output,
103 |             **kwargs,
104 |         )
105 |         if "generation_kwargs" in kwargs:
106 |             if "stop_sequences" in kwargs["generation_kwargs"]:
107 |                 kwargs["generation_kwargs"]["stop"] = kwargs["generation_kwargs"][
108 |                     "stop_sequences"
109 |                 ]
110 |                 del kwargs["generation_kwargs"]["stop_sequences"]
111 |             if "do_sample" in kwargs["generation_kwargs"]:
112 |                 del kwargs["generation_kwargs"]["do_sample"]
113 |     elif base_urls["ollama"]:
114 |         if "generation_kwargs" in kwargs:
115 |             if "max_new_tokens" in kwargs["generation_kwargs"]:
116 |                 kwargs["generation_kwargs"]["num_predict"] = kwargs[
117 |                     "generation_kwargs"
118 |                 ]["max_new_tokens"]
119 |                 del kwargs["generation_kwargs"]["max_new_tokens"]
120 |             if "stop_sequences" in kwargs["generation_kwargs"]:
121 |                 kwargs["generation_kwargs"]["stop"] = kwargs["generation_kwargs"][
122 |                     "stop_sequences"
123 |                 ]
124 |                 del kwargs["generation_kwargs"]["stop_sequences"]
125 |             if "do_sample" in kwargs["generation_kwargs"]:
126 |                 del kwargs["generation_kwargs"]["do_sample"]
127 |             options = kwargs["generation_kwargs"]
128 |             del kwargs["generation_kwargs"]
129 |             kwargs["generation_kwargs"] = {}
130 |             kwargs["generation_kwargs"]["options"] = options
131 |         llm = OllamaLLM(
132 |             model=model,
133 |             host=base_urls["ollama"],
134 |             tokenizer_id=tokenizer_id,
135 |             use_magpie_template=use_magpie_template,
136 |             structured_output=structured_output,
137 |             **kwargs,
138 |         )
139 |     elif base_urls["huggingface"]:
140 |         kwargs["generation_kwargs"]["do_sample"] = True
141 |         llm = InferenceEndpointsLLM(
142 |             api_key=_get_next_api_key(),
143 |             base_url=base_urls["huggingface"],
144 |             tokenizer_id=tokenizer_id,
145 |             use_magpie_template=use_magpie_template,
146 |             structured_output=structured_output,
147 |             **kwargs,
148 |         )
149 |     elif base_urls["vllm"]:
150 |         if "generation_kwargs" in kwargs:
151 |             if "do_sample" in kwargs["generation_kwargs"]:
152 |                 del kwargs["generation_kwargs"]["do_sample"]
153 |         llm = ClientvLLM(
154 |             base_url=base_urls["vllm"],
155 |             model=model,
156 |             tokenizer=tokenizer_id,
157 |             api_key=_get_next_api_key(),
158 |             use_magpie_template=use_magpie_template,
159 |             structured_output=structured_output,
160 |             **kwargs,
161 |         )
162 |     else:
163 |         llm = InferenceEndpointsLLM(
164 |             api_key=_get_next_api_key(),
165 |             tokenizer_id=tokenizer_id,
166 |             model_id=model,
167 |             use_magpie_template=use_magpie_template,
168 |             structured_output=structured_output,
169 |             **kwargs,
170 |         )
171 | 
172 |     return llm
173 | 
174 | 
175 | try:
176 |     llm = _get_llm()
177 |     llm.load()
178 |     llm.generate([[{"content": "Hello, world!", "role": "user"}]])
179 | except Exception as e:
180 |     raise Exception(f"Error loading {llm.__class__.__name__}: {e}")
181 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/pipelines/chat.py:
--------------------------------------------------------------------------------
  1 | from datasets import get_dataset_config_names, get_dataset_split_names
  2 | from distilabel.steps.tasks import (
  3 |     ChatGeneration,
  4 |     Magpie,
  5 |     GenerateSentencePair,
  6 |     TextGeneration,
  7 | )
  8 | 
  9 | from synthetic_dataset_generator.constants import (
 10 |     MAGPIE_PRE_QUERY_TEMPLATE,
 11 |     MAX_NUM_TOKENS,
 12 | )
 13 | from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
 14 | 
 15 | INFORMATION_SEEKING_PROMPT = (
 16 |     "You are an AI assistant designed to provide accurate and concise information on a wide"
 17 |     " range of topics. Your purpose is to assist users in finding specific facts,"
 18 |     " explanations, or details about various subjects. Provide clear, factual responses and,"
 19 |     " when appropriate, offer additional context or related information that might be useful"
 20 |     " to the user."
 21 | )
 22 | 
 23 | REASONING_PROMPT = (
 24 |     "You are an AI assistant specialized in logical thinking and problem-solving. Your"
 25 |     " purpose is to help users work through complex ideas, analyze situations, and draw"
 26 |     " conclusions based on given information. Approach each query with structured thinking,"
 27 |     " break down problems into manageable parts, and guide users through the reasoning"
 28 |     " process step-by-step."
 29 | )
 30 | 
 31 | PLANNING_PROMPT = (
 32 |     "You are an AI assistant focused on helping users create effective plans and strategies."
 33 |     " Your purpose is to assist in organizing thoughts, setting goals, and developing"
 34 |     " actionable steps for various projects or activities. Offer structured approaches,"
 35 |     " consider potential challenges, and provide tips for efficient execution of plans."
 36 | )
 37 | 
 38 | EDITING_PROMPT = (
 39 |     "You are an AI assistant specialized in editing and improving written content. Your"
 40 |     " purpose is to help users refine their writing by offering suggestions for grammar,"
 41 |     " style, clarity, and overall structure. Provide constructive feedback, explain your"
 42 |     " edits, and offer alternative phrasings when appropriate."
 43 | )
 44 | 
 45 | CODING_DEBUGGING_PROMPT = (
 46 |     "You are an AI assistant designed to help with programming tasks. Your purpose is to"
 47 |     " assist users in writing, reviewing, and debugging code across various programming"
 48 |     " languages. Provide clear explanations, offer best practices, and help troubleshoot"
 49 |     " issues. When appropriate, suggest optimizations or alternative approaches to coding"
 50 |     " problems."
 51 | )
 52 | 
 53 | MATH_SYSTEM_PROMPT = (
 54 |     "You are an AI assistant designed to provide helpful, step-by-step guidance on solving"
 55 |     " math problems. The user will ask you a wide range of complex mathematical questions."
 56 |     " Your purpose is to assist users in understanding mathematical concepts, working through"
 57 |     " equations, and arriving at the correct solutions."
 58 | )
 59 | 
 60 | ROLE_PLAYING_PROMPT = (
 61 |     "You are an AI assistant capable of engaging in various role-playing scenarios. Your"
 62 |     " purpose is to adopt different personas or characters as requested by the user. Maintain"
 63 |     " consistency with the chosen role, respond in character, and help create immersive and"
 64 |     " interactive experiences for the user."
 65 | )
 66 | 
 67 | DATA_ANALYSIS_PROMPT = (
 68 |     "You are an AI assistant specialized in data analysis and interpretation. Your purpose is"
 69 |     " to help users understand and derive insights from data sets, statistics, and analytical"
 70 |     " tasks. Offer clear explanations of data trends, assist with statistical calculations,"
 71 |     " and provide guidance on data visualization and interpretation techniques."
 72 | )
 73 | 
 74 | CREATIVE_WRITING_PROMPT = (
 75 |     "You are an AI assistant designed to support creative writing endeavors. Your purpose is"
 76 |     " to help users craft engaging stories, poems, and other creative texts. Offer"
 77 |     " suggestions for plot development, character creation, dialogue writing, and other"
 78 |     " aspects of creative composition. Provide constructive feedback and inspire creativity."
 79 | )
 80 | 
 81 | ADVICE_SEEKING_PROMPT = (
 82 |     "You are an AI assistant focused on providing thoughtful advice and guidance. Your"
 83 |     " purpose is to help users navigate various personal or professional issues by offering"
 84 |     " balanced perspectives, considering potential outcomes, and suggesting practical"
 85 |     " solutions. Encourage users to think critically about their situations while providing"
 86 |     " supportive and constructive advice."
 87 | )
 88 | 
 89 | BRAINSTORMING_PROMPT = (
 90 |     "You are an AI assistant specialized in generating ideas and facilitating creative"
 91 |     " thinking. Your purpose is to help users explore possibilities, think outside the box,"
 92 |     " and develop innovative concepts. Encourage free-flowing thoughts, offer diverse"
 93 |     " perspectives, and help users build upon and refine their ideas."
 94 | )
 95 | 
 96 | PROMPT_CREATION_PROMPT = f"""You are an AI assistant specialized in generating very precise prompts for dataset creation.
 97 | 
 98 | Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
 99 | 
100 | In the generated prompt always finish with this sentence: User questions are direct and concise.
101 | 
102 | The prompt you write should follow the same style and structure as the following example prompts:
103 | 
104 | {INFORMATION_SEEKING_PROMPT}
105 | 
106 | {REASONING_PROMPT}
107 | 
108 | {PLANNING_PROMPT}
109 | 
110 | {CODING_DEBUGGING_PROMPT}
111 | 
112 | {EDITING_PROMPT}
113 | 
114 | {ROLE_PLAYING_PROMPT}
115 | 
116 | {DATA_ANALYSIS_PROMPT}
117 | 
118 | {CREATIVE_WRITING_PROMPT}
119 | 
120 | {ADVICE_SEEKING_PROMPT}
121 | 
122 | {BRAINSTORMING_PROMPT}
123 | 
124 | User dataset description:
125 | """
126 | 
127 | FOLLOW_UP_TEMPLATE = """Conversation:
128 | {% for message in messages %}
129 |     {% if message.role == "user" %}
130 | User Question: {{ message.content }}
131 |     {% elif message.role == "assistant" %}
132 | Assistant Response: {{ message.content }}
133 |     {% endif %}
134 | {% endfor %}
135 | 
136 | Please generate the next logical user message in this conversation. Do not include any other information or 'User Question' in your response.
137 | """.rstrip()
138 | 
139 | DEFAULT_DATASET_DESCRIPTIONS = [
140 |     "rude customer assistant for a phone company",
141 |     "assistant that solves math puzzles using python",
142 | ]
143 | if MAGPIE_PRE_QUERY_TEMPLATE == "llama3":
144 |     _STOP_SEQUENCES = [
145 |         "<|eot_id|>",
146 |         "<|start_header_id|>",
147 |         "assistant",
148 |         " \n\n",
149 |     ]
150 | elif MAGPIE_PRE_QUERY_TEMPLATE == "qwen2":
151 |     _STOP_SEQUENCES = ["<|im_end|>", "<|im_start|>", "assistant", "\n\n"]
152 | else:
153 |     _STOP_SEQUENCES = [
154 |         "<|eot_id|>",
155 |         "<|start_header_id|>",
156 |         "assistant",
157 |         " \n\n",
158 |     ]
159 | 
160 | 
161 | def _get_output_mappings(num_turns: int):
162 |     if num_turns == 1:
163 |         return {"instruction": "prompt", "response": "completion"}
164 |     else:
165 |         return {"conversation": "messages"}
166 | 
167 | 
168 | def get_prompt_generator():
169 |     generation_kwargs = {
170 |         "temperature": 0.8,
171 |         "max_new_tokens": MAX_NUM_TOKENS,
172 |         "do_sample": True,
173 |     }
174 |     prompt_generator = TextGeneration(
175 |         llm=_get_llm(generation_kwargs=generation_kwargs),
176 |         system_prompt=PROMPT_CREATION_PROMPT,
177 |         use_system_prompt=True,
178 |     )
179 |     prompt_generator.load()
180 |     return prompt_generator
181 | 
182 | 
183 | def get_magpie_generator(num_turns: int, temperature: float, is_sample: bool):
184 |     input_mappings = _get_output_mappings(num_turns)
185 |     output_mappings = input_mappings.copy()
186 |     if num_turns == 1:
187 |         generation_kwargs = {
188 |             "temperature": temperature,
189 |             "do_sample": True,
190 |             "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.25),
191 |             "stop_sequences": _STOP_SEQUENCES,
192 |         }
193 |         magpie_generator = Magpie(
194 |             llm=_get_llm(
195 |                 generation_kwargs=generation_kwargs,
196 |                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
197 |                 use_magpie_template=True,
198 |             ),
199 |             n_turns=num_turns,
200 |             output_mappings=output_mappings,
201 |             only_instruction=True,
202 |         )
203 |     else:
204 |         generation_kwargs = {
205 |             "temperature": temperature,
206 |             "do_sample": True,
207 |             "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
208 |             "stop_sequences": _STOP_SEQUENCES,
209 |         }
210 |         magpie_generator = Magpie(
211 |             llm=_get_llm(
212 |                 generation_kwargs=generation_kwargs,
213 |                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
214 |                 use_magpie_template=True,
215 |             ),
216 |             end_with_user=True,
217 |             n_turns=num_turns,
218 |             output_mappings=output_mappings,
219 |         )
220 |     magpie_generator.load()
221 |     return magpie_generator
222 | 
223 | 
224 | def get_sentence_pair_generator(temperature: float, is_sample: bool):
225 |     generation_kwargs = {
226 |         "temperature": temperature,
227 |         "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
228 |     }
229 |     sentence_pair_generator = GenerateSentencePair(
230 |         llm=_get_llm(generation_kwargs=generation_kwargs),
231 |         triplet=False,
232 |         action="query",
233 |         hard_negative=True,
234 |     )
235 |     sentence_pair_generator.load()
236 |     return sentence_pair_generator
237 | 
238 | 
239 | def get_response_generator(
240 |     system_prompt: str, num_turns: int, temperature: float, is_sample: bool
241 | ):
242 |     if num_turns == 1:
243 |         generation_kwargs = {
244 |             "temperature": temperature,
245 |             "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
246 |         }
247 |         response_generator = TextGeneration(
248 |             llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs),
249 |             system_prompt=system_prompt,
250 |             output_mappings={"generation": "completion"},
251 |             input_mappings={"instruction": "prompt"},
252 |         )
253 |     else:
254 |         generation_kwargs = {
255 |             "temperature": temperature,
256 |             "max_new_tokens": MAX_NUM_TOKENS,
257 |         }
258 |         response_generator = ChatGeneration(
259 |             llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs),
260 |             output_mappings={"generation": "completion"},
261 |             input_mappings={"conversation": "messages"},
262 |         )
263 |     response_generator.load()
264 |     return response_generator
265 | 
266 | 
267 | def get_follow_up_generator(type: str, temperature: float, is_sample: bool):
268 |     if type == "instruction":
269 |         generation_kwargs = {
270 |             "temperature": temperature,
271 |             "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
272 |         }
273 |         follow_up_generator = TextGeneration(
274 |             llm=_get_llm(generation_kwargs=generation_kwargs),
275 |             template=FOLLOW_UP_TEMPLATE,
276 |             columns=["messages"],
277 |         )
278 |     else:
279 |         generation_kwargs = {
280 |             "temperature": temperature,
281 |             "max_new_tokens": MAX_NUM_TOKENS,
282 |         }
283 |         follow_up_generator = ChatGeneration(
284 |             llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs),
285 |         )
286 |     follow_up_generator.load()
287 |     return follow_up_generator
288 | 
289 | def generate_pipeline_code_system_prompt(
290 |     system_prompt: str,
291 |     num_turns: int,
292 |     num_rows: int,
293 | ):
294 |     input_mappings = _get_output_mappings(num_turns)
295 |     code = f"""
296 |     # Requirements: `pip install distilabel[hf-inference-endpoints]`
297 |     import os
298 |     from distilabel.pipeline import Pipeline
299 |     from distilabel.steps import KeepColumns
300 |     from distilabel.steps.tasks import MagpieGenerator
301 |     from distilabel.llms import {_get_llm_class()}
302 | 
303 |     SYSTEM_PROMPT = "{system_prompt}"
304 | 
305 |     with Pipeline(name="sft") as pipeline:
306 |         magpie = MagpieGenerator(
307 |             llm={_get_llm_class()}.from_dict(
308 |                 {_get_llm().dump()}
309 |             ),
310 |             n_turns={num_turns},
311 |             num_rows={num_rows},
312 |             batch_size=1,
313 |             system_prompt=SYSTEM_PROMPT,
314 |             output_mappings={input_mappings},
315 |         )
316 |         keep_columns = KeepColumns(
317 |             columns={list(input_mappings.values())} + ["model_name"],
318 |         )
319 |         magpie.connect(keep_columns)
320 | 
321 |     if __name__ == "__main__":
322 |         distiset = pipeline.run()
323 |     """
324 |     return code
325 | 
326 | def generate_pipeline_code_seed(
327 |     repo_id: str,
328 |     subset: str,
329 |     split: str,
330 |     input_type: str,
331 |     document_column: str,
332 |     num_turns: int,
333 |     num_rows: int,
334 | ):
335 |     code = f"""
336 | # Requirements: `pip install distilabel[hf-inference-endpoints]`
337 | from distilabel.models import {_get_llm_class()}
338 | from distilabel.pipeline import Pipeline
339 | from distilabel.steps import KeepColumns{", LoadDataFromDicts" if input_type != "dataset-input"  else ""}{", LoadDataFromHub" if input_type == "dataset-input" else ""}{", StepInput, step" if num_turns > 1 else ""}
340 | from distilabel.steps.tasks import GenerateSentencePair, TextGeneration {", ChatGeneration" if num_turns > 1 else ""}
341 | """
342 | 
343 |     if num_turns > 1:
344 |         code += """
345 | FOLLOW_UP_TEMPLATE = '''Conversation:
346 | {{% for message in messages %}}
347 |     {{% if message.role == "user" %}}
348 | User Question: {{{{ message.content }}}}
349 |     {{% elif message.role == "assistant" %}}
350 | Assistant Response: {{{{ message.content }}}}
351 |     {{% endif %}}
352 | {{% endfor %}}
353 | 
354 | Please generate the next logical user message in this conversation. Do not include any other information or 'User Question' in your response.
355 | '''.rstrip()
356 | 
357 | @step(inputs=["prompt", "completion"], outputs=["messages"])
358 | def PrepareMessages(*inputs: StepInput) -> StepOutput:
359 |     for input in inputs:
360 |         for item in input:
361 |             item["messages"] = [
362 |                 {"role": "user", "content": item["prompt"]},
363 |                 {"role": "assistant", "content": item["completion"]},
364 |             ]
365 |         yield input
366 | 
367 | 
368 | @step(inputs=["messages", "generation"], outputs=["messages"])
369 | def FormatMessagesInstruction(*inputs: StepInput) -> StepOutput:
370 |     for input in inputs:
371 |         for item in input:
372 |             item["messages"].append({"role": "user", "content": item["generation"]})
373 |         yield input
374 | 
375 | 
376 | @step(inputs=["messages", "generation"], outputs=["messages"])
377 | def FormatMessagesResponse(*inputs: StepInput) -> StepOutput:
378 |     for input in inputs:
379 |         for item in input:
380 |             item["messages"].append({"role": "assistant", "content": item["generation"]})
381 |         yield input
382 | """
383 | 
384 |     if input_type == "dataset-input":
385 |         code += f"""
386 | with Pipeline(name="sft") as pipeline:
387 |     load_the_dataset = LoadDataFromHub(
388 |         repo_id='{repo_id}',
389 |         config='{subset}',
390 |         split='{split}',
391 |         num_examples={num_rows},
392 |         batch_size=2,
393 |         output_mappings={{'{document_column}':'anchor'}},
394 |     )
395 |     """
396 | 
397 |     else: 
398 |         code += """
399 | data = process_and_chunk_files(files=[files])
400 | 
401 | with Pipeline(name="sft") as pipeline:
402 |     load_the_dataset = LoadDataFromDicts(
403 |         data = data
404 |     )
405 | """
406 |     code += f"""
407 |     instruction_generator = GenerateSentencePair(
408 |         name="instruction_generation",
409 |         triplet=False,
410 |         hard_negative=True,
411 |         action="query",
412 |         llm={_get_llm_class()}.from_dict(
413 |             {_get_llm().dump()}
414 |         ),
415 |         input_batch_size=10,
416 |         output_mappings={{"positive": "prompt"}},
417 |     )
418 | 
419 |     response_generator = TextGeneration(
420 |         name="response_generation",
421 |         llm={_get_llm_class()}.from_dict(
422 |             {_get_llm().dump()}
423 |         ),
424 |         input_batch_size=10,
425 |         input_mappings={{"instruction": "prompt"}},
426 |         output_mappings={{"generation": "completion"}},
427 |     )
428 |     """
429 | 
430 |     if num_turns > 1:
431 |         code += """
432 |     prepare_messages = PrepareMessages()
433 |     """
434 | 
435 |         for i in range(num_turns - 1):
436 |             code += f"""
437 |     follow_up_instruction_{i} = TextGeneration(
438 |         llm={_get_llm_class()}.from_dict(
439 |             {_get_llm().dump()}
440 |         ),
441 |         template=FOLLOW_UP_TEMPLATE,
442 |         columns=["messages"],
443 |     )
444 |     format_instruction_{i} = FormatMessagesInstruction()
445 |     follow_up_response_{i} = ChatGeneration(
446 |         llm={_get_llm_class()}.from_dict(
447 |             {_get_llm().dump()}
448 |         ),
449 |     )
450 |     format_response_{i} = FormatMessagesResponse()
451 |     """
452 | 
453 |     if num_turns > 1:
454 |         code += """
455 |         keep_columns = KeepColumns(columns=["messages"])
456 |         """
457 |         code += "load_the_dataset >> instruction_generator >> response_generator >> prepare_messages"
458 | 
459 |         for i in range(1, num_turns + 1):
460 |             code += f" >> follow_up_instruction_{i} >> format_instruction_{i} >> follow_up_response_{i} >> format_response_{i}"
461 | 
462 |         code += " >> keep_columns"
463 | 
464 |     code += """
465 | if __name__ == "__main__":
466 |     distiset = pipeline.run()
467 | )
468 | """
469 |     return code
470 | 
471 | def generate_pipeline_code(
472 |     repo_id: str,
473 |     input_type: str,
474 |     system_prompt: str,
475 |     document_column: str,
476 |     num_turns: int,
477 |     num_rows: int,
478 | ):
479 |     if input_type == "dataset-input" and repo_id is not None:
480 |         subset = get_dataset_config_names(repo_id)[0]
481 |         split = get_dataset_split_names(repo_id, subset)[0]
482 |     else:
483 |         subset = "default"
484 |         split = "train"
485 |     if input_type == "prompt-type":
486 |         return generate_pipeline_code_system_prompt(
487 |             system_prompt=system_prompt,
488 |             num_turns=num_turns,
489 |             num_rows=num_rows,
490 |         )
491 |     return generate_pipeline_code_seed(
492 |         repo_id=repo_id,
493 |         subset=subset,
494 |         split=split,
495 |         input_type=input_type,
496 |         document_column=document_column,
497 |         num_turns=num_turns,
498 |         num_rows=num_rows,
499 |     )
500 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/pipelines/embeddings.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from sentence_transformers import SentenceTransformer
 4 | from sentence_transformers.models import StaticEmbedding
 5 | 
 6 | from synthetic_dataset_generator.constants import STATIC_EMBEDDING_MODEL
 7 | 
 8 | static_embedding = StaticEmbedding.from_model2vec(STATIC_EMBEDDING_MODEL)
 9 | model = SentenceTransformer(modules=[static_embedding])
10 | 
11 | 
12 | def get_embeddings(texts: List[str]) -> List[List[float]]:
13 |     return [embedding.tolist() for embedding in model.encode(texts)]
14 | 
15 | 
16 | def get_sentence_embedding_dimensions() -> int:
17 |     return model.get_sentence_embedding_dimension()
18 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/pipelines/eval.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | from datasets import get_dataset_config_names, get_dataset_split_names
  4 | from distilabel.models import InferenceEndpointsLLM
  5 | from distilabel.steps.tasks import (
  6 |     TextGeneration,
  7 |     UltraFeedback,
  8 | )
  9 | 
 10 | from synthetic_dataset_generator.constants import BASE_URL, MAX_NUM_TOKENS, MODEL
 11 | from synthetic_dataset_generator.pipelines.base import _get_next_api_key
 12 | from synthetic_dataset_generator.utils import extract_column_names
 13 | 
 14 | 
 15 | def get_ultrafeedback_evaluator(aspect: str, is_sample: bool):
 16 |     ultrafeedback_evaluator = UltraFeedback(
 17 |         llm=InferenceEndpointsLLM(
 18 |             model_id=MODEL,
 19 |             base_url=BASE_URL,
 20 |             api_key=_get_next_api_key(),
 21 |             generation_kwargs={
 22 |                 "temperature": 0.01,
 23 |                 "max_new_tokens": MAX_NUM_TOKENS if not is_sample else 512,
 24 |             },
 25 |         ),
 26 |         aspect=aspect,
 27 |     )
 28 |     ultrafeedback_evaluator.load()
 29 |     return ultrafeedback_evaluator
 30 | 
 31 | 
 32 | def get_custom_evaluator(
 33 |     prompt_template: str, structured_output: dict, columns: List[str], is_sample: bool
 34 | ):
 35 |     custom_evaluator = TextGeneration(
 36 |         llm=InferenceEndpointsLLM(
 37 |             model_id=MODEL,
 38 |             base_url=BASE_URL,
 39 |             api_key=_get_next_api_key(),
 40 |             structured_output={"format": "json", "schema": structured_output},
 41 |             generation_kwargs={
 42 |                 "temperature": 0.01,
 43 |                 "max_new_tokens": MAX_NUM_TOKENS if not is_sample else 512,
 44 |             },
 45 |         ),
 46 |         template=prompt_template,
 47 |         columns=columns,
 48 |     )
 49 |     custom_evaluator.load()
 50 |     return custom_evaluator
 51 | 
 52 | 
 53 | def generate_ultrafeedback_pipeline_code(
 54 |     repo_id: str,
 55 |     subset: str,
 56 |     split: str,
 57 |     aspects: List[str],
 58 |     instruction_column: str,
 59 |     response_columns: str,
 60 |     num_rows: int,
 61 | ):
 62 |     if len(aspects) == 1:
 63 |         code = f"""
 64 | # Requirements: `pip install distilabel[hf-inference-endpoints]`
 65 | import os
 66 | from datasets import load_dataset
 67 | from distilabel.pipeline import Pipeline
 68 | from distilabel.steps import LoadDataFromDicts
 69 | from distilabel.steps.tasks import UltraFeedback
 70 | from distilabel.models import InferenceEndpointsLLM
 71 | 
 72 | MODEL = "{MODEL}"
 73 | BASE_URL = "{BASE_URL}"
 74 | os.environ["API_KEY"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 75 | 
 76 | hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}[:{num_rows}]")
 77 | data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
 78 | 
 79 | with Pipeline(name="ultrafeedback") as pipeline:
 80 | 
 81 |     load_the_dataset = LoadDataFromDicts(
 82 |         data = data,
 83 |     )
 84 | 
 85 |     ultrafeedback_evaluator = UltraFeedback(
 86 |         llm=InferenceEndpointsLLM(
 87 |             model_id=MODEL,
 88 |             base_url=BASE_URL,
 89 |             api_key=os.environ["API_KEY"],
 90 |             generation_kwargs={{
 91 |                 "temperature": 0.01,
 92 |                 "max_new_tokens": {MAX_NUM_TOKENS},
 93 |             }},
 94 |         ),
 95 |         aspect=aspect,
 96 |     )
 97 | 
 98 |     load_the_dataset >> ultrafeedback_evaluator
 99 | 
100 | if __name__ == "__main__":
101 |     distiset = pipeline.run()
102 | """
103 |     else:
104 |         code = f"""
105 | # Requirements: `pip install distilabel[hf-inference-endpoints]`
106 | import os
107 | from distilabel.pipeline import Pipeline
108 | from distilabel.steps import LoadDataFromDicts, CombineOutputs
109 | from distilabel.steps.tasks import UltraFeedback
110 | from distilabel.models import InferenceEndpointsLLM
111 | 
112 | MODEL = "{MODEL}"
113 | BASE_URL = "{BASE_URL}"
114 | os.environ["BASE_URL"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
115 | 
116 | hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}")
117 | data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
118 | 
119 | with Pipeline(name="ultrafeedback") as pipeline:
120 | 
121 |     load_the_dataset = LoadDataFromDicts(
122 |         data = data,
123 |     )
124 | 
125 |     tasks = []
126 |     for aspect in aspects:
127 |         evaluate_responses = UltraFeedback(
128 |             name=f"evaluate-responses-{{aspect}}",
129 |             aspect=aspect,
130 |             llm=InferenceEndpointsLLM(
131 |                 model_id=MODEL,
132 |                 base_url=BASE_URL,
133 |                 api_key=os.environ["BASE_URL"],
134 |                 generation_kwargs={{
135 |                     "temperature": 0.01,
136 |                     "max_new_tokens": {MAX_NUM_TOKENS},
137 |                 }},
138 |             output_mappings={{
139 |                 "ratings": f"ratings_{{aspect}}",
140 |                 "types": f"type_{{aspect}}",
141 |                 "rationales": f"rationales_for_types_{{aspect}}",
142 |                 "rationales-for-ratings": f"rationales_for_ratings_{{aspect}}",
143 |             }} if aspect in ["truthfulness", "helpfulness"] else {{"rationales": f"rationales_{{aspect}}", "ratings": f"ratings_{{aspect}}"}},
144 |         )
145 |         tasks.append(evaluate_responses)
146 | 
147 |     combine_outputs = CombineOutputs()
148 | 
149 |     load_the_dataset >> tasks >> combine_outputs
150 | 
151 | if __name__ == "__main__":
152 |     distiset = pipeline.run()
153 | """
154 |     return code
155 | 
156 | 
157 | def generate_custom_pipeline_code(
158 |     repo_id, subset, split, prompt_template, structured_output, num_rows
159 | ):
160 |     columns = extract_column_names(structured_output)
161 |     code = f"""
162 | # Requirements: `pip install distilabel[hf-inference-endpoints, instructor]`
163 | import os
164 | from distilabel.pipeline import Pipeline
165 | from distilabel.steps import LoadDataFromHub
166 | from distilabel.steps.tasks import TextGeneration
167 | from distilabel.models import InferenceEndpointsLLM
168 | 
169 | MODEL = "{MODEL}"
170 | BASE_URL = "{BASE_URL}"
171 | CUSTOM_TEMPLATE = "{prompt_template}"
172 | os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
173 | 
174 | with Pipeline(name="custom-evaluation") as pipeline:
175 |     load_the_dataset = LoadDataFromHub(
176 |         repo_id="{repo_id}",
177 |         config="{subset}",
178 |         split="{split}",
179 |         num_examples={num_rows},
180 |         batch_size=2
181 |     )
182 |     custom_evaluator = TextGeneration(
183 |         llm=InferenceEndpointsLLM(
184 |             model_id=MODEL,
185 |             base_url=BASE_URL,
186 |             api_key=os.environ["HF_TOKEN"],
187 |             structured_output={{"format": "json", "schema": {structured_output}}},
188 |             generation_kwargs={{
189 |                 "temperature": 0.01,
190 |                 "max_new_tokens": {MAX_NUM_TOKENS},
191 |             }},
192 |         ),
193 |         template=CUSTOM_TEMPLATE,
194 |         columns={columns}
195 |     )
196 | 
197 |     load_the_dataset >> custom_evaluator
198 | 
199 | if __name__ == "__main__":
200 |     distiset = pipeline.run()
201 | """
202 |     return code
203 | 
204 | 
205 | def generate_pipeline_code(
206 |     repo_id,
207 |     aspects,
208 |     instruction_column,
209 |     response_columns,
210 |     prompt_template,
211 |     structured_output,
212 |     num_rows,
213 |     eval_type,
214 | ):
215 |     if repo_id is None:
216 |         subset = "default"
217 |         split = "train"
218 |     else:
219 |         subset = get_dataset_config_names(repo_id)[0]
220 |         split = get_dataset_split_names(repo_id, subset)[0]
221 |     if eval_type == "ultrafeedback":
222 |         return generate_ultrafeedback_pipeline_code(
223 |             repo_id,
224 |             subset,
225 |             split,
226 |             aspects,
227 |             instruction_column,
228 |             response_columns,
229 |             num_rows,
230 |         )
231 |     return generate_custom_pipeline_code(
232 |         repo_id, subset, split, prompt_template, structured_output, num_rows
233 |     )
234 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/pipelines/rag.py:
--------------------------------------------------------------------------------
  1 | from datasets import get_dataset_config_names, get_dataset_split_names
  2 | from distilabel.steps.tasks import (
  3 |     GenerateSentencePair,
  4 |     TextGeneration,
  5 | )
  6 | 
  7 | from synthetic_dataset_generator.constants import MAX_NUM_TOKENS
  8 | from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
  9 | 
 10 | DEFAULT_DATASET_DESCRIPTIONS = [
 11 |     "A dataset to retrieve information from legal documents.",
 12 |     "A dataset to search for economical techniques.",
 13 | ]
 14 | 
 15 | PROMPT_CREATION_PROMPT = """
 16 | 
 17 | You are an AI assistant specialized in designing retrieval-augmented generation (RAG) tasks for dataset generation.
 18 | 
 19 | Your task is to generate a well-structured and descriptive prompt based on the provided dataset description. Respond with only the generated prompt and nothing else.
 20 | 
 21 | The prompt should closely follow the style and structure of the example prompts below. Ensure that you include all relevant details from the dataset description.
 22 | 
 23 | Description: A dataset to retrieve information from legal documents.
 24 | Output: A dataset to retrieve information from a collection of legal documents related to the US law system and the status of contracts.
 25 | 
 26 | Description: A dataset to search for economical techniques.
 27 | Output: A dataset to search for economical techniques and strategies for the European market and the financial sector.
 28 | 
 29 | Description: A dataset covering FAQ questions for a tech company called Argilla that sells technology datasets within the open-source Natural Language Processing space.
 30 | Output: A dataset covering FAQ questions for a tech company called Argilla that sells technology datasets within the open-source Natural Language Processing space.
 31 | 
 32 | Description:
 33 | """
 34 | 
 35 | SYSTEM_PROMPT_CHUCKS = """
 36 | You are a helpful and knowledgeable AI assistant. Your task is to generate concise and informative text chunks relevant to the given retrieval task.
 37 | 
 38 | Ensure the text chunks are:
 39 | - Focused and directly related to the retrieval task.
 40 | - Clear, truthful, and based on your general knowledge.
 41 | 
 42 | Do not include or reference the retrieval task itself in the generated chunks.
 43 | """
 44 | 
 45 | CHUNKS_TEMPLATE = """You have been assigned to generate text chunks based on the following retrieval task: {{ task }}.
 46 | 
 47 | Provide only the text chunks without explaining your process or reasoning. Do not include any additional information. Do not indicate that it is a text chunk.
 48 | 
 49 | Ensure the chunks are concise, clear, and directly relevant to the task.
 50 | 
 51 | Use your general knowledge to create informative and precise outputs.
 52 | """
 53 | 
 54 | SYSTEM_PROMPT_RAG = """
 55 | You are a helpful AI assistant. Your task is to answer the following question based on the provided document.
 56 | 
 57 | If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible.
 58 | 
 59 | If you cannot answer the question based on the given information, state that clearly.
 60 | """
 61 | 
 62 | RAG_TEMPLATE = """Document:
 63 | {{ context }}
 64 | 
 65 | Question: {{ question }}
 66 | 
 67 | Please provide a clear and concise answer to the question based on the information in the document:
 68 | """.rstrip()
 69 | 
 70 | 
 71 | def get_prompt_generator():
 72 |     generation_kwargs = {
 73 |         "temperature": 0.8,
 74 |         "max_new_tokens": MAX_NUM_TOKENS,
 75 |     }
 76 |     text_generator = TextGeneration(
 77 |         llm=_get_llm(generation_kwargs=generation_kwargs),
 78 |         system_prompt=PROMPT_CREATION_PROMPT,
 79 |         use_system_prompt=True,
 80 |     )
 81 | 
 82 |     text_generator.load()
 83 |     return text_generator
 84 | 
 85 | 
 86 | def get_chunks_generator(temperature: float, is_sample: bool):
 87 |     generation_kwargs = {
 88 |         "temperature": temperature,
 89 |         "max_new_tokens": MAX_NUM_TOKENS if is_sample else 256,
 90 |     }
 91 |     text_generator = TextGeneration(
 92 |         llm=_get_llm(generation_kwargs=generation_kwargs),
 93 |         system_prompt=SYSTEM_PROMPT_CHUCKS,
 94 |         template=CHUNKS_TEMPLATE,
 95 |         columns=["task"],
 96 |         use_system_prompt=True,
 97 |     )
 98 | 
 99 |     text_generator.load()
100 |     return text_generator
101 | 
102 | 
103 | def get_sentence_pair_generator(action: str, triplet: bool, temperature: float, is_sample: bool):
104 |     generation_kwargs = {
105 |         "temperature": temperature,
106 |         "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
107 |     }
108 |     sentence_pair_generator = GenerateSentencePair(
109 |         llm=_get_llm(generation_kwargs=generation_kwargs),
110 |         triplet=triplet,
111 |         action=action,
112 |         hard_negative=True,
113 |     )
114 |     sentence_pair_generator.load()
115 |     return sentence_pair_generator
116 | 
117 | 
118 | def get_response_generator(temperature: float, is_sample: bool):
119 |     generation_kwargs = {
120 |         "temperature": temperature,
121 |         "max_new_tokens": MAX_NUM_TOKENS if is_sample else 256,
122 |     }
123 |     text_generator = TextGeneration(
124 |         llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs),
125 |         system_prompt=SYSTEM_PROMPT_RAG,
126 |         template=RAG_TEMPLATE,
127 |         columns=["context", "question"],
128 |         use_system_prompt=True,
129 |     )
130 | 
131 |     text_generator.load()
132 |     return text_generator
133 | 
134 | 
135 | def generate_pipeline_code(
136 |     repo_id: str,
137 |     input_type: str,
138 |     system_prompt: str,
139 |     document_column: str,
140 |     retrieval_reranking: list[str],
141 |     num_rows: int = 10,
142 | ) -> str:
143 |     if input_type == "dataset-input" and repo_id is not None:
144 |         subset = get_dataset_config_names(repo_id)[0]
145 |         split = get_dataset_split_names(repo_id, subset)[0]
146 |     else:
147 |         subset = "default"
148 |         split = "train"
149 |     retrieval = "Retrieval" in retrieval_reranking
150 |     reranking = "Reranking" in retrieval_reranking
151 |     base_code = f"""
152 | # Requirements: `pip install distilabel[hf-inference-endpoints]`
153 | {"import random" if input_type == "prompt-input" else ""}
154 | from distilabel.models import {_get_llm_class()}
155 | from distilabel.pipeline import Pipeline
156 | from distilabel.steps import KeepColumns{", LoadDataFromDicts" if input_type != "dataset-input"  else ""}{", LoadDataFromHub" if input_type == "dataset-input" else ""}{", CombineOutputs" if retrieval and reranking else ""}
157 | from distilabel.steps.tasks import GenerateSentencePair, TextGeneration {", GenerateTextRetrievalData" if input_type == "prompt-input" else ""}
158 | 
159 | SYSTEM_PROMPT_RAG = '''
160 | You are a helpful AI assistant. Your task is to answer the following question based on the provided document.
161 | 
162 | If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible.
163 | 
164 | If you cannot answer the question based on the given information, state that clearly.
165 | '''
166 | 
167 | RAG_TEMPLATE = '''Document:
168 | {{{{ filename }}}}
169 | 
170 | Question: {{{{ question }}}}
171 | 
172 | Please provide a clear and concise answer to the question based on the information in the document:
173 | '''.rstrip()
174 | """
175 | 
176 |     if input_type == "file-input":
177 |         base_code += """
178 | data = process_and_chunk_files(files=[files])
179 | """
180 | 
181 |     if input_type == "prompt-input":
182 |         pipeline = f"""
183 | TASK_SYSTEM_PROMPT =  '''
184 | 
185 | {system_prompt}    
186 | ''' 
187 | 
188 | with Pipeline(name="rag") as pipeline:
189 | 
190 |     task_generator = LoadDataFromDicts(data=[{{"task": TASK_SYSTEM_PROMPT}}])
191 | 
192 |     sentence_similarity_generation = GenerateTextRetrievalData(
193 |         llm={_get_llm_class()}.from_dict(
194 |             {_get_llm().dump()}
195 |         ),
196 |         seed=random.randint(0, 2**32 - 1),
197 |         query_type="common",
198 |         difficulty="high school",
199 |         clarity="clear",
200 |         num_generations={num_rows},
201 |         output_mappings={{"positive_document": "anchor"}},
202 |     )
203 | 
204 |     keep_columns_prompt = KeepColumns(
205 |         columns=["anchor"],
206 |     )
207 |     """
208 |     else:
209 |         pipeline = """
210 | with Pipeline(name="rag") as pipeline:
211 | """
212 |         if input_type == "file-input":
213 |             pipeline += """
214 |     load_the_dataset = LoadDataFromDicts(
215 |         data = data,
216 |     )
217 |     """
218 |         else:
219 |             pipeline += f"""
220 |     load_the_dataset = LoadDataFromHub(
221 |         repo_id="{repo_id}",
222 |         config="{subset}",
223 |         split="{split}",
224 |         num_examples={num_rows},
225 |         batch_size=2,
226 |         output_mappings={{'{document_column}': 'anchor'}}
227 |     )
228 |     """
229 | 
230 |     pipeline += f"""
231 |     generate_retrieval_pairs = GenerateSentencePair(
232 |         triplet={str(retrieval)},
233 |         hard_negative=True,
234 |         action="query",
235 |         llm={_get_llm_class()}.from_dict(
236 |             {_get_llm().dump()}
237 |         ),
238 |         output_mappings={{"positive": "positive_retrieval"{', "negative": "negative_retrieval"' if retrieval else ""}}},
239 |         input_batch_size=10,
240 |     )
241 |     """
242 | 
243 |     if reranking:
244 |         pipeline += f"""
245 |     generate_reranking_pairs = GenerateSentencePair(
246 |         triplet=True,
247 |         hard_negative=True,
248 |         action="semantically-similar",
249 |         llm={_get_llm_class()}.from_dict(
250 |             {_get_llm().dump()}
251 |         ),
252 |         input_batch_size=10,
253 |         output_mappings={{"positive": "positive_reranking", "negative": "negative_reranking"}},
254 |     )
255 |     
256 |     combine_outputs = CombineOutputs()
257 |     """
258 | 
259 |     pipeline += f"""
260 |     generate_response = TextGeneration(
261 |         llm={_get_llm_class()}.from_dict(
262 |             {_get_llm().dump()}
263 |         ),
264 |         system_prompt=SYSTEM_PROMPT_RAG,
265 |         template=RAG_TEMPLATE,
266 |         columns=["filename", "question"],
267 |         use_system_prompt=True,
268 |         input_mappings={{"filename": "anchor", "question": "positive_retrieval"}},
269 |         output_mappings={{"generation": "response"}},
270 |     )
271 |     
272 |     keep_columns = KeepColumns(
273 |         columns=["anchor", "positive_retrieval", "response"{', "negative_retrieval"' if retrieval else ""}{', "positive_reranking", "negative_reranking"' if reranking else ""}],
274 |     )
275 |     """
276 | 
277 |     pipeline_steps = (
278 |         "[generate_retrieval_pairs, generate_reranking_pairs] >> combine_outputs >> generate_response >> keep_columns"
279 |         if reranking
280 |         else "generate_retrieval_pairs >> generate_response >> keep_columns"
281 |     )
282 | 
283 |     pipeline += """
284 |     task_generator >> sentence_similarity_generation >> keep_columns_prompt >> {pipeline_steps}
285 | """.format(pipeline_steps=pipeline_steps) if input_type == "prompt-input" else """
286 |     load_the_dataset >> {pipeline_steps}
287 | """.format(pipeline_steps=pipeline_steps)
288 | 
289 |     pipeline += """
290 |     if __name__ == "__main__":
291 |         distiset = pipeline.run()
292 |     """
293 | 
294 |     return base_code + pipeline
295 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/pipelines/textcat.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from typing import List
  3 | 
  4 | from distilabel.steps.tasks import (
  5 |     GenerateTextClassificationData,
  6 |     TextClassification,
  7 |     TextGeneration,
  8 | )
  9 | from pydantic import BaseModel, Field
 10 | 
 11 | from synthetic_dataset_generator.constants import (
 12 |     MAX_NUM_TOKENS,
 13 | )
 14 | from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
 15 | from synthetic_dataset_generator.utils import get_preprocess_labels
 16 | 
 17 | PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
 18 | 
 19 | Your should write a prompt following a the dataset description. Respond with the prompt and nothing else.
 20 | 
 21 | The prompt should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels.
 22 | 
 23 | Make sure to always include all of the detailed information from the description and the context of the company that is provided.
 24 | 
 25 | Don't include the labels in the classification_task but only provide a high level description of the classification task.
 26 | 
 27 | If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
 28 | 
 29 | Description: DavidMovieHouse is a cinema that has been in business for 10 years.
 30 | Output: {"classification_task": "The company DavidMovieHouse is a cinema that has been in business for 10 years and has had customers reviews of varying customer groups. Classify the customer reviews as", "labels": ["positive", "negative"]}
 31 | 
 32 | Description: A dataset that focuses on creating neo-ludite discussions about technologies within the AI space.
 33 | Output: {"classification_task": "Neo-ludiite discussions about technologies within the AI space cover from different speaking people. Categorize the discussions into one of the following categories", "labels": ["tech-support", "tech-opposition"]}
 34 | 
 35 | Description: A dataset that covers the articles of a niche sports website called TheSportBlogs that focuses on female sports within the ballsport domain for the US market.
 36 | Output: {"classification_task": "TechSportBlogs is a niche sports website that focuses on female sports within the ballsport domain for the US market. Written by different journalists. Determine the category of based on the article using the following categories", "labels": ["basketball", "volleyball", "tennis", "hockey", "baseball", "soccer"]}
 37 | 
 38 | Description: A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review with labels "data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"
 39 | Output: {"classification_task": "A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review from various cusomer demographics with labels", "labels": ["data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"]}
 40 | 
 41 | Description:
 42 | """
 43 | 
 44 | DEFAULT_DATASET_DESCRIPTIONS = [
 45 |     "A dataset covering customer reviews for an e-commerce website.",
 46 |     "A dataset covering news articles about various topics.",
 47 | ]
 48 | 
 49 | 
 50 | class TextClassificationTask(BaseModel):
 51 |     classification_task: str = Field(
 52 |         ...,
 53 |         title="classification_task",
 54 |         description="The classification task to be performed.",
 55 |     )
 56 | 
 57 |     labels: list[str] = Field(
 58 |         ...,
 59 |         title="Labels",
 60 |         description="The possible labels for the classification task.",
 61 |     )
 62 | 
 63 | 
 64 | def get_prompt_generator():
 65 |     structured_output = {
 66 |         "format": "json",
 67 |         "schema": TextClassificationTask,
 68 |     }
 69 |     generation_kwargs = {
 70 |         "temperature": 0.8,
 71 |         "max_new_tokens": MAX_NUM_TOKENS,
 72 |     }
 73 |     llm = _get_llm(
 74 |         structured_output=structured_output,
 75 |         generation_kwargs=generation_kwargs,
 76 |     )
 77 | 
 78 |     prompt_generator = TextGeneration(
 79 |         llm=llm,
 80 |         system_prompt=PROMPT_CREATION_PROMPT,
 81 |         use_system_prompt=True,
 82 |     )
 83 | 
 84 |     prompt_generator.load()
 85 |     return prompt_generator
 86 | 
 87 | 
 88 | def get_textcat_generator(
 89 |     difficulty: str, clarity: str, temperature: float, is_sample: bool
 90 | ):
 91 |     generation_kwargs = {
 92 |         "temperature": temperature,
 93 |         "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
 94 |         "top_p": 0.95,
 95 |     }
 96 |     llm = _get_llm(generation_kwargs=generation_kwargs)
 97 |     textcat_generator = GenerateTextClassificationData(
 98 |         llm=llm,
 99 |         difficulty=None if difficulty == "mixed" else difficulty,
100 |         clarity=None if clarity == "mixed" else clarity,
101 |         seed=random.randint(0, 2**32 - 1),
102 |     )
103 |     textcat_generator.load()
104 |     return textcat_generator
105 | 
106 | 
107 | def get_labeller_generator(system_prompt: str, labels: List[str], multi_label: bool):
108 |     generation_kwargs = {
109 |         "temperature": 0.01,
110 |         "max_new_tokens": MAX_NUM_TOKENS,
111 |     }
112 |     llm = _get_llm(generation_kwargs=generation_kwargs)
113 |     labeller_generator = TextClassification(
114 |         llm=llm,
115 |         context=system_prompt,
116 |         available_labels=labels,
117 |         n=len(labels) if multi_label else 1,
118 |         default_label="unknown",
119 |     )
120 |     labeller_generator.load()
121 |     return labeller_generator
122 | 
123 | 
124 | def generate_pipeline_code(
125 |     system_prompt: str,
126 |     difficulty: str = None,
127 |     clarity: str = None,
128 |     labels: List[str] = None,
129 |     num_labels: int = 1,
130 |     num_rows: int = 10,
131 | ) -> str:
132 |     labels = get_preprocess_labels(labels)
133 |     base_code = f"""
134 | # Requirements: `pip install distilabel[hf-inference-endpoints]`
135 | import os
136 | import random
137 | from distilabel.models import {_get_llm_class()}
138 | from distilabel.pipeline import Pipeline
139 | from distilabel.steps import LoadDataFromDicts, KeepColumns
140 | from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
141 | 
142 | SYSTEM_PROMPT = "{system_prompt}"
143 | 
144 | with Pipeline(name="textcat") as pipeline:
145 | 
146 |     task_generator = LoadDataFromDicts(data=[{{"task": SYSTEM_PROMPT}}])
147 | 
148 |     textcat_generation = GenerateTextClassificationData(
149 |         llm={_get_llm_class()}.from_dict(
150 |             {_get_llm().dump()}
151 |         ),
152 |         seed=random.randint(0, 2**32 - 1),
153 |         difficulty={None if difficulty == "mixed" else repr(difficulty)},
154 |         clarity={None if clarity == "mixed" else repr(clarity)},
155 |         num_generations={num_rows},
156 |         output_mappings={{"input_text": "text"}},
157 |     )
158 |     """
159 | 
160 |     if num_labels == 1:
161 |         return (
162 |             base_code
163 |             + """
164 |     keep_columns = KeepColumns(
165 |         columns=["text", "label"],
166 |     )
167 | 
168 |     # Connect steps in the pipeline
169 |     task_generator >> textcat_generation >> keep_columns
170 | 
171 |     if __name__ == "__main__":
172 |         distiset = pipeline.run()
173 |     """
174 |         )
175 | 
176 |     return (
177 |         base_code
178 |         + f"""
179 |     keep_columns = KeepColumns(
180 |         columns=["text"],
181 |     )
182 | 
183 |     textcat_labeller = TextClassification(
184 |         llm={_get_llm_class()}.from_dict(
185 |             {_get_llm().dump()}
186 |         ),
187 |         n={num_labels},
188 |         available_labels={labels},
189 |         context=SYSTEM_PROMPT,
190 |         default_label="unknown"
191 |     )
192 | 
193 |     # Connect steps in the pipeline
194 |     task_generator >> textcat_generation >> keep_columns >> textcat_labeller
195 | 
196 |     if __name__ == "__main__":
197 |         distiset = pipeline.run()
198 |     """
199 |     )
200 | 


--------------------------------------------------------------------------------
/src/synthetic_dataset_generator/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import uuid
  3 | import warnings
  4 | from typing import List, Optional, Union
  5 | 
  6 | import argilla as rg
  7 | import gradio as gr
  8 | import numpy as np
  9 | import pandas as pd
 10 | from gradio.oauth import (
 11 |     OAuthToken,
 12 |     get_space,
 13 | )
 14 | from huggingface_hub import whoami
 15 | from jinja2 import Environment, meta
 16 | 
 17 | from synthetic_dataset_generator.constants import argilla_client
 18 | 
 19 | 
 20 | def get_duplicate_button():
 21 |     if get_space() is not None:
 22 |         return gr.DuplicateButton(size="lg")
 23 | 
 24 | 
 25 | def list_orgs(oauth_token: Union[OAuthToken, None] = None):
 26 |     if oauth_token is None:
 27 |         return []
 28 |     try:
 29 |         data = whoami(oauth_token.token)
 30 |     except Exception:
 31 |         swap_visibility(None)
 32 |         return []
 33 | 
 34 |     try:
 35 |         if data["auth"]["type"] == "oauth":
 36 |             organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
 37 |         elif data["auth"]["type"] == "access_token":
 38 |             organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
 39 |         else:
 40 |             organizations = [
 41 |                 entry["entity"]["name"]
 42 |                 for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
 43 |                 if "repo.write" in entry["permissions"]
 44 |             ]
 45 |             organizations = [org for org in organizations if org != data["name"]]
 46 |             organizations = [data["name"]] + organizations
 47 |     except Exception as e:
 48 |         warnings.warn(str(e))
 49 |         gr.Info(
 50 |             "Your user token does not have the necessary permissions to push to organizations."
 51 |             "Please check your OAuth permissions in https://huggingface.co/settings/connected-applications."
 52 |             "Update your token permissions to include repo.write: https://huggingface.co/settings/tokens."
 53 |         )
 54 |         return []
 55 | 
 56 |     return organizations
 57 | 
 58 | 
 59 | def get_random_repo_name():
 60 |     return f"my-distiset-{str(uuid.uuid4())[:8]}"
 61 | 
 62 | 
 63 | def get_org_dropdown(oauth_token: Union[OAuthToken, None] = None):
 64 |     if oauth_token is not None:
 65 |         orgs = list_orgs(oauth_token)
 66 |     else:
 67 |         orgs = []
 68 |     return gr.Dropdown(
 69 |         label="Organization",
 70 |         choices=orgs,
 71 |         value=orgs[0] if orgs else None,
 72 |         allow_custom_value=True,
 73 |         interactive=True,
 74 |     )
 75 | 
 76 | 
 77 | def swap_visibility(oauth_token: Union[OAuthToken, None]):
 78 |     if oauth_token:
 79 |         return gr.update(elem_classes=["main_ui_logged_in"])
 80 |     else:
 81 |         return gr.update(elem_classes=["main_ui_logged_out"])
 82 | 
 83 | 
 84 | def get_argilla_client() -> Union[rg.Argilla, None]:
 85 |     return argilla_client
 86 | 
 87 | 
 88 | def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
 89 |     return list(set([label.lower().strip() for label in labels])) if labels else []
 90 | 
 91 | 
 92 | def column_to_list(dataframe: pd.DataFrame, column_name: str) -> List[str]:
 93 |     if column_name in dataframe.columns:
 94 |         return dataframe[column_name].tolist()
 95 |     else:
 96 |         raise ValueError(f"Column '{column_name}' does not exist.")
 97 | 
 98 | 
 99 | def process_columns(
100 |     dataframe,
101 |     instruction_column: str,
102 |     response_columns: Union[str, List[str]],
103 | ) -> List[dict]:
104 |     instruction_column = [instruction_column]
105 |     if isinstance(response_columns, str):
106 |         response_columns = [response_columns]
107 | 
108 |     data = []
109 |     for _, row in dataframe.iterrows():
110 |         instruction = ""
111 |         for col in instruction_column:
112 |             value = row[col]
113 |             if isinstance(value, (list, np.ndarray)):
114 |                 user_contents = [d["content"] for d in value if d.get("role") == "user"]
115 |                 if user_contents:
116 |                     instruction = user_contents[-1]
117 |             elif isinstance(value, str):
118 |                 try:
119 |                     parsed_message = json.loads(value)
120 |                     user_contents = [
121 |                         d["content"] for d in parsed_message if d.get("role") == "user"
122 |                     ]
123 |                     if user_contents:
124 |                         instruction = user_contents[-1]
125 |                 except json.JSONDecodeError:
126 |                     instruction = value
127 |             else:
128 |                 instruction = ""
129 | 
130 |         generations = []
131 |         for col in response_columns:
132 |             value = row[col]
133 |             if isinstance(value, (list, np.ndarray)):
134 |                 if all(isinstance(item, dict) and "role" in item for item in value):
135 |                     assistant_contents = [
136 |                         d["content"] for d in value if d.get("role") == "assistant"
137 |                     ]
138 |                     if assistant_contents:
139 |                         generations.append(assistant_contents[-1])
140 |                 else:
141 |                     generations.extend(value)
142 |             elif isinstance(value, str):
143 |                 try:
144 |                     parsed_message = json.loads(value)
145 |                     assistant_contents = [
146 |                         d["content"]
147 |                         for d in parsed_message
148 |                         if d.get("role") == "assistant"
149 |                     ]
150 |                     if assistant_contents:
151 |                         generations.append(assistant_contents[-1])
152 |                 except json.JSONDecodeError:
153 |                     generations.append(value)
154 |             else:
155 |                 pass
156 | 
157 |         data.append({"instruction": instruction, "generations": generations})
158 | 
159 |     return data
160 | 
161 | 
162 | def extract_column_names(prompt_template: str) -> List[str]:
163 |     env = Environment()
164 |     parsed_content = env.parse(prompt_template)
165 |     variables = meta.find_undeclared_variables(parsed_content)
166 |     return list(variables)
167 | 
168 | 
169 | def pad_or_truncate_list(lst, target_length):
170 |     lst = lst or []
171 |     lst_length = len(lst)
172 |     if lst_length >= target_length:
173 |         return lst[-target_length:]
174 |     else:
175 |         return lst + [None] * (target_length - lst_length)
176 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/5a40c394b8aa9dc7ed21451f6c7db2bcdff7f13d/tests/__init__.py


--------------------------------------------------------------------------------