├── .github └── workflows │ ├── deploy.yaml │ └── test.yaml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── examples ├── README.md ├── push_to_hf.py ├── requirements.txt ├── tiny_programs.py └── train_model.py ├── fastdata ├── __init__.py ├── _modidx.py └── core.py ├── nbs ├── 00_core.ipynb ├── _quarto.yml ├── index.ipynb ├── nbdev.yml ├── sidebar.yml └── styles.css ├── pyproject.toml ├── settings.ini └── setup.py /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | 3 | permissions: 4 | contents: write 5 | pages: write 6 | 7 | on: 8 | push: 9 | branches: [ "main", "master" ] 10 | workflow_dispatch: 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | steps: [uses: fastai/workflows/quarto-ghp@master] 15 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [workflow_dispatch, pull_request, push] 3 | 4 | jobs: 5 | test: 6 | runs-on: ubuntu-latest 7 | steps: [uses: fastai/workflows/nbdev-ci@master] 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _docs/ 2 | _proc/ 3 | 4 | *.bak 5 | .gitattributes 6 | .last_checked 7 | .gitconfig 8 | *.bak 9 | *.log 10 | *~ 11 | ~* 12 | _tmp* 13 | tmp* 14 | tags 15 | *.pkg 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | env/ 28 | build/ 29 | conda/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | .eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .coverage 59 | .coverage.* 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | *.cover 64 | .hypothesis/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # dotenv 100 | .env 101 | 102 | # virtualenv 103 | .venv 104 | venv/ 105 | ENV/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | 120 | .vscode 121 | *.swp 122 | 123 | # osx generated files 124 | .DS_Store 125 | .DS_Store? 126 | .Trashes 127 | ehthumbs.db 128 | Thumbs.db 129 | .idea 130 | 131 | # pytest 132 | .pytest_cache 133 | 134 | # tools/trust-doc-nbs 135 | docs_src/.last_checked 136 | 137 | # symlinks to fastai 138 | docs_src/fastai 139 | tools/fastai 140 | 141 | # link checker 142 | checklink/cookies.txt 143 | 144 | # .gitconfig is now autogenerated 145 | .gitconfig 146 | 147 | # Quarto installer 148 | .deb 149 | .pkg 150 | 151 | # Quarto 152 | .quarto 153 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Release notes 2 | 3 | 4 | 5 | ## 0.0.4 6 | 7 | ### Features Added 8 | 9 | - Added `generate_to_hf` which automatically saves generations to Hugging Face dataset repository ([#7](https://github.com/AnswerDotAI/fastdata/pull/9)) 10 | 11 | ### Bugs Squashed 12 | 13 | - When using max_workers > 1, the order of the returned synthetic data is not the same as the order of inputs given ([#7](https://github.com/AnswerDotAI/fastdata/issues/7)) 14 | 15 | 16 | ## 0.0.3 17 | - Update readme 18 | 19 | 20 | 21 | ## 0.0.1 22 | 23 | - initial release 24 | 25 | 26 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | Make sure you have read the [doc on code style]( 4 | https://docs.fast.ai/dev/style.html) first. (Note that we don't follow PEP8, but instead follow a coding style designed specifically for numerical and interactive programming.) 5 | 6 | This project uses [nbdev](https://nbdev.fast.ai/getting_started.html) for development. Before beginning, make sure that nbdev and a jupyter-compatible client such as jupyterlab or nbclassic are installed. To make changes to the codebase, update the notebooks in the `nbs` folder, not the .py files directly. Then, run `nbdev_export`. For more details, have a look at the [nbdev tutorial](https://nbdev.fast.ai/tutorials/tutorial.html). 7 | 8 | You may want to set up a `prep` alias in `~/.zshrc` or other shell startup file: 9 | 10 | ```sh 11 | alias prep='nbdev_export && nbdev_clean && nbdev_trust' 12 | ``` 13 | 14 | Run `prep` before each commit to ensure your python files are up to date, and you notebooks cleaned of metadata and notarized. 15 | 16 | ## Updating README.md 17 | 18 | Similar to updating Python source code files, to update the `README.md` file you will need to edit a notebook file, specifically `nbs/index.ipynb`. 19 | 20 | However, there are a couple of extra dependencies that you need to install first in order to make this work properly. Go to the directory you cloned the repo to, and type: 21 | 22 | ``` 23 | pip install -e '.[dev]' 24 | ``` 25 | 26 | And install quarto too: 27 | 28 | ``` 29 | nbdev_install_quarto 30 | ``` 31 | 32 | Then, after you make subsequent changes to `nbs/index.ipynb`, run the following from the repo's root directory to (re)build `README.md`: 33 | 34 | ``` 35 | nbdev_readme 36 | ``` 37 | 38 | ## Did you find a bug? 39 | 40 | * Ensure the bug was not already reported by searching on GitHub under Issues. 41 | * If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring. 42 | * Be sure to add the complete error messages. 43 | 44 | ### Did you write a patch that fixes a bug? 45 | 46 | * Open a new GitHub pull request with the patch. 47 | * Ensure that your PR includes a test that fails without your patch, and pass with it. 48 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable. 49 | 50 | ## PR submission guidelines 51 | 52 | * Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needed to keep each PR focused. 53 | * Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and will most likely get rejected. 54 | * Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can. 55 | * Do not turn an already-submitted PR into your development playground. If after you submit a PR, you discover that more work is needed: close the PR, do the required work, and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project. 56 | * If, however, you submit a PR and receive a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work, and then submit it again. Use common sense where you'd choose one way over another. 57 | 58 | ## Do you want to contribute to the documentation? 59 | 60 | * Docs are automatically created from the notebooks in the nbs folder. 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022, fastai 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include LICENSE 3 | include CONTRIBUTING.md 4 | include README.md 5 | recursive-exclude * __pycache__ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastdata 2 | 3 | 4 | 5 | 6 | `fastdata` is a minimal library for generating synthetic data for 7 | training deep learning models. For example, below is how you can 8 | generate a dataset to train a language model to translate from English 9 | to Spanish. 10 | 11 | First you need to define the structure of the data you want to generate. 12 | `claudette`, which is the library that fastdata uses to generate data, 13 | requires you to define the schema of the data you want to generate. 14 | 15 | ``` python 16 | from fastcore.utils import * 17 | ``` 18 | 19 | ``` python 20 | class Translation(): 21 | "Translation from an English phrase to a Spanish phrase" 22 | def __init__(self, english: str, spanish: str): store_attr() 23 | def __repr__(self): return f"{self.english} ➡ *{self.spanish}*" 24 | 25 | Translation("Hello, how are you today?", "Hola, ¿cómo estás hoy?") 26 | ``` 27 | 28 | Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?* 29 | 30 | Next, you need to define the prompt that will be used to generate the 31 | data and any inputs you want to pass to the prompt. 32 | 33 | ``` python 34 | prompt_template = """\ 35 | Generate English and Spanish translations on the following topic: 36 | {topic} 37 | """ 38 | 39 | inputs = [{"topic": "Otters are cute"}, {"topic": "I love programming"}] 40 | ``` 41 | 42 | Finally, we can generate some data with fastdata. 43 | 44 | > [!NOTE] 45 | > 46 | > We only support Anthropic models at the moment. Therefore, make sure 47 | > you have an API key for the model you want to use and the proper 48 | > environment variables set or pass the api key to the 49 | > [`FastData`](https://AnswerDotAI.github.io/fastdata/core.html#fastdata) 50 | > class `FastData(api_key="sk-ant-api03-...")`. 51 | 52 | ``` python 53 | from fastdata.core import FastData 54 | ``` 55 | 56 | ``` python 57 | fast_data = FastData(model="claude-3-haiku-20240307") 58 | translations = fast_data.generate( 59 | prompt_template=prompt_template, 60 | inputs=inputs, 61 | schema=Translation, 62 | ) 63 | ``` 64 | 65 | 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.57it/s] 66 | 67 | ``` python 68 | from IPython.display import Markdown 69 | ``` 70 | 71 | ``` python 72 | Markdown("\n".join(f'- {t}' for t in translations)) 73 | ``` 74 | 75 | - I love programming ➡ *Me encanta la programación* 76 | - Otters are cute ➡ *Las nutrias son lindas* 77 | 78 | ### Installation 79 | 80 | Install latest from the GitHub 81 | [repository](https://github.com/AnswerDotAI/fastdata): 82 | 83 | ``` sh 84 | $ pip install git+https://github.com/AnswerDotAI/fastdata.git 85 | ``` 86 | 87 | or from [pypi](https://pypi.org/project/fastdata/) 88 | 89 | ``` sh 90 | $ pip install python-fastdata 91 | ``` 92 | 93 | If you’d like to see how best to generate data with fastdata, check out 94 | our blog post [here](https://www.answer.ai/blog/introducing-fastdata) 95 | and some of the examples in the 96 | [examples](https://github.com/AnswerDotAI/fastdata/tree/main/examples) 97 | directory. 98 | 99 | ## Developer Guide 100 | 101 | If you are new to using `nbdev` here are some useful pointers to get you 102 | started. 103 | 104 | ### Install fastdata in Development mode 105 | 106 | ``` sh 107 | # make sure fastdata package is installed in development mode 108 | $ pip install -e . 109 | 110 | # make changes under nbs/ directory 111 | # ... 112 | 113 | # compile to have changes apply to fastdata 114 | $ nbdev_prepare 115 | ``` 116 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # TinyPrograms 2 | 3 | This experiment was inspired by the paper [TinyStories: How Small Can Language Models Be and Still Speak Coherent English?](https://arxiv.org/abs/2305.07759). The authors used OpenAI GPT models to generate simple, synthetic children's books. They showed that training on this data allowed extremely tiny models (~10M parameters) to outperform 10x larger models. TinyProgram's is the coding equivalent. It has ~1,000 tiny python programs generated by Antropic's Haiku model. I decided to deviate from the original paper by finetuning a model rather than train from scratch since I had such a small dataset. I used the following definition of what a tiny program is when using `fastdata` to generate them: 4 | 5 | ```python 6 | class TinyProgram(BasicRepr): 7 | """ 8 | A tiny program that is a valid python program that satisfies the requirements. 9 | """ 10 | def __init__( 11 | self, 12 | requirements: str, # A description of the requirements for the program to help the persona. 13 | code: str, # The code that satisfies the requirements. 14 | ): store_attr() 15 | ``` 16 | 17 | I also followed the advice in the blog post that accompanies this experiment, "How To ~~Train~~ Synthesize Your ~~Dragon~~ Data." It includes examples of the tiny programs I want. I also used the recently released PersonaHub to create a diverse set of tiny programs. Here is what the prompt looks like: 18 | 19 | ```python 20 | prompt_template = """\ 21 | 22 | {examples} 23 | {persona} 27 | """ 28 | ``` 29 | 30 | Here is one of the programs that Haiku created: 31 | 32 | #### Persona 33 | 34 | ``` 35 | A striker who admires the player's defensive prowess and values their support on the field 36 | ``` 37 | 38 | #### Requirements 39 | 40 | ``` 41 | A Python program that simulates a soccer game between a striker and a defensive player. The striker admires the defensive player's prowess and values their support on the field. The program should model the interactions between the striker and the defensive player, demonstrating how the defensive player's support enables the striker to be more effective in scoring goals. 42 | ``` 43 | 44 | #### Program 45 | 46 | ```python 47 | import random 48 | 49 | class SoccerPlayer: 50 | def init(self, name, position): 51 | self.name = name 52 | self.position = position 53 | self.skill_level = random.randint(1, 10) 54 | 55 | class Striker(SoccerPlayer): 56 | def init(self, name): 57 | super().__init__(name, "Striker") 58 | self.scoring_chance = 0.5 59 | 60 | def shoot(self, defender): 61 | if defender.skill_level >= self.skill_level: 62 | print(f"{self.name} shoots, but {defender.name} blocks the shot!") 63 | else: 64 | if random.random() < self.scoring_chance: 65 | print(f"{self.name} scores a goal!") 66 | else: 67 | print(f"{self.name} misses the shot.") 68 | 69 | class Defender(SoccerPlayer): 70 | def init(self, name): 71 | super().__init__(name, "Defender") 72 | self.support_level = 7 73 | 74 | def support(self, striker): 75 | striker.scoring_chance += self.support_level / 100 76 | print(f"{self.name} provides support to {striker.name}, increasing their scoring chance.") 77 | 78 | def simulate_game(): 79 | striker = Striker("Alex") 80 | defender = Defender("Sarah") 81 | print(f"{striker.name} (Striker) vs. {defender.name} (Defender)") 82 | 83 | for _ in range(5): 84 | defender.support(striker) 85 | striker.shoot(defender) 86 | print() 87 | 88 | simulate_game() 89 | ``` 90 | 91 | ### The Experiment 92 | 93 | I took TinyPrograms and tried to finetune a strong LLM model to see if I could improve its coding ability. I used Huggingface's awesome SmolLM-360M. It's small and works well on coding tasks. Out of the box, SmolLM-360M scores 11.6% on a popular coding test called HumanEval. HumanEval is a popular coding test. I created 5 configurations of datasets to test which improves my model the most: 94 | 95 | 1. The first one is simply the 992 tiny Python programs. 96 | 2. The second is 992 Python files that have been taken from the popular Stack dataset. 97 | 3. The third is a high-quality, filtered version of the tiny Python programs. It uses an LLM to score the programs based on a rubric. 98 | 4. The fourth is the same as the third, but on the Python files taken from the Stack. 99 | 5. Finally, the fifth mixes half of the high quality filtered tiny Python programs and the high quality filtered Python files from the Stack. 100 | 101 | ## Filtering for Quality 102 | 103 | To filter the tiny programs, I used `fastdata` 104 | 105 | ```python 106 | class TinyProgramCritique(BasicRepr): 107 | """ 108 | A critique of a tiny program. 109 | """ 110 | def __init__( 111 | self, 112 | critique: str, # A critique of the code. 113 | score: Literal[1, 2, 3, 4, 5], # A score of the code from 1 to 5. 114 | ): store_attr() 115 | ``` 116 | 117 | And here is the prompt I used to guide to model to generating a score: 118 | 119 | ```python 120 | critique_template = """\ 121 | Below is a code snippet. Evaluate its educational value for teaching programming to beginners in this language, using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion: 122 | 123 | - Add 1 point if the code is syntactically correct and runs without errors, providing a basic example of working code in the language. 124 | - Add another point if the code demonstrates fundamental programming concepts (e.g., variables, control structures, functions) in a straightforward manner, even if it's not optimized or doesn't follow all best practices. 125 | - Award a third point if the code is well-commented, explaining key concepts and the purpose of different code sections. It should be readable and illustrate good naming conventions, making it easier for beginners to understand. 126 | - Grant a fourth point if the code showcases language-specific features or common programming patterns in an accessible way. It should provide clear examples of how to apply these concepts practically. 127 | - Bestow a fifth point if the code is an exemplary teaching tool, striking an excellent balance between simplicity and real-world applicability. It should inspire further learning, possibly including deliberate mistakes or opportunities for improvement that a teacher could use as discussion points. 128 | 129 | The code snippet: 130 | 131 | {code} 132 | 133 | 134 | After examining the code: 135 | - Briefly justify your total score, up to 100 words, focusing on its effectiveness as a teaching tool for beginners. 136 | - Conclude with the score. 137 | """ 138 | ``` 139 | 140 | This is the distribution of the scores for the 992 tiny Python programs: 141 | 142 | | Score | Count | 143 | |-------|-------| 144 | | 1 | 25 | 145 | | 2 | 117 | 146 | | 3 | 96 | 147 | | 4 | 256 | 148 | | 5 | 498 | 149 | 150 | And here is the same for 10,000 of the Python files: 151 | 152 | | Score | Count | 153 | |-------|-------| 154 | | 1 | 2239 | 155 | | 2 | 5230 | 156 | | 3 | 1545 | 157 | | 4 | 618 | 158 | | 5 | 236 | 159 | 160 | I only kept a score of 4 and 5 as high quality data for both the tiny python programs and python files from the Stack. 161 | 162 | ### Results 163 | 164 | | Setup | pass@1 | 165 | |---------|--------| 166 | | Baseline | 11.6% | 167 | | TinyPrograms | 09.1% | 168 | | The Stack | 11.0% | 169 | | TinyPrograms Filtered | 12.2% | 170 | | The Stack Filtered | 08.5% | 171 | | Mixed Filtered | 09.8% | 172 | 173 | ### Key findings from the experiment: 174 | 175 | 1. Training on synthetic data is better than training on random GitHub programs when performing quality filtering, i.e., TinyPrograms Filtered vs The Stack Filtered. 176 | 2. Only high-quality synthetic data (TinyPrograms Filtered) improve performance over the baseline. 177 | 3. All other setups degrade performance. High-quality Python files from the Stack show the biggest drop. This warrants further investigation. Possible explanations include: 178 | - The scoring system may not be as effective for GitHub programs as it is for synthetic ones. 179 | - There might be a lack of diversity in the GitHub programs. 180 | 181 | For further exploration, I encourage you to: 182 | 1. Replicate this experiment with your own task. 183 | 2. Experiment with larger datasets to see how they affect model performance. 184 | 3. Share your findings with the community and reach out if you need help! 185 | 186 | To do this yourself, follow the rest of this README. It shows how to reproduce my results and serves as a starting point for your project. 187 | 188 | ## Install 189 | 190 | Make sure you have installed `fastdata` with the following command from the root of the repo: 191 | 192 | ```bash 193 | pip install -e . 194 | ``` 195 | 196 | If you want to train a model, install the following dependencies in the `examples` folder: 197 | 198 | ```bash 199 | pip install -r requirements.txt 200 | ``` 201 | 202 | Then run the following if you will use flash attention: 203 | 204 | ```bash 205 | pip install flash-attn --no-build-isolation 206 | ``` 207 | 208 | ## Run 209 | 210 | ### Data Synthesis 211 | 212 | We have a script to generate our tiny programs dataset. It can be run with this command: 213 | 214 | ```bash 215 | python tiny_programs.py 216 | ``` 217 | 218 | You can see all the command-line arguments by running: 219 | 220 | ```bash 221 | python tiny_programs.py --help 222 | ``` 223 | 224 | ### Training 225 | 226 | To train a model, you can use the following command: 227 | 228 | ```bash 229 | python train.py 230 | ``` 231 | 232 | You can view all the command-line arguments by executing the following command: 233 | 234 | ```bash 235 | python train.py --help 236 | ``` 237 | -------------------------------------------------------------------------------- /examples/push_to_hf.py: -------------------------------------------------------------------------------- 1 | from fastcore.utils import * 2 | from fastdata.core import FastData 3 | 4 | 5 | class Translation: 6 | "Translation from an English phrase to a Spanish phrase" 7 | 8 | def __init__(self, english: str, spanish: str): 9 | self.english = english 10 | self.spanish = spanish 11 | 12 | def __repr__(self): 13 | return f"{self.english} ➡ *{self.spanish}*" 14 | 15 | 16 | prompt_template = """\ 17 | Generate English and Spanish translations on the following topic: 18 | {topic} 19 | """ 20 | 21 | inputs = [ 22 | {"topic": "I am going to the beach this weekend"}, 23 | {"topic": "I am going to the gym after work"}, 24 | {"topic": "I am going to the park with my kids"}, 25 | {"topic": "I am going to the movies with my friends"}, 26 | {"topic": "I am going to the store to buy some groceries"}, 27 | {"topic": "I am going to the library to read some books"}, 28 | {"topic": "I am going to the zoo to see the animals"}, 29 | {"topic": "I am going to the museum to see the art"}, 30 | {"topic": "I am going to the restaurant to eat some food"}, 31 | ] 32 | 33 | fast_data = FastData(model="claude-3-haiku-20240307") 34 | dataset_name = "my_dataset" 35 | 36 | repo_id, translations = fast_data.generate_to_hf( 37 | prompt_template=prompt_template, 38 | inputs=inputs, 39 | schema=Translation, 40 | repo_id=dataset_name, 41 | max_items_per_file=4, 42 | ) 43 | print(f"A new repository has been create on {repo_id}") 44 | print(translations) 45 | -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | minai 3 | torcheval -------------------------------------------------------------------------------- /examples/tiny_programs.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset, load_dataset 2 | from fastdata.core import FastData 3 | from fastcore.script import * 4 | from fastcore.utils import * 5 | 6 | class TinyProgram(BasicRepr): 7 | "A tiny program that is a valid python program that satisfies the requirements." 8 | def __init__(self,requirements: str,code: str): store_attr() 9 | 10 | examples = [ 11 | TinyProgram( 12 | requirements="A Python-based data aggregation and analysis tool that scrapes key Salvadoran news websites and government portals for the latest political updates, election results, and policy changes. The program would use standard libraries like requests for web scraping, re for text parsing, and pandas for data manipulation. It would store the collected information in a structured format, perform basic sentiment analysis on news articles, and generate a daily summary report highlighting significant political events, trending topics, and shifts in public opinion. The tool could also track mentions of key political figures and parties, providing a quick overview of their media presence and associated sentiments.", 13 | code="""\ 14 | ```python 15 | import requests 16 | from bs4 import BeautifulSoup 17 | import pandas as pd 18 | from textblob import TextBlob 19 | from collections import Counter 20 | import datetime 21 | 22 | def scrape_news(url): 23 | response = requests.get(url) 24 | soup = BeautifulSoup(response.content, 'html.parser') 25 | articles = soup.find_all('article', class_='article-item') 26 | 27 | news_data = [] 28 | for article in articles: 29 | title = article.find('h2', class_='article-title').text.strip() 30 | summary = article.find('p', class_='article-summary').text.strip() 31 | news_data.append({'title': title, 'summary': summary}) 32 | 33 | return news_data 34 | 35 | def analyze_sentiment(text): 36 | return TextBlob(text).sentiment.polarity 37 | 38 | def generate_report(data): 39 | df = pd.DataFrame(data) 40 | df['sentiment'] = df['summary'].apply(analyze_sentiment) 41 | 42 | # Calculate average sentiment 43 | avg_sentiment = df['sentiment'].mean() 44 | 45 | # Find most mentioned words 46 | all_words = ' '.join(df['title'] + ' ' + df['summary']).lower().split() 47 | word_freq = Counter(word for word in all_words if len(word) > 3) 48 | top_words = word_freq.most_common(5) 49 | 50 | # Generate report 51 | report = f"Daily Political Analysis Report for El Salvador - {datetime.date.today()}\n\n" 52 | report += f"Number of articles analyzed: {len(df)}\n" 53 | report += f"Average sentiment: {'Positive' if avg_sentiment > 0 else 'Negative'} ({avg_sentiment:.2f})\n\n" 54 | report += "Top mentioned words:\n" 55 | for word, count in top_words: 56 | report += f"- {word}: {count} times\n" 57 | 58 | report += "\nMost positive article:\n" 59 | pos_article = df.loc[df['sentiment'].idxmax()] 60 | report += f"Title: {pos_article['title']}\nSentiment: {pos_article['sentiment']:.2f}\n\n" 61 | 62 | report += "Most negative article:\n" 63 | neg_article = df.loc[df['sentiment'].idxmin()] 64 | report += f"Title: {neg_article['title']}\nSentiment: {neg_article['sentiment']:.2f}\n" 65 | 66 | return report 67 | 68 | def main(): 69 | url = "https://www.elsalvador.com/noticias/nacional/" # Example Salvadoran news website 70 | news_data = scrape_news(url) 71 | report = generate_report(news_data) 72 | print(report) 73 | 74 | # Optionally, save the report to a file 75 | with open(f"el_salvador_political_report_{datetime.date.today()}.txt", "w") as f: 76 | f.write(report) 77 | 78 | if __name__ == "__main__": 79 | main() 80 | ``` 81 | """ 82 | ), 83 | TinyProgram( 84 | requirements="A \"Joke Personalizer\" program that takes a comedian's standard jokes as input and automatically modifies them to include inside references and shared memories from their school days. The program should use a dictionary of preset keywords (e.g., \"cafeteria\", \"Mr. Johnson's class\") and replace generic terms in the jokes with these personalized references. It should also have a \"nostalgia meter\" that rates how many childhood callbacks are in each joke, and a \"groan factor\" estimator based on the corniness of the puns. Bonus feature: a \"detention probability calculator\" that humorously predicts how likely the joke would have gotten them in trouble back in school.", 85 | code="""\ 86 | ```python 87 | import random 88 | import re 89 | 90 | class JokePersonalizer: 91 | def __init__(self): 92 | self.inside_references = { 93 | "restaurant": "cafeteria", 94 | "teacher": "Mr. Johnson", 95 | "friend": "Danny 'Braces' Smith", 96 | "car": "rusty old bike", 97 | "mall": "corner store", 98 | "party": "detention", 99 | "cool": "totally radical", 100 | "phone": "Gameboy", 101 | "computer": "TI-83 calculator", 102 | "internet": "library encyclopedia" 103 | } 104 | self.pun_words = ["cheesy", "corny", "hilarious", "side-splitting", "knee-slapping"] 105 | 106 | def personalize_joke(self, joke): 107 | for generic, personal in self.inside_references.items(): 108 | joke = re.sub(r'\b' + generic + r'\b', personal, joke, flags=re.IGNORECASE) 109 | return joke 110 | 111 | def nostalgia_meter(self, joke): 112 | count = sum(1 for ref in self.inside_references.values() if ref.lower() in joke.lower()) 113 | return min(count * 20, 100) # 20 points per reference, max 100 114 | 115 | def groan_factor(self, joke): 116 | pun_count = sum(1 for word in self.pun_words if word.lower() in joke.lower()) 117 | return min(pun_count * 25, 100) # 25 points per pun word, max 100 118 | 119 | def detention_probability(self, joke): 120 | naughty_words = ["detention", "trouble", "principal's office", "suspended"] 121 | probability = sum(10 for word in naughty_words if word.lower() in joke.lower()) 122 | return min(probability, 100) # 10% per naughty word, max 100% 123 | 124 | def process_joke(self, original_joke): 125 | personalized_joke = self.personalize_joke(original_joke) 126 | nostalgia = self.nostalgia_meter(personalized_joke) 127 | groan = self.groan_factor(personalized_joke) 128 | detention_prob = self.detention_probability(personalized_joke) 129 | 130 | return { 131 | "original": original_joke, 132 | "personalized": personalized_joke, 133 | "nostalgia_rating": nostalgia, 134 | "groan_factor": groan, 135 | "detention_probability": detention_prob 136 | } 137 | 138 | # Example usage 139 | personalizer = JokePersonalizer() 140 | 141 | jokes = [ 142 | "I went to a restaurant last night and had the best meal ever!", 143 | "My teacher asked me to stay after class, it was so cool!", 144 | "I threw a party and nobody came. It was a real phone-y situation!", 145 | ] 146 | 147 | for joke in jokes: 148 | result = personalizer.process_joke(joke) 149 | print(f"Original: {result['original']}") 150 | print(f"Personalized: {result['personalized']}") 151 | print(f"Nostalgia Rating: {result['nostalgia_rating']}%") 152 | print(f"Groan Factor: {result['groan_factor']}%") 153 | print(f"Detention Probability: {result['detention_probability']}%") 154 | print() 155 | ``` 156 | """ 157 | ), 158 | ] 159 | examples = "\n".join(f"- {example}" for example in examples) 160 | 161 | class TinyProgramCritique(BasicRepr): 162 | "A critique of a tiny program." 163 | def __init__(self,critique: str,score: int): store_attr() 164 | 165 | def load_personas(num_personas: int = 1_000): 166 | return load_dataset("proj-persona/PersonaHub", "persona", split='train').select(range(num_personas))['persona'] 167 | 168 | def generate_tiny_programs(fast_data, personas, examples, sp): 169 | prompt_template = """\ 170 | 171 | {examples} 172 | {persona} 176 | """ 177 | tiny_programs = fast_data.generate( 178 | prompt_template=prompt_template, 179 | inputs=[{"persona": persona, "examples": examples} for persona in personas], 180 | schema=TinyProgram, 181 | sp=sp 182 | ) 183 | return [t for t in tiny_programs if t is not None] 184 | 185 | def generate_critiques(fast_data, tiny_programs, sp): 186 | critique_template = """\ 187 | Below is a code snippet. Evaluate its educational value for teaching programming to beginners in this language, using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion: 188 | 189 | - Add 1 point if the code is syntactically correct and runs without errors, providing a basic example of working code in the language. 190 | - Add another point if the code demonstrates fundamental programming concepts (e.g., variables, control structures, functions) in a straightforward manner, even if it's not optimized or doesn't follow all best practices. 191 | - Award a third point if the code is well-commented, explaining key concepts and the purpose of different code sections. It should be readable and illustrate good naming conventions, making it easier for beginners to understand. 192 | - Grant a fourth point if the code showcases language-specific features or common programming patterns in an accessible way. It should provide clear examples of how to apply these concepts practically. 193 | - Bestow a fifth point if the code is an exemplary teaching tool, striking an excellent balance between simplicity and real-world applicability. It should inspire further learning, possibly including deliberate mistakes or opportunities for improvement that a teacher could use as discussion points. 194 | 195 | The code snippet: 196 | 197 | {code} 198 | 199 | 200 | After examining the code: 201 | 202 | - Briefly justify your total score, up to 100 words, focusing on its effectiveness as a teaching tool for beginners. 203 | - Conclude with the score. 204 | """ 205 | return fast_data.generate( 206 | prompt_template=critique_template, 207 | inputs=[{"code": t.code} for t in tiny_programs], 208 | schema=TinyProgramCritique, 209 | sp=sp 210 | ) 211 | 212 | def update_programs_with_critiques(tiny_programs, critiques): 213 | programs = [] 214 | for program, critique in zip(tiny_programs, critiques): 215 | if program is None or critique is None: continue 216 | programs.append({ 217 | "requirements": program.requirements, 218 | "code": program.code, 219 | "critique": critique.critique, 220 | "score": critique.score 221 | }) 222 | return programs 223 | 224 | @call_parse 225 | def main(num_personas: Param("Number of personas to use", int) = 1_000, 226 | program_model: Param("Model to use for generating tiny programs", str) = "claude-3-haiku-20240307", 227 | program_sp: Param("System prompt for generating tiny programs", str) = "You are a helpful assistant for generating python programs.", 228 | critique_model: Param("Model to use for generating critiques", str) = "claude-3-5-sonnet-20240620", 229 | critique_sp: Param("System prompt for critiquing tiny programs", str) = "You are a helpful assistant tasked with critiquing python programs.", 230 | output_dataset: Param("Name of the output dataset", str) = "answerdotai/tiny_programs", 231 | private: Param("Whether to make the output dataset private", bool) = True): 232 | 233 | fast_data = FastData(model=program_model) 234 | personas = load_personas(num_personas) 235 | tiny_programs = generate_tiny_programs(fast_data, personas, examples, program_sp) 236 | fast_data = FastData(model=critique_model) 237 | critiques = generate_critiques(fast_data, tiny_programs, critique_sp) 238 | updated_programs = update_programs_with_critiques(tiny_programs, critiques) 239 | 240 | ds = Dataset.from_list(updated_programs) 241 | ds.push_to_hub(output_dataset, private=private) -------------------------------------------------------------------------------- /examples/train_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from fastcore.script import * 3 | from minai.core import * 4 | from datasets import load_dataset 5 | from torch.utils.data import DataLoader 6 | from transformers import AutoModelForCausalLM, AutoTokenizer 7 | 8 | @call_parse 9 | def main( 10 | model_id: Param("Model ID", str) = 'HuggingFaceTB/SmolLM-360M', 11 | dataset_name: Param("Dataset name", str) = "answerdotai/tiny_programs", 12 | model_output_name: Param("Model output name", str) = "answerdotai/SmolLM-360M-finetuned-tiny_programs", 13 | batch_size: Param("Batch size", int) = 8, 14 | lr: Param("Learning rate", float) = 1e-3, 15 | num_epochs: Param("Number of epochs", int) = 5, 16 | filter_dataset: Param("Filter dataset", bool) = False, 17 | dataset_size: Param("Dataset size", int) = 754, 18 | dataset_column: Param("Dataset column", str) = "code", 19 | is_private: Param("Is private", bool) = True, 20 | ): 21 | set_seed(42) 22 | 23 | # Model and tokenizer setup 24 | m = AutoModelForCausalLM.from_pretrained( 25 | model_id, 26 | device_map=0, 27 | attn_implementation="flash_attention_2", 28 | trust_remote_code=True, 29 | low_cpu_mem_usage=True, 30 | torch_dtype=torch.bfloat16, 31 | use_cache=False 32 | ) 33 | tokenizer = AutoTokenizer.from_pretrained(model_id) 34 | tokenizer.pad_token = tokenizer.eos_token 35 | 36 | # Dataset preparation 37 | dataset = load_dataset(dataset_name, split="train").shuffle(42).select(range(dataset_size)) 38 | if filter_dataset: 39 | dataset = dataset.filter(lambda x: x['score'] in [4, 5]) 40 | 41 | def to_text(x): 42 | x['text'] = x[dataset_column] 43 | return x 44 | 45 | dataset = dataset.shuffle(42).map(to_text, remove_columns=dataset.column_names) 46 | train_dataset = dataset.select(range(0, len(dataset)-50)) 47 | eval_dataset = dataset.select(range(len(dataset)-50, len(dataset))) 48 | 49 | # DataLoader setup 50 | def collate_fn(examples): 51 | input_ids = tokenizer([e['text'] for e in examples], return_tensors='pt', padding=True, truncation=True, max_length=512)['input_ids'] 52 | return (input_ids[:, :-1], input_ids[:, 1:]) 53 | 54 | train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True) 55 | eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False) 56 | dls = DataLoaders(train_dataloader, eval_dataloader) 57 | 58 | # Training setup 59 | loss_fn = lambda x, y: torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1)) 60 | # sz = len(dls.train) // 10 61 | 62 | cbs = [DeviceCB(), MetricsCB()] 63 | prog = ProgressCB(plot=True) 64 | learn = MomentumLearner(m, dls, loss_func=loss_fn, lr=lr, cbs=cbs, preds_nm='logits', mom=0.9) 65 | 66 | # Training 67 | learn.fit(num_epochs, cbs=prog) 68 | 69 | # push to the hub 70 | learn.model.push_to_hub(model_output_name, private=is_private) 71 | tokenizer.push_to_hub(model_output_name, private=is_private) 72 | 73 | # Test generation 74 | prompt = 'import requests\n' 75 | tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda() 76 | 77 | with torch.inference_mode(): 78 | output = m.generate(tokenized_prompt, max_new_tokens=90) 79 | 80 | print(prompt + tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)) 81 | 82 | if __name__ == "__main__": 83 | main() -------------------------------------------------------------------------------- /fastdata/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.5" 2 | -------------------------------------------------------------------------------- /fastdata/_modidx.py: -------------------------------------------------------------------------------- 1 | # Autogenerated by nbdev 2 | 3 | d = { 'settings': { 'branch': 'main', 4 | 'doc_baseurl': '/fastdata', 5 | 'doc_host': 'https://AnswerDotAI.github.io', 6 | 'git_url': 'https://github.com/AnswerDotAI/fastdata', 7 | 'lib_path': 'fastdata'}, 8 | 'syms': { 'fastdata.core': { 'fastdata.core.FastData': ('core.html#fastdata', 'fastdata/core.py'), 9 | 'fastdata.core.FastData.__init__': ('core.html#fastdata.__init__', 'fastdata/core.py'), 10 | 'fastdata.core.FastData._process_input': ('core.html#fastdata._process_input', 'fastdata/core.py'), 11 | 'fastdata.core.FastData._save_results': ('core.html#fastdata._save_results', 'fastdata/core.py'), 12 | 'fastdata.core.FastData._set_rate_limit': ('core.html#fastdata._set_rate_limit', 'fastdata/core.py'), 13 | 'fastdata.core.FastData.generate': ('core.html#fastdata.generate', 'fastdata/core.py'), 14 | 'fastdata.core.FastData.generate_to_hf': ('core.html#fastdata.generate_to_hf', 'fastdata/core.py')}}} 15 | -------------------------------------------------------------------------------- /fastdata/core.py: -------------------------------------------------------------------------------- 1 | """Core functionality for synthetic data generation""" 2 | 3 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb. 4 | 5 | # %% auto 0 6 | __all__ = ['DATASET_CARD_TEMPLATE', 'FastData'] 7 | 8 | # %% ../nbs/00_core.ipynb 3 9 | import concurrent.futures 10 | import json 11 | import shutil 12 | from pathlib import Path 13 | from uuid import uuid4 14 | from typing import Optional, Union 15 | 16 | from tqdm import tqdm 17 | from fastcore.utils import * 18 | from ratelimit import limits, sleep_and_retry 19 | from huggingface_hub import CommitScheduler, DatasetCard 20 | from claudette import * 21 | 22 | # %% ../nbs/00_core.ipynb 4 23 | DATASET_CARD_TEMPLATE = """ 24 | --- 25 | tags: 26 | - fastdata 27 | - synthetic 28 | --- 29 | 30 | # {title} 31 | 32 | _Note: This is an AI-generated dataset, so its content may be inaccurate or false._ 33 | 34 | **Source of the data:** 35 | 36 | The dataset was generated using [Fastdata](https://github.com/AnswerDotAI/fastdata) library and {model_id} with the following input: 37 | 38 | ## System Prompt 39 | 40 | ``` 41 | {system_prompt} 42 | ``` 43 | 44 | ## Prompt Template 45 | 46 | ``` 47 | {prompt_template} 48 | ``` 49 | 50 | ## Sample Input 51 | 52 | ```json 53 | {sample_input} 54 | ``` 55 | 56 | """ 57 | 58 | 59 | class FastData: 60 | def __init__(self, 61 | model: str = "claude-3-haiku-20240307", 62 | calls: int = 100, 63 | period: int = 60): 64 | self.cli = Client(model) 65 | self._set_rate_limit(calls, period) 66 | 67 | def _set_rate_limit(self, calls: int, period: int): 68 | """Set a new rate limit.""" 69 | @sleep_and_retry 70 | @limits(calls=calls, period=period) 71 | def rate_limited_call(prompt: str, schema, temp: float, sp: str): 72 | return self.cli.structured( 73 | prompt, 74 | temp=temp, 75 | tools=schema, 76 | )[0] 77 | 78 | self._rate_limited_call = rate_limited_call 79 | 80 | def _process_input(self, prompt_template, schema, temp, sp, input_data): 81 | try: 82 | prompt = prompt_template.format(**input_data) 83 | return self._rate_limited_call( 84 | prompt=prompt, schema=schema, temp=temp, sp=sp 85 | ) 86 | except Exception as e: 87 | print(f"Error processing input {input_data}: {e}") 88 | return None 89 | 90 | def _save_results(self, results: list[dict], save_path: Path) -> None: 91 | with open(save_path, "w") as f: 92 | for res in results: 93 | obj_dict = getattr(res, "__stored_args__", res.__dict__) 94 | f.write(json.dumps(obj_dict) + "\n") 95 | 96 | def generate(self, 97 | prompt_template: str, 98 | inputs: list[dict], 99 | schema, 100 | temp: float = 1., 101 | sp: str = "You are a helpful assistant.", 102 | max_workers: int = 64) -> list[dict]: 103 | "For every input in INPUTS, fill PROMPT_TEMPLATE and generate a value fitting SCHEMA" 104 | 105 | with tqdm(total=len(inputs)) as pbar: 106 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: 107 | futures = [ 108 | executor.submit( 109 | self._process_input, 110 | prompt_template, 111 | schema, 112 | temp, 113 | sp, 114 | input_data, 115 | ) 116 | for input_data in inputs 117 | ] 118 | 119 | for completed_future in concurrent.futures.as_completed(futures): 120 | pbar.update(1) 121 | return [f.result() for f in futures] 122 | 123 | def generate_to_hf( 124 | self, 125 | prompt_template: str, 126 | inputs: list[dict], 127 | schema, 128 | repo_id: str, 129 | temp: float = 1.0, 130 | sp: str = "You are a helpful assistant.", 131 | max_workers: int = 64, 132 | max_items_per_file: int = 100, 133 | commit_every: Union[int, float] = 5, 134 | private: Optional[bool] = None, 135 | token: Optional[str] = None, 136 | delete_files_after: bool = True, 137 | ) -> tuple[str, list[dict]]: 138 | """ 139 | Generate data based on a prompt template and schema, and save it to Hugging Face dataset repository. 140 | This function writes the generated records to multiple files, each containing a maximum of `max_items_per_file` records. 141 | Due to the multi-threaded execution of the function, the order of the records in the files is not guaranteed to match the order of the input data. 142 | 143 | Args: 144 | prompt_template (str): The template for generating prompts. 145 | inputs (list[dict]): A list of input dictionaries to be processed. 146 | schema: The schema to parse the generated data. 147 | repo_id (str): The HuggingFace dataset name. 148 | temp (float, optional): The temperature for generation. Defaults to 1.0. 149 | sp (str, optional): The system prompt for the assistant. Defaults to "You are a helpful assistant.". 150 | max_workers (int, optional): The maximum number of worker threads. Defaults to 64. 151 | max_items_per_file (int, optional): The maximum number of items to save in each file. Defaults to 100. 152 | commit_every (Union[int, float], optional): The number of minutes between each commit. Defaults to 5. 153 | private (bool, optional): Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists. 154 | token (Optional[str], optional): The token to use to commit to the repo. Defaults to the token saved on the machine. 155 | delete_files_after (bool, optional): Whether to delete files after processing. Defaults to True. 156 | 157 | Returns: 158 | tuple[str, list[dict]]: A tuple with the generated repo_id and the list of generated data dictionaries. 159 | """ 160 | dataset_dir = Path(repo_id.replace("/", "_")) 161 | dataset_dir.mkdir(parents=True, exist_ok=True) 162 | data_dir = dataset_dir / "data" 163 | data_dir.mkdir(exist_ok=True) 164 | 165 | try: 166 | scheduler = CommitScheduler( 167 | repo_id=repo_id, 168 | repo_type="dataset", 169 | folder_path=dataset_dir, 170 | every=commit_every, 171 | private=private, 172 | token=token, 173 | ) 174 | 175 | readme_path = dataset_dir / "README.md" 176 | 177 | if not readme_path.exists(): 178 | DatasetCard( 179 | DATASET_CARD_TEMPLATE.format( 180 | title=repo_id, 181 | model_id=self.cli.model, 182 | system_prompt=sp, 183 | prompt_template=prompt_template, 184 | sample_input=inputs[:2], 185 | ) 186 | ).save(readme_path) 187 | 188 | results = [] 189 | total_inputs = len(inputs) 190 | 191 | with tqdm(total=total_inputs) as pbar: 192 | with concurrent.futures.ThreadPoolExecutor( 193 | max_workers=max_workers 194 | ) as executor: 195 | futures = [ 196 | executor.submit( 197 | self._process_input, 198 | prompt_template, 199 | schema, 200 | temp, 201 | sp, 202 | input_data, 203 | ) 204 | for input_data in inputs 205 | ] 206 | 207 | current_file = data_dir / f"train-{uuid4()}.jsonl" 208 | for completed_future in concurrent.futures.as_completed(futures): 209 | result = completed_future.result() 210 | if result is not None: 211 | results.append(result) 212 | with scheduler.lock: 213 | self._save_results(results, current_file) 214 | pbar.update(1) 215 | if len(results) >= max_items_per_file: 216 | current_file = data_dir / f"train-{uuid4()}.jsonl" 217 | results.clear() 218 | finally: 219 | scheduler.trigger().result() # force upload last result 220 | if delete_files_after: 221 | shutil.rmtree(dataset_dir) 222 | 223 | return scheduler.repo_id, [f.result() for f in futures if f.done()] 224 | -------------------------------------------------------------------------------- /nbs/00_core.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# core\n", 8 | "\n", 9 | "> Core functionality for synthetic data generation" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "#| default_exp core" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "#| hide\n", 28 | "from IPython.display import Markdown" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "#| export\n", 38 | "import concurrent.futures\n", 39 | "import json\n", 40 | "import shutil\n", 41 | "from pathlib import Path\n", 42 | "from uuid import uuid4\n", 43 | "from typing import Optional, Union\n", 44 | "\n", 45 | "from tqdm import tqdm\n", 46 | "from fastcore.utils import *\n", 47 | "from ratelimit import limits, sleep_and_retry\n", 48 | "from huggingface_hub import CommitScheduler, DatasetCard\n", 49 | "from claudette import *" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "#| export\n", 59 | "DATASET_CARD_TEMPLATE = \"\"\"\n", 60 | "---\n", 61 | "tags:\n", 62 | "- fastdata\n", 63 | "- synthetic\n", 64 | "---\n", 65 | "\n", 66 | "# {title}\n", 67 | "\n", 68 | "_Note: This is an AI-generated dataset, so its content may be inaccurate or false._\n", 69 | "\n", 70 | "**Source of the data:**\n", 71 | "\n", 72 | "The dataset was generated using [Fastdata](https://github.com/AnswerDotAI/fastdata) library and {model_id} with the following input:\n", 73 | "\n", 74 | "## System Prompt\n", 75 | "\n", 76 | "```\n", 77 | "{system_prompt}\n", 78 | "```\n", 79 | "\n", 80 | "## Prompt Template\n", 81 | "\n", 82 | "```\n", 83 | "{prompt_template}\n", 84 | "```\n", 85 | "\n", 86 | "## Sample Input\n", 87 | "\n", 88 | "```json\n", 89 | "{sample_input}\n", 90 | "```\n", 91 | "\n", 92 | "\"\"\"\n", 93 | "\n", 94 | "\n", 95 | "class FastData:\n", 96 | " def __init__(self,\n", 97 | " model: str = \"claude-3-haiku-20240307\",\n", 98 | " calls: int = 100,\n", 99 | " period: int = 60):\n", 100 | " self.cli = Client(model)\n", 101 | " self._set_rate_limit(calls, period)\n", 102 | "\n", 103 | " def _set_rate_limit(self, calls: int, period: int):\n", 104 | " \"\"\"Set a new rate limit.\"\"\"\n", 105 | " @sleep_and_retry\n", 106 | " @limits(calls=calls, period=period)\n", 107 | " def rate_limited_call(prompt: str, schema, temp: float, sp: str):\n", 108 | " return self.cli.structured(\n", 109 | " prompt,\n", 110 | " temp=temp,\n", 111 | " tools=schema,\n", 112 | " )[0]\n", 113 | " \n", 114 | " self._rate_limited_call = rate_limited_call\n", 115 | "\n", 116 | " def _process_input(self, prompt_template, schema, temp, sp, input_data):\n", 117 | " try:\n", 118 | " prompt = prompt_template.format(**input_data)\n", 119 | " return self._rate_limited_call(\n", 120 | " prompt=prompt, schema=schema, temp=temp, sp=sp\n", 121 | " )\n", 122 | " except Exception as e:\n", 123 | " print(f\"Error processing input {input_data}: {e}\")\n", 124 | " return None\n", 125 | "\n", 126 | " def _save_results(self, results: list[dict], save_path: Path) -> None:\n", 127 | " with open(save_path, \"w\") as f:\n", 128 | " for res in results:\n", 129 | " obj_dict = getattr(res, \"__stored_args__\", res.__dict__)\n", 130 | " f.write(json.dumps(obj_dict) + \"\\n\")\n", 131 | "\n", 132 | " def generate(self, \n", 133 | " prompt_template: str, \n", 134 | " inputs: list[dict], \n", 135 | " schema,\n", 136 | " temp: float = 1.,\n", 137 | " sp: str = \"You are a helpful assistant.\",\n", 138 | " max_workers: int = 64) -> list[dict]:\n", 139 | " \"For every input in INPUTS, fill PROMPT_TEMPLATE and generate a value fitting SCHEMA\"\n", 140 | " \n", 141 | " with tqdm(total=len(inputs)) as pbar:\n", 142 | " with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n", 143 | " futures = [\n", 144 | " executor.submit(\n", 145 | " self._process_input,\n", 146 | " prompt_template,\n", 147 | " schema,\n", 148 | " temp,\n", 149 | " sp,\n", 150 | " input_data,\n", 151 | " )\n", 152 | " for input_data in inputs\n", 153 | " ]\n", 154 | "\n", 155 | " for completed_future in concurrent.futures.as_completed(futures):\n", 156 | " pbar.update(1)\n", 157 | " return [f.result() for f in futures]\n", 158 | "\n", 159 | " def generate_to_hf(\n", 160 | " self,\n", 161 | " prompt_template: str,\n", 162 | " inputs: list[dict],\n", 163 | " schema,\n", 164 | " repo_id: str,\n", 165 | " temp: float = 1.0,\n", 166 | " sp: str = \"You are a helpful assistant.\",\n", 167 | " max_workers: int = 64,\n", 168 | " max_items_per_file: int = 100,\n", 169 | " commit_every: Union[int, float] = 5,\n", 170 | " private: Optional[bool] = None,\n", 171 | " token: Optional[str] = None,\n", 172 | " delete_files_after: bool = True,\n", 173 | " ) -> tuple[str, list[dict]]:\n", 174 | " \"\"\"\n", 175 | " Generate data based on a prompt template and schema, and save it to Hugging Face dataset repository.\n", 176 | " This function writes the generated records to multiple files, each containing a maximum of `max_items_per_file` records. \n", 177 | " Due to the multi-threaded execution of the function, the order of the records in the files is not guaranteed to match the order of the input data. \n", 178 | "\n", 179 | " Args:\n", 180 | " prompt_template (str): The template for generating prompts.\n", 181 | " inputs (list[dict]): A list of input dictionaries to be processed.\n", 182 | " schema: The schema to parse the generated data.\n", 183 | " repo_id (str): The HuggingFace dataset name.\n", 184 | " temp (float, optional): The temperature for generation. Defaults to 1.0.\n", 185 | " sp (str, optional): The system prompt for the assistant. Defaults to \"You are a helpful assistant.\".\n", 186 | " max_workers (int, optional): The maximum number of worker threads. Defaults to 64.\n", 187 | " max_items_per_file (int, optional): The maximum number of items to save in each file. Defaults to 100.\n", 188 | " commit_every (Union[int, float], optional): The number of minutes between each commit. Defaults to 5.\n", 189 | " private (bool, optional): Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.\n", 190 | " token (Optional[str], optional): The token to use to commit to the repo. Defaults to the token saved on the machine.\n", 191 | " delete_files_after (bool, optional): Whether to delete files after processing. Defaults to True.\n", 192 | "\n", 193 | " Returns:\n", 194 | " tuple[str, list[dict]]: A tuple with the generated repo_id and the list of generated data dictionaries.\n", 195 | " \"\"\"\n", 196 | " dataset_dir = Path(repo_id.replace(\"/\", \"_\"))\n", 197 | " dataset_dir.mkdir(parents=True, exist_ok=True)\n", 198 | " data_dir = dataset_dir / \"data\"\n", 199 | " data_dir.mkdir(exist_ok=True)\n", 200 | "\n", 201 | " try:\n", 202 | " scheduler = CommitScheduler(\n", 203 | " repo_id=repo_id,\n", 204 | " repo_type=\"dataset\",\n", 205 | " folder_path=dataset_dir,\n", 206 | " every=commit_every,\n", 207 | " private=private,\n", 208 | " token=token,\n", 209 | " )\n", 210 | "\n", 211 | " readme_path = dataset_dir / \"README.md\"\n", 212 | "\n", 213 | " if not readme_path.exists():\n", 214 | " DatasetCard(\n", 215 | " DATASET_CARD_TEMPLATE.format(\n", 216 | " title=repo_id,\n", 217 | " model_id=self.cli.model,\n", 218 | " system_prompt=sp,\n", 219 | " prompt_template=prompt_template,\n", 220 | " sample_input=inputs[:2],\n", 221 | " )\n", 222 | " ).save(readme_path)\n", 223 | "\n", 224 | " results = []\n", 225 | " total_inputs = len(inputs)\n", 226 | "\n", 227 | " with tqdm(total=total_inputs) as pbar:\n", 228 | " with concurrent.futures.ThreadPoolExecutor(\n", 229 | " max_workers=max_workers\n", 230 | " ) as executor:\n", 231 | " futures = [\n", 232 | " executor.submit(\n", 233 | " self._process_input,\n", 234 | " prompt_template,\n", 235 | " schema,\n", 236 | " temp,\n", 237 | " sp,\n", 238 | " input_data,\n", 239 | " )\n", 240 | " for input_data in inputs\n", 241 | " ]\n", 242 | "\n", 243 | " current_file = data_dir / f\"train-{uuid4()}.jsonl\"\n", 244 | " for completed_future in concurrent.futures.as_completed(futures):\n", 245 | " result = completed_future.result()\n", 246 | " if result is not None:\n", 247 | " results.append(result)\n", 248 | " with scheduler.lock:\n", 249 | " self._save_results(results, current_file)\n", 250 | " pbar.update(1)\n", 251 | " if len(results) >= max_items_per_file:\n", 252 | " current_file = data_dir / f\"train-{uuid4()}.jsonl\"\n", 253 | " results.clear()\n", 254 | " finally:\n", 255 | " scheduler.trigger().result() # force upload last result\n", 256 | " if delete_files_after:\n", 257 | " shutil.rmtree(dataset_dir)\n", 258 | "\n", 259 | " return scheduler.repo_id, [f.result() for f in futures if f.done()]" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "#| hide\n", 269 | "def to_md(ss): return '\\n'.join(f'- {s}' for s in ss) \n", 270 | "def show(ss): return Markdown(to_md(ss))" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?*" 282 | ] 283 | }, 284 | "execution_count": null, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "class Translation():\n", 291 | " \"Translation from an English phrase to a Spanish phrase\"\n", 292 | " def __init__(self, english: str, spanish: str): store_attr()\n", 293 | " def __repr__(self): return f\"{self.english} ➡ *{self.spanish}*\"\n", 294 | "\n", 295 | "Translation(\"Hello, how are you today?\", \"Hola, ¿cómo estás hoy?\")" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/markdown": [ 306 | "- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*\n", 307 | "- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*\n", 308 | "- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.*" 309 | ], 310 | "text/plain": [ 311 | "" 312 | ] 313 | }, 314 | "execution_count": null, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "examples = [\n", 321 | " Translation(\n", 322 | " english=\"Hello, my name is Nathan. I am a research scientist at an AI startup.\",\n", 323 | " spanish=\"Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.\"),\n", 324 | " Translation(\n", 325 | " english=\"How much wood could a woodchuck chuck if a woodchuck could chuck wood?\",\n", 326 | " spanish=\"¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?\"),\n", 327 | " Translation(\n", 328 | " english=\"Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See.\",\n", 329 | " spanish=\"Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.\"\n", 330 | " ),\n", 331 | "]\n", 332 | "show(examples)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stderr", 342 | "output_type": "stream", 343 | "text": [ 344 | "/home/algal/miniconda/envs/jup3/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 345 | " from .autonotebook import tqdm as notebook_tqdm\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "from datasets import load_dataset" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "name": "stderr", 360 | "output_type": "stream", 361 | "text": [ 362 | "Generating train split: 100%|█| 200000/200000 [00:00<00:00, 3852934.04 examples/\n" 363 | ] 364 | }, 365 | { 366 | "data": { 367 | "text/markdown": [ 368 | "- A Political Analyst specialized in El Salvador's political landscape.\n", 369 | "- A legal advisor who understands the legal implications of incomplete or inaccurate project documentation\n", 370 | "- A maternal health advocate focused on raising awareness about postpartum complications." 371 | ], 372 | "text/plain": [ 373 | "" 374 | ] 375 | }, 376 | "execution_count": null, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "# Load personas\n", 383 | "personas = load_dataset(\"proj-persona/PersonaHub\", \"persona\", split='train').select(range(3))['persona']\n", 384 | "show(personas)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "sp = \"You will help generate synthetic data of English and Spanish phrases.\"\n", 394 | "prompt_template = \"\"\"\\\n", 395 | "\n", 396 | "{examples}\n", 397 | "\n", 398 | "\n", 399 | "Create an English and Spanish translation pair that is similar to the examples and would be appropriate for the following persona:\n", 400 | "{persona}\n", 401 | "\"\"\"" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "Let's see what the prompt looks like in action:" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "\n", 421 | "- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*\n", 422 | "- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*\n", 423 | "- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.*\n", 424 | "\n", 425 | "\n", 426 | "Create an English and Spanish translation pair that is similar to the examples and would be appropriate for the following persona:\n", 427 | "A Political Analyst specialized in El Salvador's political landscape.\n", 428 | "\n" 429 | ] 430 | } 431 | ], 432 | "source": [ 433 | "examples_md = to_md(examples)\n", 434 | "prompt = prompt_template.format(examples=examples_md, persona=personas[0])\n", 435 | "print(prompt)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stderr", 445 | "output_type": "stream", 446 | "text": [ 447 | "100%|█████████████████████████████████████████████| 3/3 [00:01<00:00, 1.64it/s]\n" 448 | ] 449 | } 450 | ], 451 | "source": [ 452 | "# Generate translations\n", 453 | "fast_data = FastData(model=\"claude-3-haiku-20240307\")\n", 454 | "translations = fast_data.generate(\n", 455 | " prompt_template=prompt_template,\n", 456 | " inputs=[{\"persona\": persona, \"examples\": examples} for persona in personas],\n", 457 | " schema=Translation,\n", 458 | " sp=sp\n", 459 | ")" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [ 467 | { 468 | "data": { 469 | "text/markdown": [ 470 | "- The political situation in El Salvador continues to be complex, with ongoing tensions between the ruling party and opposition groups. President Nayib Bukele has consolidated significant power, raising concerns about the state of democracy in the country. ➡ *La situación política en El Salvador sigue siendo compleja, con tensiones persistentes entre el partido gobernante y los grupos de oposición. El presidente Nayib Bukele ha consolidado un poder significativo, lo que genera preocupaciones sobre el estado de la democracia en el país.*\n", 471 | "- Thorough documentation is critical for any legal proceedings. Incomplete or inaccurate records can have serious consequences. ➡ *La documentación exhaustiva es fundamental para cualquier proceso legal. Los registros incompletos o inexactos pueden tener consecuencias graves.*\n", 472 | "- Postpartum complications can be life-threatening, but with proper care and support, new mothers can recover and thrive. Let's work together to ensure all women have access to the resources they need during this crucial time. ➡ *Las complicaciones posparto pueden poner en riesgo la vida, pero con la atención y el apoyo adecuados, las nuevas madres pueden recuperarse y prosperar. Trabajemos juntos para garantizar que todas las mujeres tengan acceso a los recursos que necesitan durante este momento crucial.*" 473 | ], 474 | "text/plain": [ 475 | "" 476 | ] 477 | }, 478 | "execution_count": null, 479 | "metadata": {}, 480 | "output_type": "execute_result" 481 | } 482 | ], 483 | "source": [ 484 | "show(translations)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "# Generate translations and push results to Hugging Face Hub as a dataset\n", 494 | "# Be sure to have the HF_TOKEN environment variable set to your Hugging Face API token\n", 495 | "fast_data = FastData(model=\"claude-3-haiku-20240307\")\n", 496 | "repo_id, translations = fast_data.generate_to_hf(\n", 497 | " prompt_template=prompt_template,\n", 498 | " inputs=[{\"persona\": persona, \"examples\": examples} for persona in personas],\n", 499 | " schema=Translation,\n", 500 | " sp=sp,\n", 501 | " repo_id=f\"personas-translation-{uuid4()}\",\n", 502 | " max_items_per_file=2, # It will create a local file each 2 translations \n", 503 | ")\n", 504 | "assert len(translations) == len(personas)\n", 505 | "\n", 506 | "new_dataset = load_dataset(repo_id)\n", 507 | "assert len(translations) == len(personas)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "class TranslationCritique():\n", 517 | " \"A critique of the translation.\"\n", 518 | " def __init__(self, critique: str, score: int): store_attr()\n", 519 | " def __repr__(self): return f\"\\t- **Critique:** {self.critique}\\n\\t- **Score:** {self.score}\"" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "sp = \"You will help critique synthetic data of English and Spanish phrases.\"\n", 529 | "critique_template = \"\"\"\\\n", 530 | "Below is an extract of a translation. Evaluate its quality as a senior translator would, considering its suitability for professional use. Use the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:\n", 531 | "\n", 532 | "- Add 1 point if the translation conveys the basic meaning of the source text, even if it includes some minor errors or awkward phrasing.\n", 533 | "- Add another point if the translation is generally accurate but lacks refinement in style or fails to capture some nuances of the original. It might use inconsistent terminology or have occasional lapses in register.\n", 534 | "- Award a third point if the translation is appropriate for professional use and accurately conveys key concepts of the source text. It demonstrates good understanding of both languages, though it may not be flawless or could include some slight inconsistencies. It resembles the work of a competent translator but may have room for improvement in fluency or precision.\n", 535 | "- Grant a fourth point if the translation is highly accurate and reads naturally in the target language, exhibiting a consistent and appropriate style. It could be similar to the work of an experienced translator, offering faithful rendering of content and tone, with minimal errors, and effectively handling complex concepts or cultural references. The result is coherent, well-expressed, and valuable for its intended purpose.\n", 536 | "- Bestow a fifth point if the translation is outstanding, demonstrating mastery of both source and target languages. It captures subtle nuances, maintains the author's voice and intent, and reads as if it were originally written in the target language. The translator has made excellent choices in dealing with challenging elements like wordplay, idiomatic expressions, or culture-specific content.\n", 537 | "\n", 538 | "{translation}\n", 539 | "\n", 540 | "After examining the translation:\n", 541 | "\n", 542 | "- Briefly justify your total score, up to 100 words.\n", 543 | "- Conclude with the score of the translation.\n", 544 | "\"\"\"" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [ 552 | { 553 | "name": "stderr", 554 | "output_type": "stream", 555 | "text": [ 556 | "100%|█████████████████████████████████████████████| 3/3 [00:03<00:00, 1.27s/it]\n" 557 | ] 558 | } 559 | ], 560 | "source": [ 561 | "fast_data = FastData(model=\"claude-3-5-sonnet-20240620\")\n", 562 | "critiques = fast_data.generate(\n", 563 | " prompt_template=critique_template,\n", 564 | " inputs=[{\"translation\": f\"{t.english} -> {t.spanish}\"} for t in translations],\n", 565 | " schema=TranslationCritique,\n", 566 | " sp=sp\n", 567 | ")" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": {}, 574 | "outputs": [ 575 | { 576 | "data": { 577 | "text/markdown": [ 578 | "- The political situation in El Salvador continues to be complex, with ongoing tensions between the ruling party and opposition groups. President Nayib Bukele has consolidated significant power, raising concerns about the state of democracy in the country. ➡ *La situación política en El Salvador sigue siendo compleja, con tensiones persistentes entre el partido gobernante y los grupos de oposición. El presidente Nayib Bukele ha consolidado un poder significativo, lo que genera preocupaciones sobre el estado de la democracia en el país.*\n", 579 | "\n", 580 | "\t- **Critique:** The translation accurately conveys the meaning of the source text, capturing the complexity of El Salvador's political situation. It effectively translates key terms and maintains the tone of the original. The translator demonstrates a strong grasp of both languages, rendering the content naturally in Spanish. The phrasing is appropriate and flows well, preserving the nuances of the English version. There are no noticeable errors or awkward constructions. The translation skillfully handles the political terminology and concepts, making it suitable for professional use. It reads as if it were originally written in Spanish, indicating the translator's expertise.\n", 581 | "\t- **Score:** 5\n", 582 | "- Thorough documentation is critical for any legal proceedings. Incomplete or inaccurate records can have serious consequences. ➡ *La documentación exhaustiva es fundamental para cualquier proceso legal. Los registros incompletos o inexactos pueden tener consecuencias graves.*\n", 583 | "\n", 584 | "\t- **Critique:** The translation accurately conveys the main message of the source text, maintaining both the meaning and tone. It effectively captures the importance of thorough documentation in legal proceedings and the potential consequences of incomplete or inaccurate records. The translator has chosen appropriate Spanish equivalents for key terms, such as \"exhaustiva\" for \"thorough\" and \"proceso legal\" for \"legal proceedings\". The sentence structure is natural in Spanish, and the translation maintains the formal register suitable for legal contexts. There are no errors in grammar or vocabulary. The translation demonstrates a high level of competence, accurately conveying complex concepts while reading naturally in the target language. It could be considered the work of an experienced translator.\n", 585 | "\t- **Score:** 4\n", 586 | "- Postpartum complications can be life-threatening, but with proper care and support, new mothers can recover and thrive. Let's work together to ensure all women have access to the resources they need during this crucial time. ➡ *Las complicaciones posparto pueden poner en riesgo la vida, pero con la atención y el apoyo adecuados, las nuevas madres pueden recuperarse y prosperar. Trabajemos juntos para garantizar que todas las mujeres tengan acceso a los recursos que necesitan durante este momento crucial.*\n", 587 | "\n", 588 | "\t- **Critique:** The translation accurately conveys the meaning of the original text, maintaining both the informative and encouraging tone. It correctly translates key terms like \"postpartum complications\" and \"life-threatening.\" The Spanish version flows naturally and captures the nuances of the original, including the call to action. The translator has made excellent choices in vocabulary and structure, resulting in a text that reads as if it were originally written in Spanish. The translation demonstrates a high level of proficiency in both languages and would be suitable for professional use in healthcare communications.\n", 589 | "\t- **Score:** 5" 590 | ], 591 | "text/plain": [ 592 | "" 593 | ] 594 | }, 595 | "execution_count": null, 596 | "metadata": {}, 597 | "output_type": "execute_result" 598 | } 599 | ], 600 | "source": [ 601 | "show(f'{t}\\n\\n{c}' for t, c in zip(translations, critiques))" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "### Test that generate outputs align with inputs" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": {}, 614 | "source": [ 615 | "Let's verify that the `FastData.generate` returns results in the same order as the inputs it was passed.\n", 616 | "\n", 617 | "To show this, we will define a new prompt template, where the model is asked only to echo a piece of data\n", 618 | "from the input. Then we will verify that the values in the inputs matches the values in the outputs, in order and in value." 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [ 627 | "sp=\"You will help with accurate and faithful data processing.\"\n", 628 | "prompt_template = \"\"\"\\\n", 629 | "Below you find an item of data, a datum, which is an alphanumeric string:\n", 630 | "\n", 631 | "{datum}\n", 632 | "\n", 633 | "After reviewing this datum, please echo is back exactly, without any preamble:\n", 634 | "\"\"\"" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": null, 640 | "metadata": {}, 641 | "outputs": [], 642 | "source": [ 643 | "class Datum:\n", 644 | " \"A data value\"\n", 645 | " def __init__(self, datum: str): store_attr()\n", 646 | " def __repr__(self): return f\"{self.datum}\"" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "First we'll test that the prompt and schema class work as execpted." 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": {}, 660 | "outputs": [ 661 | { 662 | "name": "stdout", 663 | "output_type": "stream", 664 | "text": [ 665 | "Below you find an item of data, a datum, which is an alphanumeric string:\n", 666 | "\n", 667 | "b9121446-e46c-47c0-9e6d-b4df35c0974b\n", 668 | "\n", 669 | "After reviewing this datum, please echo is back exactly, without any preamble:\n", 670 | "\n" 671 | ] 672 | } 673 | ], 674 | "source": [ 675 | "print(prompt_template.format(**dict(datum=str(uuid4()))))" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "metadata": {}, 682 | "outputs": [ 683 | { 684 | "data": { 685 | "text/plain": [ 686 | "04da7de4-cc39-4699-9d25-5a476e366732" 687 | ] 688 | }, 689 | "execution_count": null, 690 | "metadata": {}, 691 | "output_type": "execute_result" 692 | } 693 | ], 694 | "source": [ 695 | "Datum(str(uuid4()))" 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": {}, 701 | "source": [ 702 | "Now we ask the model to \"generate\" (i.e., echo) 100 of these values." 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": {}, 709 | "outputs": [ 710 | { 711 | "name": "stderr", 712 | "output_type": "stream", 713 | "text": [ 714 | "100%|█████████████████████████████████████████| 100/100 [00:04<00:00, 24.17it/s]\n" 715 | ] 716 | } 717 | ], 718 | "source": [ 719 | "in_vals = [{\"datum\":str(uuid4())} for _ in range(100)]\n", 720 | "out_vals = fast_data.generate(\n", 721 | " prompt_template=prompt_template,\n", 722 | " inputs=in_vals,\n", 723 | " schema=Datum,\n", 724 | " sp=sp\n", 725 | ")" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "Now we will see if the inputs and outputs are aligned.\n", 733 | "\n", 734 | "If they are aligned, then that shows two things. First it shows that the model is echoing the values faithfully. Second, it shows _either_ that the model itself returned outputs in the order in which they were submitted, or else that `generate` has returned outputs in submission order.\n", 735 | "\n", 736 | "We are submitting a large enough quantity of items, that we _asssume_ the model will return some results out of submission order. If you want confidence which does not depend on this assumption, then could modify the test above to increase the number and complexity of the generation task, or simply inspect the implementation.\n", 737 | "\n", 738 | "Let's start by spot checking the first item:" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "metadata": {}, 745 | "outputs": [ 746 | { 747 | "data": { 748 | "text/plain": [ 749 | "('f42ea0db-24ce-4e09-a50d-edf74d0eb611',\n", 750 | " 'f42ea0db-24ce-4e09-a50d-edf74d0eb611')" 751 | ] 752 | }, 753 | "execution_count": null, 754 | "metadata": {}, 755 | "output_type": "execute_result" 756 | } 757 | ], 758 | "source": [ 759 | "in_list = [x['datum'] for x in in_vals]\n", 760 | "out_list = [x.datum for x in out_vals]\n", 761 | "(in_list[0],out_list[0])" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": null, 767 | "metadata": {}, 768 | "outputs": [ 769 | { 770 | "name": "stdout", 771 | "output_type": "stream", 772 | "text": [ 773 | "Success: output items are aligned with input items\n" 774 | ] 775 | } 776 | ], 777 | "source": [ 778 | "for (idx,(in_item,out_item)) in enumerate(zip(in_list,out_list)):\n", 779 | " if in_item != out_item:\n", 780 | " print(\"Failure: output items were not aligned with input items!\")\n", 781 | " print(f\"\\titem {idx} had in={in_item} and out={out_item}\")\n", 782 | " break\n", 783 | "else:\n", 784 | " print(\"Success: output items are aligned with input items\") " 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "#| hide\n", 794 | "import nbdev; nbdev.nbdev_export()" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": null, 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [] 803 | } 804 | ], 805 | "metadata": { 806 | "kernelspec": { 807 | "display_name": "python3", 808 | "language": "python", 809 | "name": "python3" 810 | } 811 | }, 812 | "nbformat": 4, 813 | "nbformat_minor": 4 814 | } 815 | -------------------------------------------------------------------------------- /nbs/_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | 4 | format: 5 | html: 6 | theme: cosmo 7 | css: styles.css 8 | toc: true 9 | keep-md: true 10 | commonmark: default 11 | 12 | website: 13 | twitter-card: true 14 | open-graph: true 15 | repo-actions: [issue] 16 | navbar: 17 | background: primary 18 | search: true 19 | sidebar: 20 | style: floating 21 | 22 | metadata-files: [nbdev.yml, sidebar.yml] -------------------------------------------------------------------------------- /nbs/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# fastdata\n", 8 | "\n", 9 | "> Easiest and fastest way to generate 1B synthetic tokens\n", 10 | "\n", 11 | "Minimalist library that wraps around `claudette` to make generating synthetic data easy." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "`fastdata` is a minimal library for generating synthetic data for training deep learning models. For example, below is how you can generate a dataset to train a language model to\n", 19 | "translate from English to Spanish.\n", 20 | "\n", 21 | "First you need to define the structure of the data you want to generate. `claudette`, which is the library that fastdata uses to generate data, requires you to define the schema of the data you want to generate." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from fastcore.utils import *" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?*" 42 | ] 43 | }, 44 | "execution_count": null, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "class Translation():\n", 51 | " \"Translation from an English phrase to a Spanish phrase\"\n", 52 | " def __init__(self, english: str, spanish: str): store_attr()\n", 53 | " def __repr__(self): return f\"{self.english} ➡ *{self.spanish}*\"\n", 54 | "\n", 55 | "Translation(\"Hello, how are you today?\", \"Hola, ¿cómo estás hoy?\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "Next, you need to define the prompt that will be used to generate the data and any inputs you want to pass to the prompt." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "prompt_template = \"\"\"\\\n", 72 | "Generate English and Spanish translations on the following topic:\n", 73 | "{topic}\n", 74 | "\"\"\"\n", 75 | "\n", 76 | "inputs = [{\"topic\": \"Otters are cute\"}, {\"topic\": \"I love programming\"}]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Finally, we can generate some data with fastdata.\n", 84 | "\n", 85 | "::: {.callout-note}\n", 86 | "We only support Anthropic models at the moment. Therefore, make sure you have an API key for the model you want to use and the proper environment variables set or pass the api key to the `FastData` class `FastData(api_key=\"sk-ant-api03-...\")`.\n", 87 | ":::" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from fastdata.core import FastData" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stderr", 106 | "output_type": "stream", 107 | "text": [ 108 | "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.57it/s]\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "fast_data = FastData(model=\"claude-3-haiku-20240307\")\n", 114 | "translations = fast_data.generate(\n", 115 | " prompt_template=prompt_template,\n", 116 | " inputs=inputs,\n", 117 | " schema=Translation,\n", 118 | ")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "from IPython.display import Markdown" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/markdown": [ 138 | "- I love programming ➡ *Me encanta la programación*\n", 139 | "- Otters are cute ➡ *Las nutrias son lindas*" 140 | ], 141 | "text/plain": [ 142 | "" 143 | ] 144 | }, 145 | "execution_count": null, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "Markdown(\"\\n\".join(f'- {t}' for t in translations))" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### Installation" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Install latest from the GitHub [repository][repo]:\n", 166 | "\n", 167 | "```sh\n", 168 | "$ pip install git+https://github.com/AnswerDotAI/fastdata.git\n", 169 | "```\n", 170 | "\n", 171 | "or from [pypi][pypi]\n", 172 | "\n", 173 | "\n", 174 | "```sh\n", 175 | "$ pip install python-fastdata\n", 176 | "```\n", 177 | "\n", 178 | "[repo]: https://github.com/AnswerDotAI/fastdata\n", 179 | "[docs]: https://AnswerDotAI.github.io/fastdata/\n", 180 | "[pypi]: https://pypi.org/project/fastdata/" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "If you'd like to see how best to generate data with fastdata, check out our blog post [here](https://www.answer.ai/blog/introducing-fastdata) and some of the examples in the [examples](https://github.com/AnswerDotAI/fastdata/tree/main/examples) directory." 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "## Developer Guide" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "If you are new to using `nbdev` here are some useful pointers to get you started." 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Install fastdata in Development mode" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "```sh\n", 216 | "# make sure fastdata package is installed in development mode\n", 217 | "$ pip install -e .\n", 218 | "\n", 219 | "# make changes under nbs/ directory\n", 220 | "# ...\n", 221 | "\n", 222 | "# compile to have changes apply to fastdata\n", 223 | "$ nbdev_prepare\n", 224 | "```" 225 | ] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "python3", 231 | "language": "python", 232 | "name": "python3" 233 | } 234 | }, 235 | "nbformat": 4, 236 | "nbformat_minor": 4 237 | } 238 | -------------------------------------------------------------------------------- /nbs/nbdev.yml: -------------------------------------------------------------------------------- 1 | project: 2 | output-dir: _docs 3 | 4 | website: 5 | title: "fastdata" 6 | site-url: "https://AnswerDotAI.github.io/fastdata" 7 | description: "Easiest and fastest way to 1B synthetic tokens" 8 | repo-branch: main 9 | repo-url: "https://github.com/AnswerDotAI/fastdata" 10 | -------------------------------------------------------------------------------- /nbs/sidebar.yml: -------------------------------------------------------------------------------- 1 | website: 2 | sidebar: 3 | contents: 4 | - index.ipynb 5 | - 00_core.ipynb 6 | -------------------------------------------------------------------------------- /nbs/styles.css: -------------------------------------------------------------------------------- 1 | .cell { 2 | margin-bottom: 1rem; 3 | } 4 | 5 | .cell > .sourceCode { 6 | margin-bottom: 0; 7 | } 8 | 9 | .cell-output > pre { 10 | margin-bottom: 0; 11 | } 12 | 13 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { 14 | margin-left: 0.8rem; 15 | margin-top: 0; 16 | background: none; 17 | border-left: 2px solid lightsalmon; 18 | border-top-left-radius: 0; 19 | border-top-right-radius: 0; 20 | } 21 | 22 | .cell-output > .sourceCode { 23 | border: none; 24 | } 25 | 26 | .cell-output > .sourceCode { 27 | background: none; 28 | margin-top: 0; 29 | } 30 | 31 | div.description { 32 | padding-left: 2px; 33 | padding-top: 5px; 34 | font-style: italic; 35 | font-size: 135%; 36 | opacity: 70%; 37 | } 38 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64.0"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /settings.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | repo = fastdata 3 | lib_name = python-fastdata 4 | version = 0.0.5 5 | min_python = 3.9 6 | license = apache2 7 | black_formatting = False 8 | doc_path = _docs 9 | lib_path = fastdata 10 | nbs_path = nbs 11 | recursive = True 12 | tst_flags = notest 13 | put_version_in_init = True 14 | branch = main 15 | custom_sidebar = False 16 | doc_host = https://AnswerDotAI.github.io 17 | doc_baseurl = /fastdata 18 | git_url = https://github.com/AnswerDotAI/fastdata 19 | title = fastdata 20 | audience = Developers 21 | author = ncoop57 22 | author_email = nc@answer.ai 23 | copyright = 2024 onwards, ncoop57 24 | description = Easiest and fastest way to 1B synthetic tokens 25 | keywords = nbdev jupyter notebook python 26 | language = English 27 | status = 3 28 | user = AnswerDotAI 29 | requirements = claudette fastcore ratelimit tqdm huggingface_hub 30 | dev_requirements = black datasets ipykernel nbdev 31 | readme_nb = index.ipynb 32 | allowed_metadata_keys = 33 | allowed_cell_metadata_keys = 34 | jupyter_hooks = False 35 | clean_ids = True 36 | clear_all = False 37 | cell_number = True 38 | skip_procs = 39 | 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import parse_version 2 | from configparser import ConfigParser 3 | import setuptools, shlex 4 | assert parse_version(setuptools.__version__)>=parse_version('36.2') 5 | 6 | # note: all settings are in settings.ini; edit there, not here 7 | config = ConfigParser(delimiters=['=']) 8 | config.read('settings.ini', encoding='utf-8') 9 | cfg = config['DEFAULT'] 10 | 11 | cfg_keys = 'version description keywords author author_email'.split() 12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split() 13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o) 14 | setup_cfg = {o:cfg[o] for o in cfg_keys} 15 | 16 | licenses = { 17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), 18 | 'mit': ('MIT License', 'OSI Approved :: MIT License'), 19 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), 20 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), 21 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), 22 | } 23 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', 24 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] 25 | py_versions = '3.6 3.7 3.8 3.9 3.10 3.11 3.12'.split() 26 | 27 | requirements = shlex.split(cfg.get('requirements', '')) 28 | if cfg.get('pip_requirements'): requirements += shlex.split(cfg.get('pip_requirements', '')) 29 | min_python = cfg['min_python'] 30 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) 31 | dev_requirements = (cfg.get('dev_requirements') or '').split() 32 | 33 | package_data = dict() 34 | pkg_data = cfg.get('package_data', None) 35 | if pkg_data: 36 | package_data[cfg['lib_name']] = pkg_data.split() # split as multiple files might be listed 37 | # Add package data to setup_cfg for setuptools.setup(..., **setup_cfg) 38 | setup_cfg['package_data'] = package_data 39 | 40 | setuptools.setup( 41 | name = cfg['lib_name'], 42 | license = lic[0], 43 | classifiers = [ 44 | 'Development Status :: ' + statuses[int(cfg['status'])], 45 | 'Intended Audience :: ' + cfg['audience'].title(), 46 | 'Natural Language :: ' + cfg['language'].title(), 47 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), 48 | url = cfg['git_url'], 49 | packages = setuptools.find_packages(), 50 | include_package_data = True, 51 | install_requires = requirements, 52 | extras_require={ 'dev': dev_requirements }, 53 | dependency_links = cfg.get('dep_links','').split(), 54 | python_requires = '>=' + cfg['min_python'], 55 | long_description = open('README.md', encoding='utf-8').read(), 56 | long_description_content_type = 'text/markdown', 57 | zip_safe = False, 58 | entry_points = { 59 | 'console_scripts': cfg.get('console_scripts','').split(), 60 | 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] 61 | }, 62 | **setup_cfg) 63 | 64 | 65 | --------------------------------------------------------------------------------