├── .github
└── workflows
│ ├── deploy.yaml
│ └── test.yaml
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples
├── README.md
├── push_to_hf.py
├── requirements.txt
├── tiny_programs.py
└── train_model.py
├── fastdata
├── __init__.py
├── _modidx.py
└── core.py
├── nbs
├── 00_core.ipynb
├── _quarto.yml
├── index.ipynb
├── nbdev.yml
├── sidebar.yml
└── styles.css
├── pyproject.toml
├── settings.ini
└── setup.py
/.github/workflows/deploy.yaml:
--------------------------------------------------------------------------------
1 | name: Deploy to GitHub Pages
2 |
3 | permissions:
4 | contents: write
5 | pages: write
6 |
7 | on:
8 | push:
9 | branches: [ "main", "master" ]
10 | workflow_dispatch:
11 | jobs:
12 | deploy:
13 | runs-on: ubuntu-latest
14 | steps: [uses: fastai/workflows/quarto-ghp@master]
15 |
--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on: [workflow_dispatch, pull_request, push]
3 |
4 | jobs:
5 | test:
6 | runs-on: ubuntu-latest
7 | steps: [uses: fastai/workflows/nbdev-ci@master]
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _docs/
2 | _proc/
3 |
4 | *.bak
5 | .gitattributes
6 | .last_checked
7 | .gitconfig
8 | *.bak
9 | *.log
10 | *~
11 | ~*
12 | _tmp*
13 | tmp*
14 | tags
15 | *.pkg
16 |
17 | # Byte-compiled / optimized / DLL files
18 | __pycache__/
19 | *.py[cod]
20 | *$py.class
21 |
22 | # C extensions
23 | *.so
24 |
25 | # Distribution / packaging
26 | .Python
27 | env/
28 | build/
29 | conda/
30 | develop-eggs/
31 | dist/
32 | downloads/
33 | eggs/
34 | .eggs/
35 | lib/
36 | lib64/
37 | parts/
38 | sdist/
39 | var/
40 | wheels/
41 | *.egg-info/
42 | .installed.cfg
43 | *.egg
44 |
45 | # PyInstaller
46 | # Usually these files are written by a python script from a template
47 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
48 | *.manifest
49 | *.spec
50 |
51 | # Installer logs
52 | pip-log.txt
53 | pip-delete-this-directory.txt
54 |
55 | # Unit test / coverage reports
56 | htmlcov/
57 | .tox/
58 | .coverage
59 | .coverage.*
60 | .cache
61 | nosetests.xml
62 | coverage.xml
63 | *.cover
64 | .hypothesis/
65 |
66 | # Translations
67 | *.mo
68 | *.pot
69 |
70 | # Django stuff:
71 | *.log
72 | local_settings.py
73 |
74 | # Flask stuff:
75 | instance/
76 | .webassets-cache
77 |
78 | # Scrapy stuff:
79 | .scrapy
80 |
81 | # Sphinx documentation
82 | docs/_build/
83 |
84 | # PyBuilder
85 | target/
86 |
87 | # Jupyter Notebook
88 | .ipynb_checkpoints
89 |
90 | # pyenv
91 | .python-version
92 |
93 | # celery beat schedule file
94 | celerybeat-schedule
95 |
96 | # SageMath parsed files
97 | *.sage.py
98 |
99 | # dotenv
100 | .env
101 |
102 | # virtualenv
103 | .venv
104 | venv/
105 | ENV/
106 |
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 |
111 | # Rope project settings
112 | .ropeproject
113 |
114 | # mkdocs documentation
115 | /site
116 |
117 | # mypy
118 | .mypy_cache/
119 |
120 | .vscode
121 | *.swp
122 |
123 | # osx generated files
124 | .DS_Store
125 | .DS_Store?
126 | .Trashes
127 | ehthumbs.db
128 | Thumbs.db
129 | .idea
130 |
131 | # pytest
132 | .pytest_cache
133 |
134 | # tools/trust-doc-nbs
135 | docs_src/.last_checked
136 |
137 | # symlinks to fastai
138 | docs_src/fastai
139 | tools/fastai
140 |
141 | # link checker
142 | checklink/cookies.txt
143 |
144 | # .gitconfig is now autogenerated
145 | .gitconfig
146 |
147 | # Quarto installer
148 | .deb
149 | .pkg
150 |
151 | # Quarto
152 | .quarto
153 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Release notes
2 |
3 |
4 |
5 | ## 0.0.4
6 |
7 | ### Features Added
8 |
9 | - Added `generate_to_hf` which automatically saves generations to Hugging Face dataset repository ([#7](https://github.com/AnswerDotAI/fastdata/pull/9))
10 |
11 | ### Bugs Squashed
12 |
13 | - When using max_workers > 1, the order of the returned synthetic data is not the same as the order of inputs given ([#7](https://github.com/AnswerDotAI/fastdata/issues/7))
14 |
15 |
16 | ## 0.0.3
17 | - Update readme
18 |
19 |
20 |
21 | ## 0.0.1
22 |
23 | - initial release
24 |
25 |
26 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | Make sure you have read the [doc on code style](
4 | https://docs.fast.ai/dev/style.html) first. (Note that we don't follow PEP8, but instead follow a coding style designed specifically for numerical and interactive programming.)
5 |
6 | This project uses [nbdev](https://nbdev.fast.ai/getting_started.html) for development. Before beginning, make sure that nbdev and a jupyter-compatible client such as jupyterlab or nbclassic are installed. To make changes to the codebase, update the notebooks in the `nbs` folder, not the .py files directly. Then, run `nbdev_export`. For more details, have a look at the [nbdev tutorial](https://nbdev.fast.ai/tutorials/tutorial.html).
7 |
8 | You may want to set up a `prep` alias in `~/.zshrc` or other shell startup file:
9 |
10 | ```sh
11 | alias prep='nbdev_export && nbdev_clean && nbdev_trust'
12 | ```
13 |
14 | Run `prep` before each commit to ensure your python files are up to date, and you notebooks cleaned of metadata and notarized.
15 |
16 | ## Updating README.md
17 |
18 | Similar to updating Python source code files, to update the `README.md` file you will need to edit a notebook file, specifically `nbs/index.ipynb`.
19 |
20 | However, there are a couple of extra dependencies that you need to install first in order to make this work properly. Go to the directory you cloned the repo to, and type:
21 |
22 | ```
23 | pip install -e '.[dev]'
24 | ```
25 |
26 | And install quarto too:
27 |
28 | ```
29 | nbdev_install_quarto
30 | ```
31 |
32 | Then, after you make subsequent changes to `nbs/index.ipynb`, run the following from the repo's root directory to (re)build `README.md`:
33 |
34 | ```
35 | nbdev_readme
36 | ```
37 |
38 | ## Did you find a bug?
39 |
40 | * Ensure the bug was not already reported by searching on GitHub under Issues.
41 | * If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring.
42 | * Be sure to add the complete error messages.
43 |
44 | ### Did you write a patch that fixes a bug?
45 |
46 | * Open a new GitHub pull request with the patch.
47 | * Ensure that your PR includes a test that fails without your patch, and pass with it.
48 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
49 |
50 | ## PR submission guidelines
51 |
52 | * Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needed to keep each PR focused.
53 | * Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and will most likely get rejected.
54 | * Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can.
55 | * Do not turn an already-submitted PR into your development playground. If after you submit a PR, you discover that more work is needed: close the PR, do the required work, and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project.
56 | * If, however, you submit a PR and receive a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work, and then submit it again. Use common sense where you'd choose one way over another.
57 |
58 | ## Do you want to contribute to the documentation?
59 |
60 | * Docs are automatically created from the notebooks in the nbs folder.
61 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2022, fastai
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include settings.ini
2 | include LICENSE
3 | include CONTRIBUTING.md
4 | include README.md
5 | recursive-exclude * __pycache__
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # fastdata
2 |
3 |
4 |
5 |
6 | `fastdata` is a minimal library for generating synthetic data for
7 | training deep learning models. For example, below is how you can
8 | generate a dataset to train a language model to translate from English
9 | to Spanish.
10 |
11 | First you need to define the structure of the data you want to generate.
12 | `claudette`, which is the library that fastdata uses to generate data,
13 | requires you to define the schema of the data you want to generate.
14 |
15 | ``` python
16 | from fastcore.utils import *
17 | ```
18 |
19 | ``` python
20 | class Translation():
21 | "Translation from an English phrase to a Spanish phrase"
22 | def __init__(self, english: str, spanish: str): store_attr()
23 | def __repr__(self): return f"{self.english} ➡ *{self.spanish}*"
24 |
25 | Translation("Hello, how are you today?", "Hola, ¿cómo estás hoy?")
26 | ```
27 |
28 | Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?*
29 |
30 | Next, you need to define the prompt that will be used to generate the
31 | data and any inputs you want to pass to the prompt.
32 |
33 | ``` python
34 | prompt_template = """\
35 | Generate English and Spanish translations on the following topic:
36 | {topic}
37 | """
38 |
39 | inputs = [{"topic": "Otters are cute"}, {"topic": "I love programming"}]
40 | ```
41 |
42 | Finally, we can generate some data with fastdata.
43 |
44 | > [!NOTE]
45 | >
46 | > We only support Anthropic models at the moment. Therefore, make sure
47 | > you have an API key for the model you want to use and the proper
48 | > environment variables set or pass the api key to the
49 | > [`FastData`](https://AnswerDotAI.github.io/fastdata/core.html#fastdata)
50 | > class `FastData(api_key="sk-ant-api03-...")`.
51 |
52 | ``` python
53 | from fastdata.core import FastData
54 | ```
55 |
56 | ``` python
57 | fast_data = FastData(model="claude-3-haiku-20240307")
58 | translations = fast_data.generate(
59 | prompt_template=prompt_template,
60 | inputs=inputs,
61 | schema=Translation,
62 | )
63 | ```
64 |
65 | 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.57it/s]
66 |
67 | ``` python
68 | from IPython.display import Markdown
69 | ```
70 |
71 | ``` python
72 | Markdown("\n".join(f'- {t}' for t in translations))
73 | ```
74 |
75 | - I love programming ➡ *Me encanta la programación*
76 | - Otters are cute ➡ *Las nutrias son lindas*
77 |
78 | ### Installation
79 |
80 | Install latest from the GitHub
81 | [repository](https://github.com/AnswerDotAI/fastdata):
82 |
83 | ``` sh
84 | $ pip install git+https://github.com/AnswerDotAI/fastdata.git
85 | ```
86 |
87 | or from [pypi](https://pypi.org/project/fastdata/)
88 |
89 | ``` sh
90 | $ pip install python-fastdata
91 | ```
92 |
93 | If you’d like to see how best to generate data with fastdata, check out
94 | our blog post [here](https://www.answer.ai/blog/introducing-fastdata)
95 | and some of the examples in the
96 | [examples](https://github.com/AnswerDotAI/fastdata/tree/main/examples)
97 | directory.
98 |
99 | ## Developer Guide
100 |
101 | If you are new to using `nbdev` here are some useful pointers to get you
102 | started.
103 |
104 | ### Install fastdata in Development mode
105 |
106 | ``` sh
107 | # make sure fastdata package is installed in development mode
108 | $ pip install -e .
109 |
110 | # make changes under nbs/ directory
111 | # ...
112 |
113 | # compile to have changes apply to fastdata
114 | $ nbdev_prepare
115 | ```
116 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # TinyPrograms
2 |
3 | This experiment was inspired by the paper [TinyStories: How Small Can Language Models Be and Still Speak Coherent English?](https://arxiv.org/abs/2305.07759). The authors used OpenAI GPT models to generate simple, synthetic children's books. They showed that training on this data allowed extremely tiny models (~10M parameters) to outperform 10x larger models. TinyProgram's is the coding equivalent. It has ~1,000 tiny python programs generated by Antropic's Haiku model. I decided to deviate from the original paper by finetuning a model rather than train from scratch since I had such a small dataset. I used the following definition of what a tiny program is when using `fastdata` to generate them:
4 |
5 | ```python
6 | class TinyProgram(BasicRepr):
7 | """
8 | A tiny program that is a valid python program that satisfies the requirements.
9 | """
10 | def __init__(
11 | self,
12 | requirements: str, # A description of the requirements for the program to help the persona.
13 | code: str, # The code that satisfies the requirements.
14 | ): store_attr()
15 | ```
16 |
17 | I also followed the advice in the blog post that accompanies this experiment, "How To ~~Train~~ Synthesize Your ~~Dragon~~ Data." It includes examples of the tiny programs I want. I also used the recently released PersonaHub to create a diverse set of tiny programs. Here is what the prompt looks like:
18 |
19 | ```python
20 | prompt_template = """\
21 |
22 | {examples}
23 | {persona}
27 | """
28 | ```
29 |
30 | Here is one of the programs that Haiku created:
31 |
32 | #### Persona
33 |
34 | ```
35 | A striker who admires the player's defensive prowess and values their support on the field
36 | ```
37 |
38 | #### Requirements
39 |
40 | ```
41 | A Python program that simulates a soccer game between a striker and a defensive player. The striker admires the defensive player's prowess and values their support on the field. The program should model the interactions between the striker and the defensive player, demonstrating how the defensive player's support enables the striker to be more effective in scoring goals.
42 | ```
43 |
44 | #### Program
45 |
46 | ```python
47 | import random
48 |
49 | class SoccerPlayer:
50 | def init(self, name, position):
51 | self.name = name
52 | self.position = position
53 | self.skill_level = random.randint(1, 10)
54 |
55 | class Striker(SoccerPlayer):
56 | def init(self, name):
57 | super().__init__(name, "Striker")
58 | self.scoring_chance = 0.5
59 |
60 | def shoot(self, defender):
61 | if defender.skill_level >= self.skill_level:
62 | print(f"{self.name} shoots, but {defender.name} blocks the shot!")
63 | else:
64 | if random.random() < self.scoring_chance:
65 | print(f"{self.name} scores a goal!")
66 | else:
67 | print(f"{self.name} misses the shot.")
68 |
69 | class Defender(SoccerPlayer):
70 | def init(self, name):
71 | super().__init__(name, "Defender")
72 | self.support_level = 7
73 |
74 | def support(self, striker):
75 | striker.scoring_chance += self.support_level / 100
76 | print(f"{self.name} provides support to {striker.name}, increasing their scoring chance.")
77 |
78 | def simulate_game():
79 | striker = Striker("Alex")
80 | defender = Defender("Sarah")
81 | print(f"{striker.name} (Striker) vs. {defender.name} (Defender)")
82 |
83 | for _ in range(5):
84 | defender.support(striker)
85 | striker.shoot(defender)
86 | print()
87 |
88 | simulate_game()
89 | ```
90 |
91 | ### The Experiment
92 |
93 | I took TinyPrograms and tried to finetune a strong LLM model to see if I could improve its coding ability. I used Huggingface's awesome SmolLM-360M. It's small and works well on coding tasks. Out of the box, SmolLM-360M scores 11.6% on a popular coding test called HumanEval. HumanEval is a popular coding test. I created 5 configurations of datasets to test which improves my model the most:
94 |
95 | 1. The first one is simply the 992 tiny Python programs.
96 | 2. The second is 992 Python files that have been taken from the popular Stack dataset.
97 | 3. The third is a high-quality, filtered version of the tiny Python programs. It uses an LLM to score the programs based on a rubric.
98 | 4. The fourth is the same as the third, but on the Python files taken from the Stack.
99 | 5. Finally, the fifth mixes half of the high quality filtered tiny Python programs and the high quality filtered Python files from the Stack.
100 |
101 | ## Filtering for Quality
102 |
103 | To filter the tiny programs, I used `fastdata`
104 |
105 | ```python
106 | class TinyProgramCritique(BasicRepr):
107 | """
108 | A critique of a tiny program.
109 | """
110 | def __init__(
111 | self,
112 | critique: str, # A critique of the code.
113 | score: Literal[1, 2, 3, 4, 5], # A score of the code from 1 to 5.
114 | ): store_attr()
115 | ```
116 |
117 | And here is the prompt I used to guide to model to generating a score:
118 |
119 | ```python
120 | critique_template = """\
121 | Below is a code snippet. Evaluate its educational value for teaching programming to beginners in this language, using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:
122 |
123 | - Add 1 point if the code is syntactically correct and runs without errors, providing a basic example of working code in the language.
124 | - Add another point if the code demonstrates fundamental programming concepts (e.g., variables, control structures, functions) in a straightforward manner, even if it's not optimized or doesn't follow all best practices.
125 | - Award a third point if the code is well-commented, explaining key concepts and the purpose of different code sections. It should be readable and illustrate good naming conventions, making it easier for beginners to understand.
126 | - Grant a fourth point if the code showcases language-specific features or common programming patterns in an accessible way. It should provide clear examples of how to apply these concepts practically.
127 | - Bestow a fifth point if the code is an exemplary teaching tool, striking an excellent balance between simplicity and real-world applicability. It should inspire further learning, possibly including deliberate mistakes or opportunities for improvement that a teacher could use as discussion points.
128 |
129 | The code snippet:
130 |
131 | {code}
132 |
133 |
134 | After examining the code:
135 | - Briefly justify your total score, up to 100 words, focusing on its effectiveness as a teaching tool for beginners.
136 | - Conclude with the score.
137 | """
138 | ```
139 |
140 | This is the distribution of the scores for the 992 tiny Python programs:
141 |
142 | | Score | Count |
143 | |-------|-------|
144 | | 1 | 25 |
145 | | 2 | 117 |
146 | | 3 | 96 |
147 | | 4 | 256 |
148 | | 5 | 498 |
149 |
150 | And here is the same for 10,000 of the Python files:
151 |
152 | | Score | Count |
153 | |-------|-------|
154 | | 1 | 2239 |
155 | | 2 | 5230 |
156 | | 3 | 1545 |
157 | | 4 | 618 |
158 | | 5 | 236 |
159 |
160 | I only kept a score of 4 and 5 as high quality data for both the tiny python programs and python files from the Stack.
161 |
162 | ### Results
163 |
164 | | Setup | pass@1 |
165 | |---------|--------|
166 | | Baseline | 11.6% |
167 | | TinyPrograms | 09.1% |
168 | | The Stack | 11.0% |
169 | | TinyPrograms Filtered | 12.2% |
170 | | The Stack Filtered | 08.5% |
171 | | Mixed Filtered | 09.8% |
172 |
173 | ### Key findings from the experiment:
174 |
175 | 1. Training on synthetic data is better than training on random GitHub programs when performing quality filtering, i.e., TinyPrograms Filtered vs The Stack Filtered.
176 | 2. Only high-quality synthetic data (TinyPrograms Filtered) improve performance over the baseline.
177 | 3. All other setups degrade performance. High-quality Python files from the Stack show the biggest drop. This warrants further investigation. Possible explanations include:
178 | - The scoring system may not be as effective for GitHub programs as it is for synthetic ones.
179 | - There might be a lack of diversity in the GitHub programs.
180 |
181 | For further exploration, I encourage you to:
182 | 1. Replicate this experiment with your own task.
183 | 2. Experiment with larger datasets to see how they affect model performance.
184 | 3. Share your findings with the community and reach out if you need help!
185 |
186 | To do this yourself, follow the rest of this README. It shows how to reproduce my results and serves as a starting point for your project.
187 |
188 | ## Install
189 |
190 | Make sure you have installed `fastdata` with the following command from the root of the repo:
191 |
192 | ```bash
193 | pip install -e .
194 | ```
195 |
196 | If you want to train a model, install the following dependencies in the `examples` folder:
197 |
198 | ```bash
199 | pip install -r requirements.txt
200 | ```
201 |
202 | Then run the following if you will use flash attention:
203 |
204 | ```bash
205 | pip install flash-attn --no-build-isolation
206 | ```
207 |
208 | ## Run
209 |
210 | ### Data Synthesis
211 |
212 | We have a script to generate our tiny programs dataset. It can be run with this command:
213 |
214 | ```bash
215 | python tiny_programs.py
216 | ```
217 |
218 | You can see all the command-line arguments by running:
219 |
220 | ```bash
221 | python tiny_programs.py --help
222 | ```
223 |
224 | ### Training
225 |
226 | To train a model, you can use the following command:
227 |
228 | ```bash
229 | python train.py
230 | ```
231 |
232 | You can view all the command-line arguments by executing the following command:
233 |
234 | ```bash
235 | python train.py --help
236 | ```
237 |
--------------------------------------------------------------------------------
/examples/push_to_hf.py:
--------------------------------------------------------------------------------
1 | from fastcore.utils import *
2 | from fastdata.core import FastData
3 |
4 |
5 | class Translation:
6 | "Translation from an English phrase to a Spanish phrase"
7 |
8 | def __init__(self, english: str, spanish: str):
9 | self.english = english
10 | self.spanish = spanish
11 |
12 | def __repr__(self):
13 | return f"{self.english} ➡ *{self.spanish}*"
14 |
15 |
16 | prompt_template = """\
17 | Generate English and Spanish translations on the following topic:
18 | {topic}
19 | """
20 |
21 | inputs = [
22 | {"topic": "I am going to the beach this weekend"},
23 | {"topic": "I am going to the gym after work"},
24 | {"topic": "I am going to the park with my kids"},
25 | {"topic": "I am going to the movies with my friends"},
26 | {"topic": "I am going to the store to buy some groceries"},
27 | {"topic": "I am going to the library to read some books"},
28 | {"topic": "I am going to the zoo to see the animals"},
29 | {"topic": "I am going to the museum to see the art"},
30 | {"topic": "I am going to the restaurant to eat some food"},
31 | ]
32 |
33 | fast_data = FastData(model="claude-3-haiku-20240307")
34 | dataset_name = "my_dataset"
35 |
36 | repo_id, translations = fast_data.generate_to_hf(
37 | prompt_template=prompt_template,
38 | inputs=inputs,
39 | schema=Translation,
40 | repo_id=dataset_name,
41 | max_items_per_file=4,
42 | )
43 | print(f"A new repository has been create on {repo_id}")
44 | print(translations)
45 |
--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | minai
3 | torcheval
--------------------------------------------------------------------------------
/examples/tiny_programs.py:
--------------------------------------------------------------------------------
1 | from datasets import Dataset, load_dataset
2 | from fastdata.core import FastData
3 | from fastcore.script import *
4 | from fastcore.utils import *
5 |
6 | class TinyProgram(BasicRepr):
7 | "A tiny program that is a valid python program that satisfies the requirements."
8 | def __init__(self,requirements: str,code: str): store_attr()
9 |
10 | examples = [
11 | TinyProgram(
12 | requirements="A Python-based data aggregation and analysis tool that scrapes key Salvadoran news websites and government portals for the latest political updates, election results, and policy changes. The program would use standard libraries like requests for web scraping, re for text parsing, and pandas for data manipulation. It would store the collected information in a structured format, perform basic sentiment analysis on news articles, and generate a daily summary report highlighting significant political events, trending topics, and shifts in public opinion. The tool could also track mentions of key political figures and parties, providing a quick overview of their media presence and associated sentiments.",
13 | code="""\
14 | ```python
15 | import requests
16 | from bs4 import BeautifulSoup
17 | import pandas as pd
18 | from textblob import TextBlob
19 | from collections import Counter
20 | import datetime
21 |
22 | def scrape_news(url):
23 | response = requests.get(url)
24 | soup = BeautifulSoup(response.content, 'html.parser')
25 | articles = soup.find_all('article', class_='article-item')
26 |
27 | news_data = []
28 | for article in articles:
29 | title = article.find('h2', class_='article-title').text.strip()
30 | summary = article.find('p', class_='article-summary').text.strip()
31 | news_data.append({'title': title, 'summary': summary})
32 |
33 | return news_data
34 |
35 | def analyze_sentiment(text):
36 | return TextBlob(text).sentiment.polarity
37 |
38 | def generate_report(data):
39 | df = pd.DataFrame(data)
40 | df['sentiment'] = df['summary'].apply(analyze_sentiment)
41 |
42 | # Calculate average sentiment
43 | avg_sentiment = df['sentiment'].mean()
44 |
45 | # Find most mentioned words
46 | all_words = ' '.join(df['title'] + ' ' + df['summary']).lower().split()
47 | word_freq = Counter(word for word in all_words if len(word) > 3)
48 | top_words = word_freq.most_common(5)
49 |
50 | # Generate report
51 | report = f"Daily Political Analysis Report for El Salvador - {datetime.date.today()}\n\n"
52 | report += f"Number of articles analyzed: {len(df)}\n"
53 | report += f"Average sentiment: {'Positive' if avg_sentiment > 0 else 'Negative'} ({avg_sentiment:.2f})\n\n"
54 | report += "Top mentioned words:\n"
55 | for word, count in top_words:
56 | report += f"- {word}: {count} times\n"
57 |
58 | report += "\nMost positive article:\n"
59 | pos_article = df.loc[df['sentiment'].idxmax()]
60 | report += f"Title: {pos_article['title']}\nSentiment: {pos_article['sentiment']:.2f}\n\n"
61 |
62 | report += "Most negative article:\n"
63 | neg_article = df.loc[df['sentiment'].idxmin()]
64 | report += f"Title: {neg_article['title']}\nSentiment: {neg_article['sentiment']:.2f}\n"
65 |
66 | return report
67 |
68 | def main():
69 | url = "https://www.elsalvador.com/noticias/nacional/" # Example Salvadoran news website
70 | news_data = scrape_news(url)
71 | report = generate_report(news_data)
72 | print(report)
73 |
74 | # Optionally, save the report to a file
75 | with open(f"el_salvador_political_report_{datetime.date.today()}.txt", "w") as f:
76 | f.write(report)
77 |
78 | if __name__ == "__main__":
79 | main()
80 | ```
81 | """
82 | ),
83 | TinyProgram(
84 | requirements="A \"Joke Personalizer\" program that takes a comedian's standard jokes as input and automatically modifies them to include inside references and shared memories from their school days. The program should use a dictionary of preset keywords (e.g., \"cafeteria\", \"Mr. Johnson's class\") and replace generic terms in the jokes with these personalized references. It should also have a \"nostalgia meter\" that rates how many childhood callbacks are in each joke, and a \"groan factor\" estimator based on the corniness of the puns. Bonus feature: a \"detention probability calculator\" that humorously predicts how likely the joke would have gotten them in trouble back in school.",
85 | code="""\
86 | ```python
87 | import random
88 | import re
89 |
90 | class JokePersonalizer:
91 | def __init__(self):
92 | self.inside_references = {
93 | "restaurant": "cafeteria",
94 | "teacher": "Mr. Johnson",
95 | "friend": "Danny 'Braces' Smith",
96 | "car": "rusty old bike",
97 | "mall": "corner store",
98 | "party": "detention",
99 | "cool": "totally radical",
100 | "phone": "Gameboy",
101 | "computer": "TI-83 calculator",
102 | "internet": "library encyclopedia"
103 | }
104 | self.pun_words = ["cheesy", "corny", "hilarious", "side-splitting", "knee-slapping"]
105 |
106 | def personalize_joke(self, joke):
107 | for generic, personal in self.inside_references.items():
108 | joke = re.sub(r'\b' + generic + r'\b', personal, joke, flags=re.IGNORECASE)
109 | return joke
110 |
111 | def nostalgia_meter(self, joke):
112 | count = sum(1 for ref in self.inside_references.values() if ref.lower() in joke.lower())
113 | return min(count * 20, 100) # 20 points per reference, max 100
114 |
115 | def groan_factor(self, joke):
116 | pun_count = sum(1 for word in self.pun_words if word.lower() in joke.lower())
117 | return min(pun_count * 25, 100) # 25 points per pun word, max 100
118 |
119 | def detention_probability(self, joke):
120 | naughty_words = ["detention", "trouble", "principal's office", "suspended"]
121 | probability = sum(10 for word in naughty_words if word.lower() in joke.lower())
122 | return min(probability, 100) # 10% per naughty word, max 100%
123 |
124 | def process_joke(self, original_joke):
125 | personalized_joke = self.personalize_joke(original_joke)
126 | nostalgia = self.nostalgia_meter(personalized_joke)
127 | groan = self.groan_factor(personalized_joke)
128 | detention_prob = self.detention_probability(personalized_joke)
129 |
130 | return {
131 | "original": original_joke,
132 | "personalized": personalized_joke,
133 | "nostalgia_rating": nostalgia,
134 | "groan_factor": groan,
135 | "detention_probability": detention_prob
136 | }
137 |
138 | # Example usage
139 | personalizer = JokePersonalizer()
140 |
141 | jokes = [
142 | "I went to a restaurant last night and had the best meal ever!",
143 | "My teacher asked me to stay after class, it was so cool!",
144 | "I threw a party and nobody came. It was a real phone-y situation!",
145 | ]
146 |
147 | for joke in jokes:
148 | result = personalizer.process_joke(joke)
149 | print(f"Original: {result['original']}")
150 | print(f"Personalized: {result['personalized']}")
151 | print(f"Nostalgia Rating: {result['nostalgia_rating']}%")
152 | print(f"Groan Factor: {result['groan_factor']}%")
153 | print(f"Detention Probability: {result['detention_probability']}%")
154 | print()
155 | ```
156 | """
157 | ),
158 | ]
159 | examples = "\n".join(f"- {example}" for example in examples)
160 |
161 | class TinyProgramCritique(BasicRepr):
162 | "A critique of a tiny program."
163 | def __init__(self,critique: str,score: int): store_attr()
164 |
165 | def load_personas(num_personas: int = 1_000):
166 | return load_dataset("proj-persona/PersonaHub", "persona", split='train').select(range(num_personas))['persona']
167 |
168 | def generate_tiny_programs(fast_data, personas, examples, sp):
169 | prompt_template = """\
170 |
171 | {examples}
172 | {persona}
176 | """
177 | tiny_programs = fast_data.generate(
178 | prompt_template=prompt_template,
179 | inputs=[{"persona": persona, "examples": examples} for persona in personas],
180 | schema=TinyProgram,
181 | sp=sp
182 | )
183 | return [t for t in tiny_programs if t is not None]
184 |
185 | def generate_critiques(fast_data, tiny_programs, sp):
186 | critique_template = """\
187 | Below is a code snippet. Evaluate its educational value for teaching programming to beginners in this language, using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:
188 |
189 | - Add 1 point if the code is syntactically correct and runs without errors, providing a basic example of working code in the language.
190 | - Add another point if the code demonstrates fundamental programming concepts (e.g., variables, control structures, functions) in a straightforward manner, even if it's not optimized or doesn't follow all best practices.
191 | - Award a third point if the code is well-commented, explaining key concepts and the purpose of different code sections. It should be readable and illustrate good naming conventions, making it easier for beginners to understand.
192 | - Grant a fourth point if the code showcases language-specific features or common programming patterns in an accessible way. It should provide clear examples of how to apply these concepts practically.
193 | - Bestow a fifth point if the code is an exemplary teaching tool, striking an excellent balance between simplicity and real-world applicability. It should inspire further learning, possibly including deliberate mistakes or opportunities for improvement that a teacher could use as discussion points.
194 |
195 | The code snippet:
196 |
197 | {code}
198 |
199 |
200 | After examining the code:
201 |
202 | - Briefly justify your total score, up to 100 words, focusing on its effectiveness as a teaching tool for beginners.
203 | - Conclude with the score.
204 | """
205 | return fast_data.generate(
206 | prompt_template=critique_template,
207 | inputs=[{"code": t.code} for t in tiny_programs],
208 | schema=TinyProgramCritique,
209 | sp=sp
210 | )
211 |
212 | def update_programs_with_critiques(tiny_programs, critiques):
213 | programs = []
214 | for program, critique in zip(tiny_programs, critiques):
215 | if program is None or critique is None: continue
216 | programs.append({
217 | "requirements": program.requirements,
218 | "code": program.code,
219 | "critique": critique.critique,
220 | "score": critique.score
221 | })
222 | return programs
223 |
224 | @call_parse
225 | def main(num_personas: Param("Number of personas to use", int) = 1_000,
226 | program_model: Param("Model to use for generating tiny programs", str) = "claude-3-haiku-20240307",
227 | program_sp: Param("System prompt for generating tiny programs", str) = "You are a helpful assistant for generating python programs.",
228 | critique_model: Param("Model to use for generating critiques", str) = "claude-3-5-sonnet-20240620",
229 | critique_sp: Param("System prompt for critiquing tiny programs", str) = "You are a helpful assistant tasked with critiquing python programs.",
230 | output_dataset: Param("Name of the output dataset", str) = "answerdotai/tiny_programs",
231 | private: Param("Whether to make the output dataset private", bool) = True):
232 |
233 | fast_data = FastData(model=program_model)
234 | personas = load_personas(num_personas)
235 | tiny_programs = generate_tiny_programs(fast_data, personas, examples, program_sp)
236 | fast_data = FastData(model=critique_model)
237 | critiques = generate_critiques(fast_data, tiny_programs, critique_sp)
238 | updated_programs = update_programs_with_critiques(tiny_programs, critiques)
239 |
240 | ds = Dataset.from_list(updated_programs)
241 | ds.push_to_hub(output_dataset, private=private)
--------------------------------------------------------------------------------
/examples/train_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from fastcore.script import *
3 | from minai.core import *
4 | from datasets import load_dataset
5 | from torch.utils.data import DataLoader
6 | from transformers import AutoModelForCausalLM, AutoTokenizer
7 |
8 | @call_parse
9 | def main(
10 | model_id: Param("Model ID", str) = 'HuggingFaceTB/SmolLM-360M',
11 | dataset_name: Param("Dataset name", str) = "answerdotai/tiny_programs",
12 | model_output_name: Param("Model output name", str) = "answerdotai/SmolLM-360M-finetuned-tiny_programs",
13 | batch_size: Param("Batch size", int) = 8,
14 | lr: Param("Learning rate", float) = 1e-3,
15 | num_epochs: Param("Number of epochs", int) = 5,
16 | filter_dataset: Param("Filter dataset", bool) = False,
17 | dataset_size: Param("Dataset size", int) = 754,
18 | dataset_column: Param("Dataset column", str) = "code",
19 | is_private: Param("Is private", bool) = True,
20 | ):
21 | set_seed(42)
22 |
23 | # Model and tokenizer setup
24 | m = AutoModelForCausalLM.from_pretrained(
25 | model_id,
26 | device_map=0,
27 | attn_implementation="flash_attention_2",
28 | trust_remote_code=True,
29 | low_cpu_mem_usage=True,
30 | torch_dtype=torch.bfloat16,
31 | use_cache=False
32 | )
33 | tokenizer = AutoTokenizer.from_pretrained(model_id)
34 | tokenizer.pad_token = tokenizer.eos_token
35 |
36 | # Dataset preparation
37 | dataset = load_dataset(dataset_name, split="train").shuffle(42).select(range(dataset_size))
38 | if filter_dataset:
39 | dataset = dataset.filter(lambda x: x['score'] in [4, 5])
40 |
41 | def to_text(x):
42 | x['text'] = x[dataset_column]
43 | return x
44 |
45 | dataset = dataset.shuffle(42).map(to_text, remove_columns=dataset.column_names)
46 | train_dataset = dataset.select(range(0, len(dataset)-50))
47 | eval_dataset = dataset.select(range(len(dataset)-50, len(dataset)))
48 |
49 | # DataLoader setup
50 | def collate_fn(examples):
51 | input_ids = tokenizer([e['text'] for e in examples], return_tensors='pt', padding=True, truncation=True, max_length=512)['input_ids']
52 | return (input_ids[:, :-1], input_ids[:, 1:])
53 |
54 | train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
55 | eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
56 | dls = DataLoaders(train_dataloader, eval_dataloader)
57 |
58 | # Training setup
59 | loss_fn = lambda x, y: torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1))
60 | # sz = len(dls.train) // 10
61 |
62 | cbs = [DeviceCB(), MetricsCB()]
63 | prog = ProgressCB(plot=True)
64 | learn = MomentumLearner(m, dls, loss_func=loss_fn, lr=lr, cbs=cbs, preds_nm='logits', mom=0.9)
65 |
66 | # Training
67 | learn.fit(num_epochs, cbs=prog)
68 |
69 | # push to the hub
70 | learn.model.push_to_hub(model_output_name, private=is_private)
71 | tokenizer.push_to_hub(model_output_name, private=is_private)
72 |
73 | # Test generation
74 | prompt = 'import requests\n'
75 | tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
76 |
77 | with torch.inference_mode():
78 | output = m.generate(tokenized_prompt, max_new_tokens=90)
79 |
80 | print(prompt + tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True))
81 |
82 | if __name__ == "__main__":
83 | main()
--------------------------------------------------------------------------------
/fastdata/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.5"
2 |
--------------------------------------------------------------------------------
/fastdata/_modidx.py:
--------------------------------------------------------------------------------
1 | # Autogenerated by nbdev
2 |
3 | d = { 'settings': { 'branch': 'main',
4 | 'doc_baseurl': '/fastdata',
5 | 'doc_host': 'https://AnswerDotAI.github.io',
6 | 'git_url': 'https://github.com/AnswerDotAI/fastdata',
7 | 'lib_path': 'fastdata'},
8 | 'syms': { 'fastdata.core': { 'fastdata.core.FastData': ('core.html#fastdata', 'fastdata/core.py'),
9 | 'fastdata.core.FastData.__init__': ('core.html#fastdata.__init__', 'fastdata/core.py'),
10 | 'fastdata.core.FastData._process_input': ('core.html#fastdata._process_input', 'fastdata/core.py'),
11 | 'fastdata.core.FastData._save_results': ('core.html#fastdata._save_results', 'fastdata/core.py'),
12 | 'fastdata.core.FastData._set_rate_limit': ('core.html#fastdata._set_rate_limit', 'fastdata/core.py'),
13 | 'fastdata.core.FastData.generate': ('core.html#fastdata.generate', 'fastdata/core.py'),
14 | 'fastdata.core.FastData.generate_to_hf': ('core.html#fastdata.generate_to_hf', 'fastdata/core.py')}}}
15 |
--------------------------------------------------------------------------------
/fastdata/core.py:
--------------------------------------------------------------------------------
1 | """Core functionality for synthetic data generation"""
2 |
3 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
4 |
5 | # %% auto 0
6 | __all__ = ['DATASET_CARD_TEMPLATE', 'FastData']
7 |
8 | # %% ../nbs/00_core.ipynb 3
9 | import concurrent.futures
10 | import json
11 | import shutil
12 | from pathlib import Path
13 | from uuid import uuid4
14 | from typing import Optional, Union
15 |
16 | from tqdm import tqdm
17 | from fastcore.utils import *
18 | from ratelimit import limits, sleep_and_retry
19 | from huggingface_hub import CommitScheduler, DatasetCard
20 | from claudette import *
21 |
22 | # %% ../nbs/00_core.ipynb 4
23 | DATASET_CARD_TEMPLATE = """
24 | ---
25 | tags:
26 | - fastdata
27 | - synthetic
28 | ---
29 |
30 | # {title}
31 |
32 | _Note: This is an AI-generated dataset, so its content may be inaccurate or false._
33 |
34 | **Source of the data:**
35 |
36 | The dataset was generated using [Fastdata](https://github.com/AnswerDotAI/fastdata) library and {model_id} with the following input:
37 |
38 | ## System Prompt
39 |
40 | ```
41 | {system_prompt}
42 | ```
43 |
44 | ## Prompt Template
45 |
46 | ```
47 | {prompt_template}
48 | ```
49 |
50 | ## Sample Input
51 |
52 | ```json
53 | {sample_input}
54 | ```
55 |
56 | """
57 |
58 |
59 | class FastData:
60 | def __init__(self,
61 | model: str = "claude-3-haiku-20240307",
62 | calls: int = 100,
63 | period: int = 60):
64 | self.cli = Client(model)
65 | self._set_rate_limit(calls, period)
66 |
67 | def _set_rate_limit(self, calls: int, period: int):
68 | """Set a new rate limit."""
69 | @sleep_and_retry
70 | @limits(calls=calls, period=period)
71 | def rate_limited_call(prompt: str, schema, temp: float, sp: str):
72 | return self.cli.structured(
73 | prompt,
74 | temp=temp,
75 | tools=schema,
76 | )[0]
77 |
78 | self._rate_limited_call = rate_limited_call
79 |
80 | def _process_input(self, prompt_template, schema, temp, sp, input_data):
81 | try:
82 | prompt = prompt_template.format(**input_data)
83 | return self._rate_limited_call(
84 | prompt=prompt, schema=schema, temp=temp, sp=sp
85 | )
86 | except Exception as e:
87 | print(f"Error processing input {input_data}: {e}")
88 | return None
89 |
90 | def _save_results(self, results: list[dict], save_path: Path) -> None:
91 | with open(save_path, "w") as f:
92 | for res in results:
93 | obj_dict = getattr(res, "__stored_args__", res.__dict__)
94 | f.write(json.dumps(obj_dict) + "\n")
95 |
96 | def generate(self,
97 | prompt_template: str,
98 | inputs: list[dict],
99 | schema,
100 | temp: float = 1.,
101 | sp: str = "You are a helpful assistant.",
102 | max_workers: int = 64) -> list[dict]:
103 | "For every input in INPUTS, fill PROMPT_TEMPLATE and generate a value fitting SCHEMA"
104 |
105 | with tqdm(total=len(inputs)) as pbar:
106 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
107 | futures = [
108 | executor.submit(
109 | self._process_input,
110 | prompt_template,
111 | schema,
112 | temp,
113 | sp,
114 | input_data,
115 | )
116 | for input_data in inputs
117 | ]
118 |
119 | for completed_future in concurrent.futures.as_completed(futures):
120 | pbar.update(1)
121 | return [f.result() for f in futures]
122 |
123 | def generate_to_hf(
124 | self,
125 | prompt_template: str,
126 | inputs: list[dict],
127 | schema,
128 | repo_id: str,
129 | temp: float = 1.0,
130 | sp: str = "You are a helpful assistant.",
131 | max_workers: int = 64,
132 | max_items_per_file: int = 100,
133 | commit_every: Union[int, float] = 5,
134 | private: Optional[bool] = None,
135 | token: Optional[str] = None,
136 | delete_files_after: bool = True,
137 | ) -> tuple[str, list[dict]]:
138 | """
139 | Generate data based on a prompt template and schema, and save it to Hugging Face dataset repository.
140 | This function writes the generated records to multiple files, each containing a maximum of `max_items_per_file` records.
141 | Due to the multi-threaded execution of the function, the order of the records in the files is not guaranteed to match the order of the input data.
142 |
143 | Args:
144 | prompt_template (str): The template for generating prompts.
145 | inputs (list[dict]): A list of input dictionaries to be processed.
146 | schema: The schema to parse the generated data.
147 | repo_id (str): The HuggingFace dataset name.
148 | temp (float, optional): The temperature for generation. Defaults to 1.0.
149 | sp (str, optional): The system prompt for the assistant. Defaults to "You are a helpful assistant.".
150 | max_workers (int, optional): The maximum number of worker threads. Defaults to 64.
151 | max_items_per_file (int, optional): The maximum number of items to save in each file. Defaults to 100.
152 | commit_every (Union[int, float], optional): The number of minutes between each commit. Defaults to 5.
153 | private (bool, optional): Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
154 | token (Optional[str], optional): The token to use to commit to the repo. Defaults to the token saved on the machine.
155 | delete_files_after (bool, optional): Whether to delete files after processing. Defaults to True.
156 |
157 | Returns:
158 | tuple[str, list[dict]]: A tuple with the generated repo_id and the list of generated data dictionaries.
159 | """
160 | dataset_dir = Path(repo_id.replace("/", "_"))
161 | dataset_dir.mkdir(parents=True, exist_ok=True)
162 | data_dir = dataset_dir / "data"
163 | data_dir.mkdir(exist_ok=True)
164 |
165 | try:
166 | scheduler = CommitScheduler(
167 | repo_id=repo_id,
168 | repo_type="dataset",
169 | folder_path=dataset_dir,
170 | every=commit_every,
171 | private=private,
172 | token=token,
173 | )
174 |
175 | readme_path = dataset_dir / "README.md"
176 |
177 | if not readme_path.exists():
178 | DatasetCard(
179 | DATASET_CARD_TEMPLATE.format(
180 | title=repo_id,
181 | model_id=self.cli.model,
182 | system_prompt=sp,
183 | prompt_template=prompt_template,
184 | sample_input=inputs[:2],
185 | )
186 | ).save(readme_path)
187 |
188 | results = []
189 | total_inputs = len(inputs)
190 |
191 | with tqdm(total=total_inputs) as pbar:
192 | with concurrent.futures.ThreadPoolExecutor(
193 | max_workers=max_workers
194 | ) as executor:
195 | futures = [
196 | executor.submit(
197 | self._process_input,
198 | prompt_template,
199 | schema,
200 | temp,
201 | sp,
202 | input_data,
203 | )
204 | for input_data in inputs
205 | ]
206 |
207 | current_file = data_dir / f"train-{uuid4()}.jsonl"
208 | for completed_future in concurrent.futures.as_completed(futures):
209 | result = completed_future.result()
210 | if result is not None:
211 | results.append(result)
212 | with scheduler.lock:
213 | self._save_results(results, current_file)
214 | pbar.update(1)
215 | if len(results) >= max_items_per_file:
216 | current_file = data_dir / f"train-{uuid4()}.jsonl"
217 | results.clear()
218 | finally:
219 | scheduler.trigger().result() # force upload last result
220 | if delete_files_after:
221 | shutil.rmtree(dataset_dir)
222 |
223 | return scheduler.repo_id, [f.result() for f in futures if f.done()]
224 |
--------------------------------------------------------------------------------
/nbs/00_core.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# core\n",
8 | "\n",
9 | "> Core functionality for synthetic data generation"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "#| default_exp core"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "#| hide\n",
28 | "from IPython.display import Markdown"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "#| export\n",
38 | "import concurrent.futures\n",
39 | "import json\n",
40 | "import shutil\n",
41 | "from pathlib import Path\n",
42 | "from uuid import uuid4\n",
43 | "from typing import Optional, Union\n",
44 | "\n",
45 | "from tqdm import tqdm\n",
46 | "from fastcore.utils import *\n",
47 | "from ratelimit import limits, sleep_and_retry\n",
48 | "from huggingface_hub import CommitScheduler, DatasetCard\n",
49 | "from claudette import *"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "#| export\n",
59 | "DATASET_CARD_TEMPLATE = \"\"\"\n",
60 | "---\n",
61 | "tags:\n",
62 | "- fastdata\n",
63 | "- synthetic\n",
64 | "---\n",
65 | "\n",
66 | "# {title}\n",
67 | "\n",
68 | "_Note: This is an AI-generated dataset, so its content may be inaccurate or false._\n",
69 | "\n",
70 | "**Source of the data:**\n",
71 | "\n",
72 | "The dataset was generated using [Fastdata](https://github.com/AnswerDotAI/fastdata) library and {model_id} with the following input:\n",
73 | "\n",
74 | "## System Prompt\n",
75 | "\n",
76 | "```\n",
77 | "{system_prompt}\n",
78 | "```\n",
79 | "\n",
80 | "## Prompt Template\n",
81 | "\n",
82 | "```\n",
83 | "{prompt_template}\n",
84 | "```\n",
85 | "\n",
86 | "## Sample Input\n",
87 | "\n",
88 | "```json\n",
89 | "{sample_input}\n",
90 | "```\n",
91 | "\n",
92 | "\"\"\"\n",
93 | "\n",
94 | "\n",
95 | "class FastData:\n",
96 | " def __init__(self,\n",
97 | " model: str = \"claude-3-haiku-20240307\",\n",
98 | " calls: int = 100,\n",
99 | " period: int = 60):\n",
100 | " self.cli = Client(model)\n",
101 | " self._set_rate_limit(calls, period)\n",
102 | "\n",
103 | " def _set_rate_limit(self, calls: int, period: int):\n",
104 | " \"\"\"Set a new rate limit.\"\"\"\n",
105 | " @sleep_and_retry\n",
106 | " @limits(calls=calls, period=period)\n",
107 | " def rate_limited_call(prompt: str, schema, temp: float, sp: str):\n",
108 | " return self.cli.structured(\n",
109 | " prompt,\n",
110 | " temp=temp,\n",
111 | " tools=schema,\n",
112 | " )[0]\n",
113 | " \n",
114 | " self._rate_limited_call = rate_limited_call\n",
115 | "\n",
116 | " def _process_input(self, prompt_template, schema, temp, sp, input_data):\n",
117 | " try:\n",
118 | " prompt = prompt_template.format(**input_data)\n",
119 | " return self._rate_limited_call(\n",
120 | " prompt=prompt, schema=schema, temp=temp, sp=sp\n",
121 | " )\n",
122 | " except Exception as e:\n",
123 | " print(f\"Error processing input {input_data}: {e}\")\n",
124 | " return None\n",
125 | "\n",
126 | " def _save_results(self, results: list[dict], save_path: Path) -> None:\n",
127 | " with open(save_path, \"w\") as f:\n",
128 | " for res in results:\n",
129 | " obj_dict = getattr(res, \"__stored_args__\", res.__dict__)\n",
130 | " f.write(json.dumps(obj_dict) + \"\\n\")\n",
131 | "\n",
132 | " def generate(self, \n",
133 | " prompt_template: str, \n",
134 | " inputs: list[dict], \n",
135 | " schema,\n",
136 | " temp: float = 1.,\n",
137 | " sp: str = \"You are a helpful assistant.\",\n",
138 | " max_workers: int = 64) -> list[dict]:\n",
139 | " \"For every input in INPUTS, fill PROMPT_TEMPLATE and generate a value fitting SCHEMA\"\n",
140 | " \n",
141 | " with tqdm(total=len(inputs)) as pbar:\n",
142 | " with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
143 | " futures = [\n",
144 | " executor.submit(\n",
145 | " self._process_input,\n",
146 | " prompt_template,\n",
147 | " schema,\n",
148 | " temp,\n",
149 | " sp,\n",
150 | " input_data,\n",
151 | " )\n",
152 | " for input_data in inputs\n",
153 | " ]\n",
154 | "\n",
155 | " for completed_future in concurrent.futures.as_completed(futures):\n",
156 | " pbar.update(1)\n",
157 | " return [f.result() for f in futures]\n",
158 | "\n",
159 | " def generate_to_hf(\n",
160 | " self,\n",
161 | " prompt_template: str,\n",
162 | " inputs: list[dict],\n",
163 | " schema,\n",
164 | " repo_id: str,\n",
165 | " temp: float = 1.0,\n",
166 | " sp: str = \"You are a helpful assistant.\",\n",
167 | " max_workers: int = 64,\n",
168 | " max_items_per_file: int = 100,\n",
169 | " commit_every: Union[int, float] = 5,\n",
170 | " private: Optional[bool] = None,\n",
171 | " token: Optional[str] = None,\n",
172 | " delete_files_after: bool = True,\n",
173 | " ) -> tuple[str, list[dict]]:\n",
174 | " \"\"\"\n",
175 | " Generate data based on a prompt template and schema, and save it to Hugging Face dataset repository.\n",
176 | " This function writes the generated records to multiple files, each containing a maximum of `max_items_per_file` records. \n",
177 | " Due to the multi-threaded execution of the function, the order of the records in the files is not guaranteed to match the order of the input data. \n",
178 | "\n",
179 | " Args:\n",
180 | " prompt_template (str): The template for generating prompts.\n",
181 | " inputs (list[dict]): A list of input dictionaries to be processed.\n",
182 | " schema: The schema to parse the generated data.\n",
183 | " repo_id (str): The HuggingFace dataset name.\n",
184 | " temp (float, optional): The temperature for generation. Defaults to 1.0.\n",
185 | " sp (str, optional): The system prompt for the assistant. Defaults to \"You are a helpful assistant.\".\n",
186 | " max_workers (int, optional): The maximum number of worker threads. Defaults to 64.\n",
187 | " max_items_per_file (int, optional): The maximum number of items to save in each file. Defaults to 100.\n",
188 | " commit_every (Union[int, float], optional): The number of minutes between each commit. Defaults to 5.\n",
189 | " private (bool, optional): Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.\n",
190 | " token (Optional[str], optional): The token to use to commit to the repo. Defaults to the token saved on the machine.\n",
191 | " delete_files_after (bool, optional): Whether to delete files after processing. Defaults to True.\n",
192 | "\n",
193 | " Returns:\n",
194 | " tuple[str, list[dict]]: A tuple with the generated repo_id and the list of generated data dictionaries.\n",
195 | " \"\"\"\n",
196 | " dataset_dir = Path(repo_id.replace(\"/\", \"_\"))\n",
197 | " dataset_dir.mkdir(parents=True, exist_ok=True)\n",
198 | " data_dir = dataset_dir / \"data\"\n",
199 | " data_dir.mkdir(exist_ok=True)\n",
200 | "\n",
201 | " try:\n",
202 | " scheduler = CommitScheduler(\n",
203 | " repo_id=repo_id,\n",
204 | " repo_type=\"dataset\",\n",
205 | " folder_path=dataset_dir,\n",
206 | " every=commit_every,\n",
207 | " private=private,\n",
208 | " token=token,\n",
209 | " )\n",
210 | "\n",
211 | " readme_path = dataset_dir / \"README.md\"\n",
212 | "\n",
213 | " if not readme_path.exists():\n",
214 | " DatasetCard(\n",
215 | " DATASET_CARD_TEMPLATE.format(\n",
216 | " title=repo_id,\n",
217 | " model_id=self.cli.model,\n",
218 | " system_prompt=sp,\n",
219 | " prompt_template=prompt_template,\n",
220 | " sample_input=inputs[:2],\n",
221 | " )\n",
222 | " ).save(readme_path)\n",
223 | "\n",
224 | " results = []\n",
225 | " total_inputs = len(inputs)\n",
226 | "\n",
227 | " with tqdm(total=total_inputs) as pbar:\n",
228 | " with concurrent.futures.ThreadPoolExecutor(\n",
229 | " max_workers=max_workers\n",
230 | " ) as executor:\n",
231 | " futures = [\n",
232 | " executor.submit(\n",
233 | " self._process_input,\n",
234 | " prompt_template,\n",
235 | " schema,\n",
236 | " temp,\n",
237 | " sp,\n",
238 | " input_data,\n",
239 | " )\n",
240 | " for input_data in inputs\n",
241 | " ]\n",
242 | "\n",
243 | " current_file = data_dir / f\"train-{uuid4()}.jsonl\"\n",
244 | " for completed_future in concurrent.futures.as_completed(futures):\n",
245 | " result = completed_future.result()\n",
246 | " if result is not None:\n",
247 | " results.append(result)\n",
248 | " with scheduler.lock:\n",
249 | " self._save_results(results, current_file)\n",
250 | " pbar.update(1)\n",
251 | " if len(results) >= max_items_per_file:\n",
252 | " current_file = data_dir / f\"train-{uuid4()}.jsonl\"\n",
253 | " results.clear()\n",
254 | " finally:\n",
255 | " scheduler.trigger().result() # force upload last result\n",
256 | " if delete_files_after:\n",
257 | " shutil.rmtree(dataset_dir)\n",
258 | "\n",
259 | " return scheduler.repo_id, [f.result() for f in futures if f.done()]"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {},
266 | "outputs": [],
267 | "source": [
268 | "#| hide\n",
269 | "def to_md(ss): return '\\n'.join(f'- {s}' for s in ss) \n",
270 | "def show(ss): return Markdown(to_md(ss))"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [
278 | {
279 | "data": {
280 | "text/plain": [
281 | "Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?*"
282 | ]
283 | },
284 | "execution_count": null,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "class Translation():\n",
291 | " \"Translation from an English phrase to a Spanish phrase\"\n",
292 | " def __init__(self, english: str, spanish: str): store_attr()\n",
293 | " def __repr__(self): return f\"{self.english} ➡ *{self.spanish}*\"\n",
294 | "\n",
295 | "Translation(\"Hello, how are you today?\", \"Hola, ¿cómo estás hoy?\")"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "data": {
305 | "text/markdown": [
306 | "- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*\n",
307 | "- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*\n",
308 | "- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.*"
309 | ],
310 | "text/plain": [
311 | ""
312 | ]
313 | },
314 | "execution_count": null,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "examples = [\n",
321 | " Translation(\n",
322 | " english=\"Hello, my name is Nathan. I am a research scientist at an AI startup.\",\n",
323 | " spanish=\"Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.\"),\n",
324 | " Translation(\n",
325 | " english=\"How much wood could a woodchuck chuck if a woodchuck could chuck wood?\",\n",
326 | " spanish=\"¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?\"),\n",
327 | " Translation(\n",
328 | " english=\"Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See.\",\n",
329 | " spanish=\"Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.\"\n",
330 | " ),\n",
331 | "]\n",
332 | "show(examples)"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [
340 | {
341 | "name": "stderr",
342 | "output_type": "stream",
343 | "text": [
344 | "/home/algal/miniconda/envs/jup3/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
345 | " from .autonotebook import tqdm as notebook_tqdm\n"
346 | ]
347 | }
348 | ],
349 | "source": [
350 | "from datasets import load_dataset"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {},
357 | "outputs": [
358 | {
359 | "name": "stderr",
360 | "output_type": "stream",
361 | "text": [
362 | "Generating train split: 100%|█| 200000/200000 [00:00<00:00, 3852934.04 examples/\n"
363 | ]
364 | },
365 | {
366 | "data": {
367 | "text/markdown": [
368 | "- A Political Analyst specialized in El Salvador's political landscape.\n",
369 | "- A legal advisor who understands the legal implications of incomplete or inaccurate project documentation\n",
370 | "- A maternal health advocate focused on raising awareness about postpartum complications."
371 | ],
372 | "text/plain": [
373 | ""
374 | ]
375 | },
376 | "execution_count": null,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "# Load personas\n",
383 | "personas = load_dataset(\"proj-persona/PersonaHub\", \"persona\", split='train').select(range(3))['persona']\n",
384 | "show(personas)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": null,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "sp = \"You will help generate synthetic data of English and Spanish phrases.\"\n",
394 | "prompt_template = \"\"\"\\\n",
395 | "\n",
396 | "{examples}\n",
397 | "\n",
398 | "\n",
399 | "Create an English and Spanish translation pair that is similar to the examples and would be appropriate for the following persona:\n",
400 | "{persona}\n",
401 | "\"\"\""
402 | ]
403 | },
404 | {
405 | "cell_type": "markdown",
406 | "metadata": {},
407 | "source": [
408 | "Let's see what the prompt looks like in action:"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": null,
414 | "metadata": {},
415 | "outputs": [
416 | {
417 | "name": "stdout",
418 | "output_type": "stream",
419 | "text": [
420 | "\n",
421 | "- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*\n",
422 | "- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*\n",
423 | "- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.*\n",
424 | "\n",
425 | "\n",
426 | "Create an English and Spanish translation pair that is similar to the examples and would be appropriate for the following persona:\n",
427 | "A Political Analyst specialized in El Salvador's political landscape.\n",
428 | "\n"
429 | ]
430 | }
431 | ],
432 | "source": [
433 | "examples_md = to_md(examples)\n",
434 | "prompt = prompt_template.format(examples=examples_md, persona=personas[0])\n",
435 | "print(prompt)"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": null,
441 | "metadata": {},
442 | "outputs": [
443 | {
444 | "name": "stderr",
445 | "output_type": "stream",
446 | "text": [
447 | "100%|█████████████████████████████████████████████| 3/3 [00:01<00:00, 1.64it/s]\n"
448 | ]
449 | }
450 | ],
451 | "source": [
452 | "# Generate translations\n",
453 | "fast_data = FastData(model=\"claude-3-haiku-20240307\")\n",
454 | "translations = fast_data.generate(\n",
455 | " prompt_template=prompt_template,\n",
456 | " inputs=[{\"persona\": persona, \"examples\": examples} for persona in personas],\n",
457 | " schema=Translation,\n",
458 | " sp=sp\n",
459 | ")"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "metadata": {},
466 | "outputs": [
467 | {
468 | "data": {
469 | "text/markdown": [
470 | "- The political situation in El Salvador continues to be complex, with ongoing tensions between the ruling party and opposition groups. President Nayib Bukele has consolidated significant power, raising concerns about the state of democracy in the country. ➡ *La situación política en El Salvador sigue siendo compleja, con tensiones persistentes entre el partido gobernante y los grupos de oposición. El presidente Nayib Bukele ha consolidado un poder significativo, lo que genera preocupaciones sobre el estado de la democracia en el país.*\n",
471 | "- Thorough documentation is critical for any legal proceedings. Incomplete or inaccurate records can have serious consequences. ➡ *La documentación exhaustiva es fundamental para cualquier proceso legal. Los registros incompletos o inexactos pueden tener consecuencias graves.*\n",
472 | "- Postpartum complications can be life-threatening, but with proper care and support, new mothers can recover and thrive. Let's work together to ensure all women have access to the resources they need during this crucial time. ➡ *Las complicaciones posparto pueden poner en riesgo la vida, pero con la atención y el apoyo adecuados, las nuevas madres pueden recuperarse y prosperar. Trabajemos juntos para garantizar que todas las mujeres tengan acceso a los recursos que necesitan durante este momento crucial.*"
473 | ],
474 | "text/plain": [
475 | ""
476 | ]
477 | },
478 | "execution_count": null,
479 | "metadata": {},
480 | "output_type": "execute_result"
481 | }
482 | ],
483 | "source": [
484 | "show(translations)"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "# Generate translations and push results to Hugging Face Hub as a dataset\n",
494 | "# Be sure to have the HF_TOKEN environment variable set to your Hugging Face API token\n",
495 | "fast_data = FastData(model=\"claude-3-haiku-20240307\")\n",
496 | "repo_id, translations = fast_data.generate_to_hf(\n",
497 | " prompt_template=prompt_template,\n",
498 | " inputs=[{\"persona\": persona, \"examples\": examples} for persona in personas],\n",
499 | " schema=Translation,\n",
500 | " sp=sp,\n",
501 | " repo_id=f\"personas-translation-{uuid4()}\",\n",
502 | " max_items_per_file=2, # It will create a local file each 2 translations \n",
503 | ")\n",
504 | "assert len(translations) == len(personas)\n",
505 | "\n",
506 | "new_dataset = load_dataset(repo_id)\n",
507 | "assert len(translations) == len(personas)"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": null,
513 | "metadata": {},
514 | "outputs": [],
515 | "source": [
516 | "class TranslationCritique():\n",
517 | " \"A critique of the translation.\"\n",
518 | " def __init__(self, critique: str, score: int): store_attr()\n",
519 | " def __repr__(self): return f\"\\t- **Critique:** {self.critique}\\n\\t- **Score:** {self.score}\""
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "metadata": {},
526 | "outputs": [],
527 | "source": [
528 | "sp = \"You will help critique synthetic data of English and Spanish phrases.\"\n",
529 | "critique_template = \"\"\"\\\n",
530 | "Below is an extract of a translation. Evaluate its quality as a senior translator would, considering its suitability for professional use. Use the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:\n",
531 | "\n",
532 | "- Add 1 point if the translation conveys the basic meaning of the source text, even if it includes some minor errors or awkward phrasing.\n",
533 | "- Add another point if the translation is generally accurate but lacks refinement in style or fails to capture some nuances of the original. It might use inconsistent terminology or have occasional lapses in register.\n",
534 | "- Award a third point if the translation is appropriate for professional use and accurately conveys key concepts of the source text. It demonstrates good understanding of both languages, though it may not be flawless or could include some slight inconsistencies. It resembles the work of a competent translator but may have room for improvement in fluency or precision.\n",
535 | "- Grant a fourth point if the translation is highly accurate and reads naturally in the target language, exhibiting a consistent and appropriate style. It could be similar to the work of an experienced translator, offering faithful rendering of content and tone, with minimal errors, and effectively handling complex concepts or cultural references. The result is coherent, well-expressed, and valuable for its intended purpose.\n",
536 | "- Bestow a fifth point if the translation is outstanding, demonstrating mastery of both source and target languages. It captures subtle nuances, maintains the author's voice and intent, and reads as if it were originally written in the target language. The translator has made excellent choices in dealing with challenging elements like wordplay, idiomatic expressions, or culture-specific content.\n",
537 | "\n",
538 | "{translation}\n",
539 | "\n",
540 | "After examining the translation:\n",
541 | "\n",
542 | "- Briefly justify your total score, up to 100 words.\n",
543 | "- Conclude with the score of the translation.\n",
544 | "\"\"\""
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": null,
550 | "metadata": {},
551 | "outputs": [
552 | {
553 | "name": "stderr",
554 | "output_type": "stream",
555 | "text": [
556 | "100%|█████████████████████████████████████████████| 3/3 [00:03<00:00, 1.27s/it]\n"
557 | ]
558 | }
559 | ],
560 | "source": [
561 | "fast_data = FastData(model=\"claude-3-5-sonnet-20240620\")\n",
562 | "critiques = fast_data.generate(\n",
563 | " prompt_template=critique_template,\n",
564 | " inputs=[{\"translation\": f\"{t.english} -> {t.spanish}\"} for t in translations],\n",
565 | " schema=TranslationCritique,\n",
566 | " sp=sp\n",
567 | ")"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": null,
573 | "metadata": {},
574 | "outputs": [
575 | {
576 | "data": {
577 | "text/markdown": [
578 | "- The political situation in El Salvador continues to be complex, with ongoing tensions between the ruling party and opposition groups. President Nayib Bukele has consolidated significant power, raising concerns about the state of democracy in the country. ➡ *La situación política en El Salvador sigue siendo compleja, con tensiones persistentes entre el partido gobernante y los grupos de oposición. El presidente Nayib Bukele ha consolidado un poder significativo, lo que genera preocupaciones sobre el estado de la democracia en el país.*\n",
579 | "\n",
580 | "\t- **Critique:** The translation accurately conveys the meaning of the source text, capturing the complexity of El Salvador's political situation. It effectively translates key terms and maintains the tone of the original. The translator demonstrates a strong grasp of both languages, rendering the content naturally in Spanish. The phrasing is appropriate and flows well, preserving the nuances of the English version. There are no noticeable errors or awkward constructions. The translation skillfully handles the political terminology and concepts, making it suitable for professional use. It reads as if it were originally written in Spanish, indicating the translator's expertise.\n",
581 | "\t- **Score:** 5\n",
582 | "- Thorough documentation is critical for any legal proceedings. Incomplete or inaccurate records can have serious consequences. ➡ *La documentación exhaustiva es fundamental para cualquier proceso legal. Los registros incompletos o inexactos pueden tener consecuencias graves.*\n",
583 | "\n",
584 | "\t- **Critique:** The translation accurately conveys the main message of the source text, maintaining both the meaning and tone. It effectively captures the importance of thorough documentation in legal proceedings and the potential consequences of incomplete or inaccurate records. The translator has chosen appropriate Spanish equivalents for key terms, such as \"exhaustiva\" for \"thorough\" and \"proceso legal\" for \"legal proceedings\". The sentence structure is natural in Spanish, and the translation maintains the formal register suitable for legal contexts. There are no errors in grammar or vocabulary. The translation demonstrates a high level of competence, accurately conveying complex concepts while reading naturally in the target language. It could be considered the work of an experienced translator.\n",
585 | "\t- **Score:** 4\n",
586 | "- Postpartum complications can be life-threatening, but with proper care and support, new mothers can recover and thrive. Let's work together to ensure all women have access to the resources they need during this crucial time. ➡ *Las complicaciones posparto pueden poner en riesgo la vida, pero con la atención y el apoyo adecuados, las nuevas madres pueden recuperarse y prosperar. Trabajemos juntos para garantizar que todas las mujeres tengan acceso a los recursos que necesitan durante este momento crucial.*\n",
587 | "\n",
588 | "\t- **Critique:** The translation accurately conveys the meaning of the original text, maintaining both the informative and encouraging tone. It correctly translates key terms like \"postpartum complications\" and \"life-threatening.\" The Spanish version flows naturally and captures the nuances of the original, including the call to action. The translator has made excellent choices in vocabulary and structure, resulting in a text that reads as if it were originally written in Spanish. The translation demonstrates a high level of proficiency in both languages and would be suitable for professional use in healthcare communications.\n",
589 | "\t- **Score:** 5"
590 | ],
591 | "text/plain": [
592 | ""
593 | ]
594 | },
595 | "execution_count": null,
596 | "metadata": {},
597 | "output_type": "execute_result"
598 | }
599 | ],
600 | "source": [
601 | "show(f'{t}\\n\\n{c}' for t, c in zip(translations, critiques))"
602 | ]
603 | },
604 | {
605 | "cell_type": "markdown",
606 | "metadata": {},
607 | "source": [
608 | "### Test that generate outputs align with inputs"
609 | ]
610 | },
611 | {
612 | "cell_type": "markdown",
613 | "metadata": {},
614 | "source": [
615 | "Let's verify that the `FastData.generate` returns results in the same order as the inputs it was passed.\n",
616 | "\n",
617 | "To show this, we will define a new prompt template, where the model is asked only to echo a piece of data\n",
618 | "from the input. Then we will verify that the values in the inputs matches the values in the outputs, in order and in value."
619 | ]
620 | },
621 | {
622 | "cell_type": "code",
623 | "execution_count": null,
624 | "metadata": {},
625 | "outputs": [],
626 | "source": [
627 | "sp=\"You will help with accurate and faithful data processing.\"\n",
628 | "prompt_template = \"\"\"\\\n",
629 | "Below you find an item of data, a datum, which is an alphanumeric string:\n",
630 | "\n",
631 | "{datum}\n",
632 | "\n",
633 | "After reviewing this datum, please echo is back exactly, without any preamble:\n",
634 | "\"\"\""
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": null,
640 | "metadata": {},
641 | "outputs": [],
642 | "source": [
643 | "class Datum:\n",
644 | " \"A data value\"\n",
645 | " def __init__(self, datum: str): store_attr()\n",
646 | " def __repr__(self): return f\"{self.datum}\""
647 | ]
648 | },
649 | {
650 | "cell_type": "markdown",
651 | "metadata": {},
652 | "source": [
653 | "First we'll test that the prompt and schema class work as execpted."
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": null,
659 | "metadata": {},
660 | "outputs": [
661 | {
662 | "name": "stdout",
663 | "output_type": "stream",
664 | "text": [
665 | "Below you find an item of data, a datum, which is an alphanumeric string:\n",
666 | "\n",
667 | "b9121446-e46c-47c0-9e6d-b4df35c0974b\n",
668 | "\n",
669 | "After reviewing this datum, please echo is back exactly, without any preamble:\n",
670 | "\n"
671 | ]
672 | }
673 | ],
674 | "source": [
675 | "print(prompt_template.format(**dict(datum=str(uuid4()))))"
676 | ]
677 | },
678 | {
679 | "cell_type": "code",
680 | "execution_count": null,
681 | "metadata": {},
682 | "outputs": [
683 | {
684 | "data": {
685 | "text/plain": [
686 | "04da7de4-cc39-4699-9d25-5a476e366732"
687 | ]
688 | },
689 | "execution_count": null,
690 | "metadata": {},
691 | "output_type": "execute_result"
692 | }
693 | ],
694 | "source": [
695 | "Datum(str(uuid4()))"
696 | ]
697 | },
698 | {
699 | "cell_type": "markdown",
700 | "metadata": {},
701 | "source": [
702 | "Now we ask the model to \"generate\" (i.e., echo) 100 of these values."
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": null,
708 | "metadata": {},
709 | "outputs": [
710 | {
711 | "name": "stderr",
712 | "output_type": "stream",
713 | "text": [
714 | "100%|█████████████████████████████████████████| 100/100 [00:04<00:00, 24.17it/s]\n"
715 | ]
716 | }
717 | ],
718 | "source": [
719 | "in_vals = [{\"datum\":str(uuid4())} for _ in range(100)]\n",
720 | "out_vals = fast_data.generate(\n",
721 | " prompt_template=prompt_template,\n",
722 | " inputs=in_vals,\n",
723 | " schema=Datum,\n",
724 | " sp=sp\n",
725 | ")"
726 | ]
727 | },
728 | {
729 | "cell_type": "markdown",
730 | "metadata": {},
731 | "source": [
732 | "Now we will see if the inputs and outputs are aligned.\n",
733 | "\n",
734 | "If they are aligned, then that shows two things. First it shows that the model is echoing the values faithfully. Second, it shows _either_ that the model itself returned outputs in the order in which they were submitted, or else that `generate` has returned outputs in submission order.\n",
735 | "\n",
736 | "We are submitting a large enough quantity of items, that we _asssume_ the model will return some results out of submission order. If you want confidence which does not depend on this assumption, then could modify the test above to increase the number and complexity of the generation task, or simply inspect the implementation.\n",
737 | "\n",
738 | "Let's start by spot checking the first item:"
739 | ]
740 | },
741 | {
742 | "cell_type": "code",
743 | "execution_count": null,
744 | "metadata": {},
745 | "outputs": [
746 | {
747 | "data": {
748 | "text/plain": [
749 | "('f42ea0db-24ce-4e09-a50d-edf74d0eb611',\n",
750 | " 'f42ea0db-24ce-4e09-a50d-edf74d0eb611')"
751 | ]
752 | },
753 | "execution_count": null,
754 | "metadata": {},
755 | "output_type": "execute_result"
756 | }
757 | ],
758 | "source": [
759 | "in_list = [x['datum'] for x in in_vals]\n",
760 | "out_list = [x.datum for x in out_vals]\n",
761 | "(in_list[0],out_list[0])"
762 | ]
763 | },
764 | {
765 | "cell_type": "code",
766 | "execution_count": null,
767 | "metadata": {},
768 | "outputs": [
769 | {
770 | "name": "stdout",
771 | "output_type": "stream",
772 | "text": [
773 | "Success: output items are aligned with input items\n"
774 | ]
775 | }
776 | ],
777 | "source": [
778 | "for (idx,(in_item,out_item)) in enumerate(zip(in_list,out_list)):\n",
779 | " if in_item != out_item:\n",
780 | " print(\"Failure: output items were not aligned with input items!\")\n",
781 | " print(f\"\\titem {idx} had in={in_item} and out={out_item}\")\n",
782 | " break\n",
783 | "else:\n",
784 | " print(\"Success: output items are aligned with input items\") "
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": null,
790 | "metadata": {},
791 | "outputs": [],
792 | "source": [
793 | "#| hide\n",
794 | "import nbdev; nbdev.nbdev_export()"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": null,
800 | "metadata": {},
801 | "outputs": [],
802 | "source": []
803 | }
804 | ],
805 | "metadata": {
806 | "kernelspec": {
807 | "display_name": "python3",
808 | "language": "python",
809 | "name": "python3"
810 | }
811 | },
812 | "nbformat": 4,
813 | "nbformat_minor": 4
814 | }
815 |
--------------------------------------------------------------------------------
/nbs/_quarto.yml:
--------------------------------------------------------------------------------
1 | project:
2 | type: website
3 |
4 | format:
5 | html:
6 | theme: cosmo
7 | css: styles.css
8 | toc: true
9 | keep-md: true
10 | commonmark: default
11 |
12 | website:
13 | twitter-card: true
14 | open-graph: true
15 | repo-actions: [issue]
16 | navbar:
17 | background: primary
18 | search: true
19 | sidebar:
20 | style: floating
21 |
22 | metadata-files: [nbdev.yml, sidebar.yml]
--------------------------------------------------------------------------------
/nbs/index.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# fastdata\n",
8 | "\n",
9 | "> Easiest and fastest way to generate 1B synthetic tokens\n",
10 | "\n",
11 | "Minimalist library that wraps around `claudette` to make generating synthetic data easy."
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "`fastdata` is a minimal library for generating synthetic data for training deep learning models. For example, below is how you can generate a dataset to train a language model to\n",
19 | "translate from English to Spanish.\n",
20 | "\n",
21 | "First you need to define the structure of the data you want to generate. `claudette`, which is the library that fastdata uses to generate data, requires you to define the schema of the data you want to generate."
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "from fastcore.utils import *"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/plain": [
41 | "Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?*"
42 | ]
43 | },
44 | "execution_count": null,
45 | "metadata": {},
46 | "output_type": "execute_result"
47 | }
48 | ],
49 | "source": [
50 | "class Translation():\n",
51 | " \"Translation from an English phrase to a Spanish phrase\"\n",
52 | " def __init__(self, english: str, spanish: str): store_attr()\n",
53 | " def __repr__(self): return f\"{self.english} ➡ *{self.spanish}*\"\n",
54 | "\n",
55 | "Translation(\"Hello, how are you today?\", \"Hola, ¿cómo estás hoy?\")"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "Next, you need to define the prompt that will be used to generate the data and any inputs you want to pass to the prompt."
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "prompt_template = \"\"\"\\\n",
72 | "Generate English and Spanish translations on the following topic:\n",
73 | "{topic}\n",
74 | "\"\"\"\n",
75 | "\n",
76 | "inputs = [{\"topic\": \"Otters are cute\"}, {\"topic\": \"I love programming\"}]"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "Finally, we can generate some data with fastdata.\n",
84 | "\n",
85 | "::: {.callout-note}\n",
86 | "We only support Anthropic models at the moment. Therefore, make sure you have an API key for the model you want to use and the proper environment variables set or pass the api key to the `FastData` class `FastData(api_key=\"sk-ant-api03-...\")`.\n",
87 | ":::"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "from fastdata.core import FastData"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "name": "stderr",
106 | "output_type": "stream",
107 | "text": [
108 | "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.57it/s]\n"
109 | ]
110 | }
111 | ],
112 | "source": [
113 | "fast_data = FastData(model=\"claude-3-haiku-20240307\")\n",
114 | "translations = fast_data.generate(\n",
115 | " prompt_template=prompt_template,\n",
116 | " inputs=inputs,\n",
117 | " schema=Translation,\n",
118 | ")"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "from IPython.display import Markdown"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "data": {
137 | "text/markdown": [
138 | "- I love programming ➡ *Me encanta la programación*\n",
139 | "- Otters are cute ➡ *Las nutrias son lindas*"
140 | ],
141 | "text/plain": [
142 | ""
143 | ]
144 | },
145 | "execution_count": null,
146 | "metadata": {},
147 | "output_type": "execute_result"
148 | }
149 | ],
150 | "source": [
151 | "Markdown(\"\\n\".join(f'- {t}' for t in translations))"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "### Installation"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "Install latest from the GitHub [repository][repo]:\n",
166 | "\n",
167 | "```sh\n",
168 | "$ pip install git+https://github.com/AnswerDotAI/fastdata.git\n",
169 | "```\n",
170 | "\n",
171 | "or from [pypi][pypi]\n",
172 | "\n",
173 | "\n",
174 | "```sh\n",
175 | "$ pip install python-fastdata\n",
176 | "```\n",
177 | "\n",
178 | "[repo]: https://github.com/AnswerDotAI/fastdata\n",
179 | "[docs]: https://AnswerDotAI.github.io/fastdata/\n",
180 | "[pypi]: https://pypi.org/project/fastdata/"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "If you'd like to see how best to generate data with fastdata, check out our blog post [here](https://www.answer.ai/blog/introducing-fastdata) and some of the examples in the [examples](https://github.com/AnswerDotAI/fastdata/tree/main/examples) directory."
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {},
193 | "source": [
194 | "## Developer Guide"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "If you are new to using `nbdev` here are some useful pointers to get you started."
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "### Install fastdata in Development mode"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "```sh\n",
216 | "# make sure fastdata package is installed in development mode\n",
217 | "$ pip install -e .\n",
218 | "\n",
219 | "# make changes under nbs/ directory\n",
220 | "# ...\n",
221 | "\n",
222 | "# compile to have changes apply to fastdata\n",
223 | "$ nbdev_prepare\n",
224 | "```"
225 | ]
226 | }
227 | ],
228 | "metadata": {
229 | "kernelspec": {
230 | "display_name": "python3",
231 | "language": "python",
232 | "name": "python3"
233 | }
234 | },
235 | "nbformat": 4,
236 | "nbformat_minor": 4
237 | }
238 |
--------------------------------------------------------------------------------
/nbs/nbdev.yml:
--------------------------------------------------------------------------------
1 | project:
2 | output-dir: _docs
3 |
4 | website:
5 | title: "fastdata"
6 | site-url: "https://AnswerDotAI.github.io/fastdata"
7 | description: "Easiest and fastest way to 1B synthetic tokens"
8 | repo-branch: main
9 | repo-url: "https://github.com/AnswerDotAI/fastdata"
10 |
--------------------------------------------------------------------------------
/nbs/sidebar.yml:
--------------------------------------------------------------------------------
1 | website:
2 | sidebar:
3 | contents:
4 | - index.ipynb
5 | - 00_core.ipynb
6 |
--------------------------------------------------------------------------------
/nbs/styles.css:
--------------------------------------------------------------------------------
1 | .cell {
2 | margin-bottom: 1rem;
3 | }
4 |
5 | .cell > .sourceCode {
6 | margin-bottom: 0;
7 | }
8 |
9 | .cell-output > pre {
10 | margin-bottom: 0;
11 | }
12 |
13 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre {
14 | margin-left: 0.8rem;
15 | margin-top: 0;
16 | background: none;
17 | border-left: 2px solid lightsalmon;
18 | border-top-left-radius: 0;
19 | border-top-right-radius: 0;
20 | }
21 |
22 | .cell-output > .sourceCode {
23 | border: none;
24 | }
25 |
26 | .cell-output > .sourceCode {
27 | background: none;
28 | margin-top: 0;
29 | }
30 |
31 | div.description {
32 | padding-left: 2px;
33 | padding-top: 5px;
34 | font-style: italic;
35 | font-size: 135%;
36 | opacity: 70%;
37 | }
38 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=64.0"]
3 | build-backend = "setuptools.build_meta"
4 |
--------------------------------------------------------------------------------
/settings.ini:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | repo = fastdata
3 | lib_name = python-fastdata
4 | version = 0.0.5
5 | min_python = 3.9
6 | license = apache2
7 | black_formatting = False
8 | doc_path = _docs
9 | lib_path = fastdata
10 | nbs_path = nbs
11 | recursive = True
12 | tst_flags = notest
13 | put_version_in_init = True
14 | branch = main
15 | custom_sidebar = False
16 | doc_host = https://AnswerDotAI.github.io
17 | doc_baseurl = /fastdata
18 | git_url = https://github.com/AnswerDotAI/fastdata
19 | title = fastdata
20 | audience = Developers
21 | author = ncoop57
22 | author_email = nc@answer.ai
23 | copyright = 2024 onwards, ncoop57
24 | description = Easiest and fastest way to 1B synthetic tokens
25 | keywords = nbdev jupyter notebook python
26 | language = English
27 | status = 3
28 | user = AnswerDotAI
29 | requirements = claudette fastcore ratelimit tqdm huggingface_hub
30 | dev_requirements = black datasets ipykernel nbdev
31 | readme_nb = index.ipynb
32 | allowed_metadata_keys =
33 | allowed_cell_metadata_keys =
34 | jupyter_hooks = False
35 | clean_ids = True
36 | clear_all = False
37 | cell_number = True
38 | skip_procs =
39 |
40 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from pkg_resources import parse_version
2 | from configparser import ConfigParser
3 | import setuptools, shlex
4 | assert parse_version(setuptools.__version__)>=parse_version('36.2')
5 |
6 | # note: all settings are in settings.ini; edit there, not here
7 | config = ConfigParser(delimiters=['='])
8 | config.read('settings.ini', encoding='utf-8')
9 | cfg = config['DEFAULT']
10 |
11 | cfg_keys = 'version description keywords author author_email'.split()
12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o)
14 | setup_cfg = {o:cfg[o] for o in cfg_keys}
15 |
16 | licenses = {
17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
18 | 'mit': ('MIT License', 'OSI Approved :: MIT License'),
19 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'),
20 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'),
21 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'),
22 | }
23 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
24 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
25 | py_versions = '3.6 3.7 3.8 3.9 3.10 3.11 3.12'.split()
26 |
27 | requirements = shlex.split(cfg.get('requirements', ''))
28 | if cfg.get('pip_requirements'): requirements += shlex.split(cfg.get('pip_requirements', ''))
29 | min_python = cfg['min_python']
30 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None))
31 | dev_requirements = (cfg.get('dev_requirements') or '').split()
32 |
33 | package_data = dict()
34 | pkg_data = cfg.get('package_data', None)
35 | if pkg_data:
36 | package_data[cfg['lib_name']] = pkg_data.split() # split as multiple files might be listed
37 | # Add package data to setup_cfg for setuptools.setup(..., **setup_cfg)
38 | setup_cfg['package_data'] = package_data
39 |
40 | setuptools.setup(
41 | name = cfg['lib_name'],
42 | license = lic[0],
43 | classifiers = [
44 | 'Development Status :: ' + statuses[int(cfg['status'])],
45 | 'Intended Audience :: ' + cfg['audience'].title(),
46 | 'Natural Language :: ' + cfg['language'].title(),
47 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []),
48 | url = cfg['git_url'],
49 | packages = setuptools.find_packages(),
50 | include_package_data = True,
51 | install_requires = requirements,
52 | extras_require={ 'dev': dev_requirements },
53 | dependency_links = cfg.get('dep_links','').split(),
54 | python_requires = '>=' + cfg['min_python'],
55 | long_description = open('README.md', encoding='utf-8').read(),
56 | long_description_content_type = 'text/markdown',
57 | zip_safe = False,
58 | entry_points = {
59 | 'console_scripts': cfg.get('console_scripts','').split(),
60 | 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d']
61 | },
62 | **setup_cfg)
63 |
64 |
65 |
--------------------------------------------------------------------------------