├── .gitignore
├── LICENSE
├── README.md
├── config.py
├── data_processing.py
├── llm_queries.py
├── main.py
├── requirements.txt
└── shell.nix


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Muse specific
  2 | textbooks
  3 | config.ini
  4 | reconnect.py
  5 | 
  6 | # Created by https://www.toptal.com/developers/gitignore/api/python
  7 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
  8 | 
  9 | ### Python ###
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | cover/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | .pybuilder/
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | #   For a library or package, you might want to ignore these files since the code is
 96 | #   intended to run in multiple environments; otherwise, check them in:
 97 | # .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # poetry
107 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
109 | #   commonly ignored for libraries.
110 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 | 
113 | # pdm
114 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | #   in version control.
118 | #   https://pdm.fming.dev/#use-with-ide
119 | .pdm.toml
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | 
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 | 
144 | # Rope project settings
145 | .ropeproject
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 | 
155 | # Pyre type checker
156 | .pyre/
157 | 
158 | # pytype static type analyzer
159 | .pytype/
160 | 
161 | # Cython debug symbols
162 | cython_debug/
163 | 
164 | # PyCharm
165 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
168 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | #.idea/
170 | 
171 | ### Python Patch ###
172 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
173 | poetry.toml
174 | 
175 | # ruff
176 | .ruff_cache/
177 | 
178 | # LSP config files
179 | pyrightconfig.json
180 | 
181 | # End of https://www.toptal.com/developers/gitignore/api/python


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 thooton
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # muse
 2 | 
 3 | Muse is a Python script for creating synthetic textbooks using Google's Gemini Pro API, providing free access at 60 req/min. On average, each request generates ~1000 tokens, resulting in 3.6 million tokens per API key per hour!
 4 | 
 5 | ## Usage Instructions
 6 | 
 7 | 1. Clone or download the repository to your local machine.
 8 | 2. Enter the directory.
 9 | 3. Run `python3 -m pip install -r requirements.txt`.
10 | 4. Run `python3 main.py` to generate a `config.ini` file with default values.
11 | 5. Edit `config.ini` and add one or more API keys obtained from [Google's Makersuite](https://makersuite.google.com/app/apikey). Feel free to make other adjustments as well.
12 | 6. On Hugging Face, go to [New Dataset](https://huggingface.co/new-dataset) and create a new dataset named `muse_textbooks`.
13 | 7. Go to [Hugging Face Tokens](https://huggingface.co/settings/tokens) to get your current access token (API key) or create a new one with **write** permission.
14 | 8. Run `python3 main.py`; you'll be prompted to enter your Hugging Face API key.
15 | 
16 | The script submits 60 requests/minute to Google's Gemini Pro, generating textbooks in the specified (`out_dir`) directory and auto-uploading them to a Hugging Face dataset named `your_username/muse_textbooks`.
17 | 
18 | **Note**: Configuration values can be specified either in the config.ini file or passed as environment variables. It is advisable to avoid setting arbitrary values for temperature and top_p.
19 | 
20 | ### Nix/Nixos
21 | 
22 | Run: `nix-shell` inside the muse directory. Then follow instructions from the above-mentioned step 5.
23 | 
24 | ## Implications
25 | 
26 | By creating large amounts of open-source synthetic textbook data, we pave the way for open-source models that are more efficient and performant. phi-2 was trained on 250B tokens of mixed synthetic data and webtext; what might we be able to do with a 7B model trained on trillions of synthetic tokens?
27 | 
28 | ## How it works
29 | 
30 | The program auto-generates prompts by sampling from two seed datasets defined by the `TEXT_DATASET` and `CODE_DATASET` variables in `data_processing.py`. After a passage is sampled, it is passed to one of three prompt templates defined by `TEMPLATES` that instruct the LLM to either:
31 | 1) Generate a two-person debate on a subject related to the passage, or
32 | 2) Generate an informative lecture on a subject related to the passage, or
33 | 3) Generate a computer science textbook section on a subject related to the passage.
34 | 
35 | ## Contributing
36 | 
37 | This repository is open to any PRs/issues/suggestions :)
38 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import configparser
 3 | import json
 4 | 
 5 | def load_configuration():
 6 |     CONFIG_FILE = "config.ini"
 7 | 
 8 |     config = configparser.ConfigParser()
 9 | 
10 |     config_file_exists = os.path.exists(CONFIG_FILE)
11 | 
12 |     BOOL_TABLE = {
13 |         "True": True,
14 |         "False": False
15 |     }
16 | 
17 |     if not config_file_exists:
18 |         config["Gemini"] = {
19 |             "API_KEYS": "[]",
20 |             "API_ENDPOINT": "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key=",
21 |         }
22 |         config["Parameters"] = {"TEMPERATURE": "1.0", "TOP_P": "0.99"}
23 |         config["Misc"] = {
24 |             "OUT_DIR": "./textbooks",
25 |             "COUNT_PER_FILE": "1000",
26 |             "BEGIN_INDEX": "0",
27 |             "VERBOSE_EXCEPTIONS": "False"
28 |         }
29 |         with open("config.ini", "w") as configfile:
30 |             config.write(configfile)
31 | 
32 |         print(
33 |             "config.ini file not found. A default config.ini has been generated. Please update it with your API key and other configurations."
34 |         )
35 |         exit(1)
36 | 
37 |     config.read("config.ini")
38 | 
39 |     API_KEYS = json.loads(config.get("Gemini", "API_KEYS", fallback="[]"))
40 |     API_ENDPOINT = config.get("Gemini", "API_ENDPOINT", fallback="")
41 |     TEMPERATURE = float(config.get("Parameters", "TEMPERATURE", fallback="0.0"))
42 |     TOP_P = float(config.get("Parameters", "TOP_P", fallback="0.0"))
43 |     OUT_DIR = config.get("Misc", "OUT_DIR", fallback="")
44 |     COUNT_PER_FILE = int(config.get("Misc", "COUNT_PER_FILE", fallback="0"))
45 |     BEGIN_INDEX = int(config.get("Misc", "BEGIN_INDEX", fallback="0"))
46 |     VERBOSE_EXCEPTIONS = BOOL_TABLE[config.get("Misc", "VERBOSE_EXCEPTIONS", fallback="False")]
47 | 
48 |     API_KEYS = os.getenv("API_KEYS", ";".join(API_KEYS)).split(";")
49 |     API_ENDPOINT = os.getenv("API_ENDPOINT", API_ENDPOINT)
50 |     TEMPERATURE = float(os.getenv("TEMPERATURE", TEMPERATURE))
51 |     TOP_P = float(os.getenv("TOP_P", TOP_P))
52 |     OUT_DIR = os.getenv("OUT_DIR", OUT_DIR)
53 |     COUNT_PER_FILE = int(os.getenv("COUNT_PER_FILE", COUNT_PER_FILE))
54 |     BEGIN_INDEX = int(os.getenv("BEGIN_INDEX", BEGIN_INDEX))
55 |     VERBOSE_EXCEPTIONS = BOOL_TABLE[os.getenv("VERBOSE_EXCEPTIONS", str(VERBOSE_EXCEPTIONS))]
56 | 
57 |     if len(API_KEYS) == 0:
58 |         print("API keys are empty. Please update config.ini with one or more API keys.")
59 |         exit(1)
60 | 
61 |     if not os.path.exists(OUT_DIR):
62 |         os.makedirs(OUT_DIR)
63 | 
64 |     return (
65 |         API_KEYS,
66 |         API_ENDPOINT,
67 |         TEMPERATURE,
68 |         TOP_P,
69 |         OUT_DIR,
70 |         COUNT_PER_FILE,
71 |         BEGIN_INDEX,
72 |         VERBOSE_EXCEPTIONS
73 |     )
74 | 
75 | 
76 | (
77 |     API_KEYS,
78 |     API_ENDPOINT,
79 |     TEMPERATURE,
80 |     TOP_P,
81 |     OUT_DIR,
82 |     COUNT_PER_FILE,
83 |     BEGIN_INDEX,
84 |     VERBOSE_EXCEPTIONS
85 | ) = load_configuration()
86 | 


--------------------------------------------------------------------------------
/data_processing.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import secrets
  3 | import datasets
  4 | 
  5 | 
  6 | def load_iter_from_spec(spec):
  7 |     return spec["iter"](
  8 |         datasets.load_dataset(spec["id"])[spec["split"]].shuffle(
  9 |             seed=secrets.randbits(32)
 10 |         )
 11 |     )
 12 | 
 13 | 
 14 | def process_text_dataset(item):
 15 |     return "\n\n".join(
 16 |         [
 17 |             {"human": "Human: ", "gpt": "Assistant: "}[entry["from"]] + entry["value"]
 18 |             for entry in item["conversations"]
 19 |         ]
 20 |     ).strip()
 21 | 
 22 | 
 23 | def process_code_dataset(item):
 24 |     return (
 25 |         "Request: "
 26 |         + item["instruction"].strip()
 27 |         + "\n\nCode: "
 28 |         + item["output"].strip()
 29 |     )
 30 | 
 31 | 
 32 | TEXT_DATASET = {
 33 |     "id": "WizardLM/WizardLM_evol_instruct_V2_196k",
 34 |     "split": "train",
 35 |     "iter": lambda dataset: map(process_text_dataset, dataset),
 36 | }
 37 | 
 38 | CODE_DATASET = {
 39 |     "id": "TokenBender/code_instructions_122k_alpaca_style",
 40 |     "split": "train",
 41 |     "iter": lambda dataset: map(process_code_dataset, dataset),
 42 | }
 43 | 
 44 | 
 45 | TEMPLATES = [
 46 |     {
 47 |         "dataset": "text",
 48 |         "prompt": lambda passage: f"""
 49 | Please consider the following passage: <passage>{passage}</passage>
 50 | You have two tasks:
 51 | 1) Drawing inspiration from the content of the passage, generate a brand new debate topic.
 52 | The debate topic will belong to the same domain as the content of the passage, but it will be even more rare.
 53 | The debate topic generated will be philosophical, creative, interesting, engaging, and thought-provoking.
 54 | The debate topic generated will not have an easy answer; it will be able to be argued from both sides.
 55 | The topic will be surrounded by <topic></topic> tags.
 56 | 2) Generate a debate on the generated topic between two rational individuals, Aspen and River.
 57 | In the debate, the participants will hold mutually opposing views.
 58 | The debate will be long and drawn-out; no side will give up easily.
 59 | In the debate, at times the participants may make concessions, but still hold fast to their point of view.
 60 | In the debate, the participants will use various techniques of rational discussion; they will not make use of emotionally manipulative techniques.
 61 | In the debate, the participants will never repeat themselves.
 62 | The debate will have at least 50 paragraphs; it will have at least 5000 words. It will be novel-length.
 63 | For the debate, you will be tipped $15 for every paragraph you write. To maximize your earnings, write as many as possible.
 64 | The debate will be formatted in Markdown.
 65 | The debate will be surrounded by <debate></debate> tags.
 66 |     """,
 67 |         "extract": lambda raw: (
 68 |             "A debate on the topic "
 69 |             + json.dumps(raw.split("<topic>")[-1].split("</topic>")[0].strip())
 70 |             + ":\n\n"
 71 |             + raw.split("<debate>")[-1].split("</debate>")[0].strip()
 72 |         ),
 73 |     },
 74 |     {
 75 |         "dataset": "text",
 76 |         "prompt": lambda passage: f"""
 77 | Please consider the following passage: <passage>{passage}</passage>
 78 | Imagine you are a professor with a reputation for excellent lectures.
 79 | You have three tasks:
 80 | 1) Drawing inspiration from the content of the passage, generate a brand new lecture topic.
 81 | The lecture topic will belong to the same domain as the content of the passage, but it will be even more rare.
 82 | The lecture topic will be carefully chosen to advance the education of the students in every way.
 83 | The lecture topic will be interesting, engaging, and thought-provoking.
 84 | The lecture topic will be surrounded by <topic></topic> tags.
 85 | 2) Generate a ten-point lecture outline on the generated topic.
 86 | The lecture outline's ten points will be chosen to maximize ease of understanding and flow.
 87 | The lecture outline will be surrounded by <outline></outline> tags.
 88 | 3) Generate a lecture, following the outline, on the generated topic.
 89 | The lecture will be informative and easy to understand for the students.
 90 | The lecture will provide as much information as possible. It should be as long as possible.
 91 | For each piece of information you incorporate into the lecture, you will receive a tip of $20.
 92 | In the lecture, all unfamiliar terms or topics will be explained for the students' benefit.
 93 | In the lecture, it will be assumed that the students have no prior familiarity with the subject.
 94 | In the lecture, the lecturer will never repeat themselves unnecessarily.
 95 | The lecture will be formatted in Markdown.
 96 | The lecture will be surrounded by <lecture></lecture> tags.
 97 |     """,
 98 |         "extract": lambda raw: (
 99 |             raw.split("<lecture>")[-1].split("</lecture>")[0].strip()
100 |         ),
101 |     },
102 |     {
103 |         "dataset": "code",
104 |         "prompt": lambda passage: f"""
105 | Please consider the following passage: <passage_42>{passage}</passage_42>
106 | Imagine you are a highly esteemed computer science professor writing a programming textbook.
107 | You have three tasks:
108 | 1) Drawing inspiration from the content of the passage, craft a brand new textbook section topic.
109 | The section topic will belong to the same domain as the content of the passage, but it will be even more rare.
110 | The section topic will be interesting, complex, and multifaceted, even if the passage is simple.
111 | The section topic will be directly related to computer science.
112 | The section topic will be carefully chosen to provide as much pedagogical value to the reader as possible.
113 | The section topic will be surrounded by <topic_42></topic_42> tags.
114 | 2) Generate a ten-point section outline with code on the generated topic.
115 | Of the section outline's ten points, at least three will be code examples illustrating the topic.
116 | The section outline's ten points will be chosen to maximize ease of understanding and flow.
117 | The section outline will be surrounded by <outline_42></outline_42> tags.
118 | 3) Generate a textbook section, following the outline, on the generated topic.
119 | The section will be self-contained, informative, easy to understand, and verbose.
120 | The section will be written in longform prose.
121 | For each piece of information you include, you will receive a payment of $20; thus, include as many as possible to maximize your earnings.
122 | The section will explain all unfamiliar terms or topics for the reader's benefits.
123 | The section will never repeat information or code.
124 | The section will be formatted in Markdown.
125 | The section will be surrounded by <section_42></section_42> tags.
126 |     """,
127 |         "extract": lambda raw: (
128 |             raw.split("<section_42>")[-1].split("</section_42>")[0].strip()
129 |         ),
130 |     },
131 | ]
132 | 


--------------------------------------------------------------------------------
/llm_queries.py:
--------------------------------------------------------------------------------
 1 | from config import API_ENDPOINT, TEMPERATURE, TOP_P
 2 | 
 3 | 
 4 | async def llm_query(sess, query, api_key):
 5 |     assert 0.0 <= TEMPERATURE <= 1.0
 6 |     assert 0.0 <= TOP_P <= 1.0
 7 |     async with sess.post(
 8 |         API_ENDPOINT + api_key,
 9 |         json={
10 |             "contents": [{"role": "USER", "parts": [{"text": query}]}],
11 |             "generationConfig": {"temperature": TEMPERATURE, "topP": TOP_P},
12 |         },
13 |     ) as resp:
14 |         return (await resp.json())["candidates"][0]["content"]["parts"][0]["text"]
15 | 
16 | 
17 | async def llm_template_query(sess, template, passage, api_key):
18 |     try:
19 |         query = template["prompt"](passage).strip()
20 |         response = await llm_query(sess, query, api_key)
21 |         return ("ok", template["extract"](response))
22 |     except Exception as exc:
23 |         return ("err", {"exc": exc, "api_key": api_key})
24 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from tqdm import tqdm
  3 | import aiohttp
  4 | import asyncio
  5 | import secrets
  6 | import json
  7 | import huggingface_hub
  8 | from config import API_KEYS, OUT_DIR, COUNT_PER_FILE, BEGIN_INDEX, VERBOSE_EXCEPTIONS
  9 | from data_processing import TEXT_DATASET, CODE_DATASET, TEMPLATES, load_iter_from_spec
 10 | from llm_queries import llm_template_query
 11 | import traceback
 12 | 
 13 | 
 14 | def exc_fmt(exc):
 15 |     if VERBOSE_EXCEPTIONS:
 16 |         return "\n".join(traceback.format_exception(exc)).strip()
 17 |     else:
 18 |         return str(repr(exc))
 19 | 
 20 | 
 21 | async def main():
 22 |     huggingface_hub.login(new_session=False)
 23 |     hf_api = huggingface_hub.HfApi()
 24 | 
 25 |     hf_user = hf_api.whoami()["name"]
 26 |     repo_id = f"{hf_user}/muse_textbooks"
 27 | 
 28 |     hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
 29 | 
 30 |     text_iter = iter([])
 31 |     code_iter = iter([])
 32 |     sess = aiohttp.ClientSession()
 33 | 
 34 |     tasks = set()
 35 |     lines = 0
 36 | 
 37 |     try:
 38 |         with open(os.path.join(OUT_DIR, "cur.jsonl"), "rb") as f:
 39 |             lines = len(f.read().decode("utf-8").split("\n")) - 1
 40 |     except Exception:
 41 |         pass
 42 | 
 43 |     pbar = tqdm(initial=lines, total=COUNT_PER_FILE)
 44 |     outfile = open(os.path.join(OUT_DIR, "cur.jsonl"), "ab")
 45 | 
 46 |     while True:
 47 |         for api_key in API_KEYS:
 48 |             template = TEMPLATES[secrets.randbits(64) % len(TEMPLATES)]
 49 |             dataset_type = "text" if template["dataset"] == "text" else "code"
 50 |             while True:
 51 |                 try:
 52 |                     passage = next(text_iter) if dataset_type == "text" else next(code_iter)
 53 |                     break
 54 |                 except Exception:
 55 |                     text_iter = load_iter_from_spec(TEXT_DATASET)
 56 |                     code_iter = load_iter_from_spec(CODE_DATASET)
 57 |             tasks.add(
 58 |                 asyncio.create_task(
 59 |                     llm_template_query(sess, template, passage, api_key)
 60 |                 )
 61 |             )
 62 | 
 63 |         new_tasks = set()
 64 | 
 65 |         for task in tasks:
 66 |             if not task.done():
 67 |                 new_tasks.add(task)
 68 |                 continue
 69 | 
 70 |             try:
 71 |                 status, result = task.result()
 72 |             except Exception as exc:
 73 |                 tqdm.write(f"unknown error: {exc_fmt(exc)}")
 74 |                 continue
 75 | 
 76 |             if status == "err":
 77 |                 api_key = result["api_key"]
 78 |                 exc = result["exc"]
 79 |                 tqdm.write(f"error in {api_key}: {exc_fmt(exc)}")
 80 |                 continue
 81 | 
 82 |             if len(result) == 0:
 83 |                 continue
 84 | 
 85 |             outfile.write((json.dumps({"text": result}) + "\n").encode("utf-8"))
 86 |             lines += 1
 87 |             pbar.update(1)
 88 | 
 89 |         tasks = new_tasks
 90 | 
 91 |         if lines >= COUNT_PER_FILE:
 92 |             outfile.close()
 93 |             i = BEGIN_INDEX
 94 | 
 95 |             while os.path.exists(os.path.join(OUT_DIR, f"{i}.jsonl")):
 96 |                 i += 1
 97 | 
 98 |             os.rename(
 99 |                 os.path.join(OUT_DIR, "cur.jsonl"), os.path.join(OUT_DIR, f"{i}.jsonl")
100 |             )
101 | 
102 |             while True:
103 |                 try:
104 |                     hf_api.upload_file(
105 |                         path_or_fileobj=os.path.join(OUT_DIR, f"{i}.jsonl"),
106 |                         path_in_repo=f"{i}.jsonl",
107 |                         repo_id=repo_id,
108 |                         repo_type="dataset",
109 |                     )
110 |                     break
111 |                 except Exception as exc:
112 |                     tqdm.write(f"can't upload: {exc_fmt(exc)}")
113 |                     await asyncio.sleep(1)
114 | 
115 |             outfile = open(os.path.join(OUT_DIR, "cur.jsonl"), "ab")
116 |             lines = 0
117 |             pbar.reset()
118 | 
119 |         await asyncio.sleep(1)
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     asyncio.run(main())
124 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | datasets
3 | tqdm
4 | huggingface_hub


--------------------------------------------------------------------------------
/shell.nix:
--------------------------------------------------------------------------------
 1 | let
 2 |   nixpkgs-src = builtins.fetchTarball {
 3 |     # Master of Dec 21, 2023
 4 |     url = "https://github.com/NixOS/nixpkgs/archive/08b802c343d93f4d09deeebf187f6ec8c3233124.tar.gz";
 5 |     sha256 = "0zx0d52cpjr0nxfd3w2qzaaqbq83pj1iqw8300hsij9mhxlny3vw";
 6 |   };
 7 | 
 8 |   pkgs = import nixpkgs-src {
 9 |     config = {
10 |       allowUnfree = true;
11 |     };
12 |   };
13 | 
14 |   myPython = pkgs.python311;
15 | 
16 |   pythonWithPkgs = myPython.withPackages (pythonPkgs:
17 |     with pythonPkgs; [
18 |       black # for formatting
19 |       pip
20 |       setuptools
21 |       virtualenvwrapper
22 |       wheel
23 |     ]);
24 | 
25 |   lib-path = with pkgs;
26 |     lib.makeLibraryPath [
27 |       libffi
28 |       openssl
29 |       stdenv.cc.cc
30 |     ];
31 | 
32 |   shell = pkgs.mkShell {
33 |     buildInputs = [
34 |       pythonWithPkgs
35 |       # pkgs.autoconf
36 |       # pkgs.pkg-config
37 | 
38 |       # Misc packages needed for compiling python libs
39 |       pkgs.readline
40 |       pkgs.libffi
41 |       pkgs.openssl
42 | 
43 |       # Necessary because of messing with LD_LIBRARY_PATH
44 |       pkgs.git
45 |       pkgs.openssh
46 |       pkgs.rsync
47 |     ];
48 | 
49 |     shellHook = ''
50 |       # Allow the use of wheels.
51 |       SOURCE_DATE_EPOCH=$(date +%s)
52 | 
53 |       # Augment the dynamic linker path
54 |       export "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${lib-path}"
55 | 
56 |       # Setup the virtual environment if it doesn't already exist.
57 |       VENV=.venv
58 |       if test ! -d $VENV; then
59 |         virtualenv $VENV
60 |       fi
61 | 
62 |       source ./$VENV/bin/activate
63 |       export PYTHONPATH=`pwd`/$VENV/${myPython.sitePackages}/:$PYTHONPATH
64 | 
65 |       pip install -r requirements.txt
66 | 
67 |       python main.py
68 |     '';
69 |   };
70 | in
71 |   shell
72 | 


--------------------------------------------------------------------------------