├── .gitignore ├── LICENSE ├── README.md ├── config.py ├── data_processing.py ├── llm_queries.py ├── main.py ├── requirements.txt └── shell.nix /.gitignore: -------------------------------------------------------------------------------- 1 | # Muse specific 2 | textbooks 3 | config.ini 4 | reconnect.py 5 | 6 | # Created by https://www.toptal.com/developers/gitignore/api/python 7 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 8 | 9 | ### Python ### 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/#use-with-ide 119 | .pdm.toml 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | 158 | # pytype static type analyzer 159 | .pytype/ 160 | 161 | # Cython debug symbols 162 | cython_debug/ 163 | 164 | # PyCharm 165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 167 | # and can be added to the global gitignore or merged into this file. For a more nuclear 168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 169 | #.idea/ 170 | 171 | ### Python Patch ### 172 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 173 | poetry.toml 174 | 175 | # ruff 176 | .ruff_cache/ 177 | 178 | # LSP config files 179 | pyrightconfig.json 180 | 181 | # End of https://www.toptal.com/developers/gitignore/api/python -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 thooton 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # muse 2 | 3 | Muse is a Python script for creating synthetic textbooks using Google's Gemini Pro API, providing free access at 60 req/min. On average, each request generates ~1000 tokens, resulting in 3.6 million tokens per API key per hour! 4 | 5 | ## Usage Instructions 6 | 7 | 1. Clone or download the repository to your local machine. 8 | 2. Enter the directory. 9 | 3. Run `python3 -m pip install -r requirements.txt`. 10 | 4. Run `python3 main.py` to generate a `config.ini` file with default values. 11 | 5. Edit `config.ini` and add one or more API keys obtained from [Google's Makersuite](https://makersuite.google.com/app/apikey). Feel free to make other adjustments as well. 12 | 6. On Hugging Face, go to [New Dataset](https://huggingface.co/new-dataset) and create a new dataset named `muse_textbooks`. 13 | 7. Go to [Hugging Face Tokens](https://huggingface.co/settings/tokens) to get your current access token (API key) or create a new one with **write** permission. 14 | 8. Run `python3 main.py`; you'll be prompted to enter your Hugging Face API key. 15 | 16 | The script submits 60 requests/minute to Google's Gemini Pro, generating textbooks in the specified (`out_dir`) directory and auto-uploading them to a Hugging Face dataset named `your_username/muse_textbooks`. 17 | 18 | **Note**: Configuration values can be specified either in the config.ini file or passed as environment variables. It is advisable to avoid setting arbitrary values for temperature and top_p. 19 | 20 | ### Nix/Nixos 21 | 22 | Run: `nix-shell` inside the muse directory. Then follow instructions from the above-mentioned step 5. 23 | 24 | ## Implications 25 | 26 | By creating large amounts of open-source synthetic textbook data, we pave the way for open-source models that are more efficient and performant. phi-2 was trained on 250B tokens of mixed synthetic data and webtext; what might we be able to do with a 7B model trained on trillions of synthetic tokens? 27 | 28 | ## How it works 29 | 30 | The program auto-generates prompts by sampling from two seed datasets defined by the `TEXT_DATASET` and `CODE_DATASET` variables in `data_processing.py`. After a passage is sampled, it is passed to one of three prompt templates defined by `TEMPLATES` that instruct the LLM to either: 31 | 1) Generate a two-person debate on a subject related to the passage, or 32 | 2) Generate an informative lecture on a subject related to the passage, or 33 | 3) Generate a computer science textbook section on a subject related to the passage. 34 | 35 | ## Contributing 36 | 37 | This repository is open to any PRs/issues/suggestions :) 38 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import configparser 3 | import json 4 | 5 | def load_configuration(): 6 | CONFIG_FILE = "config.ini" 7 | 8 | config = configparser.ConfigParser() 9 | 10 | config_file_exists = os.path.exists(CONFIG_FILE) 11 | 12 | BOOL_TABLE = { 13 | "True": True, 14 | "False": False 15 | } 16 | 17 | if not config_file_exists: 18 | config["Gemini"] = { 19 | "API_KEYS": "[]", 20 | "API_ENDPOINT": "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key=", 21 | } 22 | config["Parameters"] = {"TEMPERATURE": "1.0", "TOP_P": "0.99"} 23 | config["Misc"] = { 24 | "OUT_DIR": "./textbooks", 25 | "COUNT_PER_FILE": "1000", 26 | "BEGIN_INDEX": "0", 27 | "VERBOSE_EXCEPTIONS": "False" 28 | } 29 | with open("config.ini", "w") as configfile: 30 | config.write(configfile) 31 | 32 | print( 33 | "config.ini file not found. A default config.ini has been generated. Please update it with your API key and other configurations." 34 | ) 35 | exit(1) 36 | 37 | config.read("config.ini") 38 | 39 | API_KEYS = json.loads(config.get("Gemini", "API_KEYS", fallback="[]")) 40 | API_ENDPOINT = config.get("Gemini", "API_ENDPOINT", fallback="") 41 | TEMPERATURE = float(config.get("Parameters", "TEMPERATURE", fallback="0.0")) 42 | TOP_P = float(config.get("Parameters", "TOP_P", fallback="0.0")) 43 | OUT_DIR = config.get("Misc", "OUT_DIR", fallback="") 44 | COUNT_PER_FILE = int(config.get("Misc", "COUNT_PER_FILE", fallback="0")) 45 | BEGIN_INDEX = int(config.get("Misc", "BEGIN_INDEX", fallback="0")) 46 | VERBOSE_EXCEPTIONS = BOOL_TABLE[config.get("Misc", "VERBOSE_EXCEPTIONS", fallback="False")] 47 | 48 | API_KEYS = os.getenv("API_KEYS", ";".join(API_KEYS)).split(";") 49 | API_ENDPOINT = os.getenv("API_ENDPOINT", API_ENDPOINT) 50 | TEMPERATURE = float(os.getenv("TEMPERATURE", TEMPERATURE)) 51 | TOP_P = float(os.getenv("TOP_P", TOP_P)) 52 | OUT_DIR = os.getenv("OUT_DIR", OUT_DIR) 53 | COUNT_PER_FILE = int(os.getenv("COUNT_PER_FILE", COUNT_PER_FILE)) 54 | BEGIN_INDEX = int(os.getenv("BEGIN_INDEX", BEGIN_INDEX)) 55 | VERBOSE_EXCEPTIONS = BOOL_TABLE[os.getenv("VERBOSE_EXCEPTIONS", str(VERBOSE_EXCEPTIONS))] 56 | 57 | if len(API_KEYS) == 0: 58 | print("API keys are empty. Please update config.ini with one or more API keys.") 59 | exit(1) 60 | 61 | if not os.path.exists(OUT_DIR): 62 | os.makedirs(OUT_DIR) 63 | 64 | return ( 65 | API_KEYS, 66 | API_ENDPOINT, 67 | TEMPERATURE, 68 | TOP_P, 69 | OUT_DIR, 70 | COUNT_PER_FILE, 71 | BEGIN_INDEX, 72 | VERBOSE_EXCEPTIONS 73 | ) 74 | 75 | 76 | ( 77 | API_KEYS, 78 | API_ENDPOINT, 79 | TEMPERATURE, 80 | TOP_P, 81 | OUT_DIR, 82 | COUNT_PER_FILE, 83 | BEGIN_INDEX, 84 | VERBOSE_EXCEPTIONS 85 | ) = load_configuration() 86 | -------------------------------------------------------------------------------- /data_processing.py: -------------------------------------------------------------------------------- 1 | import json 2 | import secrets 3 | import datasets 4 | 5 | 6 | def load_iter_from_spec(spec): 7 | return spec["iter"]( 8 | datasets.load_dataset(spec["id"])[spec["split"]].shuffle( 9 | seed=secrets.randbits(32) 10 | ) 11 | ) 12 | 13 | 14 | def process_text_dataset(item): 15 | return "\n\n".join( 16 | [ 17 | {"human": "Human: ", "gpt": "Assistant: "}[entry["from"]] + entry["value"] 18 | for entry in item["conversations"] 19 | ] 20 | ).strip() 21 | 22 | 23 | def process_code_dataset(item): 24 | return ( 25 | "Request: " 26 | + item["instruction"].strip() 27 | + "\n\nCode: " 28 | + item["output"].strip() 29 | ) 30 | 31 | 32 | TEXT_DATASET = { 33 | "id": "WizardLM/WizardLM_evol_instruct_V2_196k", 34 | "split": "train", 35 | "iter": lambda dataset: map(process_text_dataset, dataset), 36 | } 37 | 38 | CODE_DATASET = { 39 | "id": "TokenBender/code_instructions_122k_alpaca_style", 40 | "split": "train", 41 | "iter": lambda dataset: map(process_code_dataset, dataset), 42 | } 43 | 44 | 45 | TEMPLATES = [ 46 | { 47 | "dataset": "text", 48 | "prompt": lambda passage: f""" 49 | Please consider the following passage: {passage} 50 | You have two tasks: 51 | 1) Drawing inspiration from the content of the passage, generate a brand new debate topic. 52 | The debate topic will belong to the same domain as the content of the passage, but it will be even more rare. 53 | The debate topic generated will be philosophical, creative, interesting, engaging, and thought-provoking. 54 | The debate topic generated will not have an easy answer; it will be able to be argued from both sides. 55 | The topic will be surrounded by tags. 56 | 2) Generate a debate on the generated topic between two rational individuals, Aspen and River. 57 | In the debate, the participants will hold mutually opposing views. 58 | The debate will be long and drawn-out; no side will give up easily. 59 | In the debate, at times the participants may make concessions, but still hold fast to their point of view. 60 | In the debate, the participants will use various techniques of rational discussion; they will not make use of emotionally manipulative techniques. 61 | In the debate, the participants will never repeat themselves. 62 | The debate will have at least 50 paragraphs; it will have at least 5000 words. It will be novel-length. 63 | For the debate, you will be tipped $15 for every paragraph you write. To maximize your earnings, write as many as possible. 64 | The debate will be formatted in Markdown. 65 | The debate will be surrounded by tags. 66 | """, 67 | "extract": lambda raw: ( 68 | "A debate on the topic " 69 | + json.dumps(raw.split("")[-1].split("")[0].strip()) 70 | + ":\n\n" 71 | + raw.split("")[-1].split("")[0].strip() 72 | ), 73 | }, 74 | { 75 | "dataset": "text", 76 | "prompt": lambda passage: f""" 77 | Please consider the following passage: {passage} 78 | Imagine you are a professor with a reputation for excellent lectures. 79 | You have three tasks: 80 | 1) Drawing inspiration from the content of the passage, generate a brand new lecture topic. 81 | The lecture topic will belong to the same domain as the content of the passage, but it will be even more rare. 82 | The lecture topic will be carefully chosen to advance the education of the students in every way. 83 | The lecture topic will be interesting, engaging, and thought-provoking. 84 | The lecture topic will be surrounded by tags. 85 | 2) Generate a ten-point lecture outline on the generated topic. 86 | The lecture outline's ten points will be chosen to maximize ease of understanding and flow. 87 | The lecture outline will be surrounded by tags. 88 | 3) Generate a lecture, following the outline, on the generated topic. 89 | The lecture will be informative and easy to understand for the students. 90 | The lecture will provide as much information as possible. It should be as long as possible. 91 | For each piece of information you incorporate into the lecture, you will receive a tip of $20. 92 | In the lecture, all unfamiliar terms or topics will be explained for the students' benefit. 93 | In the lecture, it will be assumed that the students have no prior familiarity with the subject. 94 | In the lecture, the lecturer will never repeat themselves unnecessarily. 95 | The lecture will be formatted in Markdown. 96 | The lecture will be surrounded by tags. 97 | """, 98 | "extract": lambda raw: ( 99 | raw.split("")[-1].split("")[0].strip() 100 | ), 101 | }, 102 | { 103 | "dataset": "code", 104 | "prompt": lambda passage: f""" 105 | Please consider the following passage: {passage} 106 | Imagine you are a highly esteemed computer science professor writing a programming textbook. 107 | You have three tasks: 108 | 1) Drawing inspiration from the content of the passage, craft a brand new textbook section topic. 109 | The section topic will belong to the same domain as the content of the passage, but it will be even more rare. 110 | The section topic will be interesting, complex, and multifaceted, even if the passage is simple. 111 | The section topic will be directly related to computer science. 112 | The section topic will be carefully chosen to provide as much pedagogical value to the reader as possible. 113 | The section topic will be surrounded by tags. 114 | 2) Generate a ten-point section outline with code on the generated topic. 115 | Of the section outline's ten points, at least three will be code examples illustrating the topic. 116 | The section outline's ten points will be chosen to maximize ease of understanding and flow. 117 | The section outline will be surrounded by tags. 118 | 3) Generate a textbook section, following the outline, on the generated topic. 119 | The section will be self-contained, informative, easy to understand, and verbose. 120 | The section will be written in longform prose. 121 | For each piece of information you include, you will receive a payment of $20; thus, include as many as possible to maximize your earnings. 122 | The section will explain all unfamiliar terms or topics for the reader's benefits. 123 | The section will never repeat information or code. 124 | The section will be formatted in Markdown. 125 | The section will be surrounded by tags. 126 | """, 127 | "extract": lambda raw: ( 128 | raw.split("")[-1].split("")[0].strip() 129 | ), 130 | }, 131 | ] 132 | -------------------------------------------------------------------------------- /llm_queries.py: -------------------------------------------------------------------------------- 1 | from config import API_ENDPOINT, TEMPERATURE, TOP_P 2 | 3 | 4 | async def llm_query(sess, query, api_key): 5 | assert 0.0 <= TEMPERATURE <= 1.0 6 | assert 0.0 <= TOP_P <= 1.0 7 | async with sess.post( 8 | API_ENDPOINT + api_key, 9 | json={ 10 | "contents": [{"role": "USER", "parts": [{"text": query}]}], 11 | "generationConfig": {"temperature": TEMPERATURE, "topP": TOP_P}, 12 | }, 13 | ) as resp: 14 | return (await resp.json())["candidates"][0]["content"]["parts"][0]["text"] 15 | 16 | 17 | async def llm_template_query(sess, template, passage, api_key): 18 | try: 19 | query = template["prompt"](passage).strip() 20 | response = await llm_query(sess, query, api_key) 21 | return ("ok", template["extract"](response)) 22 | except Exception as exc: 23 | return ("err", {"exc": exc, "api_key": api_key}) 24 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | import aiohttp 4 | import asyncio 5 | import secrets 6 | import json 7 | import huggingface_hub 8 | from config import API_KEYS, OUT_DIR, COUNT_PER_FILE, BEGIN_INDEX, VERBOSE_EXCEPTIONS 9 | from data_processing import TEXT_DATASET, CODE_DATASET, TEMPLATES, load_iter_from_spec 10 | from llm_queries import llm_template_query 11 | import traceback 12 | 13 | 14 | def exc_fmt(exc): 15 | if VERBOSE_EXCEPTIONS: 16 | return "\n".join(traceback.format_exception(exc)).strip() 17 | else: 18 | return str(repr(exc)) 19 | 20 | 21 | async def main(): 22 | huggingface_hub.login(new_session=False) 23 | hf_api = huggingface_hub.HfApi() 24 | 25 | hf_user = hf_api.whoami()["name"] 26 | repo_id = f"{hf_user}/muse_textbooks" 27 | 28 | hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True) 29 | 30 | text_iter = iter([]) 31 | code_iter = iter([]) 32 | sess = aiohttp.ClientSession() 33 | 34 | tasks = set() 35 | lines = 0 36 | 37 | try: 38 | with open(os.path.join(OUT_DIR, "cur.jsonl"), "rb") as f: 39 | lines = len(f.read().decode("utf-8").split("\n")) - 1 40 | except Exception: 41 | pass 42 | 43 | pbar = tqdm(initial=lines, total=COUNT_PER_FILE) 44 | outfile = open(os.path.join(OUT_DIR, "cur.jsonl"), "ab") 45 | 46 | while True: 47 | for api_key in API_KEYS: 48 | template = TEMPLATES[secrets.randbits(64) % len(TEMPLATES)] 49 | dataset_type = "text" if template["dataset"] == "text" else "code" 50 | while True: 51 | try: 52 | passage = next(text_iter) if dataset_type == "text" else next(code_iter) 53 | break 54 | except Exception: 55 | text_iter = load_iter_from_spec(TEXT_DATASET) 56 | code_iter = load_iter_from_spec(CODE_DATASET) 57 | tasks.add( 58 | asyncio.create_task( 59 | llm_template_query(sess, template, passage, api_key) 60 | ) 61 | ) 62 | 63 | new_tasks = set() 64 | 65 | for task in tasks: 66 | if not task.done(): 67 | new_tasks.add(task) 68 | continue 69 | 70 | try: 71 | status, result = task.result() 72 | except Exception as exc: 73 | tqdm.write(f"unknown error: {exc_fmt(exc)}") 74 | continue 75 | 76 | if status == "err": 77 | api_key = result["api_key"] 78 | exc = result["exc"] 79 | tqdm.write(f"error in {api_key}: {exc_fmt(exc)}") 80 | continue 81 | 82 | if len(result) == 0: 83 | continue 84 | 85 | outfile.write((json.dumps({"text": result}) + "\n").encode("utf-8")) 86 | lines += 1 87 | pbar.update(1) 88 | 89 | tasks = new_tasks 90 | 91 | if lines >= COUNT_PER_FILE: 92 | outfile.close() 93 | i = BEGIN_INDEX 94 | 95 | while os.path.exists(os.path.join(OUT_DIR, f"{i}.jsonl")): 96 | i += 1 97 | 98 | os.rename( 99 | os.path.join(OUT_DIR, "cur.jsonl"), os.path.join(OUT_DIR, f"{i}.jsonl") 100 | ) 101 | 102 | while True: 103 | try: 104 | hf_api.upload_file( 105 | path_or_fileobj=os.path.join(OUT_DIR, f"{i}.jsonl"), 106 | path_in_repo=f"{i}.jsonl", 107 | repo_id=repo_id, 108 | repo_type="dataset", 109 | ) 110 | break 111 | except Exception as exc: 112 | tqdm.write(f"can't upload: {exc_fmt(exc)}") 113 | await asyncio.sleep(1) 114 | 115 | outfile = open(os.path.join(OUT_DIR, "cur.jsonl"), "ab") 116 | lines = 0 117 | pbar.reset() 118 | 119 | await asyncio.sleep(1) 120 | 121 | 122 | if __name__ == "__main__": 123 | asyncio.run(main()) 124 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | datasets 3 | tqdm 4 | huggingface_hub -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | let 2 | nixpkgs-src = builtins.fetchTarball { 3 | # Master of Dec 21, 2023 4 | url = "https://github.com/NixOS/nixpkgs/archive/08b802c343d93f4d09deeebf187f6ec8c3233124.tar.gz"; 5 | sha256 = "0zx0d52cpjr0nxfd3w2qzaaqbq83pj1iqw8300hsij9mhxlny3vw"; 6 | }; 7 | 8 | pkgs = import nixpkgs-src { 9 | config = { 10 | allowUnfree = true; 11 | }; 12 | }; 13 | 14 | myPython = pkgs.python311; 15 | 16 | pythonWithPkgs = myPython.withPackages (pythonPkgs: 17 | with pythonPkgs; [ 18 | black # for formatting 19 | pip 20 | setuptools 21 | virtualenvwrapper 22 | wheel 23 | ]); 24 | 25 | lib-path = with pkgs; 26 | lib.makeLibraryPath [ 27 | libffi 28 | openssl 29 | stdenv.cc.cc 30 | ]; 31 | 32 | shell = pkgs.mkShell { 33 | buildInputs = [ 34 | pythonWithPkgs 35 | # pkgs.autoconf 36 | # pkgs.pkg-config 37 | 38 | # Misc packages needed for compiling python libs 39 | pkgs.readline 40 | pkgs.libffi 41 | pkgs.openssl 42 | 43 | # Necessary because of messing with LD_LIBRARY_PATH 44 | pkgs.git 45 | pkgs.openssh 46 | pkgs.rsync 47 | ]; 48 | 49 | shellHook = '' 50 | # Allow the use of wheels. 51 | SOURCE_DATE_EPOCH=$(date +%s) 52 | 53 | # Augment the dynamic linker path 54 | export "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${lib-path}" 55 | 56 | # Setup the virtual environment if it doesn't already exist. 57 | VENV=.venv 58 | if test ! -d $VENV; then 59 | virtualenv $VENV 60 | fi 61 | 62 | source ./$VENV/bin/activate 63 | export PYTHONPATH=`pwd`/$VENV/${myPython.sitePackages}/:$PYTHONPATH 64 | 65 | pip install -r requirements.txt 66 | 67 | python main.py 68 | ''; 69 | }; 70 | in 71 | shell 72 | --------------------------------------------------------------------------------