├── .gitignore
├── LICENSE
├── README.md
├── config.py
├── data_processing.py
├── llm_queries.py
├── main.py
├── requirements.txt
└── shell.nix
/.gitignore:
--------------------------------------------------------------------------------
1 | # Muse specific
2 | textbooks
3 | config.ini
4 | reconnect.py
5 |
6 | # Created by https://www.toptal.com/developers/gitignore/api/python
7 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
8 |
9 | ### Python ###
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 |
15 | # C extensions
16 | *.so
17 |
18 | # Distribution / packaging
19 | .Python
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | share/python-wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .nox/
52 | .coverage
53 | .coverage.*
54 | .cache
55 | nosetests.xml
56 | coverage.xml
57 | *.cover
58 | *.py,cover
59 | .hypothesis/
60 | .pytest_cache/
61 | cover/
62 |
63 | # Translations
64 | *.mo
65 | *.pot
66 |
67 | # Django stuff:
68 | *.log
69 | local_settings.py
70 | db.sqlite3
71 | db.sqlite3-journal
72 |
73 | # Flask stuff:
74 | instance/
75 | .webassets-cache
76 |
77 | # Scrapy stuff:
78 | .scrapy
79 |
80 | # Sphinx documentation
81 | docs/_build/
82 |
83 | # PyBuilder
84 | .pybuilder/
85 | target/
86 |
87 | # Jupyter Notebook
88 | .ipynb_checkpoints
89 |
90 | # IPython
91 | profile_default/
92 | ipython_config.py
93 |
94 | # pyenv
95 | # For a library or package, you might want to ignore these files since the code is
96 | # intended to run in multiple environments; otherwise, check them in:
97 | # .python-version
98 |
99 | # pipenv
100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | # install all needed dependencies.
104 | #Pipfile.lock
105 |
106 | # poetry
107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | # This is especially recommended for binary packages to ensure reproducibility, and is more
109 | # commonly ignored for libraries.
110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 |
113 | # pdm
114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | # in version control.
118 | # https://pdm.fming.dev/#use-with-ide
119 | .pdm.toml
120 |
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 |
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 |
128 | # SageMath parsed files
129 | *.sage.py
130 |
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 |
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 |
144 | # Rope project settings
145 | .ropeproject
146 |
147 | # mkdocs documentation
148 | /site
149 |
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 |
155 | # Pyre type checker
156 | .pyre/
157 |
158 | # pytype static type analyzer
159 | .pytype/
160 |
161 | # Cython debug symbols
162 | cython_debug/
163 |
164 | # PyCharm
165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | # and can be added to the global gitignore or merged into this file. For a more nuclear
168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | #.idea/
170 |
171 | ### Python Patch ###
172 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
173 | poetry.toml
174 |
175 | # ruff
176 | .ruff_cache/
177 |
178 | # LSP config files
179 | pyrightconfig.json
180 |
181 | # End of https://www.toptal.com/developers/gitignore/api/python
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 thooton
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # muse
2 |
3 | Muse is a Python script for creating synthetic textbooks using Google's Gemini Pro API, providing free access at 60 req/min. On average, each request generates ~1000 tokens, resulting in 3.6 million tokens per API key per hour!
4 |
5 | ## Usage Instructions
6 |
7 | 1. Clone or download the repository to your local machine.
8 | 2. Enter the directory.
9 | 3. Run `python3 -m pip install -r requirements.txt`.
10 | 4. Run `python3 main.py` to generate a `config.ini` file with default values.
11 | 5. Edit `config.ini` and add one or more API keys obtained from [Google's Makersuite](https://makersuite.google.com/app/apikey). Feel free to make other adjustments as well.
12 | 6. On Hugging Face, go to [New Dataset](https://huggingface.co/new-dataset) and create a new dataset named `muse_textbooks`.
13 | 7. Go to [Hugging Face Tokens](https://huggingface.co/settings/tokens) to get your current access token (API key) or create a new one with **write** permission.
14 | 8. Run `python3 main.py`; you'll be prompted to enter your Hugging Face API key.
15 |
16 | The script submits 60 requests/minute to Google's Gemini Pro, generating textbooks in the specified (`out_dir`) directory and auto-uploading them to a Hugging Face dataset named `your_username/muse_textbooks`.
17 |
18 | **Note**: Configuration values can be specified either in the config.ini file or passed as environment variables. It is advisable to avoid setting arbitrary values for temperature and top_p.
19 |
20 | ### Nix/Nixos
21 |
22 | Run: `nix-shell` inside the muse directory. Then follow instructions from the above-mentioned step 5.
23 |
24 | ## Implications
25 |
26 | By creating large amounts of open-source synthetic textbook data, we pave the way for open-source models that are more efficient and performant. phi-2 was trained on 250B tokens of mixed synthetic data and webtext; what might we be able to do with a 7B model trained on trillions of synthetic tokens?
27 |
28 | ## How it works
29 |
30 | The program auto-generates prompts by sampling from two seed datasets defined by the `TEXT_DATASET` and `CODE_DATASET` variables in `data_processing.py`. After a passage is sampled, it is passed to one of three prompt templates defined by `TEMPLATES` that instruct the LLM to either:
31 | 1) Generate a two-person debate on a subject related to the passage, or
32 | 2) Generate an informative lecture on a subject related to the passage, or
33 | 3) Generate a computer science textbook section on a subject related to the passage.
34 |
35 | ## Contributing
36 |
37 | This repository is open to any PRs/issues/suggestions :)
38 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import configparser
3 | import json
4 |
5 | def load_configuration():
6 | CONFIG_FILE = "config.ini"
7 |
8 | config = configparser.ConfigParser()
9 |
10 | config_file_exists = os.path.exists(CONFIG_FILE)
11 |
12 | BOOL_TABLE = {
13 | "True": True,
14 | "False": False
15 | }
16 |
17 | if not config_file_exists:
18 | config["Gemini"] = {
19 | "API_KEYS": "[]",
20 | "API_ENDPOINT": "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key=",
21 | }
22 | config["Parameters"] = {"TEMPERATURE": "1.0", "TOP_P": "0.99"}
23 | config["Misc"] = {
24 | "OUT_DIR": "./textbooks",
25 | "COUNT_PER_FILE": "1000",
26 | "BEGIN_INDEX": "0",
27 | "VERBOSE_EXCEPTIONS": "False"
28 | }
29 | with open("config.ini", "w") as configfile:
30 | config.write(configfile)
31 |
32 | print(
33 | "config.ini file not found. A default config.ini has been generated. Please update it with your API key and other configurations."
34 | )
35 | exit(1)
36 |
37 | config.read("config.ini")
38 |
39 | API_KEYS = json.loads(config.get("Gemini", "API_KEYS", fallback="[]"))
40 | API_ENDPOINT = config.get("Gemini", "API_ENDPOINT", fallback="")
41 | TEMPERATURE = float(config.get("Parameters", "TEMPERATURE", fallback="0.0"))
42 | TOP_P = float(config.get("Parameters", "TOP_P", fallback="0.0"))
43 | OUT_DIR = config.get("Misc", "OUT_DIR", fallback="")
44 | COUNT_PER_FILE = int(config.get("Misc", "COUNT_PER_FILE", fallback="0"))
45 | BEGIN_INDEX = int(config.get("Misc", "BEGIN_INDEX", fallback="0"))
46 | VERBOSE_EXCEPTIONS = BOOL_TABLE[config.get("Misc", "VERBOSE_EXCEPTIONS", fallback="False")]
47 |
48 | API_KEYS = os.getenv("API_KEYS", ";".join(API_KEYS)).split(";")
49 | API_ENDPOINT = os.getenv("API_ENDPOINT", API_ENDPOINT)
50 | TEMPERATURE = float(os.getenv("TEMPERATURE", TEMPERATURE))
51 | TOP_P = float(os.getenv("TOP_P", TOP_P))
52 | OUT_DIR = os.getenv("OUT_DIR", OUT_DIR)
53 | COUNT_PER_FILE = int(os.getenv("COUNT_PER_FILE", COUNT_PER_FILE))
54 | BEGIN_INDEX = int(os.getenv("BEGIN_INDEX", BEGIN_INDEX))
55 | VERBOSE_EXCEPTIONS = BOOL_TABLE[os.getenv("VERBOSE_EXCEPTIONS", str(VERBOSE_EXCEPTIONS))]
56 |
57 | if len(API_KEYS) == 0:
58 | print("API keys are empty. Please update config.ini with one or more API keys.")
59 | exit(1)
60 |
61 | if not os.path.exists(OUT_DIR):
62 | os.makedirs(OUT_DIR)
63 |
64 | return (
65 | API_KEYS,
66 | API_ENDPOINT,
67 | TEMPERATURE,
68 | TOP_P,
69 | OUT_DIR,
70 | COUNT_PER_FILE,
71 | BEGIN_INDEX,
72 | VERBOSE_EXCEPTIONS
73 | )
74 |
75 |
76 | (
77 | API_KEYS,
78 | API_ENDPOINT,
79 | TEMPERATURE,
80 | TOP_P,
81 | OUT_DIR,
82 | COUNT_PER_FILE,
83 | BEGIN_INDEX,
84 | VERBOSE_EXCEPTIONS
85 | ) = load_configuration()
86 |
--------------------------------------------------------------------------------
/data_processing.py:
--------------------------------------------------------------------------------
1 | import json
2 | import secrets
3 | import datasets
4 |
5 |
6 | def load_iter_from_spec(spec):
7 | return spec["iter"](
8 | datasets.load_dataset(spec["id"])[spec["split"]].shuffle(
9 | seed=secrets.randbits(32)
10 | )
11 | )
12 |
13 |
14 | def process_text_dataset(item):
15 | return "\n\n".join(
16 | [
17 | {"human": "Human: ", "gpt": "Assistant: "}[entry["from"]] + entry["value"]
18 | for entry in item["conversations"]
19 | ]
20 | ).strip()
21 |
22 |
23 | def process_code_dataset(item):
24 | return (
25 | "Request: "
26 | + item["instruction"].strip()
27 | + "\n\nCode: "
28 | + item["output"].strip()
29 | )
30 |
31 |
32 | TEXT_DATASET = {
33 | "id": "WizardLM/WizardLM_evol_instruct_V2_196k",
34 | "split": "train",
35 | "iter": lambda dataset: map(process_text_dataset, dataset),
36 | }
37 |
38 | CODE_DATASET = {
39 | "id": "TokenBender/code_instructions_122k_alpaca_style",
40 | "split": "train",
41 | "iter": lambda dataset: map(process_code_dataset, dataset),
42 | }
43 |
44 |
45 | TEMPLATES = [
46 | {
47 | "dataset": "text",
48 | "prompt": lambda passage: f"""
49 | Please consider the following passage: {passage}
50 | You have two tasks:
51 | 1) Drawing inspiration from the content of the passage, generate a brand new debate topic.
52 | The debate topic will belong to the same domain as the content of the passage, but it will be even more rare.
53 | The debate topic generated will be philosophical, creative, interesting, engaging, and thought-provoking.
54 | The debate topic generated will not have an easy answer; it will be able to be argued from both sides.
55 | The topic will be surrounded by tags.
56 | 2) Generate a debate on the generated topic between two rational individuals, Aspen and River.
57 | In the debate, the participants will hold mutually opposing views.
58 | The debate will be long and drawn-out; no side will give up easily.
59 | In the debate, at times the participants may make concessions, but still hold fast to their point of view.
60 | In the debate, the participants will use various techniques of rational discussion; they will not make use of emotionally manipulative techniques.
61 | In the debate, the participants will never repeat themselves.
62 | The debate will have at least 50 paragraphs; it will have at least 5000 words. It will be novel-length.
63 | For the debate, you will be tipped $15 for every paragraph you write. To maximize your earnings, write as many as possible.
64 | The debate will be formatted in Markdown.
65 | The debate will be surrounded by tags.
66 | """,
67 | "extract": lambda raw: (
68 | "A debate on the topic "
69 | + json.dumps(raw.split("")[-1].split("")[0].strip())
70 | + ":\n\n"
71 | + raw.split("")[-1].split("")[0].strip()
72 | ),
73 | },
74 | {
75 | "dataset": "text",
76 | "prompt": lambda passage: f"""
77 | Please consider the following passage: {passage}
78 | Imagine you are a professor with a reputation for excellent lectures.
79 | You have three tasks:
80 | 1) Drawing inspiration from the content of the passage, generate a brand new lecture topic.
81 | The lecture topic will belong to the same domain as the content of the passage, but it will be even more rare.
82 | The lecture topic will be carefully chosen to advance the education of the students in every way.
83 | The lecture topic will be interesting, engaging, and thought-provoking.
84 | The lecture topic will be surrounded by tags.
85 | 2) Generate a ten-point lecture outline on the generated topic.
86 | The lecture outline's ten points will be chosen to maximize ease of understanding and flow.
87 | The lecture outline will be surrounded by tags.
88 | 3) Generate a lecture, following the outline, on the generated topic.
89 | The lecture will be informative and easy to understand for the students.
90 | The lecture will provide as much information as possible. It should be as long as possible.
91 | For each piece of information you incorporate into the lecture, you will receive a tip of $20.
92 | In the lecture, all unfamiliar terms or topics will be explained for the students' benefit.
93 | In the lecture, it will be assumed that the students have no prior familiarity with the subject.
94 | In the lecture, the lecturer will never repeat themselves unnecessarily.
95 | The lecture will be formatted in Markdown.
96 | The lecture will be surrounded by tags.
97 | """,
98 | "extract": lambda raw: (
99 | raw.split("")[-1].split("")[0].strip()
100 | ),
101 | },
102 | {
103 | "dataset": "code",
104 | "prompt": lambda passage: f"""
105 | Please consider the following passage: {passage}
106 | Imagine you are a highly esteemed computer science professor writing a programming textbook.
107 | You have three tasks:
108 | 1) Drawing inspiration from the content of the passage, craft a brand new textbook section topic.
109 | The section topic will belong to the same domain as the content of the passage, but it will be even more rare.
110 | The section topic will be interesting, complex, and multifaceted, even if the passage is simple.
111 | The section topic will be directly related to computer science.
112 | The section topic will be carefully chosen to provide as much pedagogical value to the reader as possible.
113 | The section topic will be surrounded by tags.
114 | 2) Generate a ten-point section outline with code on the generated topic.
115 | Of the section outline's ten points, at least three will be code examples illustrating the topic.
116 | The section outline's ten points will be chosen to maximize ease of understanding and flow.
117 | The section outline will be surrounded by tags.
118 | 3) Generate a textbook section, following the outline, on the generated topic.
119 | The section will be self-contained, informative, easy to understand, and verbose.
120 | The section will be written in longform prose.
121 | For each piece of information you include, you will receive a payment of $20; thus, include as many as possible to maximize your earnings.
122 | The section will explain all unfamiliar terms or topics for the reader's benefits.
123 | The section will never repeat information or code.
124 | The section will be formatted in Markdown.
125 | The section will be surrounded by tags.
126 | """,
127 | "extract": lambda raw: (
128 | raw.split("")[-1].split("")[0].strip()
129 | ),
130 | },
131 | ]
132 |
--------------------------------------------------------------------------------
/llm_queries.py:
--------------------------------------------------------------------------------
1 | from config import API_ENDPOINT, TEMPERATURE, TOP_P
2 |
3 |
4 | async def llm_query(sess, query, api_key):
5 | assert 0.0 <= TEMPERATURE <= 1.0
6 | assert 0.0 <= TOP_P <= 1.0
7 | async with sess.post(
8 | API_ENDPOINT + api_key,
9 | json={
10 | "contents": [{"role": "USER", "parts": [{"text": query}]}],
11 | "generationConfig": {"temperature": TEMPERATURE, "topP": TOP_P},
12 | },
13 | ) as resp:
14 | return (await resp.json())["candidates"][0]["content"]["parts"][0]["text"]
15 |
16 |
17 | async def llm_template_query(sess, template, passage, api_key):
18 | try:
19 | query = template["prompt"](passage).strip()
20 | response = await llm_query(sess, query, api_key)
21 | return ("ok", template["extract"](response))
22 | except Exception as exc:
23 | return ("err", {"exc": exc, "api_key": api_key})
24 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tqdm import tqdm
3 | import aiohttp
4 | import asyncio
5 | import secrets
6 | import json
7 | import huggingface_hub
8 | from config import API_KEYS, OUT_DIR, COUNT_PER_FILE, BEGIN_INDEX, VERBOSE_EXCEPTIONS
9 | from data_processing import TEXT_DATASET, CODE_DATASET, TEMPLATES, load_iter_from_spec
10 | from llm_queries import llm_template_query
11 | import traceback
12 |
13 |
14 | def exc_fmt(exc):
15 | if VERBOSE_EXCEPTIONS:
16 | return "\n".join(traceback.format_exception(exc)).strip()
17 | else:
18 | return str(repr(exc))
19 |
20 |
21 | async def main():
22 | huggingface_hub.login(new_session=False)
23 | hf_api = huggingface_hub.HfApi()
24 |
25 | hf_user = hf_api.whoami()["name"]
26 | repo_id = f"{hf_user}/muse_textbooks"
27 |
28 | hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
29 |
30 | text_iter = iter([])
31 | code_iter = iter([])
32 | sess = aiohttp.ClientSession()
33 |
34 | tasks = set()
35 | lines = 0
36 |
37 | try:
38 | with open(os.path.join(OUT_DIR, "cur.jsonl"), "rb") as f:
39 | lines = len(f.read().decode("utf-8").split("\n")) - 1
40 | except Exception:
41 | pass
42 |
43 | pbar = tqdm(initial=lines, total=COUNT_PER_FILE)
44 | outfile = open(os.path.join(OUT_DIR, "cur.jsonl"), "ab")
45 |
46 | while True:
47 | for api_key in API_KEYS:
48 | template = TEMPLATES[secrets.randbits(64) % len(TEMPLATES)]
49 | dataset_type = "text" if template["dataset"] == "text" else "code"
50 | while True:
51 | try:
52 | passage = next(text_iter) if dataset_type == "text" else next(code_iter)
53 | break
54 | except Exception:
55 | text_iter = load_iter_from_spec(TEXT_DATASET)
56 | code_iter = load_iter_from_spec(CODE_DATASET)
57 | tasks.add(
58 | asyncio.create_task(
59 | llm_template_query(sess, template, passage, api_key)
60 | )
61 | )
62 |
63 | new_tasks = set()
64 |
65 | for task in tasks:
66 | if not task.done():
67 | new_tasks.add(task)
68 | continue
69 |
70 | try:
71 | status, result = task.result()
72 | except Exception as exc:
73 | tqdm.write(f"unknown error: {exc_fmt(exc)}")
74 | continue
75 |
76 | if status == "err":
77 | api_key = result["api_key"]
78 | exc = result["exc"]
79 | tqdm.write(f"error in {api_key}: {exc_fmt(exc)}")
80 | continue
81 |
82 | if len(result) == 0:
83 | continue
84 |
85 | outfile.write((json.dumps({"text": result}) + "\n").encode("utf-8"))
86 | lines += 1
87 | pbar.update(1)
88 |
89 | tasks = new_tasks
90 |
91 | if lines >= COUNT_PER_FILE:
92 | outfile.close()
93 | i = BEGIN_INDEX
94 |
95 | while os.path.exists(os.path.join(OUT_DIR, f"{i}.jsonl")):
96 | i += 1
97 |
98 | os.rename(
99 | os.path.join(OUT_DIR, "cur.jsonl"), os.path.join(OUT_DIR, f"{i}.jsonl")
100 | )
101 |
102 | while True:
103 | try:
104 | hf_api.upload_file(
105 | path_or_fileobj=os.path.join(OUT_DIR, f"{i}.jsonl"),
106 | path_in_repo=f"{i}.jsonl",
107 | repo_id=repo_id,
108 | repo_type="dataset",
109 | )
110 | break
111 | except Exception as exc:
112 | tqdm.write(f"can't upload: {exc_fmt(exc)}")
113 | await asyncio.sleep(1)
114 |
115 | outfile = open(os.path.join(OUT_DIR, "cur.jsonl"), "ab")
116 | lines = 0
117 | pbar.reset()
118 |
119 | await asyncio.sleep(1)
120 |
121 |
122 | if __name__ == "__main__":
123 | asyncio.run(main())
124 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | datasets
3 | tqdm
4 | huggingface_hub
--------------------------------------------------------------------------------
/shell.nix:
--------------------------------------------------------------------------------
1 | let
2 | nixpkgs-src = builtins.fetchTarball {
3 | # Master of Dec 21, 2023
4 | url = "https://github.com/NixOS/nixpkgs/archive/08b802c343d93f4d09deeebf187f6ec8c3233124.tar.gz";
5 | sha256 = "0zx0d52cpjr0nxfd3w2qzaaqbq83pj1iqw8300hsij9mhxlny3vw";
6 | };
7 |
8 | pkgs = import nixpkgs-src {
9 | config = {
10 | allowUnfree = true;
11 | };
12 | };
13 |
14 | myPython = pkgs.python311;
15 |
16 | pythonWithPkgs = myPython.withPackages (pythonPkgs:
17 | with pythonPkgs; [
18 | black # for formatting
19 | pip
20 | setuptools
21 | virtualenvwrapper
22 | wheel
23 | ]);
24 |
25 | lib-path = with pkgs;
26 | lib.makeLibraryPath [
27 | libffi
28 | openssl
29 | stdenv.cc.cc
30 | ];
31 |
32 | shell = pkgs.mkShell {
33 | buildInputs = [
34 | pythonWithPkgs
35 | # pkgs.autoconf
36 | # pkgs.pkg-config
37 |
38 | # Misc packages needed for compiling python libs
39 | pkgs.readline
40 | pkgs.libffi
41 | pkgs.openssl
42 |
43 | # Necessary because of messing with LD_LIBRARY_PATH
44 | pkgs.git
45 | pkgs.openssh
46 | pkgs.rsync
47 | ];
48 |
49 | shellHook = ''
50 | # Allow the use of wheels.
51 | SOURCE_DATE_EPOCH=$(date +%s)
52 |
53 | # Augment the dynamic linker path
54 | export "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${lib-path}"
55 |
56 | # Setup the virtual environment if it doesn't already exist.
57 | VENV=.venv
58 | if test ! -d $VENV; then
59 | virtualenv $VENV
60 | fi
61 |
62 | source ./$VENV/bin/activate
63 | export PYTHONPATH=`pwd`/$VENV/${myPython.sitePackages}/:$PYTHONPATH
64 |
65 | pip install -r requirements.txt
66 |
67 | python main.py
68 | '';
69 | };
70 | in
71 | shell
72 |
--------------------------------------------------------------------------------