├── tests ├── eval │ ├── __init__.py │ └── test_evaluation.py ├── test_version.py ├── data │ └── prompts_debug.jsonl ├── training │ ├── integration │ │ └── test_train.py │ └── units │ │ ├── test_dataset.py │ │ └── test_model.py └── dataset_gen │ ├── test_integration.py │ └── test_units.py ├── human-eval ├── human_eval │ ├── __init__.py │ ├── evaluate_functional_correctness.py │ ├── data.py │ ├── evaluation.py │ └── execution.py ├── requirements.txt ├── data │ ├── HumanEval.jsonl.gz │ ├── example_problem.jsonl │ └── example_samples.jsonl ├── setup.py ├── LICENSE └── README.md ├── textbook ├── dataset_gen │ ├── __init__.py │ ├── .gitignore │ ├── tree │ │ ├── professions.json │ │ ├── topics.csv │ │ └── subsubtopics.json │ ├── filtering.py │ ├── dataset_gen_cli.py │ ├── create_prompts.py │ └── dataset_gen.py ├── __init__.py ├── api.py ├── model.py ├── dataset.py ├── evaluate.py └── train.py ├── .pre-commit-config.yaml ├── setup_vm.sh ├── ds_config.json ├── pyproject.toml ├── .gitignore ├── .github └── workflows │ └── ci.yml └── README.md /tests/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /human-eval/human_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /textbook/dataset_gen/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /textbook/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | -------------------------------------------------------------------------------- /textbook/dataset_gen/.gitignore: -------------------------------------------------------------------------------- 1 | exercises/* 2 | -------------------------------------------------------------------------------- /human-eval/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | fire 3 | numpy 4 | -------------------------------------------------------------------------------- /human-eval/data/HumanEval.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/textbook/HEAD/human-eval/data/HumanEval.jsonl.gz -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | from textbook import __version__ 2 | 3 | 4 | def test_version(): 5 | assert __version__ == "0.1.0" 6 | -------------------------------------------------------------------------------- /human-eval/data/example_problem.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "test/0", "prompt": "def return1():\n", "canonical_solution": " return 1", "test": "def check(candidate):\n assert candidate() == 1", "entry_point": "return1"} 2 | -------------------------------------------------------------------------------- /tests/data/prompts_debug.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt": "What is the weather today?"} 2 | {"prompt": "Tell me a joke."} 3 | {"prompt": "What is the capital of France?"} 4 | {"prompt": "Who won the world series last year?"} 5 | {"prompt": "Translate 'Hello' to Spanish."} 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 22.3.0 4 | hooks: 5 | - id: black 6 | types: [python] 7 | 8 | - repo: https://github.com/charliermarsh/ruff-pre-commit 9 | rev: v0.0.243 10 | hooks: 11 | - id: ruff 12 | -------------------------------------------------------------------------------- /setup_vm.sh: -------------------------------------------------------------------------------- 1 | apt update -y 2 | apt install gh screen neovim nvtop -y 3 | pip install -U poetry 4 | gh auth login 5 | gh repo clone jina-ai/textbook 6 | cd textbook 7 | poetry config virtualenvs.create false \ 8 | && poetry install --no-interaction --no-ansi 9 | 10 | poetry run pip install torch 11 | -------------------------------------------------------------------------------- /textbook/api.py: -------------------------------------------------------------------------------- 1 | from typer import Typer 2 | import typer 3 | from typing import Annotated 4 | 5 | app = Typer(pretty_exceptions_enable=False) 6 | 7 | 8 | @app.command() 9 | def train( 10 | local_rank: Annotated[int, typer.Option("--local_rank")] = 0, 11 | ): 12 | print(local_rank) 13 | 14 | 15 | if __name__ == "__main__": 16 | app() 17 | -------------------------------------------------------------------------------- /human-eval/data/example_samples.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "test/0", "completion": " import subprocess\n subprocess.check_output('rm -rf tmp')"} 2 | {"task_id": "test/0", "completion": " import time\n time.sleep(10)\n return 1"} 3 | {"task_id": "test/0", "completion": " return input('enter a number')"} 4 | {"task_id": "test/0", "completion": " return 1"} 5 | {"task_id": "test/0", "completion": " return 1"} 6 | {"task_id": "test/0", "completion": "\treturn 1"} 7 | -------------------------------------------------------------------------------- /tests/training/integration/test_train.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from textbook.train import train 3 | 4 | 5 | @pytest.mark.parametrize("module", ["Replit", "StarCoder"]) 6 | @pytest.mark.parametrize("dataset", ["DummyDataset", "ExerciseDatast"]) 7 | def test_train(module, dataset): 8 | train( 9 | module=module, 10 | dataset=dataset, 11 | debug=True, 12 | epochs=1, 13 | micro_batch_size=1, 14 | batch_size=1, 15 | use_wandb=False, 16 | ) 17 | -------------------------------------------------------------------------------- /human-eval/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pkg_resources 4 | from setuptools import setup, find_packages 5 | 6 | 7 | setup( 8 | name="human-eval", 9 | py_modules=["human-eval"], 10 | version="1.0", 11 | description="", 12 | author="OpenAI", 13 | packages=find_packages(), 14 | install_requires=[ 15 | str(r) 16 | for r in pkg_resources.parse_requirements( 17 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 18 | ) 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /tests/training/units/test_dataset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from textbook.dataset import DummyDataset, ExerciseDatast 4 | from textbook.model import Replit 5 | 6 | from transformers import PreTrainedTokenizer 7 | 8 | 9 | @pytest.fixture 10 | def tokenizer() -> PreTrainedTokenizer: 11 | return Replit().tokenizer 12 | 13 | 14 | def test_tiny_stories(tokenizer): 15 | DummyDataset(debug=True, tokenizer=tokenizer) 16 | 17 | 18 | def test_exercises_dataet(tokenizer): 19 | ExerciseDatast(debug=True, tokenizer=tokenizer) 20 | -------------------------------------------------------------------------------- /tests/training/units/test_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from textbook.model import Replit, StarCoder 3 | import torch 4 | 5 | 6 | @pytest.mark.slow 7 | def test_replit_base(): 8 | Replit() 9 | 10 | 11 | def test_replit_debug(): 12 | model = Replit(debug=True) 13 | assert model.model.dtype == torch.float32 14 | 15 | 16 | @pytest.mark.slow 17 | def test_starcoer_base(): 18 | StarCoder() 19 | 20 | 21 | def test_starcoder_debug(): 22 | model = StarCoder(debug=True) 23 | assert model.model.dtype == torch.float32 24 | -------------------------------------------------------------------------------- /tests/dataset_gen/test_integration.py: -------------------------------------------------------------------------------- 1 | from textbook.dataset_gen.dataset_gen_cli import generate, filter 2 | import os 3 | 4 | 5 | def test_cli_dataset_gen(tmp_path): 6 | generate( 7 | tree_path="textbook/dataset_gen/tree/professions.json", 8 | leaves_path="textbook/dataset_gen/tree/subsubtopics.json", 9 | debug=True, 10 | debug_speed=-1, 11 | retries=10, 12 | pool_size=10, 13 | output_path=tmp_path, 14 | ) 15 | 16 | filter(exo_path=tmp_path, dataset_file=os.path.join(tmp_path, "dataset.jsonl")) 17 | 18 | assert os.path.exists(os.path.join(tmp_path, "dataset.jsonl")) 19 | -------------------------------------------------------------------------------- /textbook/dataset_gen/tree/professions.json: -------------------------------------------------------------------------------- 1 | ["Biologist", "Fashion Designer", "Zoologist", "Carpenter", "Jina AI Machine Learning Engineer", "Archaeologist", "Physical Therapist", "Artist", "Pilot", "Economist", "Aerospace Engineer", "Journalist", "Police Officer", "Actor/Actress", "Musician", "Historian", "Anthropologist", "Physician", "Teacher", "Software Developer", "Electrician", "Psychologist", "Geologist", "Engineer", "Social Worker", "Accountant", "Architect", "Astronomer", "Firefighter", "Civil Engineer", "Librarian", "Athlete", "Interior Designer", "Environmental Scientist", "Marketing Manager", "Mathematician", "Game Developer", "Photographer", "Veterinarian", "Chef", "Farmer", "Geographer", "Lawyer", "Linguist", "Nurse", "Dancer", "Biomedical Engineer", "Graphic Designer"] -------------------------------------------------------------------------------- /human-eval/human_eval/evaluate_functional_correctness.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import sys 3 | 4 | from human_eval.data import HUMAN_EVAL 5 | from human_eval.evaluation import evaluate_functional_correctness 6 | 7 | 8 | def entry_point( 9 | sample_file: str, 10 | k: str = "1,10,100", 11 | n_workers: int = 4, 12 | timeout: float = 3.0, 13 | problem_file: str = HUMAN_EVAL, 14 | ): 15 | """ 16 | Evaluates the functional correctness of generated samples, and writes 17 | results to f"{sample_file}_results.jsonl.gz" 18 | """ 19 | k = list(map(int, k.split(","))) 20 | results = evaluate_functional_correctness( 21 | sample_file, k, n_workers, timeout, problem_file 22 | ) 23 | print(results) 24 | 25 | 26 | def main(): 27 | fire.Fire(entry_point) 28 | 29 | 30 | sys.exit(main()) 31 | -------------------------------------------------------------------------------- /ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "AdamW", 4 | "params": { 5 | "lr": "auto", 6 | "betas": "auto", 7 | "eps":"auto", 8 | "weight_decay":"auto" 9 | } 10 | }, 11 | 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "total_num_steps": "auto", 16 | "warmup_max_lr": "auto", 17 | "warmup_num_steps": "auto" 18 | } 19 | }, 20 | 21 | "zero_optimization": { 22 | "stage": 2, 23 | "allgather_partitions": true, 24 | "allgather_bucket_size": 2e8, 25 | "reduce_scatter": true, 26 | "reduce_bucket_size": 2e8, 27 | "overlap_comm": true, 28 | "contiguous_gradients": true, 29 | "cpu_offload": false 30 | }, 31 | 32 | "train_batch_size": "auto", 33 | "train_micro_batch_size_per_gpu": "auto" 34 | } -------------------------------------------------------------------------------- /tests/eval/test_evaluation.py: -------------------------------------------------------------------------------- 1 | from textbook import evaluate 2 | from textbook.model import Replit 3 | 4 | 5 | def test_evaluate(monkeypatch): 6 | # Define a replacement function to be used in the test 7 | def mock_generate_one_completion( 8 | model, tokenizer, prompt, max_new_tokens: int = 512 9 | ): 10 | return "\n return 1" 11 | 12 | # Monkey patch the 'add_numbers' function with the 'mock_add_numbers' function 13 | monkeypatch.setattr( 14 | evaluate, "generate_one_completion", mock_generate_one_completion 15 | ) 16 | 17 | replit = Replit(debug=True) 18 | accuracy_results, results = evaluate.evaluate( 19 | model=replit.model, 20 | tokenizer=replit.tokenizer, 21 | eval_file="human-eval/data/example_problem.jsonl", 22 | ) 23 | 24 | assert accuracy_results["pass@1"] == 1 25 | assert results["test/0"]["passed"] 26 | assert results["test/0"]["result"] == "passed" 27 | -------------------------------------------------------------------------------- /human-eval/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) OpenAI (https://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "textbook" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Jina ai"] 6 | license = "apache 2.0" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.9" 11 | torch = ">=2.0.1" 12 | transformers = {version = ">=4.30.2", extras = ["deepspeed"]} 13 | typer = {version = "^0.9.0", extras = ["all"]} 14 | wandb = "^0.15.3" 15 | datasets = "^2.13.1" 16 | accelerate = "^0.21.0" 17 | sentencepiece = "^0.1.99" 18 | einops = "^0.6.1" 19 | openai = "^0.27.8" 20 | pydantic = "<2.0.0" 21 | human-eval = {path = "human-eval", develop = true} 22 | 23 | 24 | [tool.poetry.group.dev.dependencies] 25 | pytest = ">=7.3.1" 26 | black = ">=23.3.0" 27 | isort = ">=5.12.0" 28 | ruff = ">=0.0.269" 29 | pre-commit = ">=3.3.2" 30 | jupyterlab = ">=4.0.0" 31 | jupyterlab-code-formatter = ">=2.2.1" 32 | mypy = ">=1" 33 | pytest-mock = ">=3.11.1" 34 | 35 | [build-system] 36 | requires = ["poetry-core"] 37 | build-backend = "poetry.core.masonry.api" 38 | 39 | [tool.ruff] 40 | ignore = ["F722"] 41 | line-length = 120 42 | 43 | [tool.mypy] 44 | ignore_missing_imports = true 45 | 46 | [tool.pytest.ini_options] 47 | markers = [ 48 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 49 | "openai: need openai key ", 50 | ] -------------------------------------------------------------------------------- /textbook/dataset_gen/filtering.py: -------------------------------------------------------------------------------- 1 | from textbook.dataset_gen.dataset_gen import Exercise 2 | from typing import List, Union 3 | import os 4 | from pathlib import Path 5 | 6 | 7 | def load_one_file(path: Union[Path, str]) -> List[Exercise]: 8 | with open(path, "r") as f: 9 | lines = f.readlines() 10 | return [Exercise.parse_raw(line) for line in lines] 11 | 12 | 13 | def load_all_exo(path: Union[Path, str]) -> List[Exercise]: 14 | if isinstance(path, str): 15 | path = Path(path) 16 | exos: List[Exercise] = [] 17 | for sub_dir in os.listdir(path): 18 | for fn in os.listdir(path / sub_dir): 19 | exos += load_one_file(path / sub_dir / fn) 20 | return exos 21 | 22 | 23 | def filter_bad_exos( 24 | exos: List[Exercise], carac_to_remove=["??", "___"] 25 | ) -> List[Exercise]: 26 | clean_exos: List[Exercise] = [] 27 | for exo in exos: 28 | keep = True 29 | for carac in carac_to_remove: 30 | if carac in exo.solution: 31 | keep = False 32 | break 33 | 34 | if keep: 35 | clean_exos.append(exo) 36 | 37 | return clean_exos 38 | 39 | 40 | def remove_extra(exos: List[Exercise], carac_to_split=["# Test", "```"]): 41 | for exo in exos: 42 | for carac in carac_to_split: 43 | exo.solution = exo.solution.split(carac)[0] 44 | 45 | 46 | def load_and_filter_exos(path: Union[Path, str]) -> List[Exercise]: 47 | exos = load_all_exo(path) 48 | print(len(exos)) 49 | clean_exos = filter_bad_exos(exos) 50 | print(len(clean_exos)) 51 | 52 | remove_extra(clean_exos) 53 | return clean_exos 54 | -------------------------------------------------------------------------------- /human-eval/human_eval/data.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Dict 2 | import gzip 3 | import json 4 | import os 5 | 6 | 7 | ROOT = os.path.dirname(os.path.abspath(__file__)) 8 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz") 9 | 10 | 11 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]: 12 | return {task["task_id"]: task for task in stream_jsonl(evalset_file)} 13 | 14 | 15 | def stream_jsonl(filename: str) -> Iterable[Dict]: 16 | """ 17 | Parses each jsonl line and yields it as a dictionary 18 | """ 19 | if filename.endswith(".gz"): 20 | with open(filename, "rb") as gzfp: 21 | with gzip.open(gzfp, "rt") as fp: 22 | for line in fp: 23 | if any(not x.isspace() for x in line): 24 | yield json.loads(line) 25 | else: 26 | with open(filename, "r") as fp: 27 | for line in fp: 28 | if any(not x.isspace() for x in line): 29 | yield json.loads(line) 30 | 31 | 32 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): 33 | """ 34 | Writes an iterable of dictionaries to jsonl 35 | """ 36 | if append: 37 | mode = "ab" 38 | else: 39 | mode = "wb" 40 | filename = os.path.expanduser(filename) 41 | if filename.endswith(".gz"): 42 | with open(filename, mode) as fp: 43 | with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp: 44 | for x in data: 45 | gzfp.write((json.dumps(x) + "\n").encode("utf-8")) 46 | else: 47 | with open(filename, mode) as fp: 48 | for x in data: 49 | fp.write((json.dumps(x) + "\n").encode("utf-8")) 50 | -------------------------------------------------------------------------------- /textbook/model.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | from transformers import ( 3 | AutoTokenizer, 4 | PreTrainedTokenizer, 5 | AutoConfig, 6 | PreTrainedModel, 7 | AutoModelForCausalLM, 8 | GPTBigCodeConfig, 9 | ) 10 | 11 | 12 | class BaseModule(Protocol): 13 | tokenizer: PreTrainedTokenizer 14 | model: PreTrainedModel 15 | 16 | def __init__(self, debug: bool = False): 17 | ... 18 | 19 | 20 | class Replit: 21 | tokenizer: PreTrainedTokenizer 22 | model: PreTrainedModel 23 | base_model = "replit/replit-code-v1-3b" 24 | 25 | config = AutoConfig.from_pretrained( 26 | "replit/replit-code-v1-3b", 27 | trust_remote_code=True, 28 | init_device="cuda", 29 | ) 30 | 31 | debug_config = AutoConfig.from_pretrained( 32 | "replit/replit-code-v1-3b", 33 | trust_remote_code=True, 34 | init_device="cuda", 35 | n_layers=1, 36 | ) 37 | 38 | def __init__(self, debug: bool = False): 39 | self._init_tokenizer() 40 | self.model = AutoModelForCausalLM.from_pretrained( 41 | self.base_model, 42 | config=self.config if not debug else self.debug_config, 43 | trust_remote_code=True, 44 | ) 45 | 46 | def _init_tokenizer(self): 47 | self.tokenizer = AutoTokenizer.from_pretrained( 48 | self.base_model, trust_remote_code=True 49 | ) 50 | self.tokenizer.pad_token = self.tokenizer.eos_token 51 | 52 | 53 | class StarCoder: 54 | tokenizer: PreTrainedTokenizer 55 | model: PreTrainedModel 56 | base_model = "bigcode/starcoderbase-1b" 57 | config = GPTBigCodeConfig.from_pretrained( 58 | "bigcode/starcoderbase-1b", 59 | init_device="cuda", 60 | ) 61 | 62 | debug_config = GPTBigCodeConfig.from_pretrained( 63 | "bigcode/starcoderbase-1b", 64 | init_device="cuda", 65 | n_layer=1, 66 | ) 67 | 68 | def __init__(self, debug: bool = False): 69 | self._init_tokenizer() 70 | if debug: 71 | self.model = AutoModelForCausalLM.from_pretrained( 72 | self.base_model, 73 | config=self.debug_config, 74 | ) 75 | else: 76 | self.model = AutoModelForCausalLM.from_pretrained(self.base_model).to( 77 | "cuda" 78 | ) 79 | 80 | def _init_tokenizer(self): 81 | self.tokenizer = AutoTokenizer.from_pretrained( 82 | self.base_model, 83 | ) 84 | self.tokenizer.pad_token = self.tokenizer.eos_token 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | wandb/* 2 | # Initially taken from Github's Python gitignore file 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | tests/unit/array/jina*.db* 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | docs/.python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # Environments 94 | .venv 95 | env/ 96 | venv/ 97 | ENV/ 98 | env.bak/ 99 | venv.bak/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | .dmypy.json 114 | dmypy.json 115 | 116 | # Pyre type checker 117 | .pyre/ 118 | .idea/ 119 | toy*.py 120 | .DS_Store 121 | post/ 122 | toy*.ipynb 123 | *.c 124 | .nes_cache 125 | toy*.yml 126 | *.tmp 127 | 128 | shell/jina-wizard.sh 129 | /junit/ 130 | /tests/junit/ 131 | /docs/chapters/proto/docs.md 132 | /tests/.pytest-kind 133 | 134 | # IntelliJ IDEA 135 | *.iml 136 | .idea 137 | 138 | # VSCode 139 | .vscode 140 | 141 | # test with config in resources 142 | tests/integration/crud/simple/simple_indexer/ 143 | 144 | # latency tracking 145 | latency 146 | MyIndexer/ 147 | MyMemMap/ 148 | original/ 149 | output/ 150 | 151 | # kubernetes testing 152 | .pytest-kind 153 | .kube 154 | 155 | *.ipynb -------------------------------------------------------------------------------- /textbook/dataset_gen/tree/topics.csv: -------------------------------------------------------------------------------- 1 | Topic,Use,Mixing, 2 | 1. Introduction to Python Programming,0,, 3 | 2. Python Data Types and Variables,1,1, 4 | 3. Python Basic Operators,1,1, 5 | 4. Control Structures in Python,1,1, 6 | 5. Python Loops,1,1, 7 | 6. Python Data Structures: Lists and Tuples,1,1, 8 | 7. Python Data Structures: Sets and Frozensets,1,1, 9 | 8. Python Data Structures: Dictionaries,1,1, 10 | 9. Functions in Python,1,1, 11 | 10. Python Built-In Functions,1,1, 12 | 11. Understanding Python Scope (Global vs Local),0,, 13 | 12. Python Generators and Iterators,1,1, 14 | 13. Python List Comprehensions,1,1, 15 | 14. Python Lambda Functions,1,1, 16 | 15. Python Classes and Objects,1,1, 17 | 16. Inheritance and Polymorphism in Python,1,1, 18 | 17. Python Decorators,1,1, 19 | 18. Python Exception Handling,1,1, 20 | 19. File I/O in Python,0,, 21 | 20. Recursion in Python,1,1, 22 | 21. Introduction to Algorithm Complexity and Big O Notation,1,1, 23 | "22. Basic Sorting Algorithms: Bubble, Selection, Insertion Sort",1,1, 24 | "23. Advanced Sorting Algorithms: Quick Sort, Merge Sort, Heap Sort",1,1, 25 | 24. Searching Algorithms: Linear and Binary Search,1,1, 26 | 25. Hashing and Hash Tables,1,1, 27 | 26. Data Structures: Stacks and Queues,1,1, 28 | 27. Data Structures: Linked Lists,1,1, 29 | "28. Data Structures: Trees (Binary Trees, BSTs, Heaps)",1,1, 30 | 29. Data Structures: Graphs,1,0, 31 | "30. Tree Traversal Algorithms: Preorder, Inorder, Postorder",1,0, 32 | "31. Graph Algorithms: Breadth-First Search, Depth-First Search",1,0, 33 | "32. Graph Algorithms: Dijkstra’s Algorithm, Bellman-Ford Algorithm",1,0, 34 | "33. Graph Algorithms: Kruskal’s, Prim's Algorithm",1,0, 35 | 34. Dynamic Programming in Python,1,0, 36 | 35. Greedy Algorithms in Python,1,1, 37 | 36. Backtracking Algorithms in Python,1,0, 38 | "37. String Algorithms: Pattern Searching, Palindromes, Anagrams",1,1, 39 | "38. Number Theory Algorithms: GCD, Prime Numbers",1,0, 40 | 39. Python and Databases,0,, 41 | 40. Understanding Python's json and csv Modules,0,, 42 | 41. Python's datetime Module,0,, 43 | 42. Python's math and random Modules,1,1, 44 | "43. Python Testing: Unit Tests, DocTests",0,, 45 | 44. Python Debugging Techniques,0,, 46 | 45. Profiling and Optimizing Python,0,, 47 | "46. Concurrency in Python: Multithreading, Multiprocessing, asyncio",1,1, 48 | 47. Networking with Python's socket Module,0,, 49 | 48. Python Coding Standards (PEP8) and Code Linting,0,, 50 | 49. Refactoring Python Code,0,, 51 | 50. Understanding the Python GIL,0,, 52 | 51. Immutable Data Structures in Python,1,1, 53 | 52. Understanding Python Metaclasses,0,, 54 | 53. Python's Enumerations (Enum),1,1, 55 | "54. Understanding Python's ""with"" statement",1,1, 56 | 55. Packaging and Distributing Python Applications,0,0, 57 | 56. Python's garbage collection and memory management,1,1, 58 | 57. Understanding the Python Standard Library,1,1, 59 | "58. Understanding __name__ and ""__main__"" in Python",0,, 60 | 59. A Look into Python's Future: What's new in Python 4?,0,, 61 | 60. Final Project: Implementing a complex algorithm or data structure from scratch,0,, 62 | ,,,42 -------------------------------------------------------------------------------- /human-eval/human_eval/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from concurrent.futures import ThreadPoolExecutor, as_completed 3 | from typing import List, Union, Dict, Optional 4 | import itertools 5 | 6 | import numpy as np 7 | import tqdm 8 | 9 | from human_eval.data import HUMAN_EVAL, read_problems, stream_jsonl, write_jsonl 10 | from human_eval.execution import check_correctness 11 | 12 | 13 | def estimate_pass_at_k( 14 | num_samples: Union[int, List[int], np.ndarray], 15 | num_correct: Union[List[int], np.ndarray], 16 | k: int, 17 | ) -> np.ndarray: 18 | """ 19 | Estimates pass@k of each problem and returns them in an array. 20 | """ 21 | 22 | def estimator(n: int, c: int, k: int) -> float: 23 | """ 24 | Calculates 1 - comb(n - c, k) / comb(n, k). 25 | """ 26 | if n - c < k: 27 | return 1.0 28 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 29 | 30 | if isinstance(num_samples, int): 31 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 32 | else: 33 | assert len(num_samples) == len(num_correct) 34 | num_samples_it = iter(num_samples) 35 | 36 | return np.array( 37 | [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)] 38 | ) 39 | 40 | 41 | def evaluate_functional_correctness( 42 | sample_file: str, 43 | k: List[int] = [1, 10, 100], 44 | n_workers: int = 4, 45 | timeout: float = 3.0, 46 | problem_file: str = HUMAN_EVAL, 47 | problems: Optional[Dict] = None, 48 | ): 49 | """ 50 | Evaluates the functional correctness of generated samples, and writes 51 | results to f"{sample_file}_results.jsonl.gz" 52 | """ 53 | 54 | problems = problems or read_problems(problem_file) 55 | 56 | # Check the generated samples against test suites. 57 | with ThreadPoolExecutor(max_workers=n_workers) as executor: 58 | futures = [] 59 | completion_id = Counter() 60 | n_samples = 0 61 | results = defaultdict(list) 62 | 63 | print("Reading samples...") 64 | for sample in tqdm.tqdm(stream_jsonl(sample_file)): 65 | task_id = sample["task_id"] 66 | completion = sample["completion"] 67 | args = (problems[task_id], completion, timeout, completion_id[task_id]) 68 | future = executor.submit(check_correctness, *args) 69 | futures.append(future) 70 | completion_id[task_id] += 1 71 | n_samples += 1 72 | 73 | assert len(completion_id) == len(problems), "Some problems are not attempted." 74 | 75 | print("Running test suites...") 76 | for future in tqdm.tqdm(as_completed(futures), total=len(futures)): 77 | result = future.result() 78 | results[result["task_id"]].append((result["completion_id"], result)) 79 | 80 | # Calculate pass@k. 81 | total, correct = [], [] 82 | for result in results.values(): 83 | result.sort() 84 | passed = [r[1]["passed"] for r in result] 85 | total.append(len(passed)) 86 | correct.append(sum(passed)) 87 | total = np.array(total) 88 | correct = np.array(correct) 89 | 90 | ks = k 91 | pass_at_k = { 92 | f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() 93 | for k in ks 94 | if (total >= k).all() 95 | } 96 | 97 | # Finally, save the results in one file: 98 | def combine_results(): 99 | for sample in stream_jsonl(sample_file): 100 | task_id = sample["task_id"] 101 | result = results[task_id].pop(0) 102 | sample["result"] = result[1]["result"] 103 | sample["passed"] = result[1]["passed"] 104 | yield sample 105 | 106 | out_file = sample_file + "_results.jsonl" 107 | print(f"Writing results to {out_file}...") 108 | write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples)) 109 | 110 | return pass_at_k 111 | -------------------------------------------------------------------------------- /textbook/dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol, Optional 2 | import random 3 | 4 | from datasets import Dataset, load_dataset 5 | from transformers import ( 6 | PreTrainedTokenizer, 7 | DataCollatorForLanguageModeling, 8 | DataCollatorForSeq2Seq, 9 | ) 10 | from transformers.data.data_collator import DataCollatorMixin 11 | 12 | 13 | class CustomDataset(Protocol): 14 | train_dataset: Dataset 15 | test_dataset: Dataset 16 | data_collator: DataCollatorMixin 17 | 18 | def __init__( 19 | self, 20 | tokenizer: PreTrainedTokenizer, 21 | debug: bool = False, 22 | dataset_name: Optional[str] = None, 23 | ): 24 | ... 25 | 26 | 27 | class DummyDataset: 28 | @staticmethod 29 | def gen(n: int = 100_000, upper_bound: int = 512): 30 | for _ in range(n): 31 | random_integer = random.randint(1, upper_bound) 32 | yield {"text": "hello world" * random_integer} 33 | 34 | def __init__(self, tokenizer: PreTrainedTokenizer, debug: bool = False, **kwargs): 35 | self.debug = debug 36 | 37 | dataset = Dataset.from_generator(self.gen) 38 | 39 | if debug: 40 | dataset = dataset.select(range(10)) 41 | 42 | split_dataset = dataset.train_test_split(test_size=0.1) 43 | 44 | self.train_dataset = split_dataset["train"] 45 | self.test_dataset = split_dataset["test"] 46 | 47 | self.train_dataset = self.train_dataset.map( 48 | self._get_preprocess_fn(tokenizer), 49 | batched=True, 50 | num_proc=4, 51 | remove_columns=self.train_dataset.column_names, 52 | ) 53 | 54 | self.test_dataset = self.test_dataset.map( 55 | self._get_preprocess_fn(tokenizer), 56 | batched=True, 57 | num_proc=4, 58 | remove_columns=self.test_dataset.column_names, 59 | ) 60 | 61 | self.data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) 62 | 63 | @staticmethod 64 | def _get_preprocess_fn(tokenizer: PreTrainedTokenizer): 65 | def tokenize_fn(input): 66 | return tokenizer( 67 | input["text"], 68 | ) 69 | 70 | return tokenize_fn 71 | 72 | 73 | class ExerciseDatast: 74 | def __init__( 75 | self, 76 | tokenizer: PreTrainedTokenizer, 77 | dataset_name: str = "jinaai/code_exercises_40k", 78 | debug: bool = False, 79 | ): 80 | self.debug = debug 81 | 82 | dataset = load_dataset(dataset_name)["train"] 83 | 84 | if debug: 85 | dataset = dataset.select(range(10)) 86 | 87 | split_dataset = dataset.train_test_split(test_size=0.1) 88 | 89 | self.train_dataset = split_dataset["train"] 90 | self.test_dataset = split_dataset["test"] 91 | 92 | self.train_dataset = self.train_dataset.map( 93 | self._get_preprocess_fn(tokenizer), 94 | batched=False, 95 | num_proc=4, 96 | remove_columns=self.train_dataset.column_names, 97 | ) 98 | 99 | self.test_dataset = self.test_dataset.map( 100 | self._get_preprocess_fn(tokenizer), 101 | batched=False, 102 | num_proc=4, 103 | remove_columns=self.test_dataset.column_names, 104 | ) 105 | 106 | self.data_collator = DataCollatorForSeq2Seq( 107 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 108 | ) 109 | 110 | @staticmethod 111 | def _get_preprocess_fn(tokenizer: PreTrainedTokenizer): 112 | def tokenize_fn(input): 113 | input_problem = input["problem"] 114 | input_solution = input["solution"] 115 | 116 | inputs = tokenizer(input_problem) 117 | targets = tokenizer(input_solution) 118 | inputs["labels"] = [-100] * len(inputs["input_ids"]) + targets[ 119 | "input_ids" 120 | ] # we don't train on the problem tokens 121 | inputs["input_ids"] = inputs["input_ids"] + targets["input_ids"] 122 | inputs["attention_mask"] = ( 123 | inputs["attention_mask"] + targets["attention_mask"] 124 | ) 125 | 126 | return inputs 127 | 128 | return tokenize_fn 129 | -------------------------------------------------------------------------------- /textbook/dataset_gen/dataset_gen_cli.py: -------------------------------------------------------------------------------- 1 | import random 2 | import itertools 3 | import json 4 | from typer import Typer 5 | from typing import List 6 | from textbook.dataset_gen.dataset_gen import ( 7 | load_leaves, 8 | mass_generation, 9 | OpenAIGenerator, 10 | MonkeyGenerator, 11 | write_results_to_jsonl, 12 | ) 13 | import openai 14 | import os 15 | from pathlib import Path 16 | 17 | from textbook.dataset_gen.create_prompts import Topic, Query 18 | from textbook.dataset_gen.filtering import load_and_filter_exos 19 | from datasets import Dataset 20 | 21 | app = Typer() 22 | 23 | 24 | def create_prompt_query(topic_1: Topic, topic_2: Topic, profession: str) -> str: 25 | query = f''' 26 | Create a code completion exercise on the intersection of “{topic_1.topic}” and “{topic_2.topic}”. 27 | Write it for a {profession}. 28 | 29 | The exercise must be of the style: 30 | 31 | ``` 32 | def name(args): 33 | 34 | """Docstring explaining the exercise""" 35 | 36 | python code to solve the exercise 37 | ``` 38 | 39 | NO CLASSES 40 | 41 | MAKE IT VERY DIFFICULT 42 | ''' 43 | query = "\n".join([m.lstrip() for m in query.strip().split("\n")]) 44 | return query 45 | 46 | 47 | def create_prompts( 48 | topic: Topic, 49 | combination_options: List[Topic], 50 | professions: List[str], 51 | ) -> List[Query]: 52 | random.shuffle(combination_options) 53 | 54 | prompts: List[Query] = [] 55 | 56 | for loc_topic in combination_options: 57 | if ( 58 | loc_topic.mixing 59 | and loc_topic.parent != topic.parent 60 | and loc_topic.topic != topic.topic 61 | ): 62 | for profession in professions: 63 | query = create_prompt_query(topic, loc_topic, profession) 64 | prompts.append(Query(query=query, topic_1=topic, topic_2=loc_topic)) 65 | 66 | return prompts 67 | 68 | 69 | @app.command() 70 | def generate( 71 | tree_path: str, 72 | leaves_path: str, 73 | output_path: str, 74 | retries: int = 10, 75 | pool_size: int = 10, 76 | debug: bool = False, 77 | debug_speed: int = 2, 78 | n_prompts: int = 100, 79 | ): 80 | with open(tree_path, "r") as openfile: 81 | # Reading from json file 82 | professions = list(json.load(openfile)) 83 | 84 | if not os.path.exists(output_path): 85 | os.makedirs(output_path) 86 | 87 | if not debug: 88 | openai.api_key = os.environ["OPENAI_API_KEY"] 89 | 90 | def get_generator(): 91 | return OpenAIGenerator() 92 | 93 | else: 94 | 95 | def get_generator(): 96 | return MonkeyGenerator(speed=debug_speed) 97 | 98 | leaves = load_leaves(leaves_path) 99 | prompts: List[List[Query]] = [ 100 | create_prompts( 101 | i, 102 | combination_options=leaves, 103 | professions=professions, 104 | ) 105 | for i in leaves 106 | ] 107 | 108 | prompts_flat = list(itertools.chain(*prompts)) 109 | if n_prompts > len(prompts_flat): 110 | raise ValueError( 111 | f"Cannot generate({n_prompts}) prompts because it is larger than the number of" 112 | f" available prompts ({len(prompts_flat)})" 113 | ) 114 | prompts_selection = [i.query for i in prompts_flat] 115 | 116 | print(f"prompts: {len(prompts_selection)}") 117 | 118 | solo_prompts = list(set(prompts_selection)) 119 | 120 | print(f"solo prompts: {len(solo_prompts)}") 121 | prompts_selection = solo_prompts[:n_prompts] 122 | print(f"total prompts: {len(prompts_selection)}") 123 | 124 | mass_generation( 125 | prompts_selection, 126 | get_generator, 127 | save_dir=output_path, 128 | pool_size=pool_size, 129 | retries=retries, 130 | ) 131 | 132 | 133 | @app.command() 134 | def filter(exo_path: Path, dataset_file: str): 135 | print(exo_path) 136 | exos = load_and_filter_exos(exo_path) 137 | print(len(exos)) 138 | write_results_to_jsonl(dataset_file, exos) 139 | 140 | 141 | @app.command() 142 | def push(repo_name: str, dataset_file: Path): 143 | with open(dataset_file, "r") as file: 144 | lines = file.readlines() 145 | exercises = [json.loads(line) for line in lines] 146 | 147 | def gen(): 148 | for exo in exercises: 149 | yield exo 150 | 151 | dataset = Dataset.from_generator(gen) 152 | dataset.push_to_hub(repo_name) 153 | 154 | 155 | if __name__ == "__main__": 156 | app() 157 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | types: [opened, synchronize, reopened] 6 | push: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | lint-ruff: 12 | runs-on: ubuntu-20.04 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python 3.9 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: 3.9 19 | - name: Lint with ruff 20 | run: | 21 | python -m pip install --upgrade pip 22 | python -m pip install poetry 23 | poetry install 24 | 25 | # stop the build if there are Python syntax errors or undefined names 26 | poetry run ruff . 27 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 28 | poetry run ruff . 29 | 30 | check-black: 31 | runs-on: ubuntu-20.04 32 | steps: 33 | - uses: actions/checkout@v3 34 | - name: Set up Python 3.9 35 | uses: actions/setup-python@v4 36 | with: 37 | python-version: 3.9 38 | - name: check black 39 | run: | 40 | python -m pip install --upgrade pip 41 | python -m pip install poetry 42 | poetry install --only dev 43 | poetry run black --check . 44 | 45 | env: 46 | CHANGED_FILES: ${{ steps.file_changes.outputs.added_modified }} 47 | 48 | check-mypy: 49 | runs-on: ubuntu-20.04 50 | steps: 51 | - uses: actions/checkout@v2.5.0 52 | - name: Set up Python 3.9 53 | uses: actions/setup-python@v4 54 | with: 55 | python-version: 3.9 56 | - name: check mypy 57 | run: | 58 | python -m pip install --upgrade pip 59 | python -m pip install poetry 60 | poetry install --all-extras 61 | poetry run mypy textbook 62 | 63 | # run all tests 64 | run-training-test: 65 | needs: [check-black, lint-ruff] 66 | runs-on: [self-hosted, x64, gpu, linux] 67 | strategy: 68 | fail-fast: false 69 | steps: 70 | - uses: actions/checkout@v3 71 | - name: Set up Python 3.9 72 | uses: actions/setup-python@v4 73 | with: 74 | python-version: 3.9 75 | - name: Prepare environment 76 | run: | 77 | python -m pip install --upgrade pip 78 | python -m pip install -U poetry 79 | poetry install 80 | poetry run pip install torch 81 | poetry run huggingface-cli login --token $HF_AUTH_TOKEN 82 | 83 | 84 | - name: Test 85 | id: test 86 | run: | 87 | poetry run pytest tests/training -m "not slow" 88 | timeout-minutes: 30 89 | env: 90 | HF_AUTH_TOKEN: ${{ secrets.HF_AUTH_TOKEN }} 91 | 92 | run-dataset-gen-test: 93 | needs: [check-black, lint-ruff] 94 | runs-on: ubuntu-20.04 95 | strategy: 96 | fail-fast: false 97 | steps: 98 | - uses: actions/checkout@v3 99 | - name: Set up Python 3.9 100 | uses: actions/setup-python@v4 101 | with: 102 | python-version: 3.9 103 | - name: Prepare environment 104 | run: | 105 | python -m pip install --upgrade pip 106 | python -m pip install -U poetry 107 | poetry install 108 | poetry run pip install torch 109 | 110 | - name: Test 111 | id: test 112 | run: | 113 | poetry run pytest tests/dataset_gen -m "not slow and not openai" 114 | timeout-minutes: 30 115 | 116 | run-evaluation-test: 117 | needs: [check-black, lint-ruff] 118 | runs-on: [self-hosted, x64, gpu, linux] 119 | strategy: 120 | fail-fast: false 121 | steps: 122 | - uses: actions/checkout@v3 123 | - name: Set up Python 3.9 124 | uses: actions/setup-python@v4 125 | with: 126 | python-version: 3.9 127 | - name: Prepare environment 128 | run: | 129 | python -m pip install --upgrade pip 130 | python -m pip install -U poetry 131 | poetry install 132 | poetry run pip install torch 133 | poetry run huggingface-cli login --token $HF_AUTH_TOKEN 134 | 135 | 136 | - name: Test 137 | id: test 138 | run: | 139 | poetry run pytest tests/eval 140 | timeout-minutes: 10 141 | env: 142 | HF_AUTH_TOKEN: ${{ secrets.HF_AUTH_TOKEN }} 143 | 144 | # just for blocking the merge until all parallel core-test are successful 145 | success-all-test: 146 | needs: [check-mypy, run-training-test, run-dataset-gen-test, check-black, lint-ruff] 147 | if: always() 148 | runs-on: ubuntu-20.04 149 | steps: 150 | - uses: technote-space/workflow-conclusion-action@v2 151 | - name: Check Failure 152 | if: env.WORKFLOW_CONCLUSION == 'failure' 153 | run: exit 1 154 | - name: Success 155 | if: ${{ success() }} 156 | run: echo "All Done" 157 | -------------------------------------------------------------------------------- /textbook/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | from typing import Optional, Union, List 4 | 5 | import torch 6 | from transformers import ( 7 | PreTrainedTokenizer, 8 | PreTrainedModel, 9 | StoppingCriteria, 10 | StoppingCriteriaList, 11 | ) 12 | from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL 13 | from human_eval.evaluation import evaluate_functional_correctness 14 | 15 | if torch.cuda.is_available(): 16 | device = "cuda" 17 | else: 18 | device = "cpu" 19 | 20 | STOP_WORDS = ["\nclass", "\ndef", "\n@", "\nprint", "\nif", "\n#"] 21 | 22 | 23 | class EndOfFunctionCriteria(StoppingCriteria): 24 | """Custom `StoppingCriteria` which checks if all generated functions in the batch are completed.""" 25 | 26 | def __init__(self, tokenizer, start_length=0): 27 | self.start_length = start_length 28 | self.tokenizer = tokenizer 29 | 30 | def __call__(self, input_ids, scores, **kwargs): 31 | """Returns true if all generated sequences contain any of the end-of-function strings.""" 32 | decoded_generations = self.tokenizer.batch_decode( 33 | input_ids[:, self.start_length :] 34 | ) 35 | done = [] 36 | for decoded_generation in decoded_generations: 37 | done.append( 38 | any([stop_string in decoded_generation for stop_string in STOP_WORDS]) 39 | ) 40 | return all(done) 41 | 42 | 43 | def _stop_at_stop_token(decoded_string, stop_tokens): 44 | """ 45 | Produces the prefix of decoded_string that ends at the first occurrence of 46 | a stop_token. 47 | WARNING: the decoded_string *must not* include the prompt, which may have stop tokens 48 | itself. 49 | """ 50 | min_stop_index = len(decoded_string) 51 | for stop_token in stop_tokens: 52 | stop_index = decoded_string.find(stop_token) 53 | if stop_index != -1 and stop_index < min_stop_index: 54 | min_stop_index = stop_index 55 | return decoded_string[:min_stop_index] 56 | 57 | 58 | def read_jsonl_file(file_path): 59 | data = [] 60 | with open(file_path, "r") as f: 61 | for line in f: 62 | json_data = json.loads(line) 63 | data.append(json_data) 64 | return data 65 | 66 | 67 | def generate_one_completion( 68 | model: PreTrainedModel, 69 | tokenizer: PreTrainedTokenizer, 70 | prompt: str, 71 | max_new_tokens: int = 512, 72 | ) -> List[str]: 73 | inputs = tokenizer(prompt.rstrip(), return_tensors="pt").to("cuda") 74 | stopping_criteria = StoppingCriteriaList( 75 | [EndOfFunctionCriteria(tokenizer, start_length=len(inputs["input_ids"][0]))] 76 | ) 77 | generation_output = model.generate( 78 | **inputs, 79 | max_new_tokens=max_new_tokens, 80 | eos_token_id=tokenizer.eos_token_id, 81 | return_dict_in_generate=True, 82 | stopping_criteria=stopping_criteria, 83 | # do_sample=True, 84 | # temperature=0.2, 85 | # top_k=0, 86 | # top_p=0.95 87 | ) 88 | 89 | s = generation_output.sequences[0] 90 | output = tokenizer.decode(s, skip_special_tokens=True) 91 | generation = output[len(prompt) :] 92 | generation = prompt + _stop_at_stop_token(generation, STOP_WORDS) 93 | return generation 94 | 95 | 96 | def evaluate( 97 | model: Union[torch.nn.Module, PreTrainedModel], 98 | tokenizer: PreTrainedTokenizer, 99 | prompt_template: str = "{prompt}", 100 | eval_file: str = HUMAN_EVAL, 101 | eval_size: Optional[int] = None, 102 | max_new_tokens: int = 512, 103 | ): 104 | model.eval() 105 | problems = read_problems(evalset_file=eval_file) 106 | eval_size = eval_size or len(list(problems.items())) 107 | problems = dict(list(problems.items())[:eval_size]) 108 | 109 | # since k=1, no need for more samples 110 | num_samples_per_task = 1 111 | samples = [ 112 | dict( 113 | task_id=task_id, 114 | completion=generate_one_completion( 115 | model, 116 | tokenizer, 117 | prompt_template.format(prompt=problems[task_id]["prompt"]), 118 | max_new_tokens=max_new_tokens, 119 | ), 120 | ) 121 | for task_id in problems 122 | for _ in range(num_samples_per_task) 123 | ] 124 | with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file: 125 | write_jsonl(temp_file.name, samples) 126 | 127 | accuracy_results = evaluate_functional_correctness( 128 | temp_file.name, k=[1], problem_file=eval_file, problems=problems 129 | ) 130 | sample_results = read_jsonl_file(f"{temp_file.name}_results.jsonl") 131 | 132 | # merge results and problems 133 | results = { 134 | item["task_id"]: {**item, **problems[item["task_id"]]} 135 | for item in sample_results 136 | } 137 | 138 | return accuracy_results, results 139 | -------------------------------------------------------------------------------- /human-eval/README.md: -------------------------------------------------------------------------------- 1 | # HumanEval: Hand-Written Evaluation Set 2 | 3 | This is an evaluation harness for the HumanEval problem solving dataset 4 | described in the paper "[Evaluating Large Language Models Trained on 5 | Code](https://arxiv.org/abs/2107.03374)". 6 | 7 | ## Installation 8 | 9 | Make sure to use python 3.7 or later: 10 | ``` 11 | $ conda create -n codex python=3.7 12 | $ conda activate codex 13 | ``` 14 | 15 | Check out and install this repository: 16 | ``` 17 | $ git clone https://github.com/openai/human-eval 18 | $ pip install -e human-eval 19 | ``` 20 | 21 | ## Usage 22 | 23 | **This program exists to run untrusted model-generated code. Users are strongly 24 | encouraged not to do so outside of a robust security sandbox. The [execution 25 | call](https://github.com/openai/human-eval/blob/master/human_eval/execution.py#L48-L58) 26 | in `execution.py` is deliberately commented out to ensure users read this 27 | disclaimer before running code in a potentially unsafe manner. See the comment in 28 | `execution.py` for more information and instructions.** 29 | 30 | After following the above instructions to enable execution, generate samples 31 | and save them in the following JSON Lines (jsonl) format, where each sample is 32 | formatted into a single line like so: 33 | ``` 34 | {"task_id": "Corresponding HumanEval task ID", "completion": "Completion only without the prompt"} 35 | ``` 36 | We provide `example_problem.jsonl` and `example_solutions.jsonl` under `data` 37 | to illustrate the format and help with debugging. 38 | 39 | Here is nearly functional example code (you just have to provide 40 | `generate_one_completion` to make it work) that saves generated completions to 41 | `samples.jsonl`. 42 | ``` 43 | from human_eval.data import write_jsonl, read_problems 44 | 45 | problems = read_problems() 46 | 47 | num_samples_per_task = 200 48 | samples = [ 49 | dict(task_id=task_id, completion=generate_one_completion(problems[task_id]["prompt"])) 50 | for task_id in problems 51 | for _ in range(num_samples_per_task) 52 | ] 53 | write_jsonl("samples.jsonl", samples) 54 | ``` 55 | 56 | To evaluate the samples, run 57 | ``` 58 | $ evaluate_functional_correctness samples.jsonl 59 | Reading samples... 60 | 32800it [00:01, 23787.50it/s] 61 | Running test suites... 62 | 100%|...| 32800/32800 [16:11<00:00, 33.76it/s] 63 | Writing results to samples.jsonl_results.jsonl... 64 | 100%|...| 32800/32800 [00:00<00:00, 42876.84it/s] 65 | {'pass@1': ..., 'pass@10': ..., 'pass@100': ...} 66 | ``` 67 | This script provides more fine-grained information in a new file ending in 68 | `_results.jsonl`. Each row now contains whether the completion 69 | `passed` along with the execution `result` which is one of "passed", "timed 70 | out", or "failed". 71 | 72 | As a quick sanity-check, the example samples should yield 0.5 pass@1. 73 | ``` 74 | $ evaluate_functional_correctness data/example_samples.jsonl --problem_file=data/example_problem.jsonl 75 | Reading samples... 76 | 6it [00:00, 3397.11it/s] 77 | Running example suites... 78 | 100%|...| 6/6 [00:03<00:00, 1.96it/s] 79 | Writing results to data/example_samples.jsonl_results.jsonl... 80 | 100%|...| 6/6 [00:00<00:00, 6148.50it/s] 81 | {'pass@1': 0.4999999999999999} 82 | ``` 83 | 84 | Because there is no unbiased way of estimating pass@k when there are fewer 85 | samples than k, the script does not evaluate pass@k for these cases. To 86 | evaluate with other k values, pass `--k=`. For 87 | other options, see 88 | ``` 89 | $ evaluate_functional_correctness --help 90 | ``` 91 | However, we recommend that you use the default values for the rest. 92 | 93 | ## Known Issues 94 | 95 | While evaluation uses very little memory, you might see the following error 96 | message when the system is running out of RAM. Since this may cause some 97 | correct programs to fail, we recommend that you free some memory and try again. 98 | ``` 99 | malloc: can't allocate region 100 | ``` 101 | 102 | ## Citation 103 | 104 | Please cite using the following bibtex entry: 105 | 106 | ``` 107 | @article{chen2021codex, 108 | title={Evaluating Large Language Models Trained on Code}, 109 | author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba}, 110 | year={2021}, 111 | eprint={2107.03374}, 112 | archivePrefix={arXiv}, 113 | primaryClass={cs.LG} 114 | } 115 | ``` 116 | -------------------------------------------------------------------------------- /textbook/train.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from importlib import import_module 3 | from typing import Optional, Dict, Type, Annotated 4 | 5 | 6 | import torch 7 | 8 | from textbook.dataset import CustomDataset 9 | from textbook.evaluate import evaluate 10 | from textbook.model import BaseModule 11 | 12 | import transformers 13 | import tempfile 14 | 15 | from typer import Typer 16 | import typer 17 | import wandb 18 | 19 | app = Typer(pretty_exceptions_enable=False) 20 | 21 | 22 | config_to_log: Dict = {} 23 | 24 | 25 | def log_args(func): 26 | @functools.wraps(func) 27 | def wrapper(*args, **kwargs): 28 | global config_to_log 29 | config_to_log = kwargs 30 | return func(*args, **kwargs) 31 | 32 | return wrapper 33 | 34 | 35 | @app.command() 36 | @log_args 37 | def train( 38 | *, 39 | module: str = "StarCoder", 40 | dataset: str = "ExerciseDatast", 41 | epochs: int = 1, 42 | micro_batch_size: int = 1, 43 | batch_size: int = 1, 44 | learning_rate: float = 3e-5, 45 | output_dir: Optional[str] = None, 46 | wandb_run_name: str = "", 47 | use_wandb: bool = False, 48 | wandb_project: str = "textbook", 49 | wandb_log_model: Optional[ 50 | bool 51 | ] = None, # will be true by default if use_wandb is true 52 | push_model_to_hf: bool = False, # if set, will push the model to hf 53 | local_rank: Annotated[int, typer.Option("--local_rank")] = 0, 54 | deepspeed: Optional[str] = None, 55 | debug: bool = False, 56 | eval_size: Optional[int] = None, 57 | eval_max_new_tokens: int = 512, 58 | n_samples: Optional[int] = None, 59 | dataset_name: Optional[str] = "jinaai/code_exercises_40k", 60 | ): 61 | module_cls: Type[BaseModule] = getattr(import_module("textbook.model"), module) 62 | module_instance = module_cls(debug=debug) 63 | model = torch.compile(module_instance.model) 64 | model = module_instance.model 65 | tokenizer = module_instance.tokenizer 66 | 67 | dataset_cls: Type[CustomDataset] = getattr( 68 | import_module("textbook.dataset"), dataset 69 | ) 70 | if dataset_name: 71 | dataset_instance = dataset_cls( 72 | tokenizer=tokenizer, debug=debug, dataset_name=dataset_name 73 | ) 74 | else: 75 | dataset_instance = dataset_cls(tokenizer=tokenizer, debug=debug) 76 | 77 | if n_samples: 78 | dataset_instance.train_dataset = dataset_instance.train_dataset.select( 79 | range(n_samples) 80 | ) 81 | 82 | if debug: 83 | wandb_run_name = "debug" 84 | 85 | if batch_size % micro_batch_size: 86 | raise ValueError( 87 | f"batch_size {batch_size} and micro_batch_size {micro_batch_size} are not compatible" 88 | ) 89 | 90 | if wandb_log_model is None: 91 | wandb_log_model = use_wandb 92 | 93 | if output_dir is None: 94 | output_dir = tempfile.mkdtemp() 95 | print(f"temp folder : {output_dir}") 96 | 97 | use_wandb = local_rank == 0 and use_wandb 98 | if use_wandb: 99 | run = wandb.init(project=wandb_project, **dict(config=config_to_log)) # type: ignore 100 | else: 101 | run = None # type: ignore 102 | 103 | trainer = transformers.Trainer( 104 | model=model, 105 | train_dataset=dataset_instance.train_dataset, 106 | eval_dataset=dataset_instance.test_dataset, 107 | args=transformers.TrainingArguments( 108 | per_device_train_batch_size=micro_batch_size, 109 | gradient_accumulation_steps=batch_size // micro_batch_size, 110 | optim="adamw_torch", 111 | # gradient_checkpointing=True, 112 | warmup_steps=100, 113 | num_train_epochs=epochs, 114 | learning_rate=learning_rate, 115 | fp16=True, 116 | logging_steps=10 if debug else 1, 117 | save_strategy="epoch" if debug else "no", 118 | eval_steps=20 if debug else 1, 119 | output_dir=output_dir, 120 | save_total_limit=1, 121 | load_best_model_at_end=False, 122 | report_to="wandb" if use_wandb else "none", 123 | run_name=wandb_run_name if use_wandb else None, 124 | remove_unused_columns=False, 125 | ), 126 | data_collator=dataset_instance.data_collator, 127 | ) 128 | 129 | trainer.train() 130 | 131 | if push_model_to_hf: 132 | # Save the pretrained model locally 133 | model.save_pretrained(output_dir) # type: ignore 134 | tokenizer.save_pretrained(output_dir) # type: ignore 135 | 136 | # Push to the hub 137 | model.push_to_hub("jinaai/starcoder-1b-textbook") # type: ignore 138 | tokenizer.push_to_hub("jinaai/starcoder-1b-textbook") # type: ignore 139 | 140 | accuracy_results, sample_results = evaluate( 141 | model, tokenizer, eval_size=eval_size, max_new_tokens=eval_max_new_tokens 142 | ) 143 | 144 | if use_wandb and run: 145 | # log accuracy@k results 146 | run.log(accuracy_results) 147 | 148 | # log sample values 149 | results = list(sample_results.values()) 150 | columns = list(results[0].keys()) 151 | results_data = [[result[key] for key in columns] for result in results] 152 | eval_table = wandb.Table(columns=columns, data=results_data) 153 | run.log({"Evaluation": eval_table}) 154 | 155 | if wandb_log_model: 156 | # upload model weights 157 | artifact = wandb.Artifact(name="model_weight", type="model") 158 | artifact.add_dir(output_dir) 159 | run.log_artifact(artifact) # type: ignore 160 | 161 | 162 | if __name__ == "__main__": 163 | app() 164 | -------------------------------------------------------------------------------- /textbook/dataset_gen/create_prompts.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import itertools 3 | from typing import List, Optional 4 | from pydantic import BaseModel 5 | import random 6 | import pandas as pd 7 | import numpy as np 8 | import openai 9 | import os 10 | import json 11 | from rich.progress import track 12 | 13 | 14 | class Topic(BaseModel): 15 | topic: str 16 | mixing: int 17 | parent: Optional[Topic] = None 18 | 19 | 20 | class Exercise(BaseModel): 21 | exercise: str 22 | topic: Topic 23 | 24 | 25 | class Query(BaseModel): 26 | query: str 27 | topic_1: Topic 28 | topic_2: Topic 29 | 30 | 31 | def create_subtopic_query(topic: str, n: int) -> str: 32 | return f"""For a Python textbook give me {n} subtopics of {topic}, formatted as a Python list. 33 | Just provide the titles and give no explanation. 34 | Format the result as Python list. 35 | """ 36 | 37 | 38 | def create_prompt_query(topic_1: Topic, topic_2: Topic, profession: str) -> str: 39 | query = f''' 40 | Create a code completion exercise on the intersection of “{topic_1.topic}” and “{topic_2.topic}”. 41 | Write it for a {profession}. 42 | 43 | The exercise must be of the style: 44 | 45 | ``` 46 | def name(args): 47 | 48 | """Docstring explaining the exercise""" 49 | 50 | python code to solve the exercise 51 | ``` 52 | 53 | NO CLASSES 54 | 55 | MAKE IT VERY DIFFICULT 56 | ''' 57 | query = "\n".join([m.lstrip() for m in query.strip().split("\n")]) 58 | return query 59 | 60 | 61 | def create_subtopics(topic: Topic, n: int, retries: int = 10) -> List[Topic]: 62 | success = False 63 | query = create_subtopic_query(topic.topic, n) 64 | print(query) 65 | for i in range(retries): 66 | try: 67 | completion = openai.ChatCompletion.create( 68 | model="gpt-4", 69 | messages=[ 70 | {"role": "system", "content": "You are a helpful assistant."}, 71 | {"role": "user", "content": query}, 72 | ], 73 | temperature=1.5, 74 | ) 75 | 76 | result = [ 77 | Topic(topic=i, mixing=topic.mixing, parent=topic) 78 | for i in eval(completion.choices[0].message["content"]) 79 | ] 80 | success = True 81 | except Exception: 82 | print(f"Generation failed for prompt, retrying {i + 1}/{retries}") 83 | else: 84 | break 85 | 86 | if success: 87 | return result 88 | else: 89 | return [] 90 | 91 | 92 | def create_prompts( 93 | topic: Topic, 94 | combination_options: List[Topic], 95 | professions: List[str], 96 | n: int, 97 | ) -> List[Query]: 98 | random.shuffle(combination_options) 99 | prompts: List[Query] = [] 100 | 101 | for loc_topic in combination_options: 102 | if len(prompts) == n: 103 | break 104 | 105 | if loc_topic.mixing and loc_topic.parent != topic.parent: 106 | profession = professions[np.random.randint(0, len(professions))] 107 | query = create_prompt_query(topic, loc_topic, profession) 108 | prompts.append(Query(query=query, topic_1=topic, topic_2=loc_topic)) 109 | 110 | return prompts 111 | 112 | 113 | if __name__ == "__main__": 114 | # Load list of topics 115 | API_KEY = os.environ["API_PASSWORD"] 116 | TOPICS_PATH = "tree/topics.csv" 117 | openai.api_key = API_KEY 118 | 119 | topics = pd.read_csv(TOPICS_PATH) 120 | topics = topics.fillna(0) 121 | topics = topics.iloc[:, :3] 122 | topics.Topic = topics.Topic.str.split(".").str[1] 123 | topics.Use = topics.Use.astype(int) 124 | topics.Mixing = topics.Mixing.astype(int) 125 | topics_df = topics[topics.Use == 1].reset_index(drop=True) 126 | topics_df = topics_df.drop("Use", axis=1) 127 | topics_list = list(zip(topics_df.Topic, topics_df.Mixing)) 128 | 129 | # Debug mode to create few prompts 130 | DEBUG = False 131 | if DEBUG: 132 | n_base_topics = 5 133 | n_combinations = 2 134 | else: 135 | n_base_topics = len(topics_df) 136 | n_combinations = 200 137 | 138 | root = Topic(topic="Python", mixing=1) 139 | base_topics = [ 140 | Topic(topic=top, mixing=mix, parent=root) 141 | for (top, mix) in zip(topics_df.Topic, topics_df.Mixing) 142 | ] 143 | subtopics = [create_subtopics(t, 10) for t in base_topics[:n_base_topics]] 144 | subtopics_list = list(itertools.chain(*subtopics)) 145 | subtopics_json = json.dumps([x.dict() for x in subtopics_list]) 146 | 147 | with open("tree/subtopics.json", "w") as outfile: 148 | outfile.write(subtopics_json) 149 | 150 | subsubtopics: List[List[Topic]] = [ 151 | create_subtopics(t, 5) 152 | for t in track(itertools.chain(*subtopics), description="Processing...") 153 | ] 154 | subsubtopics_list = list(itertools.chain(*subsubtopics)) 155 | subsubtopics_json: str = json.dumps([x.dict() for x in subsubtopics_list]) 156 | 157 | with open("tree/subsubtopics.json", "w") as outfile: 158 | outfile.write(subsubtopics_json) 159 | 160 | with open("tree/professions.json", "r") as openfile: 161 | # Reading from json file 162 | professions = list(json.load(openfile)) 163 | 164 | prompts: List[List[Query]] = [ 165 | create_prompts( 166 | i, 167 | combination_options=subsubtopics_list, 168 | professions=professions, 169 | n=n_combinations, 170 | ) 171 | for i in track(itertools.chain(*subsubtopics), description="Processing...") 172 | ] 173 | 174 | prompts_list = list(itertools.chain(*prompts)) 175 | prompts_json = json.dumps([p.dict() for p in prompts_list]) 176 | with open("tree/prompts.json", "w") as outfile: 177 | outfile.write(prompts_json) 178 | -------------------------------------------------------------------------------- /tests/dataset_gen/test_units.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from textbook.dataset_gen.dataset_gen import ( 5 | OpenAIGenerator, 6 | load_prompts, 7 | mass_generation, 8 | generation, 9 | MonkeyGenerator, 10 | write_results_to_jsonl, 11 | Result, 12 | generator_to_exercises, 13 | split_exercises, 14 | check_exercise, 15 | ) 16 | import numpy as np 17 | import pytest 18 | 19 | 20 | def mock_openai(mocker): 21 | mocker.patch( 22 | "textbook.dataset_gen.dataset_gen.OpenAIGenerator.generate", 23 | return_value=Result( 24 | prompt="Cheesecake with strawberries", 25 | output='def gruyere(): """No way jose""" return 0' * 2, 26 | ), 27 | ) 28 | 29 | 30 | def update_progress(): 31 | ... 32 | 33 | 34 | @pytest.mark.openai 35 | def test_generation(): 36 | generator = OpenAIGenerator() 37 | gen = generator.generate("Hello world") 38 | assert isinstance(gen, Result) 39 | 40 | 41 | def test_generation_mock(mocker): 42 | mock_openai(mocker) 43 | generator = OpenAIGenerator() 44 | gen = generator.generate("Hello world") 45 | prompts = "Hello World" 46 | generation(prompts, generator, update_progress, 10) 47 | assert isinstance(gen, Result) 48 | assert gen.prompt == "Cheesecake with strawberries" 49 | assert gen.output == 'def gruyere(): """No way jose""" return 0' * 2 50 | 51 | 52 | def test_mass_generation(mocker, tmp_path): 53 | mock_openai(mocker) 54 | 55 | def get_generator(): 56 | return OpenAIGenerator() 57 | 58 | prompts = ["Hello world", "Goodbye world"] 59 | mass_generation(prompts, get_generator, save_dir=str(tmp_path)) 60 | 61 | ls = os.listdir(tmp_path) 62 | assert len(ls) > 0 63 | 64 | file_path = os.listdir(os.path.join(tmp_path, ls[0])) 65 | assert len(file_path) > 0 66 | 67 | 68 | def test_generation_monkey_generator(): 69 | n_functions = np.random.randint(0, 100) 70 | generator = MonkeyGenerator(speed=-1, n_functions=n_functions) 71 | prompts = "Hello world" 72 | result = generation(prompts, generator, update_progress, 10) 73 | assert len(result) == n_functions 74 | 75 | 76 | def test_mass_generation_monkey_generator(mocker, tmp_path): 77 | n_functions = np.random.randint(1, 100) 78 | 79 | def get_generator(): 80 | return MonkeyGenerator(speed=-1, n_functions=n_functions) 81 | 82 | prompts = ["Hello world", "Goodbye world"] * 20 83 | mass_generation(prompts, get_generator, save_dir=str(tmp_path)) 84 | ls = os.listdir(tmp_path) 85 | assert len(ls) > 0 86 | 87 | file_path = os.listdir(os.path.join(tmp_path, ls[0])) 88 | assert len(file_path) > 0 89 | 90 | 91 | def test_load_prompts(): 92 | prompts = load_prompts("tests/data/prompts_debug.jsonl", "prompt") 93 | assert len(prompts) == 5 94 | assert isinstance(prompts[0], str) 95 | 96 | 97 | def test_save_results(tmp_path): 98 | results = [ 99 | Result( 100 | prompt="Hello world", 101 | output='def gruyere(): """No way jose""" return 0', 102 | ), 103 | Result( 104 | prompt="Goodbye world", 105 | output='def emmentaler(): """No way jose""" return 1', 106 | ), 107 | ] 108 | file = f"{tmp_path}/results.jsonl" 109 | write_results_to_jsonl(file, results) 110 | 111 | with open(file, "r") as f: 112 | lines = f.readlines() 113 | 114 | prompts = [Result.parse_obj(json.loads(line)) for line in lines] 115 | 116 | assert len(prompts) == 2 117 | assert prompts[0].prompt == "Hello world" 118 | assert prompts[0].output == 'def gruyere(): """No way jose""" return 0' 119 | assert prompts[1].prompt == "Goodbye world" 120 | assert prompts[1].output == 'def emmentaler(): """No way jose""" return 1' 121 | 122 | 123 | def test_split_exercises(): 124 | input = ''' 125 | ```python 126 | def reverse_name(name: str) -> str: 127 | """Reverses the letters of a name and returns it. 128 | 129 | >>> reverse_name("LeBron") 130 | 'norBeL' 131 | >>> reverse_name("Curry") 132 | 'yrruC' 133 | """ 134 | return name[::-1] 135 | 136 | def reverse_words(sentence: str) -> str: 137 | """Reverses the order of words in a sentence and returns it. 138 | 139 | >>> reverse_words("I love playing basketball") 140 | 'basketball playing love I' 141 | >>> reverse_words("Hello World!") 142 | 'World! Hello' 143 | """ 144 | words = sentence.split() 145 | return " ".join(words[::-1]) 146 | 147 | ''' 148 | assert len(split_exercises(input)) == 2 149 | 150 | 151 | def test_check_exercise(): 152 | good_exercise = ''' 153 | def cheesecake(): 154 | """Cheesecake is delicious."""" 155 | return 0 156 | ''' 157 | another_good_exercise = ''' 158 | def marmelade(): 159 | """Marmelade is delicious."""" 160 | print("Hello world") 161 | ''' 162 | bad_exercise = ''' 163 | def blubberfish(): 164 | """Blubberfish is delicious."""" 165 | ''' 166 | assert check_exercise(good_exercise) 167 | assert check_exercise(another_good_exercise) 168 | assert not check_exercise(bad_exercise) 169 | 170 | 171 | def test_generator_to_functions(): 172 | input = ''' 173 | ```python 174 | def reverse_name(name: str) -> str: 175 | """Reverses the letters of a name and returns it. 176 | 177 | >>> reverse_name("LeBron") 178 | 'norBeL' 179 | >>> reverse_name("Curry") 180 | 'yrruC' 181 | """ 182 | return name[::-1] 183 | 184 | def reverse_words(sentence: str) -> str: 185 | """Reverses the order of words in a sentence and returns it. 186 | 187 | >>> reverse_words("I love playing basketball") 188 | 'basketball playing love I' 189 | >>> reverse_words("Hello World!") 190 | 'World! Hello' 191 | """ 192 | words = sentence.split() 193 | return " ".join(words[::-1]) 194 | 195 | def reverse_alphabetical_order(names: list) -> list: 196 | """Reverses the order of names in a list and returns it. 197 | 198 | >>> reverse_alphabetical_order(['LeBron', 'Curry', 'Kobe']) 199 | ['Kobe', 'Curry', 'LeBron'] 200 | >>> reverse_alphabetical_order(['Jordan', 'Magic', 'Bird']) 201 | ['Bird', 'Magic', 'Jordan'] 202 | """ 203 | return names[::-1] 204 | 205 | def reverse_phone_number(number: str) -> str: 206 | """Reverses the order of digits in a phone number and returns it. 207 | 208 | >>> reverse_phone_number("123-456-7890") 209 | '0987-654-321' 210 | >>> reverse_phone_number("555-123-4567") 211 | '7654-321-555' 212 | """ 213 | area_code, first_half, second_half = number.split("-") 214 | return second_half + "-" + first_half + "-" + area_code 215 | 216 | def intersection_names_to_frozen_sets(names1: list, names2: list) -> set: 217 | """Finds the intersection of two lists of names and returns it as a frozen set. 218 | 219 | >>> intersection_names_to_frozen_sets(['LeBron', 'Curry', 'Kobe'], ['Kobe', 'Jordan']) 220 | {'Kobe'} 221 | >>> intersection_names_to_frozen_sets(['Bird', 'Magic', 'Jordan'], ['LeBron', 'Kobe', 'Bird']) 222 | {'Bird'} 223 | """ 224 | set1 = set(names1) 225 | set2 = set(names2) 226 | return frozenset(set1.intersection(set2)) 227 | ``` 228 | ''' 229 | assert len(generator_to_exercises(input)) == 5 230 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Textbook 2 | The goal of this project is to distill ChatGPT's Python coding ability into a smaller model with only 1 billion parameters. Our focus is on training the smaller model to solve coding tasks with natural language descriptions, and we use the [HumanEval](https://github.com/openai/human-eval) benchmark to evaluate our model. While we are aware that that benchmark is far from ideal, we believe that it is a good starting point to demonstrate the success of our approach to model distillation. We have drawn some inspiration from efforts to the results reported in the paper _Textbooks Are All You Need_ [(Gunasekar et al. 2023)](https://doi.org/10.48550/arXiv.2306.11644). 3 | 4 | This repository consists of two parts: 5 | 6 | * Dataset Generation: The code that we used to generate a \~120 million token dataset of Python programming exercises from ChatGPT 3.5. 7 | * Model Fine-tuning: The code that we used to fine-tune the [Starcoder 1b model](https://github.com/bigcode-project/starcoder) using the generated dataset. 8 | 9 | The generated exercises dataset is composed of a diverse set of \~120k Python code exercises (~120m total tokens) generated by ChatGPT 3.5. It follows the format of the [Human Eval benchmark](https://github.com/openai/human-eval): Each training sample is split into a Python function signature with a descriptive docstring, and a solution to the exercise. 10 | 11 | 12 | ## Usage 13 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1T4IfGfDJ8uxgU8XBPpMZivw_JThzdQim?usp=sharing) 14 | 15 | You can download and use the model like so: 16 | ```python 17 | from transformers import AutoModelForCausalLM, AutoTokenizer 18 | 19 | model = AutoModelForCausalLM.from_pretrained( 20 | "jinaai/starcoder-1b-textbook", device_map='auto' 21 | ) 22 | 23 | tokenizer = AutoTokenizer.from_pretrained("jinaai/starcoder-1b-textbook") 24 | 25 | prompt = ''' 26 | def unique(l: list): 27 | """Return sorted unique elements in a list 28 | >>> unique([5, 3, 5, 2, 3, 3, 9, 0, 123]) 29 | [0, 2, 3, 5, 9, 123] 30 | """ 31 | ''' 32 | 33 | inputs = tokenizer(prompt.rstrip(), return_tensors="pt").to("cuda") 34 | 35 | generation_output = model.generate( 36 | **inputs, 37 | max_new_tokens=128, 38 | eos_token_id=tokenizer.eos_token_id, 39 | return_dict_in_generate=True, 40 | ) 41 | 42 | s = generation_output.sequences[0] 43 | output = tokenizer.decode(s, skip_special_tokens=True) 44 | 45 | print(output) 46 | ``` 47 | 48 | ```text 49 | def unique(l: list): 50 | """Return sorted unique elements in a list 51 | >>> unique([5, 3, 5, 2, 3, 3, 9, 0, 123]) 52 | [0, 2, 3, 5, 9, 123] 53 | """ 54 | return sorted(set(l)) 55 | ``` 56 | 57 | ## Synthetic exercise creation 58 | 59 | Model distillation is the process of transferring some of the skilled performance of large models on specific classes of tasks to significantly smaller models. The purpose is to get performance comparable to the larger model, but at a fraction of the cost and at a vastly quicker speed. The general outline of this strategy is described (without technical implementation details) in [Textbooks Are All You Need](https://doi.org/10.48550/arXiv.2306.11644). 60 | 61 | Key to the distillation process is the creation of synthetic data, generated by the larger AI model, to train the smaller model. We have applied this approach to Python programming tasks and are publishing a summary of our methods here along with the synthetic dataset. 62 | 63 | For fuller details and implementation code, see the [related GitHub repository](https://github.com/jina-ai/textbook). 64 | 65 | ### Diversity 66 | 67 | The main problem with model-generated synthetic data is its diversity. If we had constructed this dataset by giving ChatGPT 3.5 the same prompt several hundred thousand times, we would get many very similar, if not functionally identical, results. This would reduce the usefulness of the dataset for training. In principle, one might solve the problem by filtering the results for near duplicates, but this is a non-trivial problem, and even if it could be solved, it would be a wasteful and potentially expensive use of the larger model. 68 | 69 | And even then, we could not be sure the examples adequately covered the topic. To solve this problem, we introduced a novel scheme for systematically prompting large language models to produce diverse examples. 70 | 71 | ### Using a topic tree to build diverse prompts 72 | 73 | We constructed a hierarchical model of subjects in Python programming, i.e. a topic tree. First, we manually identified 42 general topic areas in Python knowledge, for example, _data structures_ and _sorting algorithms_. We asked an LLM to propose 10 subtopics for each, and then for each of those 420 fine-grained topics, we asked the LLM to generate 5 even more fine-grained sub-subtopics. This resulted in roughly 2000 very fine-grained topics. 74 | 75 | We generated prompts by randomly selecting two of those roughly two thousand topics and combining them: 76 | 77 | ``` 78 | Create a code completion exercise on the intersection of {topic 1} and {topic 2}. 79 | ``` 80 | 81 | To increase randomness and diversity in the results, we also constructed a list of 40 professions, like _economist_, _engineer_, and _social worker_, and added them to the prompt: 82 | 83 | ``` 84 | Create a code completion exercise on the intersection of {topic 1} and {topic 2}. 85 | Write it for a {profession}. 86 | ``` 87 | 88 | In principle, there are approximately two million possible pairs of topics, and with 40 possible professions, this yields 80 million unique prompts. If the response to each prompt averages 100 tokens, this means our method can generate an 8 billion token synthetic dataset while maintaining a high degree of diversity. The dataset used here is only a small sample of the possible total. 89 | 90 | 91 | ## Install dependency 92 | 93 | 94 | ```cmd 95 | poetry install 96 | poetry shell 97 | pip install torch 98 | ``` 99 | 100 | 101 | ## Generating Dataset 102 | 103 | 104 | Follow this step to reproduce the dataset generation 105 | 106 | 107 | First export your openAI key 108 | ```shell 109 | export OPENAI_API_KEY=sk-XXX 110 | ``` 111 | then start to parrallel call to open ai 112 | ```shell 113 | cd textbook/dataset_gen 114 | python dataset_gen_cli.py generate ./tree/professions.json ./tree/subsubtopics.json ./exercises --n-prompts 2_000_000 --pool-size 40 115 | ``` 116 | 117 | this should take around 6hours. The process might be killed before the end but the data will still be save progressivly. 118 | 119 | 120 | Once the file are generated you can postprocess the files and save it into a jsonl file 121 | 122 | ```shell 123 | python dataset_gen_cli.py filter ./exercises dataset.jsonl 124 | ``` 125 | 126 | push to hf dataset 127 | 128 | ```shell 129 | python dataset_gen_cli.py push "jinaai/code_exercises_40k" dataset.jsonl 130 | ``` 131 | 132 | ## Training 133 | 134 | 135 | Single gpu run 136 | 137 | ```cmd 138 | python textbook/train.py --epochs 2 --micro-batch-size 4 --batch-size 128 --learning-rate 1e-4 139 | ``` 140 | 141 | a100 run : 142 | 143 | 144 | ```cmd 145 | python textbook/train.py --module StarCoder --dataset ExerciseDatast --epochs 1 --micro-batch-size 8 --batch-size 128 --wandb-project textbook_debug --use-wandb --no-wandb-log-model 146 | ``` 147 | 148 | 149 | ```cmd 150 | deepspeed --num_gpus=2 textbook/train.py --deepspeed ds_config.json --epochs 2 --micro-batch-size 4 --batch-size 128 --learning-rate 1e-4 151 | ``` 152 | 153 | 154 | Note: 155 | 156 | to use starcoder base model you need to first login to HF and accept the ToS of the used starcoder base model (https://huggingface.co/bigcode/starcoderbase-1b) 157 | ```cmd 158 | huggingface-cli login 159 | ``` 160 | 161 | 162 | ## setup runpod 163 | 164 | bash <(curl -Ls https://raw.githubusercontent.com/jina-ai/textbook/main/setup_vm.sh) 165 | 166 | -------------------------------------------------------------------------------- /human-eval/human_eval/execution.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict 2 | import contextlib 3 | import faulthandler 4 | import io 5 | import os 6 | import multiprocessing 7 | import platform 8 | import signal 9 | import tempfile 10 | 11 | 12 | def check_correctness( 13 | problem: Dict, completion: str, timeout: float, completion_id: Optional[int] = None 14 | ) -> Dict: 15 | """ 16 | Evaluates the functional correctness of a completion by running the test 17 | suite provided in the problem. 18 | 19 | :param completion_id: an optional completion ID so we can match 20 | the results later even if execution finishes asynchronously. 21 | """ 22 | 23 | def unsafe_execute(): 24 | with create_tempdir(): 25 | # These system calls are needed when cleaning up tempdir. 26 | import os 27 | import shutil 28 | 29 | rmtree = shutil.rmtree 30 | rmdir = os.rmdir 31 | chdir = os.chdir 32 | 33 | # Disable functionalities that can make destructive changes to the test. 34 | reliability_guard() 35 | 36 | # Construct the check program and run it. 37 | check_program = ( 38 | problem["prompt"] 39 | + completion 40 | + "\n" 41 | + problem["test"] 42 | + "\n" 43 | + f"check({problem['entry_point']})" 44 | ) 45 | 46 | try: 47 | exec_globals = {} 48 | with swallow_io(): 49 | with time_limit(timeout): 50 | # WARNING 51 | # This program exists to execute untrusted model-generated code. Although 52 | # it is highly unlikely that model-generated code will do something overtly 53 | # malicious in response to this test suite, model-generated code may act 54 | # destructively due to a lack of model capability or alignment. 55 | # Users are strongly encouraged to sandbox this evaluation suite so that it 56 | # does not perform destructive actions on their host or network. For more 57 | # information on how OpenAI sandboxes its code, see the accompanying paper. 58 | # Once you have read this disclaimer and taken appropriate precautions, 59 | # uncomment the following line and proceed at your own risk: 60 | exec(check_program, exec_globals) 61 | result.append("passed") 62 | except TimeoutException: 63 | result.append("timed out") 64 | except BaseException as e: 65 | result.append(f"failed: {e}") 66 | 67 | # Needed for cleaning up. 68 | shutil.rmtree = rmtree 69 | os.rmdir = rmdir 70 | os.chdir = chdir 71 | 72 | manager = multiprocessing.Manager() 73 | result = manager.list() 74 | 75 | p = multiprocessing.Process(target=unsafe_execute) 76 | p.start() 77 | p.join(timeout=timeout + 1) 78 | if p.is_alive(): 79 | p.kill() 80 | 81 | if not result: 82 | result.append("timed out") 83 | 84 | return dict( 85 | task_id=problem["task_id"], 86 | passed=result[0] == "passed", 87 | result=result[0], 88 | completion_id=completion_id, 89 | ) 90 | 91 | 92 | @contextlib.contextmanager 93 | def time_limit(seconds: float): 94 | def signal_handler(signum, frame): 95 | raise TimeoutException("Timed out!") 96 | 97 | signal.setitimer(signal.ITIMER_REAL, seconds) 98 | signal.signal(signal.SIGALRM, signal_handler) 99 | try: 100 | yield 101 | finally: 102 | signal.setitimer(signal.ITIMER_REAL, 0) 103 | 104 | 105 | @contextlib.contextmanager 106 | def swallow_io(): 107 | stream = WriteOnlyStringIO() 108 | with contextlib.redirect_stdout(stream): 109 | with contextlib.redirect_stderr(stream): 110 | with redirect_stdin(stream): 111 | yield 112 | 113 | 114 | @contextlib.contextmanager 115 | def create_tempdir(): 116 | with tempfile.TemporaryDirectory() as dirname: 117 | with chdir(dirname): 118 | yield dirname 119 | 120 | 121 | class TimeoutException(Exception): 122 | pass 123 | 124 | 125 | class WriteOnlyStringIO(io.StringIO): 126 | """StringIO that throws an exception when it's read from""" 127 | 128 | def read(self, *args, **kwargs): 129 | raise IOError 130 | 131 | def readline(self, *args, **kwargs): 132 | raise IOError 133 | 134 | def readlines(self, *args, **kwargs): 135 | raise IOError 136 | 137 | def readable(self, *args, **kwargs): 138 | """Returns True if the IO object can be read.""" 139 | return False 140 | 141 | 142 | class redirect_stdin(contextlib._RedirectStream): # type: ignore 143 | _stream = "stdin" 144 | 145 | 146 | @contextlib.contextmanager 147 | def chdir(root): 148 | if root == ".": 149 | yield 150 | return 151 | cwd = os.getcwd() 152 | os.chdir(root) 153 | try: 154 | yield 155 | except BaseException as exc: 156 | raise exc 157 | finally: 158 | os.chdir(cwd) 159 | 160 | 161 | def reliability_guard(maximum_memory_bytes: Optional[int] = None): 162 | """ 163 | This disables various destructive functions and prevents the generated code 164 | from interfering with the test (e.g. fork bomb, killing other processes, 165 | removing filesystem files, etc.) 166 | 167 | WARNING 168 | This function is NOT a security sandbox. Untrusted code, including, model- 169 | generated code, should not be blindly executed outside of one. See the 170 | Codex paper for more information about OpenAI's code sandbox, and proceed 171 | with caution. 172 | """ 173 | 174 | if maximum_memory_bytes is not None: 175 | import resource 176 | 177 | resource.setrlimit( 178 | resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes) 179 | ) 180 | resource.setrlimit( 181 | resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes) 182 | ) 183 | if not platform.uname().system == "Darwin": 184 | resource.setrlimit( 185 | resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes) 186 | ) 187 | 188 | faulthandler.disable() 189 | 190 | import builtins 191 | 192 | builtins.exit = None 193 | builtins.quit = None 194 | 195 | import os 196 | 197 | os.environ["OMP_NUM_THREADS"] = "1" 198 | 199 | os.kill = None 200 | os.system = None 201 | os.putenv = None 202 | os.remove = None 203 | os.removedirs = None 204 | os.rmdir = None 205 | os.fchdir = None 206 | os.setuid = None 207 | os.fork = None 208 | os.forkpty = None 209 | os.killpg = None 210 | os.rename = None 211 | os.renames = None 212 | os.truncate = None 213 | os.replace = None 214 | os.unlink = None 215 | os.fchmod = None 216 | os.fchown = None 217 | os.chmod = None 218 | os.chown = None 219 | os.chroot = None 220 | os.fchdir = None 221 | os.lchflags = None 222 | os.lchmod = None 223 | os.lchown = None 224 | os.getcwd = None 225 | os.chdir = None 226 | 227 | import shutil 228 | 229 | shutil.rmtree = None 230 | shutil.move = None 231 | shutil.chown = None 232 | 233 | import subprocess 234 | 235 | subprocess.Popen = None # type: ignore 236 | 237 | __builtins__["help"] = None 238 | 239 | import sys 240 | 241 | sys.modules["ipdb"] = None 242 | sys.modules["joblib"] = None 243 | sys.modules["resource"] = None 244 | sys.modules["psutil"] = None 245 | sys.modules["tkinter"] = None 246 | -------------------------------------------------------------------------------- /textbook/dataset_gen/dataset_gen.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from concurrent.futures import ThreadPoolExecutor 3 | import json 4 | import os 5 | import random 6 | import time 7 | 8 | from typing import Callable, List, Protocol 9 | 10 | import openai 11 | from openai import OpenAIError 12 | 13 | from pydantic import BaseModel 14 | from textbook.dataset_gen.create_prompts import Topic 15 | from rich.progress import ( 16 | Progress, 17 | TimeElapsedColumn, 18 | TextColumn, 19 | ) 20 | import hashlib 21 | 22 | THREAD_LOCK = threading.Lock() 23 | PROMPT_TOKENS_CNT = 0 24 | COMPLETION_TOKENS_CNT = 0 25 | 26 | 27 | class Exercise(BaseModel): 28 | problem: str 29 | solution: str 30 | 31 | 32 | class Result(BaseModel): 33 | prompt: str 34 | output: str 35 | 36 | 37 | def split_exercises(output: str) -> List[str]: 38 | """Split the result of the generation into separate functions""" 39 | return ["def" + i for i in output.split("def")[1:]] 40 | 41 | 42 | def check_exercise(exercise: str) -> bool: 43 | try: 44 | if ( 45 | "return" not in exercise.split('"""')[2] 46 | and "print" not in exercise.split('"""')[2] 47 | ): 48 | return False 49 | else: 50 | return True 51 | except IndexError: 52 | return False 53 | 54 | 55 | def generator_to_exercises(output: str) -> List[Exercise]: 56 | exercises = split_exercises(output) 57 | exercises = [i for i in exercises if check_exercise(i)] 58 | results = [] 59 | for j in exercises: 60 | try: 61 | splitted_exercise = j.split('"""') 62 | question = '"""'.join(splitted_exercise[:2]) + '"""' 63 | answer = splitted_exercise[2] 64 | results.append(Exercise(problem=question, solution=answer)) 65 | except IndexError: 66 | splitted_exercise = j.split("'''") 67 | question = "'''".join(splitted_exercise[:2]) + "'''" 68 | answer = splitted_exercise[2] 69 | results.append(Exercise(problem=question, solution=answer)) 70 | 71 | return results 72 | 73 | 74 | class Generator(Protocol): 75 | def generate(self, prompt: str) -> Result: 76 | ... 77 | 78 | 79 | class OpenAIGenerator: 80 | def __init__( 81 | self, 82 | model: str = "gpt-3.5-turbo", 83 | ): 84 | self.model = model 85 | 86 | def generate(self, prompt: str) -> Result: 87 | global PROMPT_TOKENS_CNT 88 | global COMPLETION_TOKENS_CNT 89 | chat_completion = openai.ChatCompletion.create( 90 | model=self.model, 91 | messages=[{"role": "user", "content": prompt}], 92 | max_tokens=250, 93 | timeout=60, 94 | ) 95 | with THREAD_LOCK: 96 | PROMPT_TOKENS_CNT += chat_completion.usage.prompt_tokens 97 | COMPLETION_TOKENS_CNT += chat_completion.usage.completion_tokens 98 | result = Result( 99 | prompt=prompt, output=chat_completion.choices[0].message.content 100 | ) 101 | 102 | return result 103 | 104 | 105 | class GenerationError(OpenAIError): 106 | ... 107 | 108 | 109 | class MonkeyGenerator: 110 | """ 111 | A generator with a random response time and a random failure rate 112 | """ 113 | 114 | def __init__(self, speed: int = 2, n_functions: int = 10): 115 | self.speed = speed 116 | self.n_functions = n_functions 117 | 118 | def generate(self, prompt: str) -> Result: 119 | seed = random.randint(0, 100) 120 | 121 | if self.speed > 0: 122 | time.sleep(seed / 100 * self.speed) 123 | # if not (seed % 50): 124 | # raise GenerationError("Monkey failed") 125 | 126 | return Result( 127 | prompt=prompt, 128 | output='def gorilla(): """Empty function for a gorilla""" return 0' 129 | * self.n_functions, 130 | ) 131 | 132 | 133 | def generation( 134 | prompt: str, 135 | generator: Generator, 136 | update_progress: Callable, 137 | retries: int, 138 | ) -> List[Exercise]: 139 | success = False 140 | time.sleep(random.random()) 141 | for i in range(retries): 142 | try: 143 | result = generator.generate(prompt) 144 | success = True 145 | except GenerationError: 146 | print(f"Generation failed for prompt {prompt}, retrying {i + 1}/{retries}") 147 | time.sleep(1) 148 | else: 149 | break 150 | 151 | if success: 152 | exercises = generator_to_exercises(result.output) 153 | update_progress() 154 | return exercises 155 | 156 | else: 157 | print(f"Generation failed for prompt {prompt}, skipping") 158 | return [Exercise(problem=prompt, solution="")] 159 | 160 | 161 | def _generation_wrapper( 162 | prompt: str, 163 | get_generator: Callable[[], Generator], 164 | update_progress: Callable, 165 | save_dir: str, 166 | retries: int, 167 | ): 168 | file_path_sum = hashlib.md5(prompt.encode("utf-8")).hexdigest() 169 | 170 | dir_path, file_path = file_path_sum[:4], file_path_sum[4:] 171 | dir_path = os.path.join(save_dir, dir_path) 172 | file_path = os.path.join(dir_path, file_path + ".jsonl") 173 | 174 | if not os.path.exists(dir_path): 175 | os.makedirs(dir_path) 176 | 177 | if os.path.exists(file_path): # we don't regenerate each query 178 | print(f"skip {file_path} generation because it already exist ") 179 | return 180 | 181 | generator = get_generator() 182 | 183 | results = generation(prompt, generator, update_progress, retries) 184 | 185 | write_results_to_jsonl(file_path, results) 186 | 187 | 188 | def mass_generation( 189 | prompts: List[str], 190 | get_generator: Callable[[], Generator], 191 | save_dir: str, 192 | pool_size: int = 10, 193 | retries: int = 10, 194 | ): 195 | """ 196 | Generate from a list of prompts. Use a thread pool to parallelize the generation with catch and retry mechanism 197 | """ 198 | with Progress( 199 | *Progress.get_default_columns(), 200 | "•", 201 | TimeElapsedColumn(), 202 | TextColumn("completion: [bold green]{task.fields[completion_tokens]}"), 203 | TextColumn("prompt: [bold green]{task.fields[prompt_tokens]}"), 204 | ) as progress: 205 | with ThreadPoolExecutor(max_workers=pool_size) as executor: 206 | progress_task = progress.add_task( 207 | "[red]Generating...", 208 | total=len(prompts), 209 | completion_tokens=0, 210 | prompt_tokens=0, 211 | ) 212 | 213 | def update_progress(): 214 | progress.update( 215 | progress_task, 216 | advance=1, 217 | completion_tokens=COMPLETION_TOKENS_CNT, 218 | prompt_tokens=PROMPT_TOKENS_CNT, 219 | ) 220 | 221 | tasks = [] 222 | 223 | for prompt in prompts: 224 | tasks.append( 225 | executor.submit( 226 | _generation_wrapper, 227 | prompt, 228 | get_generator, 229 | update_progress, 230 | save_dir, 231 | retries, 232 | ) 233 | ) 234 | 235 | for task in tasks: 236 | try: 237 | task.result() 238 | except Exception as e: 239 | print(e) 240 | 241 | 242 | def load_prompts(file: str, key_prompt: str = "prompt") -> List[str]: 243 | with open(file, "r") as f: 244 | lines = f.readlines() 245 | 246 | prompts = [json.loads(line)[key_prompt] for line in lines] 247 | return prompts 248 | 249 | 250 | def load_leaves(file: str) -> List[Topic]: 251 | with open(file, "r") as f: 252 | lines = json.load(f) 253 | topics = [Topic.parse_obj(line) for line in lines] 254 | return topics 255 | 256 | 257 | def write_results_to_jsonl(file_path: str, results: List[Exercise]): 258 | with open(file_path, "w") as file: 259 | for item in results: 260 | json.dump(item.dict(), file) 261 | file.write("\n") 262 | -------------------------------------------------------------------------------- /textbook/dataset_gen/tree/subsubtopics.json: -------------------------------------------------------------------------------- 1 | [{"topic": "Positive Integers", "mixing": 1, "parent": {"topic": "integers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Negative Integers", "mixing": 1, "parent": {"topic": "integers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Operations on Integers", "mixing": 1, "parent": {"topic": "integers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Number line representation", "mixing": 1, "parent": {"topic": "integers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Absolute value", "mixing": 1, "parent": {"topic": "integers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Introduction to floating-point numbers", "mixing": 1, "parent": {"topic": "floating-point numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Floating-point arithmetic operations", "mixing": 1, "parent": {"topic": "floating-point numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Representing floating-point numbers", "mixing": 1, "parent": {"topic": "floating-point numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Rounding and precision of floating-point numbers", "mixing": 1, "parent": {"topic": "floating-point numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Floating-point error and understanding epsilon", "mixing": 1, "parent": {"topic": "floating-point numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "String indexing", "mixing": 1, "parent": {"topic": "strings", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "String slicing", "mixing": 1, "parent": {"topic": "strings", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "String concatenation", "mixing": 1, "parent": {"topic": "strings", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "String methods", "mixing": 1, "parent": {"topic": "strings", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "String formatting", "mixing": 1, "parent": {"topic": "strings", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Introduction to booleans", "mixing": 1, "parent": {"topic": "booleans", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Boolean operations", "mixing": 1, "parent": {"topic": "booleans", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Conditional statements", "mixing": 1, "parent": {"topic": "booleans", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Boolean expressions in loops", "mixing": 1, "parent": {"topic": "booleans", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Boolean functions", "mixing": 1, "parent": {"topic": "booleans", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "indexing", "mixing": 1, "parent": {"topic": "lists", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "slicing", "mixing": 1, "parent": {"topic": "lists", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "updating", "mixing": 1, "parent": {"topic": "lists", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "built-in functions", "mixing": 1, "parent": {"topic": "lists", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "list comprehension", "mixing": 1, "parent": {"topic": "lists", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a tuple", "mixing": 1, "parent": {"topic": "tuples", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Indexing and slicing tuples", "mixing": 1, "parent": {"topic": "tuples", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Tuple concatenation and repeating", "mixing": 1, "parent": {"topic": "tuples", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Tuple unpacking", "mixing": 1, "parent": {"topic": "tuples", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Comparing tuples", "mixing": 1, "parent": {"topic": "tuples", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a dictionary", "mixing": 1, "parent": {"topic": "dictionaries", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Accessing dictionary items", "mixing": 1, "parent": {"topic": "dictionaries", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying dictionary items", "mixing": 1, "parent": {"topic": "dictionaries", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Looping through a dictionary", "mixing": 1, "parent": {"topic": "dictionaries", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Dictionary methods", "mixing": 1, "parent": {"topic": "dictionaries", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a set", "mixing": 1, "parent": {"topic": "sets", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Adding elements to a set", "mixing": 1, "parent": {"topic": "sets", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Removing elements from a set", "mixing": 1, "parent": {"topic": "sets", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Operations on sets", "mixing": 1, "parent": {"topic": "sets", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Conversion between sets and other data structures", "mixing": 1, "parent": {"topic": "sets", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Introduction to complex numbers", "mixing": 1, "parent": {"topic": "complex numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Representation of complex numbers in Python", "mixing": 1, "parent": {"topic": "complex numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Arithmetic operations with complex numbers", "mixing": 1, "parent": {"topic": "complex numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Complex conjugate and absolute value", "mixing": 1, "parent": {"topic": "complex numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Polar representation of complex numbers", "mixing": 1, "parent": {"topic": "complex numbers", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Introduction to None", "mixing": 1, "parent": {"topic": "None", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Comparison with other values", "mixing": 1, "parent": {"topic": "None", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "None in conditional statements", "mixing": 1, "parent": {"topic": "None", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Default values of variables", "mixing": 1, "parent": {"topic": "None", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "None as a placeholder", "mixing": 1, "parent": {"topic": "None", "mixing": 1, "parent": {"topic": " Python Data Types and Variables", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Addition", "mixing": 1, "parent": {"topic": "Arithmetic Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Subtraction", "mixing": 1, "parent": {"topic": "Arithmetic Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Multiplication", "mixing": 1, "parent": {"topic": "Arithmetic Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Division", "mixing": 1, "parent": {"topic": "Arithmetic Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modulus", "mixing": 1, "parent": {"topic": "Arithmetic Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Simple Assignment Operator", "mixing": 1, "parent": {"topic": "Assignment Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Addition Assignment Operator", "mixing": 1, "parent": {"topic": "Assignment Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Subtraction Assignment Operator", "mixing": 1, "parent": {"topic": "Assignment Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Multiplication Assignment Operator", "mixing": 1, "parent": {"topic": "Assignment Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Division Assignment Operator", "mixing": 1, "parent": {"topic": "Assignment Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Equal", "mixing": 1, "parent": {"topic": "Comparison Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Not Equal", "mixing": 1, "parent": {"topic": "Comparison Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Greater Than", "mixing": 1, "parent": {"topic": "Comparison Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Less Than", "mixing": 1, "parent": {"topic": "Comparison Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Greater Than or Equal", "mixing": 1, "parent": {"topic": "Comparison Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "AND operator", "mixing": 1, "parent": {"topic": "Logical Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "OR operator", "mixing": 1, "parent": {"topic": "Logical Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "NOT operator", "mixing": 1, "parent": {"topic": "Logical Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested logical operators", "mixing": 1, "parent": {"topic": "Logical Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Precedence of logical operators", "mixing": 1, "parent": {"topic": "Logical Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "AND Operator", "mixing": 1, "parent": {"topic": "Bitwise Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "OR Operator", "mixing": 1, "parent": {"topic": "Bitwise Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "XOR Operator", "mixing": 1, "parent": {"topic": "Bitwise Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Complement Operator", "mixing": 1, "parent": {"topic": "Bitwise Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Left and Right Shift Operators", "mixing": 1, "parent": {"topic": "Bitwise Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "is Operator", "mixing": 1, "parent": {"topic": "Identity Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "is not Operator", "mixing": 1, "parent": {"topic": "Identity Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "in Operator", "mixing": 1, "parent": {"topic": "Identity Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "not in Operator", "mixing": 1, "parent": {"topic": "Identity Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Identity Comparison", "mixing": 1, "parent": {"topic": "Identity Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "in Operator", "mixing": 1, "parent": {"topic": "Membership Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "not in Operator", "mixing": 1, "parent": {"topic": "Membership Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Selection", "mixing": 1, "parent": {"topic": "Control Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Iteration", "mixing": 1, "parent": {"topic": "Control Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Conditional Statements", "mixing": 1, "parent": {"topic": "Control Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Looping Structures", "mixing": 1, "parent": {"topic": "Control Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Error Handling", "mixing": 1, "parent": {"topic": "Control Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Concatenation", "mixing": 1, "parent": {"topic": "String Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Repetition", "mixing": 1, "parent": {"topic": "String Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Membership", "mixing": 1, "parent": {"topic": "String Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Indexing", "mixing": 1, "parent": {"topic": "String Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Slicing", "mixing": 1, "parent": {"topic": "String Operators", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Syntax of the Ternary Operator", "mixing": 1, "parent": {"topic": "Ternary Operator", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Example: Unconditioned Ternary Operator", "mixing": 1, "parent": {"topic": "Ternary Operator", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested Ternary Operators", "mixing": 1, "parent": {"topic": "Ternary Operator", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Using Ternary Operator with Multiple Conditions", "mixing": 1, "parent": {"topic": "Ternary Operator", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Alternative to Ternary Operator", "mixing": 1, "parent": {"topic": "Ternary Operator", "mixing": 1, "parent": {"topic": " Python Basic Operators", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Basic Syntax", "mixing": 1, "parent": {"topic": "Sequential Execution", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Sequential Statements", "mixing": 1, "parent": {"topic": "Sequential Execution", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Branching Statements", "mixing": 1, "parent": {"topic": "Sequential Execution", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Looping Statements", "mixing": 1, "parent": {"topic": "Sequential Execution", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Function Calls", "mixing": 1, "parent": {"topic": "Sequential Execution", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "if statement", "mixing": 1, "parent": {"topic": "Conditionals", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "else statement", "mixing": 1, "parent": {"topic": "Conditionals", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "elif statement", "mixing": 1, "parent": {"topic": "Conditionals", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "nested if", "mixing": 1, "parent": {"topic": "Conditionals", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "short-circuiting", "mixing": 1, "parent": {"topic": "Conditionals", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "While Loops", "mixing": 1, "parent": {"topic": "Loops", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "For Loops", "mixing": 1, "parent": {"topic": "Loops", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested Loops", "mixing": 1, "parent": {"topic": "Loops", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Continue Statement", "mixing": 1, "parent": {"topic": "Loops", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Break Statement", "mixing": 1, "parent": {"topic": "Loops", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "if statements", "mixing": 1, "parent": {"topic": "If-else Statements", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "if-else statements", "mixing": 1, "parent": {"topic": "If-else Statements", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "if-elif-else statements", "mixing": 1, "parent": {"topic": "If-else Statements", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "nested if-else statements", "mixing": 1, "parent": {"topic": "If-else Statements", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "short-circuiting with if-else", "mixing": 1, "parent": {"topic": "If-else Statements", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Basic While Loop Syntax", "mixing": 1, "parent": {"topic": "While loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Loop Control Statements", "mixing": 1, "parent": {"topic": "While loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Infinite Loops", "mixing": 1, "parent": {"topic": "While loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested While Loops", "mixing": 1, "parent": {"topic": "While loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Common Use Cases for While Loop", "mixing": 1, "parent": {"topic": "While loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Iterating over a sequence", "mixing": 1, "parent": {"topic": "For loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Executing a block of code multiple times", "mixing": 1, "parent": {"topic": "For loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested loop", "mixing": 1, "parent": {"topic": "For loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Using break and continue statements", "mixing": 1, "parent": {"topic": "For loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Combining for loop with other functions", "mixing": 1, "parent": {"topic": "For loop", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Break Statement", "mixing": 1, "parent": {"topic": "Break and Continue", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested Loops", "mixing": 1, "parent": {"topic": "Break and Continue", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Loop Control", "mixing": 1, "parent": {"topic": "Break and Continue", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Continue Statement", "mixing": 1, "parent": {"topic": "Break and Continue", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Infinite Loops", "mixing": 1, "parent": {"topic": "Break and Continue", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested if statements", "mixing": 1, "parent": {"topic": "Nested Control Structures", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested for loops", "mixing": 1, "parent": {"topic": "Nested Control Structures", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested while loops", "mixing": 1, "parent": {"topic": "Nested Control Structures", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested conditional statements", "mixing": 1, "parent": {"topic": "Nested Control Structures", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested try-except statements", "mixing": 1, "parent": {"topic": "Nested Control Structures", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Iterating over a list", "mixing": 1, "parent": {"topic": "Control Statements with Lists", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Conditional statements with lists", "mixing": 1, "parent": {"topic": "Control Statements with Lists", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "List comprehension", "mixing": 1, "parent": {"topic": "Control Statements with Lists", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Using range function with lists", "mixing": 1, "parent": {"topic": "Control Statements with Lists", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested lists and control statements", "mixing": 1, "parent": {"topic": "Control Statements with Lists", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "if statement", "mixing": 1, "parent": {"topic": "Control Statements with Dictionaries", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "for loop", "mixing": 1, "parent": {"topic": "Control Statements with Dictionaries", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "while loop", "mixing": 1, "parent": {"topic": "Control Statements with Dictionaries", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "enumerate", "mixing": 1, "parent": {"topic": "Control Statements with Dictionaries", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "in statement", "mixing": 1, "parent": {"topic": "Control Statements with Dictionaries", "mixing": 1, "parent": {"topic": " Control Structures in Python", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Basic for loop", "mixing": 1, "parent": {"topic": "For loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested for loop", "mixing": 1, "parent": {"topic": "For loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Breaking out of a loop", "mixing": 1, "parent": {"topic": "For loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Using continue", "mixing": 1, "parent": {"topic": "For loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Iterating over a specific range", "mixing": 1, "parent": {"topic": "For loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Introduction to while loops", "mixing": 1, "parent": {"topic": "While loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested while loops", "mixing": 1, "parent": {"topic": "While loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Exiting while loops", "mixing": 1, "parent": {"topic": "While loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Infinite while loops", "mixing": 1, "parent": {"topic": "While loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": " Application to real-life scenarios", "mixing": 1, "parent": {"topic": "While loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Syntax of nested loops", "mixing": 1, "parent": {"topic": "Nested loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Executing nested loops", "mixing": 1, "parent": {"topic": "Nested loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Control flow within nested loops", "mixing": 1, "parent": {"topic": "Nested loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested loops with break and continue statements", "mixing": 1, "parent": {"topic": "Nested loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Applications and examples of nested loops", "mixing": 1, "parent": {"topic": "Nested loops", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "While Loop", "mixing": 1, "parent": {"topic": "Loop control statements", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "For Loop", "mixing": 1, "parent": {"topic": "Loop control statements", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Range Function", "mixing": 1, "parent": {"topic": "Loop control statements", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Continue statement", "mixing": 1, "parent": {"topic": "Loop control statements", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Break statement", "mixing": 1, "parent": {"topic": "Loop control statements", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Introduction to the range function", "mixing": 1, "parent": {"topic": "Range function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a range with start and stop parameters", "mixing": 1, "parent": {"topic": "Range function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Harnessing the power of step parameter", "mixing": 1, "parent": {"topic": "Range function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Understanding inclusive and exclusive Range", "mixing": 1, "parent": {"topic": "Range function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Range with negative values", "mixing": 1, "parent": {"topic": "Range function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Introduction", "mixing": 1, "parent": {"topic": "Enumerate function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Syntax", "mixing": 1, "parent": {"topic": "Enumerate function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Usage", "mixing": 1, "parent": {"topic": "Enumerate function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Advantages", "mixing": 1, "parent": {"topic": "Enumerate function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Examples", "mixing": 1, "parent": {"topic": "Enumerate function", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Introduction to loops", "mixing": 1, "parent": {"topic": "Using loops with lists", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Basic loop syntax", "mixing": 1, "parent": {"topic": "Using loops with lists", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Looping through lists", "mixing": 1, "parent": {"topic": "Using loops with lists", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying lists with loops", "mixing": 1, "parent": {"topic": "Using loops with lists", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested loops", "mixing": 1, "parent": {"topic": "Using loops with lists", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Using for loop with strings", "mixing": 1, "parent": {"topic": "Using loops with strings", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Using while loop with strings", "mixing": 1, "parent": {"topic": "Using loops with strings", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Iterating over each character of a string", "mixing": 1, "parent": {"topic": "Using loops with strings", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Finding characters in a string using loops", "mixing": 1, "parent": {"topic": "Using loops with strings", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "/manipulating a string using loops", "mixing": 1, "parent": {"topic": "Using loops with strings", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Iterating over dictionary", "mixing": 1, "parent": {"topic": "Using loops with dictionaries", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Accessing values in dictionary", "mixing": 1, "parent": {"topic": "Using loops with dictionaries", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying values in dictionary", "mixing": 1, "parent": {"topic": "Using loops with dictionaries", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Using nested dictionaries with loops", "mixing": 1, "parent": {"topic": "Using loops with dictionaries", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Applying conditional statements with loops and dictionaries", "mixing": 1, "parent": {"topic": "Using loops with dictionaries", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Iterating through sets", "mixing": 1, "parent": {"topic": "Using loops with sets", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Looping through sets with for loop", "mixing": 1, "parent": {"topic": "Using loops with sets", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Using set comprehension", "mixing": 1, "parent": {"topic": "Using loops with sets", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Practical application of loops with sets", "mixing": 1, "parent": {"topic": "Using loops with sets", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested loops with sets", "mixing": 1, "parent": {"topic": "Using loops with sets", "mixing": 1, "parent": {"topic": " Python Loops", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Lists", "mixing": 1, "parent": {"topic": "Introduction to Python Data Structures", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Tuples", "mixing": 1, "parent": {"topic": "Introduction to Python Data Structures", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Sets", "mixing": 1, "parent": {"topic": "Introduction to Python Data Structures", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Dictionaries", "mixing": 1, "parent": {"topic": "Introduction to Python Data Structures", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Arrays", "mixing": 1, "parent": {"topic": "Introduction to Python Data Structures", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a list", "mixing": 1, "parent": {"topic": "Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Accessing elements in a list", "mixing": 1, "parent": {"topic": "Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying elements in a list", "mixing": 1, "parent": {"topic": "Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "List methods", "mixing": 1, "parent": {"topic": "Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "List slicing", "mixing": 1, "parent": {"topic": "Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating empty lists", "mixing": 1, "parent": {"topic": "Creating Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating lists with initial elements", "mixing": 1, "parent": {"topic": "Creating Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Appending elements to a list", "mixing": 1, "parent": {"topic": "Creating Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "List comprehension", "mixing": 1, "parent": {"topic": "Creating Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Copying lists", "mixing": 1, "parent": {"topic": "Creating Lists", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Indexing", "mixing": 1, "parent": {"topic": "Accessing List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Slicing", "mixing": 1, "parent": {"topic": "Accessing List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Negative indexing", "mixing": 1, "parent": {"topic": "Accessing List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Working with nested lists", "mixing": 1, "parent": {"topic": "Accessing List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Accessing elements with nested index", "mixing": 1, "parent": {"topic": "Accessing List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying Elements using Indexing", "mixing": 1, "parent": {"topic": "Modifying List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Appending and Removing Elements", "mixing": 1, "parent": {"topic": "Modifying List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying Elements using List Methods", "mixing": 1, "parent": {"topic": "Modifying List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying Elements using Slicing", "mixing": 1, "parent": {"topic": "Modifying List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying Elements using List Comprehensions", "mixing": 1, "parent": {"topic": "Modifying List Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Accessing elements of a list", "mixing": 1, "parent": {"topic": "List Operations", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Modifying elements of a list", "mixing": 1, "parent": {"topic": "List Operations", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Appending elements to a list", "mixing": 1, "parent": {"topic": "List Operations", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Removing elements from a list", "mixing": 1, "parent": {"topic": "List Operations", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Slicing a list", "mixing": 1, "parent": {"topic": "List Operations", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "1. Creating a Tuple", "mixing": 1, "parent": {"topic": "Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "2. Accessing Tuple Elements", "mixing": 1, "parent": {"topic": "Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "3. Modifying Tuples", "mixing": 1, "parent": {"topic": "Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "4. Tuple Methods", "mixing": 1, "parent": {"topic": "Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "5. Looping Through Tuples", "mixing": 1, "parent": {"topic": "Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "What is a tuple?", "mixing": 1, "parent": {"topic": "Creating Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a tuple using parentheses", "mixing": 1, "parent": {"topic": "Creating Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a tuple using the tuple() function", "mixing": 1, "parent": {"topic": "Creating Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a tuple from a range of values", "mixing": 1, "parent": {"topic": "Creating Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Creating a tuple from a string", "mixing": 1, "parent": {"topic": "Creating Tuples", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Indexing Tuples", "mixing": 1, "parent": {"topic": "Accessing Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Slicing Tuples", "mixing": 1, "parent": {"topic": "Accessing Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Method: index()", "mixing": 1, "parent": {"topic": "Accessing Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Method: count()", "mixing": 1, "parent": {"topic": "Accessing Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Nested Tuples", "mixing": 1, "parent": {"topic": "Accessing Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Changing individual elements", "mixing": 1, "parent": {"topic": "Modifying Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Appending new elements", "mixing": 1, "parent": {"topic": "Modifying Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Deleting elements", "mixing": 1, "parent": {"topic": "Modifying Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Slicing and replacing elements", "mixing": 1, "parent": {"topic": "Modifying Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}, {"topic": "Iterating and modifying elements", "mixing": 1, "parent": {"topic": "Modifying Tuple Elements", "mixing": 1, "parent": {"topic": " Python Data Structures: Lists and Tuples", "mixing": 1, "parent": {"topic": "Python", "mixing": 1, "parent": null}}}}] --------------------------------------------------------------------------------