├── .gitignore
├── README.md
├── datatools
    ├── __init__.py
    ├── io_utils.py
    ├── load.py
    ├── merge_index.py
    ├── process.py
    └── scripts
    │   ├── merge_index.py
    │   ├── pack.py
    │   ├── peek.py
    │   ├── tokenize.py
    │   ├── tokenizers
    │       ├── llama2_tokenizer.model
    │       ├── llama2_tokenizer.py
    │       ├── llama3_tokenizer.model
    │       └── llama3_tokenizer.py
    │   └── wrangle.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | 
163 | checkpoints
164 | datasets
165 | .cache
166 | slurm
167 | init 
168 | 
169 | 
170 | wandb
171 | 
172 | 
173 | # Utils for creating math datasets
174 | math_utils/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🛠️ *datatools*: Simple utilities for common data actions
 2 | 
 3 | Minimal scripts and reusable functions for implementing common data operations (tokenization, splitting, subsampling, packing, and more).
 4 | 
 5 | Built with special support for [Mosaic Streaming Datasets (MDS)](https://docs.mosaicml.com/projects/streaming/en/stable/index.html).
 6 | 
 7 | ## Table of contents
 8 | - [Installation](#installation)
 9 | - [Library](#library)
10 |   - [Core Functions](#core-functions)
11 |   - [Example](#example)
12 | - [Scripts](#scripts)
13 | 
14 | ## Installation
15 | 
16 | Clone this repo and install via `pip install -e .` or install from PyPI via `pip install datatools-py`.
17 | 
18 | ## Library
19 | 
20 | *datatools* provides core libraries that can be used to easily build custom data pipelines, specifically through `from datatools import load, process`.
21 | 
22 | ### Core functions
23 | 
24 | ```python
25 | load(path, load_options)
26 | ```
27 | Loads the dataset at the path and **automatically infers its format** (e.g., compressed JSON, PyArrow, MDS, etc.) based on clues from the file format and directory structure.
28 | 
29 | ---
30 | 
31 | ```python
32 | process(input_dataset, process_fn, output_path, process_options)
33 | ```
34 | Processes an input dataset and writes the results to disk. It supports:
35 | 
36 | 1. **Multi-processing** with many CPUs, e.g. `ProcessOptions(num_proc=16)` (or as flag `-w 16`)
37 | 2. **Slurm array parallelization**, e.g. `ProcessOptions(slurm_array=True)` (or `--slurm_array`) automatically sets up `job_id` and `num_jobs` using Slurm environment variables
38 | 3. **Custom indexing**, e.g. only working on a subset `--index_range 0 30` or using a custom index file `--index_path path/to/index.npy`
39 |    See [ProcessOptions](https://github.com/CodeCreator/datatools/blob/main/datatools/process.py#L30) for details.
40 | 4. By default, output is written as mosaic-streaming MDS shards, which are merged into a single MDS dataset when the job finishes. The code also supports writing to JSONL files (`--jsonl`) and ndarray files for each column (`--ndarray`). The shards for these output formats are not automatically merged.
41 | 
42 | The `process_fn` should be a function that takes one to three arguments:
43 | 1. A subset of the data with `len(...)` and `.[...]` access
44 | 2. The global indices corresponding to the subset (optional)
45 | 3. The `process_id` for logging or sharding purposes (optional)
46 | 
47 | ### Example
48 | 
49 | ```python
50 | from datatools import load, process, ProcessOptions
51 | from transformers import AutoTokenizer
52 | 
53 | # Load dataset (can be JSON, Parquet, MDS, etc.)
54 | dataset = load("path/to/dataset")
55 | 
56 | # Setup tokenizer and processing function
57 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
58 | def tokenize_docs(data_subset):
59 |     for item in data_subset:
60 |         # Tokenize text and return dict with tokens and length
61 |         tokens = tokenizer.encode(item["text"], add_special_tokens=False)
62 |         
63 |         # Chunk the text into 1024 token chunks
64 |         for i in range(0, len(tokens), 1024):
65 |             yield {
66 |                 "input_ids": tokens[i:i+1024],
67 |                 "length": len(tokens[i:i+1024])
68 |             }
69 | 
70 | # Process dataset with 4 workers and write to disk
71 | process(dataset, tokenize_docs, "path/to/output", process_options=ProcessOptions(num_proc=4))
72 | ```
73 | 
74 | ## Scripts
75 | 
76 | *datatools* comes with the following default scripts:
77 | 
78 | * `tokenize`: Tokenize datasets per document
79 | * `pack`: Pack tokenized documents into fixed sequences
80 | * `peek`: Print datasets as JSON to stdout
81 | * `wrangle`: Subsample, merge datasets, make random splits (e.g., train/test/validation), etc.
82 | * `merge_index`: Merge Mosaic streaming datasets in subfolders into a larger dataset
83 | 
84 | Run `<script> --help` for detailed arguments. Many scripts automatically include all arguments from `ProcessOptions` (e.g., number of processes `-w <processes>`) and `LoadOptions`.
85 | 


--------------------------------------------------------------------------------
/datatools/__init__.py:
--------------------------------------------------------------------------------
1 | from datatools.load import load, load_pandas, LoadOptions
2 | from datatools.process import process, ProcessOptions, identity_fn, load_indices
3 | from datatools.merge_index import merge_index_recursively
4 | 


--------------------------------------------------------------------------------
/datatools/io_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import io
  4 | 
  5 | from typing import Any, Dict, Union, List
  6 | from collections.abc import Sequence
  7 | from pathlib import Path
  8 | 
  9 | import numpy as np
 10 | from datetime import datetime
 11 | 
 12 | from streaming.base.array import Array
 13 | from streaming.base.format import get_index_basename, reader_from_json
 14 | from streaming.base.spanner import Spanner
 15 | 
 16 | import zstandard
 17 | from contextlib import contextmanager
 18 | 
 19 | 
 20 | class ZstdUtf8WriteFile:
 21 |     def __init__(self, filename, level=3):
 22 |         self.filename = filename
 23 |         self.level = level
 24 |         self.file = None
 25 | 
 26 |     def open(self):
 27 |         self.file = open(self.filename, "wb")
 28 |         self.compressor = zstandard.ZstdCompressor(level=self.level)
 29 |         self.writer = self.compressor.stream_writer(self.file)
 30 |         self.text_writer = io.TextIOWrapper(self.writer, encoding='utf-8')
 31 |         return self.text_writer
 32 | 
 33 |     def close(self):
 34 |         self.text_writer.flush()
 35 |         self.writer.flush(zstandard.FLUSH_FRAME)
 36 |         self.file.close()
 37 | 
 38 | 
 39 | @contextmanager
 40 | def zstd_utf8_read_open(filename, level=3):
 41 |     with open(filename, "rb") as f:
 42 |         decompressor = zstandard.ZstdDecompressor(max_window_size=2147483648)
 43 |         with decompressor.stream_reader(f) as stream_reader:
 44 |             yield io.TextIOWrapper(stream_reader, encoding='utf-8')
 45 | 
 46 | 
 47 | class Subset(Array):
 48 |     def __init__(self, dataset: Array, indices: Sequence[int]):
 49 |         self.dataset = dataset
 50 |         self.indices = indices
 51 | 
 52 |     @classmethod
 53 |     def shard(cls, dataset: Array, shard_id: int, num_shards: int):
 54 |         N = len(dataset)
 55 |         shard_indices = np.linspace(0, N, num_shards + 1)
 56 | 
 57 |         return cls(dataset, range(int(shard_indices[shard_id]), int(shard_indices[shard_id + 1])))
 58 | 
 59 |     def __len__(self) -> int:
 60 |         return len(self.indices)
 61 | 
 62 |     @property
 63 |     def size(self) -> int:
 64 |         return len(self.indices)
 65 | 
 66 |     def __getitem__(self, idx: int) -> Dict[str, Any]:
 67 |         return self.dataset[int(self.indices[int(idx)])]
 68 | 
 69 | 
 70 | class LocalDatasets(Array):
 71 |     def __init__(self, paths: List[Union[Path, str]]):
 72 |         self.shards = []
 73 |         for path in paths:
 74 |             filename = os.path.join(path, get_index_basename())  # pyright: ignore
 75 |             obj = json.load(open(filename))
 76 | 
 77 |             for info in obj['shards']:
 78 |                 shard = reader_from_json(path, "", info)
 79 |                 self.shards.append(shard)
 80 |         self.num_samples = sum([shard.samples for shard in self.shards])
 81 | 
 82 |         shard_sizes = np.array([x.samples for x in self.shards])
 83 |         self.spanner = Spanner(shard_sizes)
 84 | 
 85 |     def __len__(self) -> int:
 86 |         return self.num_samples
 87 | 
 88 |     @property
 89 |     def size(self) -> int:
 90 |         return self.num_samples
 91 | 
 92 |     def get_item(self, sample_id: int) -> Dict[str, Any]:
 93 |         shard_id, index_in_shard = self.spanner[sample_id]
 94 |         shard = self.shards[shard_id]
 95 |         return shard[index_in_shard]
 96 | 
 97 | 
 98 | class JsonlDataset(Array):
 99 |     def __init__(self, paths: List[Union[str, Path]]):
100 |         self.paths = paths
101 | 
102 |         self.lines = []
103 |         for path in paths:
104 |             path = Path(path)
105 |             if path.suffixes[-1] in [".zstd", ".zst"]:
106 |                 with zstd_utf8_read_open(path) as f:
107 |                     self.lines.extend(f.readlines())
108 |             else:
109 |                 with open(path) as f:
110 |                     self.lines.extend(f.readlines())
111 | 
112 |     def __len__(self) -> int:
113 |         return len(self.lines)
114 | 
115 |     @property
116 |     def size(self) -> int:
117 |         return len(self.lines)
118 | 
119 |     def get_item(self, idx: int) -> Dict[str, Any]:
120 |         return json.loads(self.lines[idx])
121 | 
122 | 
123 | 
124 | class NDArrayWriter:
125 |     def __init__(self, columns, out, compression=None):
126 |         self.columns = columns
127 |         self.out = out
128 | 
129 |         self.buffers = {}
130 | 
131 |     def write(self, item):
132 |         for column in self.columns:
133 |             if column not in self.buffers:
134 |                 self.buffers[column] = []
135 |             self.buffers[column].append(item[column])
136 | 
137 |     def finish(self):
138 |         if self.buffers:
139 |             parent_folder = os.path.dirname(self.out)
140 |             if parent_folder:
141 |                 os.makedirs(parent_folder, exist_ok=True)
142 |             for column, buffer in self.buffers.items():
143 |                 if column:
144 |                     np.save(f"{self.out}__{column}.npy", np.array(buffer))
145 |                 else:
146 |                     np.save(f"{self.out}.npy", np.array(buffer))
147 | 
148 | 
149 | 
150 | 
151 | class DatetimeJsonEncoder(json.JSONEncoder):
152 |     def default(self, obj):
153 |         if isinstance(obj, datetime):
154 |             return str(obj)
155 |         if isinstance(obj, np.number):
156 |             return obj.item()
157 |         if isinstance(obj, np.ndarray):
158 |             return obj.tolist()
159 | 
160 |         return super().default(obj)
161 | 
162 | 
163 | class JsonlWriter:
164 |     def __init__(self, columns, out, compression=None):
165 |         self.columns = set(columns)
166 |         self.out = out
167 |         
168 |         parent_folder = os.path.dirname(self.out)
169 |         if parent_folder:
170 |             os.makedirs(parent_folder, exist_ok=True)
171 |         if compression is not None and compression.startswith("zst"):
172 |             ext = compression.split(":")[0]
173 |             level = int(compression.split(":")[1]) if ":" in compression else 3
174 |             self.file_handle = ZstdUtf8WriteFile(f"{out}.jsonl.{ext}", level)
175 |             self.file = self.file_handle.open()
176 |         else:
177 |             self.file_handle = open(f"{out}.jsonl", "w")
178 |             self.file = self.file_handle
179 | 
180 |     def write(self, item):
181 |         if not self.columns.issubset(item.keys()):
182 |             print(f"Warning: Item {item} does not contain all columns: {self.columns - item.keys()}")
183 |         self.file.write(json.dumps(item, cls=DatetimeJsonEncoder) + "\n")
184 | 
185 |     def finish(self):
186 |         self.file_handle.close()
187 | 


--------------------------------------------------------------------------------
/datatools/load.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | import numpy as np
 4 | from typing import Optional, List, Union
 5 | from collections.abc import Sequence
 6 | from dataclasses import dataclass
 7 | from multiprocessing import Pool
 8 | 
 9 | from pathlib import Path
10 | 
11 | from datatools.io_utils import LocalDatasets, JsonlDataset
12 | 
13 | @dataclass
14 | class LoadOptions:
15 |     """Options for load function"""
16 | 
17 |     # Type of data {mds, hf, jsonl, mds, parquet, arrow, hub}. Default: Infer from files
18 |     input_type: Optional[str] = None
19 | 
20 | 
21 | def load_from_hub(path: str):
22 |     from datasets import load_dataset
23 |     # split a path like path/to/dataset>name#split into path, name, split
24 |     rest, *split = path.rsplit("#", 1)
25 |     path, *name = rest.split(">", 1)
26 |     return load_dataset(path, name=(name[0] if name else None), split=(split[0] if split else None))
27 | 
28 | 
29 | def load_hf_dataset(path: Union[Path, str], input_type: str):
30 |     from datasets import load_from_disk, Dataset
31 |     path = str(path)
32 | 
33 |     return {
34 |         "hf": load_from_disk,
35 |         "arrow": Dataset.from_file,
36 |         "parquet": Dataset.from_parquet,
37 |         "hub": load_from_hub,
38 |     }[input_type](path)
39 | 
40 | 
41 | def load(*input_paths: List[Union[Path, str]], options: Optional[LoadOptions] = None) -> Sequence:
42 |     assert len(input_paths) > 0, "Empty input list"
43 | 
44 |     options = options or LoadOptions()
45 |     input_type = options.input_type
46 | 
47 |     if input_type is None:
48 |         path = Path(input_paths[0])
49 |         if path.is_dir() and (path / "index.json").is_file():
50 |             input_type = "mosaic"
51 |         elif path.is_dir() and (path / "state.json").is_file():
52 |             input_type = "hf"
53 |         elif path.is_file():
54 |             # Best guess from file extension
55 |             # Iterate over suffixes in reverse order to handle cases like .jsonl.zst
56 |             for suffix in path.suffixes[::-1]:
57 |                 if suffix in [".arrow", ".parquet", ".npy", ".jsonl"]:
58 |                     input_type = suffix[1:]
59 |                     break
60 | 
61 |     if input_type == "mosaic":
62 |         return LocalDatasets(input_paths)
63 |     elif input_type == "jsonl":
64 |         return JsonlDataset(input_paths)
65 |     elif input_type == "npy":
66 |         return np.concatenate([np.load(path) for path in input_paths])
67 |     elif input_type in {"hf", "arrow", "parquet", "hub"}:
68 |         from datasets import concatenate_datasets
69 |         return concatenate_datasets([load_hf_dataset(path, input_type) for path in input_paths])
70 |     else:
71 |         if input_type is None and not Path(input_paths[0]).exists():
72 |             raise ValueError(f"Could not infer input type from non-existent local path: {input_paths[0]}")
73 |         raise ValueError(f"Unknown input type: {input_type}")
74 | 
75 | def load_pandas(*input_paths: List[Union[Path, str]], n_rows: int = -1, num_proc: int = 1, options: Optional[LoadOptions] = None) -> pd.DataFrame:
76 |     dataset = load(*input_paths, options=options)
77 |     n_rows = n_rows if n_rows > 0 else len(dataset)
78 | 
79 |     if num_proc > 1:
80 |         with Pool(num_proc) as pool:
81 |             return pd.DataFrame(pool.map(dataset.__getitem__, range(n_rows)))
82 |     else:
83 |         return pd.DataFrame([dataset[i] for i in range(n_rows)])
84 | 
85 | 


--------------------------------------------------------------------------------
/datatools/merge_index.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import io
 4 | 
 5 | from typing import Any, Dict, Union, List
 6 | from collections.abc import Sequence
 7 | from pathlib import Path
 8 | 
 9 | import numpy as np
10 | 
11 | from streaming.base.array import Array
12 | from streaming.base.format import get_index_basename, reader_from_json
13 | from streaming.base.spanner import Spanner
14 | 
15 | 
16 | import zstandard
17 | from filelock import FileLock
18 | 
19 | 
20 | 
21 | def merge_index(directory : Path):
22 |     print(f"Merge the index for {directory}")
23 |     index_filename = get_index_basename()
24 | 
25 |     # sorting should ensure that the index order of the original datasets is retained
26 |     subfolders = sorted([
27 |         d for d in directory.iterdir()
28 |         if d.is_dir() and (d / index_filename).is_file()
29 |     ])
30 | 
31 |     if len(subfolders) > 0:
32 |         shards = []
33 |         for d in subfolders:
34 |             with (d / index_filename).open() as fp:
35 |                 subindex = json.load(fp)
36 |             for shard in subindex["shards"]:
37 |                 shard['raw_data']['basename'] = str(d.relative_to(d.parent) / shard['raw_data']['basename'])
38 |                 if 'zip_data' in shard and shard['zip_data'] is not None and 'basename' in shard['zip_data']:
39 |                     shard['zip_data']['basename'] = str(d.relative_to(d.parent) / shard['zip_data']['basename'])
40 |                 shards.append(shard)
41 |         new_index = {
42 |             'version': 2,
43 |             'shards': shards,
44 |         }
45 |         with (directory / index_filename).open("w") as fp:
46 |             json.dump(new_index, fp, indent=4)
47 | 
48 | 
49 | def merge_index_recursively(directory : Path):
50 |     assert directory.is_dir(), f"{directory} is not a directory"
51 |     index_filename = get_index_basename()
52 | 
53 |     subfolders = sorted([d for d in directory.iterdir() if d.is_dir()])
54 | 
55 |     for subfolder in subfolders:
56 |         merge_index_recursively(subfolder)
57 | 
58 |     if subfolders and all((subfolder / index_filename).is_file() for subfolder in subfolders):
59 |         # Add a lock to avoid multiple jobs writing to the same index.json
60 |         with FileLock(directory / (index_filename + ".lock")):
61 |             index_file = directory / index_filename
62 |             if index_file.exists():
63 |                 assert index_file.is_file(), f"{index_file} must be a file"
64 |                 index_file.unlink()
65 | 
66 |             merge_index(directory)
67 | 
68 | 


--------------------------------------------------------------------------------
/datatools/process.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from inspect import signature
  4 | from typing import Optional, Tuple, Dict, Any, Callable, Union
  5 | from collections.abc import Sequence, Iterator
  6 | from copy import copy
  7 | import shutil
  8 | 
  9 | from dataclasses import dataclass
 10 | from pathlib import Path
 11 | import logging
 12 | 
 13 | from multiprocessing import Pool
 14 | from tqdm import tqdm
 15 | 
 16 | import numpy as np
 17 | 
 18 | from streaming import MDSWriter
 19 | 
 20 | from simple_parsing import field
 21 | 
 22 | from datatools.merge_index import merge_index_recursively
 23 | from datatools.io_utils import Subset, NDArrayWriter, JsonlWriter
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
 27 | 
 28 | 
 29 | 
 30 | @dataclass
 31 | class ProcessOptions:
 32 |     """Options for process function"""
 33 | 
 34 |     # Number of workers to use
 35 |     num_proc: Optional[int] = field(alias=["-w", "--num_workers"], default=None)
 36 | 
 37 |     # Range of rows to process
 38 |     index_range: Optional[Tuple[int, int]] = None
 39 |     index_path: Optional[Path] = None
 40 |     indices: Optional[np.ndarray] = field(default=None)
 41 |     sort_index: bool = False
 42 | 
 43 |     job_id: Optional[int] = None    # Job id
 44 |     num_jobs: Optional[int] = None  # Number of jobs
 45 | 
 46 |     # Read slurm job array environment variables. Gets overridden by job_id/num_jobs
 47 |     slurm_array: bool = False
 48 | 
 49 |     compression: Optional[str] = None  # Compress output files
 50 |     jsonl: bool = False                # Write JSONL files
 51 |     ndarray: bool = False              # Write ndarray files for each column
 52 | 
 53 |     # Specify column_types like "input_ids=ndarray:uint32,domain=str"
 54 |     column_types: str = field(alias=["-c"], default=None)
 55 |     columns: Dict[str, str] = field(cmd=False, default=None)
 56 | 
 57 |     overwrite: bool = False
 58 | 
 59 |     log_level: int = logging.INFO
 60 | 
 61 |     def __post_init__(self):
 62 |         if self.slurm_array and self.job_id is None:
 63 |             self.job_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
 64 |             logger.warning(f"Using SLURM array environment variable: job_id={self.job_id}")
 65 |         if self.slurm_array and self.num_jobs is None:
 66 |             self.num_jobs = int(os.environ.get("SLURM_ARRAY_TASK_COUNT", 1))
 67 |             logger.warning(f"Using SLURM array environment variable: num_jobs={self.num_jobs}")
 68 |         if self.column_types is not None and self.columns is None:
 69 |             self.columns = dict([col.split("=") for col in self.column_types.split(",")])
 70 | 
 71 | 
 72 | def subset_output_path(output_path: Path, subset: str, process_id: int, options: ProcessOptions) -> str:
 73 |     parts = []
 74 | 
 75 |     if options.job_id is not None and options.num_jobs is not None:
 76 |         num_jobs = str(options.num_jobs)
 77 |         parts.append(f"job{options.job_id:0{len(num_jobs)}}-{num_jobs}")
 78 |     if options.num_proc is not None:
 79 |         num_proc = str(options.num_proc)
 80 |         parts.append(f"proc{process_id:0{len(num_proc)}}-{num_proc}")
 81 | 
 82 |     return str(output_path / subset / ("_".join(parts) if parts else ""))
 83 | 
 84 | 
 85 | def infer_columns(item):
 86 |     columns = {}
 87 |     for key, value in item.items():
 88 |         if isinstance(value, np.ndarray):
 89 |             columns[key] = f"ndarray:{value.dtype}"
 90 |         elif isinstance(value, np.number):
 91 |             columns[key] = str(value.dtype)
 92 |         elif isinstance(value, str):
 93 |             columns[key] = "str"
 94 |         elif isinstance(value, int):
 95 |             columns[key] = "int"
 96 |         elif isinstance(value, float):
 97 |             columns[key] = "float"
 98 |         else:
 99 |             columns[key] = "pkl"
100 |     return columns
101 | 
102 | 
103 | def identity_fn(dataset, indices, process_id, desc=None, disable=False):
104 |     for i in tqdm(range(len(dataset)), disable=(process_id != 0) or disable, desc=desc):
105 |         if isinstance(dataset[i], dict):
106 |             yield dataset[i]
107 |         else:
108 |             # Numpy arrays as datasets will not return dicts and
109 |             # they need to be wrapped in a dict to be accepted by writer classes
110 |             yield {"": dataset[i]}
111 | 
112 | 
113 | def write_process_(args):
114 |     dataset, indices, process_fn, output_path, options, process_id = args
115 | 
116 |     dataset = Subset.shard(dataset, process_id, options.num_proc or 1)
117 |     indices = Subset.shard(indices, process_id, options.num_proc or 1)
118 | 
119 |     writer_cls = MDSWriter
120 |     assert not (options.jsonl and options.ndarray), "Must choose between either --jsonl or --ndarray output mode"
121 |     if options.jsonl:
122 |         writer_cls = JsonlWriter
123 |     if options.ndarray:
124 |         writer_cls = NDArrayWriter
125 |     writers = {}
126 | 
127 |     process_fn_params = signature(process_fn).parameters
128 |     num_positional_params = sum(1 for param in process_fn_params.values() if param.default == param.empty)
129 | 
130 |     try:
131 |         if num_positional_params == 1:
132 |             results_generator = process_fn(dataset)
133 |         elif num_positional_params == 2 and 'process_id' not in process_fn_params:
134 |             results_generator = process_fn(dataset, indices)
135 |         elif num_positional_params == 2 and 'process_id' in process_fn_params:
136 |             results_generator = process_fn(dataset, process_id=process_id)
137 |         else:
138 |             results_generator = process_fn(dataset, indices, process_id)
139 | 
140 |         for result in results_generator:
141 |             if isinstance(result, tuple):
142 |                 subset, item = result
143 |                 subset = Path(subset if subset is not None else "")
144 |             else:
145 |                 item = result
146 |                 subset = Path("")
147 | 
148 | 
149 |             if subset not in writers:
150 |                 if options.columns is None:
151 |                     columns = infer_columns(item)
152 |                     logger.info(f"Inferred columns \"{subset}\": {columns}")
153 |                 else:
154 |                     columns = options.columns
155 | 
156 |                 writers[subset] = writer_cls(
157 |                     columns=columns,
158 |                     out=subset_output_path(output_path, subset, process_id, options),
159 |                     compression=options.compression)
160 |             writers[subset].write(item)
161 |     finally:
162 |         for writer in writers.values():
163 |             writer.finish()
164 | 
165 | 
166 | def load_indices(options):
167 |     indices = None
168 |     if options.indices is not None:
169 |         assert options.index_path is None, "Cannot specify both indices and index_path"
170 |         assert options.index_range is None, "Cannot specify both indices and index_range"
171 | 
172 |         indices = options.indices
173 | 
174 |     if options.index_path is not None:
175 |         assert options.index_range is None, "Cannot specify both index_path and index_range"
176 | 
177 |         indices = np.load(options.index_path)
178 |         logger.info(f"Loaded {len(indices)} indices from {options.index_path}")
179 | 
180 |     if indices is not None and options.sort_index:
181 |         indices = np.sort(indices)
182 | 
183 |     if options.index_range is not None:
184 |         logger.info(f"Using indices from {options.index_range[0]} to {options.index_range[1]}")
185 |         indices = range(*options.index_range)
186 | 
187 |     return indices
188 | 
189 | 
190 | def process(dataset: Sequence,
191 |             process_fn: Callable[[Subset, Subset, int], Iterator[Tuple[Path, Dict[str, Any]]]],
192 |             output_path: Union[Path, str],
193 |             options: Optional[ProcessOptions] = None):
194 |     options = copy(options)
195 |     logger.setLevel(options.log_level)
196 | 
197 |     output_path = Path(output_path)
198 | 
199 |     if options.overwrite and output_path.exists():
200 |         assert options.num_jobs is None or options.num_jobs == 1, "overwrite is incompatible with multiple jobs"
201 |         shutil.rmtree(output_path)
202 |         logger.info(f"Removed existing output directory: {output_path}")
203 | 
204 |     indices = load_indices(options)
205 |     if indices is not None:
206 |         dataset = Subset(dataset, indices)
207 |         logger.info(f"Selected {len(dataset)} indices")
208 |     else:
209 |         indices = range(len(dataset))
210 | 
211 |     if options.job_id is not None and options.num_jobs is not None:
212 |         if len(dataset) < options.num_jobs:
213 |             options.num_jobs = len(dataset)
214 |             logger.warning(f"Setting num_jobs={options.num_jobs} to match the dataset size")
215 | 
216 |         dataset = Subset.shard(dataset, options.job_id, options.num_jobs)
217 |         indices = Subset.shard(indices, options.job_id, options.num_jobs)
218 | 
219 |     if options.num_proc and len(dataset) < options.num_proc:
220 |         logger.warning(f"Setting num_proc={len(dataset)} to match the dataset size")
221 |         options.num_proc = len(dataset)
222 | 
223 |     process_args = [(dataset, indices, process_fn, output_path, options, i) for i in range(options.num_proc or 1)]
224 |     if options.num_proc is None or options.num_proc == 1:
225 |         write_process_(process_args[0])
226 |     else:
227 |         with Pool(options.num_proc) as pool:
228 |             pool.map(write_process_, process_args)
229 | 
230 |     if not options.jsonl and not options.ndarray:
231 |         # This gets executed by all jobs but each job will update all index.json
232 |         merge_index_recursively(output_path)
233 | 


--------------------------------------------------------------------------------
/datatools/scripts/merge_index.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from functools import partial, reduce
 4 | 
 5 | from simple_parsing import ArgumentParser
 6 | 
 7 | from datatools.merge_index import merge_index_recursively
 8 | 
 9 | 
10 | def main():
11 |     parser = ArgumentParser()
12 | 
13 |     parser.add_argument("paths", type=Path, nargs="+", help="Input dataset paths")
14 | 
15 |     args = parser.parse_args()
16 | 
17 |     print("Arguments:", args)
18 |     for path in args.paths:
19 |         print(f"Merging {path}")
20 |         merge_index_recursively(path)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     main()


--------------------------------------------------------------------------------
/datatools/scripts/pack.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | 
  3 | from typing import List, Optional
  4 | 
  5 | import numpy as np
  6 | from numpy.typing import NDArray
  7 | from pathlib import Path
  8 | 
  9 | from functools import partial, reduce
 10 | 
 11 | from dataclasses import dataclass
 12 | from simple_parsing import ArgumentParser, field
 13 | 
 14 | from collections import defaultdict
 15 | 
 16 | from datatools import load, LoadOptions, load_indices, process, ProcessOptions
 17 | from streaming.base.array import Array
 18 | 
 19 | 
 20 | @dataclass
 21 | class PackOptions:
 22 |     """Options for packing"""
 23 | 
 24 |     pack_length: int = field(alias=["-l"], default=8192)
 25 | 
 26 |     # Skip documents chunks with fewer tokens than `min_length`
 27 |     min_length: int = field(alias=["-s"], default=1)
 28 | 
 29 |     overlap: int = field(alias=["-o"], default=0)
 30 | 
 31 |     # Pack sequences into separate subset of fixed lengths
 32 |     # This will always pack to the longest available lengths,
 33 |     # Note that it still requires setting pack_length
 34 |     split_by_lengths: List[int] = field(alias=["-f"], default=None)
 35 | 
 36 |     # Every sequence starts with the beginning of the original document
 37 |     intact: bool = False
 38 | 
 39 |     bfd: bool = False
 40 |     bfd_num_bins: int = 50
 41 | 
 42 |     add_special_tokens: bool = True
 43 |     bos_id: Optional[int] = None
 44 |     eos_id: Optional[int] = None
 45 |     mos_id: Optional[int] = None  # middle-of-sentence token (<bos> for mid-document)
 46 |     use_for_mos: Optional[str] = field(choices=["bos", "eos"], default=None)
 47 | 
 48 |     # Use preset tokenizer config to set bos_id and eos_id
 49 |     tokenizer: Optional[str] = field(alias=["-T"], default=None)
 50 | 
 51 |     sort_by_length: bool = False
 52 | 
 53 |     split_by_column: str = None
 54 | 
 55 |     token_field: str = "input_ids"
 56 |     length_field: str = "length"
 57 |     indices_field: str = "indices"
 58 |     domain_field: str = "domain"
 59 |     other_fields: List[str] = field(default_factory=list)
 60 | 
 61 |     def special_tokens_from_tokenizer(self, tokenizer):
 62 |         if tokenizer == "llama3":
 63 |             return 128000, 128001
 64 |         elif tokenizer == "llama2":
 65 |             return 1, 2
 66 |         else:
 67 |             from transformers import AutoTokenizer
 68 |             tokenizer = AutoTokenizer.from_pretrained(tokenizer)
 69 |             return tokenizer.bos_token_id, tokenizer.eos_token_id
 70 | 
 71 | 
 72 |     def __post_init__(self):
 73 |         if self.tokenizer is not None:
 74 |             bos_id, eos_id = self.special_tokens_from_tokenizer(self.tokenizer)
 75 |             if self.bos_id is None:
 76 |                 self.bos_id = bos_id
 77 |             if self.eos_id is None:
 78 |                 self.eos_id = eos_id
 79 | 
 80 |         if self.use_for_mos is not None:
 81 |             if self.use_for_mos == "bos":
 82 |                 self.mos_id = self.bos_id
 83 |             elif self.use_for_mos == "eos":
 84 |                 self.mos_id = self.eos_id
 85 |             else:
 86 |                 raise ValueError(f"Invalid value for use_for_mos: {self.use_for_mos}")
 87 | 
 88 | 
 89 | def add_special_tokens(tokens: NDArray[np.uint32], options: PackOptions, bos=False, eos=False, mos=False):
 90 |     if not options.add_special_tokens:
 91 |         return tokens
 92 | 
 93 |     tokens_list = []
 94 | 
 95 |     # Add bos_id if not already present and if sequence is not empty
 96 |     if bos and options.bos_id is not None and len(tokens) > 0 and tokens[0] != options.bos_id:
 97 |         tokens_list.append(np.array([options.bos_id], dtype=tokens.dtype))
 98 |     elif mos and options.mos_id is not None and len(tokens) > 0 and tokens[0] != options.mos_id:
 99 |         tokens_list.append(np.array([options.mos_id], dtype=tokens.dtype))
100 | 
101 |     tokens_list.append(tokens)
102 | 
103 |     if eos and options.eos_id is not None and len(tokens) > 0 and tokens[-1] != options.eos_id:
104 |         tokens_list.append(np.array([options.eos_id], dtype=tokens.dtype))
105 | 
106 |     return np.concatenate(tokens_list, axis=0)
107 | 
108 | 
109 | class SingleBuffer:
110 |     def __init__(self, options: PackOptions, output_field: Optional[str] = None):
111 |         self.options = options
112 |         self.output_field = options.token_field if output_field is None else output_field
113 | 
114 |         self.token_buffer = []
115 | 
116 |         self.num_tokens = 0
117 | 
118 |     def process(self, tokens: NDArray):
119 |         self.token_buffer.append(tokens)
120 |         self.num_tokens += len(tokens)
121 | 
122 |         while self.num_tokens >= self.options.pack_length:
123 |             tokens = np.concatenate(self.token_buffer, 0)
124 |             item = {
125 |                 self.output_field: tokens[:self.options.pack_length]
126 |             }
127 |             if self.options.length_field:
128 |                 item[self.options.length_field] = self.options.pack_length
129 |             if self.options.indices_field:
130 |                 item[self.options.indices_field] = self.compute_indices(self.token_buffer)
131 | 
132 |             yield item
133 | 
134 |             self.token_buffer = []
135 |             self.num_tokens = 0
136 |             if not self.options.intact:
137 |                 tokens = add_special_tokens(tokens[self.options.pack_length - self.options.overlap:], self.options, mos=True)
138 | 
139 |                 if len(tokens) >= self.options.min_length:
140 |                     self.token_buffer.append(tokens)
141 |                     self.num_tokens += len(tokens)
142 | 
143 |     def compute_indices(self, token_buffer: List[NDArray]):
144 |         indices = []
145 |         start = 0
146 |         for seq in token_buffer:
147 |             indices.append((start, min(start+len(seq), self.options.pack_length)))
148 |             start += len(seq)
149 |             if start >= self.options.pack_length:
150 |                 break
151 |         return np.array(indices, dtype=np.uint32)
152 | 
153 |     @property
154 |     def available(self):
155 |         return self.options.pack_length - self.num_tokens
156 | 
157 | 
158 | # Best-fit-decreasing algorithm from "Fewer Truncations Improve Language Modeling"
159 | class BFDBuffer:
160 |     def __init__(self, options: PackOptions, output_field: Optional[str] = None):
161 |         self.buffers = [SingleBuffer(options, output_field) for _ in range(options.bfd_num_bins)]
162 | 
163 |     def add(self, tokens: NDArray):
164 |         available_buffers = [buffer for buffer in self.buffers if buffer.available >= len(tokens)]
165 | 
166 |         if available_buffers:
167 |             # Identify the smallest available buffer
168 |             buffer = min(available_buffers, key=lambda x: x.available)
169 |             yield from buffer.process(tokens)
170 |         else:
171 |             # Or if no buffer is available,
172 |             # identify the buffer with the largest available space
173 |             buffer = max(self.buffers, key=lambda x: x.available)
174 |             yield from buffer.process(tokens)
175 | 
176 | 
177 | def pack_fn(data: Array,
178 |             global_indices: Array,
179 |             process_id: int,
180 |             options: PackOptions):
181 | 
182 |     buffer_cls = BFDBuffer if options.bfd else SingleBuffer
183 |     buffers = defaultdict(partial(buffer_cls, options=options))
184 |     other_buffers = {
185 |         field: defaultdict(partial(buffer_cls, options=options, output_field=field))
186 |         for field in options.other_fields
187 |     }
188 | 
189 |     indices = list(range(len(data)))
190 |     if options.sort_by_length:
191 |         indices.sort(key=lambda x: len(data[x]['input_ids']), reverse=True)
192 | 
193 |     if options.split_by_lengths:
194 |         sorted_lengths = sorted(options.split_by_lengths, reverse=True)
195 | 
196 |     for i in tqdm(indices, disable=process_id != 0):
197 |         item = data[i]
198 | 
199 |         # Then identify the mapped subset
200 |         subset = Path()
201 |         if options.split_by_column:
202 |             if "." in options.split_by_column:
203 |                 dict_fields = options.split_by_column.split(".")
204 |                 subset = subset / reduce(dict.get, [item, *dict_fields])
205 |             else:
206 |                 subset = subset / item[options.split_by_column]
207 | 
208 |         input_ids = add_special_tokens(np.array(item[options.token_field], dtype=np.uint32), options, bos=True, eos=True)
209 |         other_seqs = {
210 |             field: add_special_tokens(np.array(item[field], dtype=np.uint32), options, bos=True, eos=True)
211 |             for field in options.other_fields
212 |         }
213 |         
214 |         if options.split_by_lengths:
215 |             while len(input_ids) >= sorted_lengths[-1]:
216 |                 # From longest to shortest
217 |                 target_len = next(target_len
218 |                                   for target_len in sorted_lengths
219 |                                   if len(input_ids) >= target_len)
220 | 
221 |                 target_subset = subset / f"{target_len}-{options.pack_length}"
222 |                 
223 |                 other_iterators = {
224 |                     field: iter(list(other_buffers[field][target_len].process(seq[:target_len])))
225 |                     for field, seq in other_seqs.items()
226 |                 }
227 |                 for item in buffers[target_len].process(input_ids[:target_len]):
228 |                     if options.domain_field:
229 |                         item.update({options.domain_field: str(target_subset)})
230 |                         
231 |                     for field, iterator in other_iterators.items():
232 |                         item[field] = next(iterator)[field]
233 |                         assert len(item[field]) == len(item[options.token_field])
234 |                     
235 |                     yield target_subset, item
236 | 
237 |                 if options.intact:
238 |                     break
239 | 
240 |                 input_ids = add_special_tokens(input_ids[target_len - options.overlap:], options, mos=True)
241 |                 
242 |                 for field, iterator in other_iterators.items():
243 |                     other_seqs[field] = add_special_tokens(other_seqs[field][target_len - options.overlap:], options, mos=True)
244 |         else:
245 |             other_iterators = {
246 |                 field: iter(list(other_buffers[field][subset].process(seq)))
247 |                 for field, seq in other_seqs.items()
248 |             }
249 |             
250 |             for item in buffers[subset].process(input_ids):
251 |                 if options.domain_field:
252 |                     item.update({options.domain_field: str(subset)})
253 |                 
254 |                 for field, iterator in other_iterators.items():
255 |                     item[field] = next(iterator)[field]
256 |                     assert len(item[field]) == len(item[options.token_field])
257 |                     
258 |                 yield subset, item
259 | 
260 | 
261 | def main():
262 |     parser = ArgumentParser()
263 | 
264 |     parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths")
265 |     parser.add_argument("output", type=Path, help="Output dataset path")
266 | 
267 |     parser.add_arguments(PackOptions, dest="pack_options")
268 |     parser.add_arguments(LoadOptions, dest="load_options")
269 |     parser.add_arguments(ProcessOptions, dest="process_options")
270 |     
271 |     parser.add_argument("-x", "--shuffle", action="store_true", help="Shuffle the dataset")
272 |     parser.add_argument("--seed", type=int, default=42, help="Shuffle seed")
273 | 
274 |     args = parser.parse_args()
275 | 
276 |     print("Arguments:", args)
277 |     dataset = load(*args.inputs, options=args.load_options)
278 |     N = len(dataset)
279 |     print(f"Loaded dataset with {N} samples")
280 |     
281 |     if args.shuffle:
282 |         indices = load_indices(args.process_options)
283 |         if indices is None:
284 |             indices = np.arange(N)
285 |         np.random.seed(args.seed)
286 |         args.process_options.indices = indices[np.random.permutation(len(indices))]
287 | 
288 |     process(dataset,
289 |             partial(pack_fn, options=args.pack_options),
290 |             args.output,
291 |             args.process_options)
292 | 
293 | 
294 | if __name__ == "__main__":
295 |     main()
296 | 


--------------------------------------------------------------------------------
/datatools/scripts/peek.py:
--------------------------------------------------------------------------------
  1 | from contextlib import contextmanager
  2 | 
  3 | from datasets import Dataset
  4 | from pathlib import Path
  5 | 
  6 | import pprint
  7 | 
  8 | import random
  9 | import json
 10 | import subprocess
 11 | import sys
 12 | 
 13 | from streaming import LocalDataset
 14 | 
 15 | from datatools.io_utils import LocalDatasets, DatetimeJsonEncoder
 16 | from datatools.load import load, LoadOptions
 17 | 
 18 | from simple_parsing import ArgumentParser
 19 | 
 20 | 
 21 | @contextmanager
 22 | def redirect_stdout_to_stderr():
 23 |     old_stdout = sys.stdout
 24 |     old_stdout.flush()
 25 |     sys.stdout = sys.stderr
 26 |     yield
 27 |     sys.stdout = old_stdout
 28 | 
 29 | 
 30 | @contextmanager
 31 | def jq_printer(compact=False):
 32 |     if compact:
 33 |         jq = subprocess.Popen(["jq", "-c", "."], stdin=subprocess.PIPE)
 34 |     else:
 35 |         jq = subprocess.Popen(["jq", "."], stdin=subprocess.PIPE)
 36 |     yield jq.stdin
 37 |     jq.stdin.close()
 38 |     jq.wait()
 39 | 
 40 | 
 41 | def dataset_summary(dataset):
 42 |     if isinstance(dataset, Dataset):
 43 |         features_dict = dataset._info.features.to_dict()
 44 |         dataset_type = "HuggingfaceDataset"
 45 |     elif isinstance(dataset, (LocalDataset, LocalDatasets)):
 46 |         features_dicts = [dict(zip(shard.column_names, shard.column_encodings)) for shard in dataset.shards]
 47 |         assert all(features_dicts[0] == features for features in features_dicts), "All shards must have the same features"
 48 |         features_dict = features_dicts[0]
 49 |         dataset_type = "MosaicDataset"
 50 |     else:
 51 |         features_dict = None
 52 |         dataset_type = "UnknownDataset"
 53 | 
 54 |     if features_dict is not None:
 55 |         features = pprint.pformat(features_dict, sort_dicts=False).replace("\n", "\n" + " "*14)
 56 |     else:
 57 |         features = "unknown"
 58 | 
 59 |     return f"{dataset_type}({{\n    length: {len(dataset)},\n    features: {features}\n}})"
 60 | 
 61 | 
 62 | def head(dataset, head):
 63 |     if head is not None:
 64 |         if head > 0:
 65 |             indices = range(0, min(head, len(dataset)))
 66 |         elif head < 0:
 67 |             indices = range(max(0, len(dataset) + head), len(dataset))
 68 |         else:
 69 |             indices = range(0, 0)
 70 |         return dataset.select(indices)
 71 |     else:
 72 |         return dataset
 73 | 
 74 | 
 75 | def main():
 76 |     parser = ArgumentParser()
 77 |     parser.add_argument("datasets",
 78 |                         type=Path,
 79 |                         nargs="+",
 80 |                         help="Dataset name or path to dataset. If multiple, will be concatenated.")
 81 | 
 82 |     parser.add_argument("-n",
 83 |                         type=int, default=None,
 84 |                         help="Keep top n examples (after shuffling), negative for bottom n")
 85 |     parser.add_argument("-r", "--raw",
 86 |                         action="store_true",
 87 |                         help="Print raw data instead of json")
 88 |     parser.add_argument("-c", "--compact",
 89 |                         action="store_true",
 90 |                         help="Print compact entries (jsonl)")
 91 | 
 92 |     parser.add_argument("-x", "--shuffle", "--randomize",
 93 |                         action="store_true",
 94 |                         help="Shuffle data")
 95 |     parser.add_argument("-s", "--seed",
 96 |                         default=42,
 97 |                         type=int,
 98 |                         help="Seed for shuffling data")
 99 |     parser.add_argument("--sort",
100 |                         type=str,
101 |                         default=None,
102 |                         help="Sort the dataset by the given columns, prefix a column with ^ to sort in descending order.")
103 | 
104 |     parser.add_arguments(LoadOptions, dest="load_options")
105 | 
106 |     with redirect_stdout_to_stderr():
107 |         args = parser.parse_args()
108 | 
109 |         dataset = load(*args.datasets, options=args.load_options)
110 | 
111 |         # Print dataset summary
112 |         dataset_desc = dataset_summary(dataset)
113 |         dataset_desc = dataset_desc.replace("\n", "\n| ")
114 |         print(f"\033[0;31m| {dataset_desc}\033[0m")
115 | 
116 |         # Sort by field
117 |         if args.sort is not None or args.shuffle:
118 |             indices = list(range(len(dataset)))
119 | 
120 |             if args.shuffle:
121 |                 assert args.sort is None, "Cannot both shuffle and sort"
122 |                 random.seed(args.seed)
123 |                 random.shuffle(indices)
124 |             else:
125 |                 column = args.sort.startswith("^") and args.sort[1:] or args.sort
126 |                 descending = args.sort.startswith("^")
127 |                 values = [dataset[i][column] for i in indices]
128 |                 indices.sort(key=lambda i: values[i], reverse=descending)
129 |         else:
130 |             indices = range(len(dataset))
131 | 
132 |         # Select head/tail
133 |         if args.n is not None:
134 |             if args.n > 0:
135 |                 indices = indices[:min(args.n, len(indices))]
136 |             elif args.n < 0:
137 |                 indices = indices[max(0, len(indices) + args.n):]
138 |             else:
139 |                 indices = indices[:0]
140 | 
141 |     if args.raw:
142 |         for i in indices:
143 |             print(dataset[i])
144 |     else:
145 |         with jq_printer(args.compact) as jq:
146 |             for i in indices:
147 |                 jq.write(json.dumps(dataset[i], cls=DatetimeJsonEncoder).encode("utf-8"))
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     main()
152 | 


--------------------------------------------------------------------------------
/datatools/scripts/tokenize.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | from tqdm import tqdm
  4 | 
  5 | from typing import Optional
  6 | 
  7 | import numpy as np
  8 | from pathlib import Path
  9 | 
 10 | from simple_parsing import ArgumentParser, field
 11 | from dataclasses import dataclass
 12 | 
 13 | from datatools.load import load, LoadOptions
 14 | from datatools.process import process, ProcessOptions
 15 | from streaming.base.array import Array
 16 | 
 17 | 
 18 | @dataclass
 19 | class TokenizeOptions:
 20 |     """Options for tokenizing"""
 21 | 
 22 |     # Can use "llama2" or "llama3" for Llama models
 23 |     # Otherwise HF tokenizer name
 24 |     tokenizer: str = field(alias=["-T"], default="llama3")
 25 | 
 26 |     domain: str = ""
 27 |     domain_by: Optional[str] = None
 28 | 
 29 |     # Either specify a chat template or a template file
 30 |     chat_template: bool = False
 31 |     chat_messages_field: Optional[str] = "messages"
 32 |     chat_assistant_masking: bool = True
 33 | 
 34 |     template_file: Optional[Path] = None
 35 |     template: str = "{text}"
 36 |     truncate_bytes: int = 10_000_000_000    #  Only usedwithout chat template
 37 | 
 38 |     token_field: str = "input_ids"
 39 |     length_field: str = "length"
 40 |     domain_field: str = "domain"
 41 | 
 42 |     def __post_init__(self):
 43 |         if self.template_file is not None:
 44 |             with Path(self.template_file).open() as f:
 45 |                 self.template = f.read()
 46 | 
 47 | 
 48 | def load_tokenizer_encoder(options: TokenizeOptions):
 49 |     if options.tokenizer in ["llama2", "llama3"]:
 50 |         if options.tokenizer == "llama2":
 51 |             from datatools.scripts.tokenizers.llama2_tokenizer import Tokenizer
 52 |             tokenizer = Tokenizer(str(Path(__file__).parent / "tokenizers" / "llama2_tokenizer.model"))
 53 |         else:
 54 |             from datatools.scripts.tokenizers.llama3_tokenizer import Tokenizer
 55 |             tokenizer = Tokenizer(str(Path(__file__).parent / "tokenizers" / "llama3_tokenizer.model"))
 56 |         from datatools.scripts.tokenizers.llama3_tokenizer import ChatFormat
 57 |         
 58 |         if options.chat_template:
 59 |             chat_format = ChatFormat(tokenizer)
 60 |             def encode_fn(item):
 61 |                 dialog = item[options.chat_messages_field]
 62 |                 return chat_format.encode_dialog_prompt(dialog, return_assistant_masks=options.chat_assistant_masking)
 63 |             return encode_fn
 64 |         else:
 65 |             def encode_fn(item):
 66 |                 text = options.template.format(**item)
 67 |                 tokens = tokenizer.encode(text[:options.truncate_bytes], bos=False, eos=False)
 68 |                 return tokens
 69 |             return encode_fn
 70 |     else:
 71 |         from transformers import AutoTokenizer
 72 |         tokenizer = AutoTokenizer.from_pretrained(options.tokenizer)
 73 |         assert not options.chat_assistant_masking, "Chat masking is not supported for HF tokenizers (yet)"
 74 |         if options.chat_template:
 75 |             def encode_fn(item):
 76 |                 dialog = item[options.chat_messages_field]
 77 |                 tokens = chat_format.apply_chat_template(dialog, truncation=False, max_length=None)
 78 |                 return tokens
 79 |             return encode_fn
 80 |         else:
 81 |             def encode_fn(item):
 82 |                 text = options.template.format(**item)
 83 |                 tokens = tokenizer.encode(text[:options.truncate_bytes], add_special_tokens=False, truncation=False, max_length=None)
 84 |                 return tokens
 85 |     return encode_fn
 86 | 
 87 |         
 88 | 
 89 | 
 90 | def tokenize_fn(data: Array,
 91 |                 process_id: int,
 92 |                 options: TokenizeOptions):
 93 | 
 94 |     encode_fn = load_tokenizer_encoder(options)
 95 | 
 96 |     for i in tqdm(range(len(data)), desc=f"Process {process_id}"):
 97 |         item = data[i]
 98 |         domain = item[options.domain_by] if options.domain_by is not None else options.domain
 99 |         
100 |         if options.chat_template and options.chat_assistant_masking:
101 |             tokens, masks = encode_fn(item)
102 | 
103 |             output_item = {
104 |                 options.token_field: np.array(tokens, dtype=np.uint32),
105 |                 "mask": np.array(masks, dtype=np.uint8)
106 |             }
107 |         else:
108 |             tokens = encode_fn(item)
109 |             output_item = {
110 |                 options.token_field: np.array(tokens, dtype=np.uint32),
111 |             }
112 |         
113 |         if options.length_field:
114 |             output_item[options.length_field] = len(tokens)
115 |         if options.domain_field:
116 |             output_item[options.domain_field] = domain
117 | 
118 |         yield output_item
119 | 
120 | 
121 | def main():
122 |     parser = ArgumentParser()
123 | 
124 |     parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths")
125 |     parser.add_argument("output", type=Path, help="Output dataset path")
126 | 
127 |     parser.add_arguments(TokenizeOptions, dest="tokenize_options")
128 |     parser.add_arguments(LoadOptions, dest="load_options")
129 |     parser.add_arguments(ProcessOptions, dest="process_options")
130 | 
131 |     args = parser.parse_args()
132 | 
133 |     print("Arguments:", args)
134 |     dataset = load(*args.inputs, options=args.load_options)
135 |     N = len(dataset)
136 |     print(f"Loaded dataset with {N} samples")
137 | 
138 |     process(dataset,
139 |             partial(tokenize_fn, options=args.tokenize_options),
140 |             args.output,
141 |             args.process_options)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     main()
146 | 


--------------------------------------------------------------------------------
/datatools/scripts/tokenizers/llama2_tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeCreator/datatools/81b94a71303b4868e09dfa2da984d81063e28c5c/datatools/scripts/tokenizers/llama2_tokenizer.model


--------------------------------------------------------------------------------
/datatools/scripts/tokenizers/llama2_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 3 | 
 4 | import os
 5 | from logging import getLogger
 6 | from typing import List
 7 | 
 8 | from sentencepiece import SentencePieceProcessor
 9 | 
10 | 
11 | logger = getLogger()
12 | 
13 | 
14 | class Tokenizer:
15 |     """tokenizing and encoding/decoding text using SentencePiece."""
16 |     def __init__(self, model_path: str):
17 |         """
18 |         Initializes the Tokenizer with a SentencePiece model.
19 | 
20 |         Args:
21 |             model_path (str): The path to the SentencePiece model file.
22 |         """
23 |         # reload tokenizer
24 |         assert os.path.isfile(model_path), model_path
25 |         self.sp_model = SentencePieceProcessor(model_file=model_path)
26 |         logger.info(f"Reloaded SentencePiece model from {model_path}")
27 | 
28 |         # BOS / EOS token IDs
29 |         self.n_words: int = self.sp_model.vocab_size()
30 |         self.bos_id: int = self.sp_model.bos_id()
31 |         self.eos_id: int = self.sp_model.eos_id()
32 |         self.pad_id: int = self.sp_model.pad_id()
33 |         logger.info(
34 |             f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
35 |         )
36 |         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
37 | 
38 |     def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
39 |         """
40 |         Encodes a string into a list of token IDs.
41 | 
42 |         Args:
43 |             s (str): The input string to be encoded.
44 |             bos (bool): Whether to prepend the beginning-of-sequence token.
45 |             eos (bool): Whether to append the end-of-sequence token.
46 | 
47 |         Returns:
48 |             List[int]: A list of token IDs.
49 |         """
50 |         assert type(s) is str
51 |         t = self.sp_model.encode(s)
52 |         if bos:
53 |             t = [self.bos_id] + t
54 |         if eos:
55 |             t = t + [self.eos_id]
56 |         return t
57 | 
58 |     def decode(self, t: List[int]) -> str:
59 |         """
60 |         Decodes a list of token IDs into a string.
61 | 
62 |         Args:
63 |             t (List[int]): The list of token IDs to be decoded.
64 | 
65 |         Returns:
66 |             str: The decoded string.
67 |         """
68 |         return self.sp_model.decode(t)


--------------------------------------------------------------------------------
/datatools/scripts/tokenizers/llama3_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
  3 | 
  4 | import os
  5 | from logging import getLogger
  6 | from pathlib import Path
  7 | from typing import (
  8 |     AbstractSet,
  9 |     cast,
 10 |     Collection,
 11 |     Dict,
 12 |     Iterator,
 13 |     List,
 14 |     Literal,
 15 |     Sequence,
 16 |     TypedDict,
 17 |     Union,
 18 | )
 19 | 
 20 | import tiktoken
 21 | from tiktoken.load import load_tiktoken_bpe
 22 | 
 23 | 
 24 | logger = getLogger(__name__)
 25 | 
 26 | 
 27 | Role = Literal["system", "user", "assistant"]
 28 | 
 29 | 
 30 | class Message(TypedDict):
 31 |     role: Role
 32 |     content: str
 33 | 
 34 | 
 35 | Dialog = Sequence[Message]
 36 | 
 37 | 
 38 | class Tokenizer:
 39 |     """
 40 |     Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
 41 |     """
 42 | 
 43 |     special_tokens: Dict[str, int]
 44 | 
 45 |     num_reserved_special_tokens = 256
 46 | 
 47 |     pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
 48 | 
 49 |     def __init__(self, model_path: str):
 50 |         """
 51 |         Initializes the Tokenizer with a Tiktoken model.
 52 | 
 53 |         Args:
 54 |             model_path (str): The path to the Tiktoken model file.
 55 |         """
 56 |         assert os.path.isfile(model_path), model_path
 57 | 
 58 |         mergeable_ranks = load_tiktoken_bpe(model_path)
 59 |         num_base_tokens = len(mergeable_ranks)
 60 |         special_tokens = [
 61 |             "<|begin_of_text|>",
 62 |             "<|end_of_text|>",
 63 |             "<|reserved_special_token_0|>",
 64 |             "<|reserved_special_token_1|>",
 65 |             "<|reserved_special_token_2|>",
 66 |             "<|reserved_special_token_3|>",
 67 |             "<|start_header_id|>",
 68 |             "<|end_header_id|>",
 69 |             "<|reserved_special_token_4|>",
 70 |             "<|eot_id|>",  # end of turn
 71 |         ] + [
 72 |             f"<|reserved_special_token_{i}|>"
 73 |             for i in range(5, self.num_reserved_special_tokens - 5)
 74 |         ]
 75 |         self.special_tokens = {
 76 |             token: num_base_tokens + i for i, token in enumerate(special_tokens)
 77 |         }
 78 |         self.model = tiktoken.Encoding(
 79 |             name=Path(model_path).name,
 80 |             pat_str=self.pat_str,
 81 |             mergeable_ranks=mergeable_ranks,
 82 |             special_tokens=self.special_tokens,
 83 |         )
 84 |         logger.info(f"Reloaded tiktoken model from {model_path}")
 85 | 
 86 |         self.n_words: int = self.model.n_vocab
 87 |         # BOS / EOS token IDs
 88 |         self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
 89 |         self.eos_id: int = self.special_tokens["<|end_of_text|>"]
 90 |         self.pad_id: int = -1
 91 |         self.stop_tokens = {
 92 |             self.special_tokens["<|end_of_text|>"],
 93 |             self.special_tokens["<|eot_id|>"],
 94 |         }
 95 |         logger.info(
 96 |             f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
 97 |         )
 98 | 
 99 |     def encode(
100 |         self,
101 |         s: str,
102 |         *,
103 |         bos: bool,
104 |         eos: bool,
105 |         allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
106 |         disallowed_special: Union[Literal["all"], Collection[str]] = (),
107 |     ) -> List[int]:
108 |         """
109 |         Encodes a string into a list of token IDs.
110 | 
111 |         Args:
112 |             s (str): The input string to be encoded.
113 |             bos (bool): Whether to prepend the beginning-of-sequence token.
114 |             eos (bool): Whether to append the end-of-sequence token.
115 |             allowed_tokens ("all"|set[str]): allowed special tokens in string
116 |             disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
117 | 
118 |         Returns:
119 |             list[int]: A list of token IDs.
120 | 
121 |         By default, setting disallowed_special=() encodes a string by ignoring
122 |         special tokens. Specifically:
123 |         - Setting `disallowed_special` to () will cause all text corresponding
124 |           to special tokens to be encoded as natural text (insteading of raising
125 |           an error).
126 |         - Setting `allowed_special` to "all" will treat all text corresponding
127 |           to special tokens to be encoded as special tokens.
128 |         """
129 |         assert type(s) is str
130 | 
131 |         # The tiktoken tokenizer can handle <=400k chars without
132 |         # pyo3_runtime.PanicException.
133 |         TIKTOKEN_MAX_ENCODE_CHARS = 400_000
134 | 
135 |         # https://github.com/openai/tiktoken/issues/195
136 |         # Here we iterate over subsequences and split if we exceed the limit
137 |         # of max consecutive non-whitespace or whitespace characters.
138 |         MAX_NO_WHITESPACES_CHARS = 25_000
139 | 
140 |         substrs = (
141 |             substr
142 |             for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
143 |             for substr in self._split_whitespaces_or_nonwhitespaces(
144 |                 s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
145 |             )
146 |         )
147 |         t: List[int] = []
148 |         for substr in substrs:
149 |             t.extend(
150 |                 self.model.encode(
151 |                     substr,
152 |                     allowed_special=allowed_special,
153 |                     disallowed_special=disallowed_special,
154 |                 )
155 |             )
156 |         if bos:
157 |             t.insert(0, self.bos_id)
158 |         if eos:
159 |             t.append(self.eos_id)
160 |         return t
161 | 
162 |     def decode(self, t: Sequence[int]) -> str:
163 |         """
164 |         Decodes a list of token IDs into a string.
165 | 
166 |         Args:
167 |             t (List[int]): The list of token IDs to be decoded.
168 | 
169 |         Returns:
170 |             str: The decoded string.
171 |         """
172 |         # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
173 |         return self.model.decode(cast(List[int], t))
174 | 
175 |     @staticmethod
176 |     def _split_whitespaces_or_nonwhitespaces(
177 |         s: str, max_consecutive_slice_len: int
178 |     ) -> Iterator[str]:
179 |         """
180 |         Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
181 |         consecutive whitespaces or consecutive non-whitespaces.
182 |         """
183 |         current_slice_len = 0
184 |         current_slice_is_space = s[0].isspace() if len(s) > 0 else False
185 |         slice_start = 0
186 | 
187 |         for i in range(len(s)):
188 |             is_now_space = s[i].isspace()
189 | 
190 |             if current_slice_is_space ^ is_now_space:
191 |                 current_slice_len = 1
192 |                 current_slice_is_space = is_now_space
193 |             else:
194 |                 current_slice_len += 1
195 |                 if current_slice_len > max_consecutive_slice_len:
196 |                     yield s[slice_start:i]
197 |                     slice_start = i
198 |                     current_slice_len = 1
199 |         yield s[slice_start:]
200 | 
201 | 
202 | class ChatFormat:
203 |     def __init__(self, tokenizer: Tokenizer):
204 |         self.tokenizer = tokenizer
205 | 
206 |     def encode_header(self, message: Message) -> List[int]:
207 |         tokens = []
208 |         tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
209 |         tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
210 |         tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
211 |         tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
212 |         return tokens
213 | 
214 |     def encode_message(self, message: Message) -> List[int]:
215 |         tokens = self.encode_header(message)
216 |         masks = [0] * len(tokens)
217 |         
218 |         assistant_mask = 1 if message["role"] == "assistant" else 0
219 |         
220 |         message = self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
221 |         tokens.extend(message)
222 |         masks.extend([assistant_mask] * len(message))
223 |         
224 |         tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
225 |         masks.append(assistant_mask)
226 |         
227 |         return tokens, masks
228 | 
229 |     def encode_dialog_prompt(self, dialog: Dialog, return_assistant_masks: bool = False) -> List[int]:
230 |         tokens = []
231 |         masks = []
232 |         
233 |         tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
234 |         masks.append(0)
235 |         
236 |         for message in dialog:
237 |             t, m = self.encode_message(message)
238 |             tokens.extend(t)
239 |             masks.extend(m)
240 |         
241 |         # # Add the start of an assistant message for the model to complete.
242 |         # tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
243 |         
244 |         if return_assistant_masks:
245 |             return tokens, masks
246 |         else:
247 |             return tokens
248 | 


--------------------------------------------------------------------------------
/datatools/scripts/wrangle.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | 
  3 | from typing import Dict, Optional, List
  4 | from copy import copy
  5 | 
  6 | import numpy as np
  7 | from pathlib import Path
  8 | 
  9 | from functools import partial
 10 | 
 11 | from simple_parsing import ArgumentParser, field
 12 | 
 13 | from datatools import load, LoadOptions, process, ProcessOptions, load_indices, merge_index_recursively
 14 | from streaming.base.array import Array
 15 | 
 16 | from dataclasses import dataclass
 17 | 
 18 | 
 19 | @dataclass
 20 | class WrangleOptions:
 21 |     """Options for data wrangling"""
 22 | 
 23 |     splits: Optional[List[str]] = field(alias="-s", default=None)
 24 |     proportions: List[float] = field(alias="-p", default_factory=lambda: [1.0])
 25 |     examples: Optional[List[int]] = field(alias="-n", default=None)
 26 | 
 27 |     shuffle: bool = field(alias=["-x", "--randomize"], default=True)
 28 |     seed: int = 42
 29 | 
 30 |     join: List[List[Path]] = field(alias="-J", required=False, default_factory=lambda: [], action="append", type=Path, nargs="+") # Other input dataset paths that will be joined along the index
 31 |     prefix: List[str] = field(alias="-P", required=False, default_factory=lambda: [], action="append", type=str, nargs=None) # Prefix for the fields of the dataset
 32 |     suffix: List[str] = field(alias="-S", required=False, default_factory=lambda: [], action="append", type=str, nargs=None) # Suffix for the fields of the dataset
 33 | 
 34 |     def __post_init__(self):
 35 |         assert len(self.prefix) == 0 or len(self.prefix) == len(self.join), "Number of prefixes must match number of join datasets or be empty"
 36 |         assert len(self.suffix) == 0 or len(self.suffix) == len(self.join), "Number of suffixes must match number of join datasets or be empty"
 37 | 
 38 | 
 39 | def join_fn(data: Array,
 40 |             global_indices: Array,
 41 |             process_id: int,
 42 |             join_datasets: List[Array],
 43 |             prefixes: List[str],
 44 |             suffixes: List[str]):
 45 | 
 46 |     for i in tqdm(range(len(data)), disable=process_id != 0):
 47 |         item = data[i]
 48 |         for j, join_dataset in enumerate(join_datasets):
 49 |             join_item = join_dataset[global_indices[i]]
 50 |             for key, value in join_item.items():
 51 |                 item[prefixes[j] + key + suffixes[j]] = value
 52 |         yield item
 53 | 
 54 | 
 55 | def split_fn(data: Array,
 56 |              global_indices: Array,
 57 |              process_id: int,
 58 |              partitions: Dict[str, int],
 59 |              seed: int):
 60 | 
 61 |     # On each worker: split the entire dataset into train, validation, and test sets
 62 |     partition_id = np.repeat(np.arange(3), list(partitions.values()))
 63 |     partition_names = list(partitions.keys())
 64 |     np.random.seed(seed)
 65 |     np.random.shuffle(partition_id)
 66 | 
 67 |     for i in tqdm(range(len(data)), disable=process_id!=0):
 68 |         yield partition_names[partition_id[global_indices[i]]], data[i]
 69 | 
 70 | 
 71 | def main():
 72 |     parser = ArgumentParser()
 73 | 
 74 |     parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths")
 75 |     parser.add_argument("output", type=Path, help="Output dataset path")
 76 | 
 77 |     parser.add_arguments(WrangleOptions, dest="wrangle_options")
 78 |     parser.add_arguments(LoadOptions, dest="load_options")
 79 |     parser.add_arguments(ProcessOptions, dest="process_options")
 80 | 
 81 |     args = parser.parse_args()
 82 |     options = args.wrangle_options
 83 | 
 84 |     print("Arguments:", args)
 85 |     dataset = load(*args.inputs, options=args.load_options)
 86 |     print(f"Loaded dataset with {len(dataset)} samples")
 87 | 
 88 |     prefixes = options.prefix if options.prefix else [""]*len(options.join)
 89 |     suffixes = options.suffix if options.suffix else [""]*len(options.join)
 90 | 
 91 |     indices = load_indices(args.process_options)
 92 |     if indices is None:
 93 |         indices = np.arange(len(dataset))
 94 |     N = len(indices)
 95 | 
 96 |     join_datasets = [
 97 |         load(*join_paths, options=args.load_options)
 98 |         for join_paths in options.join
 99 |     ]
100 | 
101 |     if options.splits is not None:
102 |         if options.examples is not None:
103 |             assert len(options.examples) == len(options.splits), "Number of examples must match number of splits"
104 |             index_ranges = [0] + np.cumsum(options.examples).tolist()
105 |         else:
106 |             assert len(options.proportions) == len(options.splits)
107 |             proportions = np.array(options.proportions)
108 |             index_ranges = [0] + np.cumsum(np.round(proportions * N).astype(int)).tolist()
109 |         splits = options.splits
110 |     else:
111 |         if options.examples is not None:
112 |             assert len(options.examples) == 1, "Only one number of examples can be specified"
113 |             index_ranges = [0, options.examples[0]]
114 |         else:
115 |             assert len(options.proportions) == 1
116 |             index_ranges = [0, round(options.proportions[0] * N)]
117 |         splits = [""]
118 | 
119 |     if options.shuffle:
120 |         np.random.seed(options.seed)
121 |         indices = indices[np.random.permutation(len(indices))]
122 | 
123 |     for split, (start, end) in zip(splits, zip(index_ranges[:-1], index_ranges[1:])):
124 |         print(f"Processing split {split} with {end - start} examples")
125 | 
126 |         process_options = copy(args.process_options)
127 |         process_options.indices = indices[start:end]
128 |         process_options.index_path = None
129 |         process_options.index_range = None
130 | 
131 |         process(dataset,
132 |                 partial(join_fn,
133 |                         join_datasets=join_datasets,
134 |                         prefixes=prefixes,
135 |                         suffixes=suffixes),
136 |                 args.output / split,
137 |                 process_options)
138 | 
139 | 
140 |     if not process_options.jsonl and not process_options.ndarray:
141 |         merge_index_recursively(args.output)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     main()
146 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='datatools-py',
 5 |     version='0.1',
 6 |     packages=['datatools'],
 7 |     install_requires=[
 8 |         'tqdm>=4.66.1',
 9 |         'numpy>=1.26.4',
10 |         'simple_parsing>=0.1.5',
11 |         'mosaicml-streaming>=0.7.5',
12 |         'datasets>=2.18.0',
13 |         'sentencepiece>=0.1.99',
14 |         'zstandard>=0.23.0'
15 |         # 'transformers==4.39.3',
16 |     ],
17 |     author='Alexander Wettig',
18 |     description='Library and scripts for common LM data utilities (tokenizing, splitting, packing, ...)',
19 |     long_description=open('README.md').read(),
20 |     long_description_content_type='text/markdown',
21 |     url='https://github.com/CodeCreator/datatools',
22 |     classifiers=[
23 |         'Programming Language :: Python :: 3',
24 |         'License :: OSI Approved :: MIT License',
25 |         'Operating System :: OS Independent',
26 |     ],
27 |     entry_points={
28 |         'console_scripts': [
29 |             'peek=datatools.scripts.peek:main',
30 |             'merge_index=datatools.scripts.merge_index:main',
31 |             'pack=datatools.scripts.pack:main',
32 |             'wrangle=datatools.scripts.wrangle:main',
33 |             'tokenize=datatools.scripts.tokenize:main',
34 |         ]
35 |     },
36 |     python_requires='>=3.6',
37 | )
38 | 


--------------------------------------------------------------------------------