├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.md ├── cityhash ├── city.cc ├── city.h └── citycrc.h ├── cpppath └── cpppath.h ├── include ├── compressor.hpp ├── dataloader.hpp ├── mapped_file.hpp ├── tokenized_data_loader.hpp ├── tokenized_data_prep.hpp ├── tokenized_data_verify.hpp ├── tools.hpp ├── uring_file.hpp ├── worker_pool.hpp └── yaml_parser.hpp ├── install.sh ├── provision ├── EXAMPLE_run_all_hosts.txt ├── README.md ├── ansible │ ├── README.md │ ├── ansible.cfg │ ├── install_conda.yml │ ├── install_cuda.yml │ ├── install_ssh_key.sh │ ├── inventory.ini │ ├── setup_repo.yml │ ├── test_install.py │ └── test_install.yml ├── hosts.txt ├── make_shard_script.py ├── shard_dataset.py ├── test_dataloader.py └── test_tiktoken.py ├── python_src └── catid_dataloader │ ├── __init__.py │ └── catid_dataloader_wrapper.py ├── requirements.txt ├── ryml ├── ryml.cpp └── ryml.hpp ├── setup.cfg ├── setup.py ├── src ├── compressor.cpp ├── mapped_file.cpp ├── python_module.cpp ├── tokenized_data_loader.cpp ├── tokenized_data_prep.cpp ├── tokenized_data_verify.cpp ├── tools.cpp ├── uring_file.cpp ├── worker_pool.cpp └── yaml_parser.cpp ├── test_catid_dataloader.py ├── tests ├── test_compressor.cpp ├── test_data_loader.cpp ├── test_data_prep.cpp ├── test_mapped_file.cpp ├── test_tools.cpp ├── test_uring_file.cpp └── test_worker_pool.cpp ├── update_pip.sh └── uring ├── arch ├── aarch64 │ ├── lib.h │ └── syscall.h ├── generic │ ├── lib.h │ └── syscall.h ├── riscv64 │ ├── lib.h │ └── syscall.h ├── syscall-defs.h └── x86 │ ├── lib.h │ └── syscall.h ├── include ├── uring.h └── uring │ ├── barrier.h │ └── io_uring.h ├── int_flags.h ├── lib.h ├── queue.c ├── register.c ├── setup.c ├── setup.h ├── syscall.c ├── syscall.h └── version.c /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | python_test_data 3 | build 4 | *.egg-info 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/.gitmodules -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/CMakeLists.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/README.md -------------------------------------------------------------------------------- /cityhash/city.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/cityhash/city.cc -------------------------------------------------------------------------------- /cityhash/city.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/cityhash/city.h -------------------------------------------------------------------------------- /cityhash/citycrc.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/cityhash/citycrc.h -------------------------------------------------------------------------------- /cpppath/cpppath.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/cpppath/cpppath.h -------------------------------------------------------------------------------- /include/compressor.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/compressor.hpp -------------------------------------------------------------------------------- /include/dataloader.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/dataloader.hpp -------------------------------------------------------------------------------- /include/mapped_file.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/mapped_file.hpp -------------------------------------------------------------------------------- /include/tokenized_data_loader.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/tokenized_data_loader.hpp -------------------------------------------------------------------------------- /include/tokenized_data_prep.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/tokenized_data_prep.hpp -------------------------------------------------------------------------------- /include/tokenized_data_verify.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/tokenized_data_verify.hpp -------------------------------------------------------------------------------- /include/tools.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/tools.hpp -------------------------------------------------------------------------------- /include/uring_file.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/uring_file.hpp -------------------------------------------------------------------------------- /include/worker_pool.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/worker_pool.hpp -------------------------------------------------------------------------------- /include/yaml_parser.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/include/yaml_parser.hpp -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/install.sh -------------------------------------------------------------------------------- /provision/EXAMPLE_run_all_hosts.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/EXAMPLE_run_all_hosts.txt -------------------------------------------------------------------------------- /provision/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/README.md -------------------------------------------------------------------------------- /provision/ansible/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/README.md -------------------------------------------------------------------------------- /provision/ansible/ansible.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/ansible.cfg -------------------------------------------------------------------------------- /provision/ansible/install_conda.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/install_conda.yml -------------------------------------------------------------------------------- /provision/ansible/install_cuda.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/install_cuda.yml -------------------------------------------------------------------------------- /provision/ansible/install_ssh_key.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/install_ssh_key.sh -------------------------------------------------------------------------------- /provision/ansible/inventory.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/inventory.ini -------------------------------------------------------------------------------- /provision/ansible/setup_repo.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/setup_repo.yml -------------------------------------------------------------------------------- /provision/ansible/test_install.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/test_install.py -------------------------------------------------------------------------------- /provision/ansible/test_install.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/ansible/test_install.yml -------------------------------------------------------------------------------- /provision/hosts.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/hosts.txt -------------------------------------------------------------------------------- /provision/make_shard_script.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/make_shard_script.py -------------------------------------------------------------------------------- /provision/shard_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/shard_dataset.py -------------------------------------------------------------------------------- /provision/test_dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/test_dataloader.py -------------------------------------------------------------------------------- /provision/test_tiktoken.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/provision/test_tiktoken.py -------------------------------------------------------------------------------- /python_src/catid_dataloader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/python_src/catid_dataloader/__init__.py -------------------------------------------------------------------------------- /python_src/catid_dataloader/catid_dataloader_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/python_src/catid_dataloader/catid_dataloader_wrapper.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/requirements.txt -------------------------------------------------------------------------------- /ryml/ryml.cpp: -------------------------------------------------------------------------------- 1 | #define RYML_SINGLE_HDR_DEFINE_NOW 2 | #include "ryml.hpp" 3 | -------------------------------------------------------------------------------- /ryml/ryml.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/ryml/ryml.hpp -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/setup.cfg -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/setup.py -------------------------------------------------------------------------------- /src/compressor.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/compressor.cpp -------------------------------------------------------------------------------- /src/mapped_file.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/mapped_file.cpp -------------------------------------------------------------------------------- /src/python_module.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/python_module.cpp -------------------------------------------------------------------------------- /src/tokenized_data_loader.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/tokenized_data_loader.cpp -------------------------------------------------------------------------------- /src/tokenized_data_prep.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/tokenized_data_prep.cpp -------------------------------------------------------------------------------- /src/tokenized_data_verify.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/tokenized_data_verify.cpp -------------------------------------------------------------------------------- /src/tools.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/tools.cpp -------------------------------------------------------------------------------- /src/uring_file.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/uring_file.cpp -------------------------------------------------------------------------------- /src/worker_pool.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/worker_pool.cpp -------------------------------------------------------------------------------- /src/yaml_parser.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/src/yaml_parser.cpp -------------------------------------------------------------------------------- /test_catid_dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/test_catid_dataloader.py -------------------------------------------------------------------------------- /tests/test_compressor.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/tests/test_compressor.cpp -------------------------------------------------------------------------------- /tests/test_data_loader.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/tests/test_data_loader.cpp -------------------------------------------------------------------------------- /tests/test_data_prep.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/tests/test_data_prep.cpp -------------------------------------------------------------------------------- /tests/test_mapped_file.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/tests/test_mapped_file.cpp -------------------------------------------------------------------------------- /tests/test_tools.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/tests/test_tools.cpp -------------------------------------------------------------------------------- /tests/test_uring_file.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/tests/test_uring_file.cpp -------------------------------------------------------------------------------- /tests/test_worker_pool.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/tests/test_worker_pool.cpp -------------------------------------------------------------------------------- /update_pip.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/update_pip.sh -------------------------------------------------------------------------------- /uring/arch/aarch64/lib.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/aarch64/lib.h -------------------------------------------------------------------------------- /uring/arch/aarch64/syscall.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/aarch64/syscall.h -------------------------------------------------------------------------------- /uring/arch/generic/lib.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/generic/lib.h -------------------------------------------------------------------------------- /uring/arch/generic/syscall.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/generic/syscall.h -------------------------------------------------------------------------------- /uring/arch/riscv64/lib.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/riscv64/lib.h -------------------------------------------------------------------------------- /uring/arch/riscv64/syscall.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/riscv64/syscall.h -------------------------------------------------------------------------------- /uring/arch/syscall-defs.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/syscall-defs.h -------------------------------------------------------------------------------- /uring/arch/x86/lib.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/x86/lib.h -------------------------------------------------------------------------------- /uring/arch/x86/syscall.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/arch/x86/syscall.h -------------------------------------------------------------------------------- /uring/include/uring.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/include/uring.h -------------------------------------------------------------------------------- /uring/include/uring/barrier.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/include/uring/barrier.h -------------------------------------------------------------------------------- /uring/include/uring/io_uring.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/include/uring/io_uring.h -------------------------------------------------------------------------------- /uring/int_flags.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/int_flags.h -------------------------------------------------------------------------------- /uring/lib.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/lib.h -------------------------------------------------------------------------------- /uring/queue.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/queue.c -------------------------------------------------------------------------------- /uring/register.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/register.c -------------------------------------------------------------------------------- /uring/setup.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/setup.c -------------------------------------------------------------------------------- /uring/setup.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/setup.h -------------------------------------------------------------------------------- /uring/syscall.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/syscall.c -------------------------------------------------------------------------------- /uring/syscall.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/syscall.h -------------------------------------------------------------------------------- /uring/version.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/dataloader/HEAD/uring/version.c --------------------------------------------------------------------------------