├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── Readme_streaming.md ├── cogdata ├── __init__.py ├── arguments.py ├── cli.py ├── data_manager.py ├── data_processor.py ├── data_savers │ ├── __init__.py │ ├── base_saver.py │ ├── binary_saver.py │ └── tar_saver.py ├── datasets │ ├── __init__.py │ ├── binary_dataset.py │ ├── parquet_dataset.py │ ├── rar_dataset.py │ ├── tar_dataset.py │ ├── txt_dataset.py │ └── zip_dataset.py ├── process_single_entry.py ├── streaming │ ├── __init__.py │ ├── fake_dataset.py │ ├── image_jsonl_dataset.py │ ├── instantiate.py │ ├── jsonl_dataset.py │ ├── merged_dataset.py │ ├── processfns.py │ ├── readme.md │ ├── reshard_states.py │ └── web_dataset.py ├── tasks │ ├── __init__.py │ ├── base_task.py │ ├── icetk_image_text_task.py │ ├── icetk_text_task.py │ └── image_text_tokenization_task.py ├── utils │ ├── __init__.py │ ├── cogview │ │ ├── __init__.py │ │ ├── api.py │ │ ├── chinese_sentencepiece │ │ │ ├── cog-pretrain.model │ │ │ └── cog-pretrain.vocab │ │ ├── sp_tokenizer.py │ │ ├── unified_tokenizer.py │ │ ├── vqvae_tokenizer.py │ │ └── vqvae_zc.py │ ├── eprogress.py │ ├── helpers.py │ ├── logger.py │ ├── progress_record.py │ └── register.py └── version.py ├── docs ├── .nojekyll ├── Makefile ├── build │ └── html │ │ ├── .buildinfo │ │ ├── Tutorial.html │ │ ├── _sources │ │ ├── Tutorial.rst.txt │ │ ├── cogdata.data_savers.rst.txt │ │ ├── cogdata.datasets.rst.txt │ │ ├── cogdata.rst.txt │ │ ├── cogdata.tasks.rst.txt │ │ ├── cogdata.utils.cogview.rst.txt │ │ ├── cogdata.utils.rst.txt │ │ └── index.rst.txt │ │ ├── _static │ │ ├── basic.css │ │ ├── css │ │ │ ├── badge_only.css │ │ │ ├── fonts │ │ │ │ ├── Roboto-Slab-Bold.woff │ │ │ │ ├── Roboto-Slab-Bold.woff2 │ │ │ │ ├── Roboto-Slab-Regular.woff │ │ │ │ ├── Roboto-Slab-Regular.woff2 │ │ │ │ ├── fontawesome-webfont.eot │ │ │ │ ├── fontawesome-webfont.svg │ │ │ │ ├── fontawesome-webfont.ttf │ │ │ │ ├── fontawesome-webfont.woff │ │ │ │ ├── fontawesome-webfont.woff2 │ │ │ │ ├── lato-bold-italic.woff │ │ │ │ ├── lato-bold-italic.woff2 │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-normal-italic.woff │ │ │ │ ├── lato-normal-italic.woff2 │ │ │ │ ├── lato-normal.woff │ │ │ │ └── lato-normal.woff2 │ │ │ └── theme.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── jquery-3.5.1.js │ │ ├── jquery.js │ │ ├── js │ │ │ ├── badge_only.js │ │ │ ├── html5shiv-printshiv.min.js │ │ │ ├── html5shiv.min.js │ │ │ └── theme.js │ │ ├── language_data.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── underscore-1.12.0.js │ │ └── underscore.js │ │ ├── cogdata.data_savers.html │ │ ├── cogdata.datasets.html │ │ ├── cogdata.html │ │ ├── cogdata.tasks.html │ │ ├── cogdata.utils.cogview.html │ │ ├── cogdata.utils.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── objects.inv │ │ ├── py-modindex.html │ │ ├── search.html │ │ └── searchindex.js ├── index.html ├── make.bat └── source │ ├── Tutorial.rst │ ├── cogdata.data_savers.rst │ ├── cogdata.datasets.rst │ ├── cogdata.rst │ ├── cogdata.tasks.rst │ ├── cogdata.utils.cogview.rst │ ├── cogdata.utils.rst │ ├── conf.py │ └── index.rst ├── downloads └── testcase │ ├── test_data_manager │ ├── testcase.json │ └── testcase.tar │ ├── test_image_text_tokenization_task │ ├── testcase.json │ └── testcase.tar │ ├── test_tar_saver │ └── testcase.zip │ ├── test_txt_tokenization_task │ └── wiki1k.txt │ └── test_zip_rar_tar_datasets │ ├── testcase.rar │ ├── testcase.tar │ └── testcase.zip ├── examples ├── cogdata_config.json.example ├── cogdata_info.json.example ├── convert2tar_helpers.py └── convert2tar_task.py ├── requirements.txt ├── scripts └── install_unrarlib.sh ├── setup.py └── tests ├── __init__.py ├── manual_test_logger.py ├── streaming ├── __init__.py ├── img_jsonl_sft_testcase.yaml ├── img_jsonl_sft_testcase_merge.yaml ├── img_txt_testcase.yaml ├── img_txt_testcase_dynamic.yaml ├── merge_testcase.yaml ├── merge_testcase_ordered.yaml ├── merge_testcase_yieldfn.yaml ├── test_customized_yield.py ├── test_dp.py ├── test_dynamic.py ├── test_img.py ├── test_itemize.py ├── test_merge.py ├── test_multiworker_reload.py ├── test_txt.py └── txt_en_testcase.yaml ├── test_binary_saver_dataset.py ├── test_data_manager.py ├── test_image_text_tokenization_task.py ├── test_monitor.py ├── test_parquet_dataset.py ├── test_register.py ├── test_tar_saver.py ├── test_txt_datasets.py ├── test_txt_tokenization_task.py └── test_zip_rar_tar_datasets.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/README.md -------------------------------------------------------------------------------- /Readme_streaming.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/Readme_streaming.md -------------------------------------------------------------------------------- /cogdata/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/__init__.py -------------------------------------------------------------------------------- /cogdata/arguments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/arguments.py -------------------------------------------------------------------------------- /cogdata/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/cli.py -------------------------------------------------------------------------------- /cogdata/data_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/data_manager.py -------------------------------------------------------------------------------- /cogdata/data_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/data_processor.py -------------------------------------------------------------------------------- /cogdata/data_savers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/data_savers/__init__.py -------------------------------------------------------------------------------- /cogdata/data_savers/base_saver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/data_savers/base_saver.py -------------------------------------------------------------------------------- /cogdata/data_savers/binary_saver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/data_savers/binary_saver.py -------------------------------------------------------------------------------- /cogdata/data_savers/tar_saver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/data_savers/tar_saver.py -------------------------------------------------------------------------------- /cogdata/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/datasets/__init__.py -------------------------------------------------------------------------------- /cogdata/datasets/binary_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/datasets/binary_dataset.py -------------------------------------------------------------------------------- /cogdata/datasets/parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/datasets/parquet_dataset.py -------------------------------------------------------------------------------- /cogdata/datasets/rar_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/datasets/rar_dataset.py -------------------------------------------------------------------------------- /cogdata/datasets/tar_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/datasets/tar_dataset.py -------------------------------------------------------------------------------- /cogdata/datasets/txt_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/datasets/txt_dataset.py -------------------------------------------------------------------------------- /cogdata/datasets/zip_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/datasets/zip_dataset.py -------------------------------------------------------------------------------- /cogdata/process_single_entry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/process_single_entry.py -------------------------------------------------------------------------------- /cogdata/streaming/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/__init__.py -------------------------------------------------------------------------------- /cogdata/streaming/fake_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/fake_dataset.py -------------------------------------------------------------------------------- /cogdata/streaming/image_jsonl_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/image_jsonl_dataset.py -------------------------------------------------------------------------------- /cogdata/streaming/instantiate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/instantiate.py -------------------------------------------------------------------------------- /cogdata/streaming/jsonl_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/jsonl_dataset.py -------------------------------------------------------------------------------- /cogdata/streaming/merged_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/merged_dataset.py -------------------------------------------------------------------------------- /cogdata/streaming/processfns.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/processfns.py -------------------------------------------------------------------------------- /cogdata/streaming/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/readme.md -------------------------------------------------------------------------------- /cogdata/streaming/reshard_states.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/reshard_states.py -------------------------------------------------------------------------------- /cogdata/streaming/web_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/streaming/web_dataset.py -------------------------------------------------------------------------------- /cogdata/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/tasks/__init__.py -------------------------------------------------------------------------------- /cogdata/tasks/base_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/tasks/base_task.py -------------------------------------------------------------------------------- /cogdata/tasks/icetk_image_text_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/tasks/icetk_image_text_task.py -------------------------------------------------------------------------------- /cogdata/tasks/icetk_text_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/tasks/icetk_text_task.py -------------------------------------------------------------------------------- /cogdata/tasks/image_text_tokenization_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/tasks/image_text_tokenization_task.py -------------------------------------------------------------------------------- /cogdata/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cogdata/utils/cogview/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/cogview/__init__.py -------------------------------------------------------------------------------- /cogdata/utils/cogview/api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/cogview/api.py -------------------------------------------------------------------------------- /cogdata/utils/cogview/chinese_sentencepiece/cog-pretrain.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/cogview/chinese_sentencepiece/cog-pretrain.model -------------------------------------------------------------------------------- /cogdata/utils/cogview/chinese_sentencepiece/cog-pretrain.vocab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/cogview/chinese_sentencepiece/cog-pretrain.vocab -------------------------------------------------------------------------------- /cogdata/utils/cogview/sp_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/cogview/sp_tokenizer.py -------------------------------------------------------------------------------- /cogdata/utils/cogview/unified_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/cogview/unified_tokenizer.py -------------------------------------------------------------------------------- /cogdata/utils/cogview/vqvae_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/cogview/vqvae_tokenizer.py -------------------------------------------------------------------------------- /cogdata/utils/cogview/vqvae_zc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/cogview/vqvae_zc.py -------------------------------------------------------------------------------- /cogdata/utils/eprogress.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/eprogress.py -------------------------------------------------------------------------------- /cogdata/utils/helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/helpers.py -------------------------------------------------------------------------------- /cogdata/utils/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/logger.py -------------------------------------------------------------------------------- /cogdata/utils/progress_record.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/progress_record.py -------------------------------------------------------------------------------- /cogdata/utils/register.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/cogdata/utils/register.py -------------------------------------------------------------------------------- /cogdata/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.8' # pragma: no cover 2 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/Makefile -------------------------------------------------------------------------------- /docs/build/html/.buildinfo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/.buildinfo -------------------------------------------------------------------------------- /docs/build/html/Tutorial.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/Tutorial.html -------------------------------------------------------------------------------- /docs/build/html/_sources/Tutorial.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_sources/Tutorial.rst.txt -------------------------------------------------------------------------------- /docs/build/html/_sources/cogdata.data_savers.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_sources/cogdata.data_savers.rst.txt -------------------------------------------------------------------------------- /docs/build/html/_sources/cogdata.datasets.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_sources/cogdata.datasets.rst.txt -------------------------------------------------------------------------------- /docs/build/html/_sources/cogdata.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_sources/cogdata.rst.txt -------------------------------------------------------------------------------- /docs/build/html/_sources/cogdata.tasks.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_sources/cogdata.tasks.rst.txt -------------------------------------------------------------------------------- /docs/build/html/_sources/cogdata.utils.cogview.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_sources/cogdata.utils.cogview.rst.txt -------------------------------------------------------------------------------- /docs/build/html/_sources/cogdata.utils.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_sources/cogdata.utils.rst.txt -------------------------------------------------------------------------------- /docs/build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_sources/index.rst.txt -------------------------------------------------------------------------------- /docs/build/html/_static/basic.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/basic.css -------------------------------------------------------------------------------- /docs/build/html/_static/css/badge_only.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/badge_only.css -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.svg -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/lato-bold-italic.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/lato-bold.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/lato-normal-italic.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/lato-normal.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/fonts/lato-normal.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/theme.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/css/theme.css -------------------------------------------------------------------------------- /docs/build/html/_static/doctools.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/doctools.js -------------------------------------------------------------------------------- /docs/build/html/_static/documentation_options.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/documentation_options.js -------------------------------------------------------------------------------- /docs/build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/file.png -------------------------------------------------------------------------------- /docs/build/html/_static/jquery-3.5.1.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/jquery-3.5.1.js -------------------------------------------------------------------------------- /docs/build/html/_static/jquery.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/jquery.js -------------------------------------------------------------------------------- /docs/build/html/_static/js/badge_only.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/js/badge_only.js -------------------------------------------------------------------------------- /docs/build/html/_static/js/html5shiv-printshiv.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/js/html5shiv-printshiv.min.js -------------------------------------------------------------------------------- /docs/build/html/_static/js/html5shiv.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/js/html5shiv.min.js -------------------------------------------------------------------------------- /docs/build/html/_static/js/theme.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/js/theme.js -------------------------------------------------------------------------------- /docs/build/html/_static/language_data.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/language_data.js -------------------------------------------------------------------------------- /docs/build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/plus.png -------------------------------------------------------------------------------- /docs/build/html/_static/pygments.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/pygments.css -------------------------------------------------------------------------------- /docs/build/html/_static/searchtools.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/searchtools.js -------------------------------------------------------------------------------- /docs/build/html/_static/underscore-1.12.0.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/underscore-1.12.0.js -------------------------------------------------------------------------------- /docs/build/html/_static/underscore.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/_static/underscore.js -------------------------------------------------------------------------------- /docs/build/html/cogdata.data_savers.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/cogdata.data_savers.html -------------------------------------------------------------------------------- /docs/build/html/cogdata.datasets.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/cogdata.datasets.html -------------------------------------------------------------------------------- /docs/build/html/cogdata.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/cogdata.html -------------------------------------------------------------------------------- /docs/build/html/cogdata.tasks.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/cogdata.tasks.html -------------------------------------------------------------------------------- /docs/build/html/cogdata.utils.cogview.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/cogdata.utils.cogview.html -------------------------------------------------------------------------------- /docs/build/html/cogdata.utils.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/cogdata.utils.html -------------------------------------------------------------------------------- /docs/build/html/genindex.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/genindex.html -------------------------------------------------------------------------------- /docs/build/html/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/index.html -------------------------------------------------------------------------------- /docs/build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/objects.inv -------------------------------------------------------------------------------- /docs/build/html/py-modindex.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/py-modindex.html -------------------------------------------------------------------------------- /docs/build/html/search.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/search.html -------------------------------------------------------------------------------- /docs/build/html/searchindex.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/build/html/searchindex.js -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/index.html -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/make.bat -------------------------------------------------------------------------------- /docs/source/Tutorial.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/Tutorial.rst -------------------------------------------------------------------------------- /docs/source/cogdata.data_savers.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/cogdata.data_savers.rst -------------------------------------------------------------------------------- /docs/source/cogdata.datasets.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/cogdata.datasets.rst -------------------------------------------------------------------------------- /docs/source/cogdata.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/cogdata.rst -------------------------------------------------------------------------------- /docs/source/cogdata.tasks.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/cogdata.tasks.rst -------------------------------------------------------------------------------- /docs/source/cogdata.utils.cogview.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/cogdata.utils.cogview.rst -------------------------------------------------------------------------------- /docs/source/cogdata.utils.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/cogdata.utils.rst -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/conf.py -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/docs/source/index.rst -------------------------------------------------------------------------------- /downloads/testcase/test_data_manager/testcase.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_data_manager/testcase.json -------------------------------------------------------------------------------- /downloads/testcase/test_data_manager/testcase.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_data_manager/testcase.tar -------------------------------------------------------------------------------- /downloads/testcase/test_image_text_tokenization_task/testcase.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_image_text_tokenization_task/testcase.json -------------------------------------------------------------------------------- /downloads/testcase/test_image_text_tokenization_task/testcase.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_image_text_tokenization_task/testcase.tar -------------------------------------------------------------------------------- /downloads/testcase/test_tar_saver/testcase.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_tar_saver/testcase.zip -------------------------------------------------------------------------------- /downloads/testcase/test_txt_tokenization_task/wiki1k.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_txt_tokenization_task/wiki1k.txt -------------------------------------------------------------------------------- /downloads/testcase/test_zip_rar_tar_datasets/testcase.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_zip_rar_tar_datasets/testcase.rar -------------------------------------------------------------------------------- /downloads/testcase/test_zip_rar_tar_datasets/testcase.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_zip_rar_tar_datasets/testcase.tar -------------------------------------------------------------------------------- /downloads/testcase/test_zip_rar_tar_datasets/testcase.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/downloads/testcase/test_zip_rar_tar_datasets/testcase.zip -------------------------------------------------------------------------------- /examples/cogdata_config.json.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/examples/cogdata_config.json.example -------------------------------------------------------------------------------- /examples/cogdata_info.json.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/examples/cogdata_info.json.example -------------------------------------------------------------------------------- /examples/convert2tar_helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/examples/convert2tar_helpers.py -------------------------------------------------------------------------------- /examples/convert2tar_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/examples/convert2tar_task.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/install_unrarlib.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/scripts/install_unrarlib.sh -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/setup.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/manual_test_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/manual_test_logger.py -------------------------------------------------------------------------------- /tests/streaming/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/streaming/img_jsonl_sft_testcase.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/img_jsonl_sft_testcase.yaml -------------------------------------------------------------------------------- /tests/streaming/img_jsonl_sft_testcase_merge.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/img_jsonl_sft_testcase_merge.yaml -------------------------------------------------------------------------------- /tests/streaming/img_txt_testcase.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/img_txt_testcase.yaml -------------------------------------------------------------------------------- /tests/streaming/img_txt_testcase_dynamic.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/img_txt_testcase_dynamic.yaml -------------------------------------------------------------------------------- /tests/streaming/merge_testcase.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/merge_testcase.yaml -------------------------------------------------------------------------------- /tests/streaming/merge_testcase_ordered.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/merge_testcase_ordered.yaml -------------------------------------------------------------------------------- /tests/streaming/merge_testcase_yieldfn.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/merge_testcase_yieldfn.yaml -------------------------------------------------------------------------------- /tests/streaming/test_customized_yield.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/test_customized_yield.py -------------------------------------------------------------------------------- /tests/streaming/test_dp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/test_dp.py -------------------------------------------------------------------------------- /tests/streaming/test_dynamic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/test_dynamic.py -------------------------------------------------------------------------------- /tests/streaming/test_img.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/test_img.py -------------------------------------------------------------------------------- /tests/streaming/test_itemize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/test_itemize.py -------------------------------------------------------------------------------- /tests/streaming/test_merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/test_merge.py -------------------------------------------------------------------------------- /tests/streaming/test_multiworker_reload.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/test_multiworker_reload.py -------------------------------------------------------------------------------- /tests/streaming/test_txt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/test_txt.py -------------------------------------------------------------------------------- /tests/streaming/txt_en_testcase.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/streaming/txt_en_testcase.yaml -------------------------------------------------------------------------------- /tests/test_binary_saver_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_binary_saver_dataset.py -------------------------------------------------------------------------------- /tests/test_data_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_data_manager.py -------------------------------------------------------------------------------- /tests/test_image_text_tokenization_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_image_text_tokenization_task.py -------------------------------------------------------------------------------- /tests/test_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_monitor.py -------------------------------------------------------------------------------- /tests/test_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_parquet_dataset.py -------------------------------------------------------------------------------- /tests/test_register.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_register.py -------------------------------------------------------------------------------- /tests/test_tar_saver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_tar_saver.py -------------------------------------------------------------------------------- /tests/test_txt_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_txt_datasets.py -------------------------------------------------------------------------------- /tests/test_txt_tokenization_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_txt_tokenization_task.py -------------------------------------------------------------------------------- /tests/test_zip_rar_tar_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sleepychord/cogdata/HEAD/tests/test_zip_rar_tar_datasets.py --------------------------------------------------------------------------------