├── .github └── FUNDING.yml ├── .gitignore ├── .gitmodules ├── README.md └── syncfolk_submodules.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: dsdanielpark 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: #minwoopark 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "langchain"] 2 | path = langchain 3 | url = https://github.com/hwchase17/langchain.git 4 | [submodule "dolly"] 5 | path = dolly 6 | url = https://github.com/databrickslabs/dolly.git 7 | [submodule "openai-cookbook"] 8 | path = openai-cookbook 9 | url = https://github.com/openai/openai-cookbook.git 10 | [submodule "lit-llama"] 11 | path = lit-llama 12 | url = https://github.com/Lightning-AI/lit-llama.git 13 | [submodule "open_llama"] 14 | path = open_llama 15 | url = https://github.com/openlm-research/open_llama.git 16 | [submodule "stanford_alpaca"] 17 | path = stanford_alpaca 18 | url = https://github.com/tatsu-lab/stanford_alpaca.git 19 | [submodule "LoRA"] 20 | path = LoRA 21 | url = https://github.com/microsoft/LoRA.git 22 | [submodule "privateGPT"] 23 | path = privateGPT 24 | url = https://github.com/imartinez/privateGPT.git 25 | [submodule "llama"] 26 | path = llama 27 | url = https://github.com/facebookresearch/llama.git 28 | [submodule "qlora"] 29 | path = qlora 30 | url = https://github.com/artidoro/qlora.git 31 | [submodule "FastChat"] 32 | path = FastChat 33 | url = https://github.com/lm-sys/FastChat.git 34 | [submodule "peft"] 35 | path = peft 36 | url = https://github.com/huggingface/peft.git 37 | [submodule "transformers"] 38 | path = transformers 39 | url = https://github.com/huggingface/transformers.git 40 | [submodule "hf-transllm"] 41 | path = hf-transllm 42 | url = https://github.com/dsdanielpark/hf-transllm.git 43 | [submodule "PaLM"] 44 | path = PaLM 45 | url = https://github.com/conceptofmind/PaLM.git 46 | [submodule "LMFlow"] 47 | path = LMFlow 48 | url = https://github.com/OptimalScale/LMFlow.git 49 | [submodule "Awesome-LLM"] 50 | path = Awesome-LLM 51 | url = https://github.com/Hannibal046/Awesome-LLM.git 52 | [submodule "StableLM"] 53 | path = StableLM 54 | url = https://github.com/Stability-AI/StableLM.git 55 | [submodule "gpt-neox"] 56 | path = gpt-neox 57 | url = https://github.com/EleutherAI/gpt-neox.git 58 | [submodule "metaseq"] 59 | path = metaseq 60 | url = https://github.com/facebookresearch/metaseq.git 61 | [submodule "fairseq"] 62 | path = fairseq 63 | url = https://github.com/facebookresearch/fairseq.git 64 | [submodule "alpaca-lora"] 65 | path = alpaca-lora 66 | url = https://github.com/tloen/alpaca-lora.git 67 | [submodule "trl"] 68 | path = trl 69 | url = https://github.com/lvwerra/trl.git 70 | [submodule "vllm"] 71 | path = vllm 72 | url = https://github.com/vllm-project/vllm.git 73 | [submodule "EasyLM"] 74 | path = EasyLM 75 | url = https://github.com/young-geng/EasyLM.git 76 | [submodule "gptq"] 77 | path = gptq 78 | url = https://github.com/IST-DASLab/gptq.git 79 | [submodule "axolotl"] 80 | path = axolotl 81 | url = https://github.com/OpenAccess-AI-Collective/axolotl.git 82 | [submodule "flash-attention"] 83 | path = flash-attention 84 | url = https://github.com/Dao-AILab/flash-attention.git 85 | [submodule "LLM-eval-survey"] 86 | path = LLM-eval-survey 87 | url = https://github.com/MLGroupJLU/LLM-eval-survey 88 | [submodule "llama.cpp"] 89 | path = llama.cpp 90 | url = https://github.com/ggerganov/llama.cpp 91 | [submodule "llama-cpp-python"] 92 | path = llama-cpp-python 93 | url = https://github.com/abetlen/llama-cpp-python 94 | [submodule "ggml"] 95 | path = ggml 96 | url = https://github.com/ggerganov/ggml 97 | [submodule "llama2.c"] 98 | path = llama2.c 99 | url = https://github.com/karpathy/llama2.c 100 | [submodule "RedPajama-Data"] 101 | path = RedPajama-Data 102 | url = https://github.com/togethercomputer/RedPajama-Data 103 | [submodule "LLaVA"] 104 | path = LLaVA 105 | url = https://github.com/haotian-liu/LLaVA 106 | [submodule "Qwen"] 107 | path = Qwen 108 | url = https://github.com/QwenLM/Qwen.git 109 | [submodule "mistral-src"] 110 | path = mistral-src 111 | url = https://github.com/mistralai/mistral-src.git 112 | [submodule "tiktoken"] 113 | path = tiktoken 114 | url = https://github.com/openai/tiktoken.git 115 | [submodule "open-llm-datasets"] 116 | path = open-llm-datasets 117 | url = https://github.com/dsdanielpark/open-llm-datasets.git 118 | [submodule "awesome"] 119 | path = awesome 120 | url = https://github.com/dsdanielpark/awesome.git 121 | [submodule "milvus"] 122 | path = milvus 123 | url = https://github.com/milvus-io/milvus.git 124 | [submodule "exllama"] 125 | path = exllama 126 | url = https://github.com/turboderp/exllama.git 127 | [submodule "autotrain-advanced"] 128 | path = autotrain-advanced 129 | url = https://github.com/huggingface/autotrain-advanced 130 | [submodule "alignment-handbook"] 131 | path = alignment-handbook 132 | url = https://github.com/huggingface/alignment-handbook.git 133 | [submodule "openai-python"] 134 | path = openai-python 135 | url = https://github.com/openai/openai-python 136 | [submodule "DeepSpeed-MII"] 137 | path = DeepSpeed-MII 138 | url = https://github.com/microsoft/DeepSpeed-MII.git 139 | [submodule "LLMDataHub"] 140 | path = LLMDataHub 141 | url = https://github.com/Zjh-819/LLMDataHub 142 | [submodule "attention_sinks"] 143 | path = attention_sinks 144 | url = https://github.com/tomaarsen/attention_sinks.git 145 | [submodule "Qwen-VL"] 146 | path = Qwen-VL 147 | url = https://github.com/QwenLM/Qwen-VL.git 148 | [submodule "Awesome-Multimodal-Large-Language-Models"] 149 | path = Awesome-Multimodal-Large-Language-Models 150 | url = https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models.git 151 | [submodule "Chinese-LLaMA-Alpac"] 152 | path = Chinese-LLaMA-Alpac 153 | url = https://github.com/ymcui/Chinese-LLaMA-Alpaca.git 154 | [submodule "Video-LLaVA"] 155 | path = Video-LLaVA 156 | url = https://github.com/PKU-YuanGroup/Video-LLaVA.git 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | [![](https://img.shields.io/badge/Language-English-lightgrey)](https://github.com/dsdanielpark/all-about-llm) 4 | 5 | 6 | # All About LLM 7 | Curated the comments as a sub-module to see how active the activity is while syncing forks. Therefore, this repository serves the purpose of curating comments not only for some experiments but mostly for self-checking, where I can see on my own where and when commits and pull requests frequently occur. To allow for viewing a list of all submodules, I intentionally do not use folders for organizing the repository. Additionally, you can view the complete list in the [git submodule file.](https://github.com/dsdanielpark/all-about-llm/blob/main/.gitmodules) 8 | 9 | This repository contains only some of the models required for _personal_ research, so please refer to other repositories for detailed information and updates. 10 | 11 |
12 | 13 | - [All About LLM](#all-about-llm) 14 | - [Quick start](#quick-start) 15 | - [Leaderboards](#leaderboards) 16 | - [Open LLM](#open-llm) 17 | - [LLM Model Evaluation](#llm-model-evaluation) 18 | - [Datasets](#datasets) 19 | 20 | 21 |
22 | 23 | 24 | 25 | ## Quick start 26 | ``` 27 | $ git clone https://github.com/dsdanielpark/all-about-llm.git 28 | $ cd all-about-llm 29 | $ git submodule update --init --recursive 30 | $ python syncfolk_submodules.py 31 | ``` 32 | 33 | ## Leaderboards 34 | 35 | | Leaderboard Name | Description | 36 | | --- | --- | 37 | | [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/) | Provides evaluation metrics for LLMs. | 38 | | [Chatbot Arena (LMSYS Org)](https://chat.lmsys.org/) | Offers resources and a leaderboard for LLM performance. | 39 | | [Open LLM Leaderboard (Hugging Face)](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) | Features a leaderboard for LLMs. | 40 | | [The Big Benchmarks Collection](https://huggingface.co/collections/open-llm-leaderboard/the-big-benchmarks-collection-64faca6335a7fc7d4ffe974a) | Gathering benchmark spaces on the hub (beyond the Open LLM Leaderboard). | 41 | | [MTEB Leaderboard](#) | Massive Text Embedding Benchmark (MTEB) Leaderboard. | 42 | | [Chatbot Arena Leaderboard](#) | This leaderboard is based on Chatbot Arena, MT-Bench, and MMLU (5-shot). | 43 | | [LLM-Perf Leaderboard](#) | Benchmarks performance (latency, throughput & memory) of LLMs with different hardwares and optimizations. | 44 | | [Big Code Models Leaderboard](#) | Compares performance of base multilingual code generation models on benchmarks like HumanEval and MultiPL-E. | 45 | | [Open ASR Leaderboard](#) | Ranks and evaluates speech recognition models, reporting Average WER and RTF. | 46 | | [MT Bench](#) | MT-Bench Browser associated with Chatbot Arena. | 47 | | [Toolbench Leaderboard](#) | - | 48 | | [OpenCompass LLM Leaderboard](#) | - | 49 | | [OpenCompass MMBench Leaderboard](#) | - | 50 | | [Open Ko-LLM Leaderboard](#) | - | 51 | 52 | 53 |
54 | 55 | ## Open LLM 56 | 57 | | LLM | Initial Release | Developer | License | 58 | | --- | --- | --- | --- | 59 | | [GPT-J](#) | 2021-06-09 | EleutherAI | Apache 2.0 | 60 | | [GPTNeo](#) | 2021-03-21 | EleutherAI, Together | Apache 2.0 | 61 | | [FLAN-T5](#) | 2022-12-06 | Google | Apache 2.0 | 62 | | [BLOOM](#) | 2022-07-06 | Hugging Face | Open RAIL-M v1 | 63 | | [OPT](#) | 2022-05-03 | Meta | NA | 64 | | [Pythia](#) | 2023-02-13 | EleutherAI, Together | Apache 2.0 | 65 | | [LLaMA](#) | 2023-02-24 | Meta | Noncommercial | 66 | | [FLAN-UL2](#) | 2023-03-03 | Google | Apache 2.0 | 67 | | [Alpaca](#) | 2023-03-13 | Stanford | Noncommercial | 68 | | [Cerebras-GPT](#) | 2023-03-28 | Cerebras | Apache 2.0 | 69 | | [Dolly](#) | 2023-03-24 | Databricks | MIT | 70 | | [Vicuna](#) | 2023-03-30 | UC Berkeley, CMU, Stanford, MBZUAI, UCSD | Noncommercial | 71 | | [GPT4All](#) | 2023-03-26 | Nomic AI | Varies | 72 | | [Koala](#) | 2023-04-03 | BAIR | Noncommercial | 73 | | [OpenAssistant](#) | 2023-04-15 | LAION | Varies | 74 | | [StableLM](#) | 2023-04-19 | Stability AI | CC BY-SA 4.0 | 75 | | [OpenLLaMA](#) | 2023-04-28 | OpenLM Research | Apache 2.0 | 76 | | [FastChat](#) | 2023-04-28 | LMSYS | Apache 2.0 | 77 | | [StableVicuna](#) | 2023-04-28 | Stability AI | Noncommercial | 78 | | [BLOOMChat](#) | 2023-05-19 | SambaNova | Apache 2.0 | 79 | | [MPT](https://www.mosaicml.com/blog/mpt-7b) | 2023-05-05 | MosaicML | Apache 2.0 | 80 | | [RedPajama](https://github.com/togethercomputer/RedPajama-Data) | 2023-05-05 | Together | Apache 2.0 | 81 | | [Falcon](https://falconllm.tii.ae/) | 2023-05-23 | TII | Apache 2.0 | 82 | | [Guanaco](https://guanaco-model.github.io/) | 2023-05-23 | UW NLP | Noncommercial | 83 | | [WizardLM](https://huggingface.co/WizardLM/WizardLM-70B-V1.0) | 2023-05-26 | WizardLM | Non-commercial | 84 | | [Orca](https://huggingface.co/Open-Orca/OpenOrca-Preview1-13B) | 2023-06-05 | Microsoft | Noncommercial | 85 | | [Llama 2](https://ai.meta.com/llama/) | 2023-07-18 | Meta | Custom (Commercial OK) | 86 | | [Platypus](https://arxiv.org/abs/2308.07317) | 2023-08-14 | - | Non-commercial | 87 | | [Qwen](https://arxiv.org/abs/2308.07317) | 2023-08-28 | Alibaba Cloud | commercial | 88 | | [Mistral](https://mistral.ai) | 2023-10-10 | Mistral AI | Permissive commercial | 89 | | [Zephyr](https://github.com/zephyrproject-rtos/zephyr) | 2023-10-25 | - | Apache | 90 | 91 | 92 |
93 | 94 | ## LLM Model Evaluation 95 | - [Harness Task Table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) 96 | - [Harness Task](https://github.com/EleutherAI/lm-evaluation-harness/tree/master/lm_eval/tasks) 97 | 98 | | No. | Task | Description | Year | Few-shot Examples | Random Baseline Accuracy | 99 | | --- | --- | --- | --- | --- | --- | 100 | | 1 | [Jeopardy](https://github.com/aigoopy/llm-jeopardy) | Consists of 2,117 Jeopardy questions from the topics of Literature, American History, World History, Word Origins, and Science, where the model is expected to provide correct answers. | 2022 | 10 | 0% | 101 | | 2 | [MMLU](https://paperswithcode.com/sota/multi-task-language-understanding-on-mmlu) | Comprises 14,042 multiple-choice questions across 57 categories, with academic-standard test-style questions covering subjects like law, mathematics, ethics, and more. The model must choose between options A, B, C, or D. | 2019 | 10 | 25% | 102 | | 3 | [BIG-bench: wikidata](https://github.com/google/BIG-bench/blob/main/bigbench/benchmark_tasks/qa_wikidata/README.md) | Consists of 20,321 questions regarding factual information derived from Wikipedia. The model is expected to complete sentences like "Barack Obama's nationality is..." | 2022 | 10 | ~0% | 103 | | 4 | [ARC easy](https://leaderboard.allenai.org/arc_easy/submissions/get-started) | Comprises 2,376 simple multiple-choice science questions extracted from 3rd to 9th-grade science exams, requiring the model to use basic scientific world knowledge. | 2019 | 10 | 25% | 104 | | 5 | [ARC challenge](https://paperswithcode.com/dataset/arc) | Contains 1,172 challenging multiple-choice science questions extracted from 3rd to 9th-grade science exams, involving scientific world knowledge and some procedural reasoning. | 2019 | 10 | 25% | 105 | | 6 | [BIG-bench misconceptions](https://paperswithcode.com/sota/misconceptions-on-big-bench) | Comprises 219 true/false questions about common misconceptions across various topics, and the model is expected to provide correct answers. | 2022 | 10 | 50% | 106 | | 7 | [BIG-bench: Strategy QA](https://github.com/google/BIG-bench) | Consists of 2,289 yes/no questions related to various common-sense topics, and the model is expected to select the correct answers. | 2022 | 10 | - | 107 | | 8 | [BIG-bench: Strange Stories](https://github.com/google/BIG-bench) | Comprises 174 short stories followed by 2-choice multiple-choice questions regarding characters, their emotions, and common-sense inferences about specific actions. | 2022 | 10 | 50% | 108 | | 9 | [BIG-bench: Novel Concepts](https://github.com/google/BIG-bench) | Contains 32 problems for finding common concepts, and the model is expected to choose the common concept among three given words. | 2022 | 10 | 25% | 109 | | 10 | [COPA](https://paperswithcode.com/sota/question-answering-on-copa) | Involves cause/effect multiple-choice questions where the model receives premises and must select the correct cause/effect among two options. | 2011 | 0 | 50% | 110 | | 11 | [PIQA](https://paperswithcode.com/paper/piqa-reasoning-about-physical-commonsense-in) | Comprises 1,838 2-choice multiple-choice questions about common-sense physics intuition, and the model is expected to select the correct answer. | 2019 | 10 | 50% | 111 | | 12 | [OpenBook QA](https://allenai.org/data/open-book-qa) | Consists of 500 multiple-choice questions about basic physics and scientific intuition for general objects and entities, and the model is expected to select the correct answers. | 2018 | 0 | 25% | 112 | | 13 | [LAMBADA](https://paperswithcode.com/sota/language-modelling-on-lambada) | Contains 5,153 text passages from books where the model reads the first N-1 words of each passage and predicts the last token. | 2016 | 0 | 0% | 113 | | 14 | [HellaSwag](https://paperswithcode.com/dataset/hellaswag) | Consists of 10,042 multiple-choice scenario-based questions where the model must choose the most plausible conclusion among four options. | 2019 | 10 | 25% | 114 | | 15 | [Winograd Schema Challenge](https://paperswithcode.com/dataset/wsc) | Contains 273 scenarios where the model must correctly resolve semantic coreferences in sentences. | 2012 | 0 | 50% | 115 | | 16 | [Winogrande](https://paperswithcode.com/paper/winogrande-an-adversarial-winograd-schema) | Comprises 1,267 scenarios with two starting sentences and a single ending sentence, and the model must select the semantically correct one. | 2012 | 0 | 50% | 116 | | 17 | [BIG bench language identification](https://github.com/google/BIG-bench) | Contains 10,000 multiple-choice questions where the model must recognize sentences written in languages other than English and identify the corresponding language. | 2012 | 10 | 25% | 117 | | 18 | [BIG bench conceptual combinations](https://github.com/google/BIG-bench) | Comprises 103 questions where the model answers multiple-choice questions about the meaning of defined neologisms and sentences using these neologisms. | 2022 | 10 | 25% | 118 | | 19 | [BIG bench conlang translation](https://github.com/google/BIG-bench) | Contains 164 problems where the model provides translations of simple sentences between English and a constructed language. | 2022 | 0 | 0% | 119 | | 20 | [BIG-bench elementary math QA](https://github.com/google/BIG-bench) | Consists of 38,160 multiple-choice arithmetic word problems, and the model is expected to select the correct answer. | 2022 | 10 | 25% | 120 | | 21 | [BIG-bench dyck languages](https://github.com/google/BIG-bench) | Involves 1,000 problems where the model must output the correct tokens required to complete a balanced expression of parentheses and curly braces. | 2022 | 10 | 0% | 121 | | 22 | [BIG-bench algorithms](https://example.com/big-bench-algorithms) | Contains 1,320 problems where the model must determine the length of the longest common subsequence of two strings or check the balance of expressions consisting of parentheses and curly braces. | 2022 | 10 | 0% | 122 | | 23 | [BIG-bench logical deduction](https://github.com/google/BIG-bench) | Comprises 1,500 multiple-choice questions requiring the model to select the logically consistent unique proposition among multiple logical constraints describing the relative order of objects. | 2022 | 10 | 25% | 123 | | 24 | [BIG-bench operators](https://github.com/google/BIG-bench) | Contains 210 problems where the model must calculate the result of expressions using mathematical operators, testing the model's ability to apply mathematical concepts. | 2022 | 10 | 0% | 124 | | 25 | [BIG-bench repeat copy logic](https://github.com/google/BIG-bench) | Comprises 32 tasks where the model must repeatedly copy a series of words in a specific order and produce the correct output. | 2022 | 10 | 0% | 125 | | 26 | [Simple arithmetic with spaces](https://github.com/google/BIG-bench) | Contains 1,000 arithmetic problems with three-digit numbers and up to three operations, where the model must calculate the correct result using the right order of operations. | 2023 | 10 | 0% | 126 | | 27 | [Simple arithmetic without spaces](https://github.com/google/BIG-bench) | Comprises 1,000 arithmetic problems with three-digit numbers and up to three operations, where the model must calculate the correct result of expressions with no spaces between numbers and operators. | 2023 | 10 | 0% | 127 | | 28 | [Math QA](https://github.com/google/BIG-bench) | Contains 2,983 multiple-choice math word problems, requiring basic inference, language comprehension, and arithmetic/algebra skills. | 2021 | 10 | 25% | 128 | | 29 | [LogiQA](https://github.com/google/BIG-bench) | Comprises 651 multiple-choice logic word problems based on mathematical and symbolic problems, where the model must make logical conclusions. | 2020 | 10 | 25% | 129 | | 30 | [BIG-bench: Understanding fables](https://github.com/google/BIG-bench) | Consists of 189 short stories followed by 4-choice multiple-choice questions where the model must select the correct moral for the story. | 2022 | 10 | 25% | 130 | | 31 | [Pubmed QA Labeled](https://pubmedqa.github.io/) | Comprises 1,000 hand-labeled medical documents and related questions, where the model must respond with yes/no/maybe. | 2019 | 10 | ~0% | 131 | | 32 | [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) | Consists of 10,570 short documents followed by related questions on various topics, and the model is expected to output the exact correct answer. | 2016 | 10 | ~0% | 132 | | 33 | [BoolQ](https://paperswithcode.com/paper/boolq-exploring-the-surprising-difficulty-of) | Contains 3,270 short passages on a diverse range of subjects followed by yes/no questions in multiple-choice format. | 2019 | 10 | ~50% | 133 | | 34 | [HumanEval code generation](https://paperswithcode.com/sota/code-generation-on-humaneval) | Comprises 164 Python programming challenges where the model is presented with the method signature and docstring comment for a Python program and is expected to complete the program. The resulting code's functional correctness is tested on a number of input/output pairs. | 2022 | 0 | 0% | 134 | | 35 | [AI2 Reasoning Challenge (25-shot)](https://allenai.org/data/arc) | Consists of grade-school science questions. | / | 25 | / | 135 | | 36 | [TruthfulQA (0-shot)](https://github.com/sylinrl/TruthfulQA) | A test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima 6-shot task, as it is prepended by 6 [examples](https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/finetune_truth.jsonl) systematically, even when launched using 0 for the number of few-shot examples. | / | 0 | / | 136 | | 37 | [AGIEval](https://github.com/ruixiangcui/AGIEval) | AGIEval is a new benchmark designed to assess foundation models in human-centric 137 | 138 | 139 |
140 | 141 | ## Datasets 142 | - https://github.com/Zjh-819/LLMDataHub 143 | - Curated by [Junhao Zhao](zhaol9555@gmail.com) 144 | 145 | | Dataset name | Used by | Type | Language | Size | Description ️ | 146 | |---------------------------------------------------------------------------------------------------------|---------------------------|---------------------|---------------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------| 147 | | [function_
calling_
extended](https://huggingface.co/datasets/Trelis/function_calling_extended) | / | Pairs | English
code | / | High quality human created dataset from enhance LM's API using ability. | 148 | | [AmericanStories](https://huggingface.co/datasets/dell-research-harvard/AmericanStories) | / | Pre-trained | English | / | Vast sized corpus scanned from US Library of Congress. | 149 | | [dolma](https://huggingface.co/datasets/allenai/dolma) | OLMo | Pre-trained | / | 3T tokens | A large diverse open-source corpus for LM pretraining. | 150 | | [Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus) | Platypus2 | Pairs | English | 25K | A very high quality dataset for improving LM's STEM reasoning ability. | 151 | | [Puffin](https://huggingface.co/datasets/LDJnr/Puffin) | Redmond-Puffin
Series | Dialog | English | ~3k entries | A dataset consists of conversations between real human and GPT-4,which features long context (over 1k tokens per conversation) and multi-turn dialogs. | 152 | | [tiny series](https://huggingface.co/datasets/nampdn-ai/tiny-codes) | / | Pairs | English | / | A series of short and concise codes or texts aim at improving LM's reasoning ability. | 153 | | [LongBench](https://huggingface.co/datasets/THUDM/LongBench) | / | Evaluation
Only | English
Chinese | 17 tasks | A benchmark for evaluate LLM's long context understanding capability. | 154 | | [orca-chat](https://huggingface.co/datasets/shahules786/orca-chat) | / | Dialog | English | 198,463 entries | An Orca-style dialog dataset aims at improving LM's long context conversational ability. | 155 | | [DialogStudio](https://github.com/salesforce/DialogStudio) | / | Dialog | Multilingual | / | A collection of diverse datasets aim at building conversational Chatbot. | 156 | | [chatbot_arena
_conversations](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations) | / | RLHF
Dialog | Multilingual | 33k conversations | Cleaned conversations with pairwise human preferences collected on Chatbot Arena. | 157 | | [WebGLM-qa](https://huggingface.co/datasets/THUDM/webglm-qa) | WebGLm | Pairs | English | 43.6k entries | Dataset used by WebGLM, which is a QA system based on LLM and Internet. Each of the entry in this dataset comprise a question, a response and a reference. The response is grounded in the reference. | 158 | | [phi-1](https://huggingface.co/datasets/teleprint-me/phi-1) | phi-1 | Dialog | English | / | A dataset generated by using the method in [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644). It focuses on math and CS problems. | 159 | | [Linly-
pretraining-
dataset](https://huggingface.co/datasets/Linly-AI/Chinese-pretraining-dataset) | Linly series | PT | Chinese | 3.4GB | Chinese pretraining dataset used by Linly series model, comprises ClueCorpusSmall, CSL news-crawl and etc. | 160 | | [FineGrainedRLHF](https://github.com/allenai/FineGrainedRLHF) | / | RLHF | English | ~5K examples | A repo aims at develop a new framework to collect human feedbacks. Data collected is with the purpose to improve LLMs factual correctness, topic relevance and other abilities. | 161 | | [dolphin](https://huggingface.co/datasets/ehartford/dolphin) | / | Pairs | English | 4.5M entries | An attempt to replicate Microsoft's Orca. Based on FLANv2. | 162 | | [openchat_
sharegpt4_
dataset](https://huggingface.co/datasets/openchat/openchat_sharegpt4_dataset) | OpenChat | Dialog | English | 6k dialogs | A high quality dataset generated by using GPT-4 to complete refined ShareGPT prompts. | 163 | | [OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca) | / | Pairs | English | 4.5M completions | A collection of augmented FLAN data. Generated by using method is Orca paper. | 164 | | [COIG-PC](https://huggingface.co/datasets/BAAI/COIG-PC)
[COIG-Lite](https://huggingface.co/datasets/BAAI/COIG-PC-Lite) | / | Pairs | Chinese | / | Enhanced version of COIG. | 165 | | [WizardLM_Orca](https://huggingface.co/datasets/psmathur/WizardLM_Orca) | orca_mini series | Pairs | English | 55K entries | Enhanced WizardLM data. Generated by using orca's method. | 166 | | arxiv instruct datasets
[math](https://huggingface.co/datasets/ArtifactAI/arxiv-math-instruct-50k)
[CS](https://huggingface.co/datasets/ArtifactAI/arxiv-beir-cs-ml-generated-queries)
[Physics](https://huggingface.co/datasets/ArtifactAI/arxiv-physics-instruct-tune-30k) | / | Pairs | English | 50K/
50K/
30K entries | dataset consists of question-answer pairs derived from ArXiv abstracts. Questions are generated using the t5-base model, while the answers are generated using the GPT-3.5-turbo model. | 167 | | [im-feeling-
curious](https://huggingface.co/datasets/xiyuez/im-feeling-curious) | / | Pairs | English | 2595 entries | Random questions and correspond facts generated by Google **I'm feeling curious** features. | 168 | | [ign_clean
_instruct
_dataset_500k](https://huggingface.co/ignmilton) | / | Pairs | / | 509K entries | A large scale SFT dataset which is synthetically created from a subset of Ultrachat prompts. ⚠ lack of detailed datacard | 169 | | [WizardLM
evolve_instruct V2](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k) | WizardLM | Dialog | English | 196k entries | The latest version of Evolve Instruct dataset. | 170 | | [Dynosaur](https://github.com/WadeYin9712/Dynosaur) | / | Pairs | English | 800K entries | The dataset generated by applying method in [this paper](https://dynosaur-it.github.io/). Highlight is generating high-quality data at low cost. | 171 | | [SlimPajama](https://huggingface.co/datasets/cerebras/SlimPajama-627B) | / | PT | Primarily
English | / | A cleaned and deduplicated version of RedPajama | 172 | | [LIMA dataset](https://huggingface.co/datasets/GAIR/lima) | LIMA | Pairs | English | 1k entries | High quality SFT dataset used by [LIMA: Less Is More for Alignment](https://arxiv.org/pdf/2305.11206.pdf) | 173 | | [TigerBot Series](https://github.com/TigerResearch/TigerBot#%E5%BC%80%E6%BA%90%E6%95%B0%E6%8D%AE%E9%9B%86) | TigerBot | PT
Pairs | Chinese
English | / | Datasets used to train the TigerBot, including pretraining data, STF data and some domain specific datasets like financial research reports. | 174 | | [TSI-v0](https://huggingface.co/datasets/tasksource/tasksource-instruct-v0) | / | Pairs | English | 30k examples
per task | A Multi-task instruction-tuning data recasted from 475 of the tasksource datasets. Similar to Flan dataset and Natural instruction. | 175 | | [NMBVC](https://github.com/esbatmop/MNBVC) | / | PT | Chinese | / | A large scale, continuously updating Chinese pretraining dataset. | 176 | | [StackOverflow
post](https://huggingface.co/datasets/mikex86/stackoverflow-posts) | / | PT | / | 35GB | Raw StackOverflow data in markdown format, for pretraining. | 177 | | [LaMini-Instruction](https://huggingface.co/datasets/MBZUAI/LaMini-instruction) | / | Pairs | English | 2.8M entries | A dataset distilled from flan collection, p3 and self-instruction. | 178 | | [ultraChat](https://huggingface.co/datasets/stingning/ultrachat) | / | Dialog | English | 1.57M dialogs | A large scale dialog dataset created by using two ChatGPT, one of which act as the user, another generates response. | 179 | | [ShareGPT_
Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) | Vicuna | Pairs | Multilingual | 53K entries | Cleaned ShareGPT dataset. | 180 | | [pku-saferlhf-dataset](https://github.com/PKU-Alignment/safe-rlhf#pku-saferlhf-dataset) | Beaver | RLHF | English | 10K + 1M | The first dataset of its kind and contains 10k instances with safety preferences. | 181 | | RefGPT-Dataset
[nonofficial link](https://github.com/sufengniu/RefGPT) | RefGPT | Pairs, Dialog | Chinese | ~50K entries | A Chinese dialog dataset aims at improve the correctness of fact in LLMs (mitigate the hallucination of LLM). | 182 | | [Luotuo-QA-A
CoQA-Chinese](https://huggingface.co/datasets/silk-road/Luotuo-QA-A-CoQA-Chinese) | Luotuo project | Context | Chinese | 127K QA pairs | A dataset built upon translated CoQA. Augmented by using OpenAI API. | 183 | | [Wizard-LM-Chinese
instruct-evol](https://huggingface.co/datasets/silk-road/Wizard-LM-Chinese-instruct-evol) | Luotuo project | Pairs | Chinese | ~70K entries | Chinese version WizardLM 70K. Answers are obtained by feed translated questions in OpenAI's GPT API and then get responses. | 184 | | [alpaca_chinese
dataset](https://github.com/hikariming/alpaca_chinese_dataset) | / | Pairs | Chinese | / | GPT-4 translated alpaca data includes some complement data (like Chinese poetry, application, etc.). Inspected by human. | 185 | | [Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL) | Open Assistant | Pairs | Chinese | 1.5GB | QA data on well-know Chinese Zhihu QA platform. | 186 | | [Alpaca-GPT-4_zh-cn](https://huggingface.co/datasets/shibing624/alpaca-zh) | / | Pairs | Chinese | about 50K entries | A Chinese Alpaca-style dataset, generated by GPT-4 originally in Chinese, not translated. | 187 | | [hh-rlhf](https://github.com/anthropics/hh-rlhf)
[on Huggingface](https://huggingface.co/datasets/Anthropic/hh-rlhf) | Koala | RLHF | English | 161k pairs
79.3MB | A pairwise dataset for training reward models in reinforcement learning for improving language models' harmlessness and helpfulness. | 188 | | [Panther-dataset_v1](https://huggingface.co/datasets/Rardilit/Panther-dataset_v1) | Panther | Pairs | English | 377 entries | A dataset comes from the hh-rlhf. It rewrite hh-rlhf into the form of input-output pairs. | 189 | | [Baize Dataset](https://github.com/project-baize/baize-chatbot/tree/main/data) | Baize | Dialog | English | 100K dialogs | A dialog dataset generated by GPT-4 using self-talking. Questions and topics are collected from Quora, StackOverflow and some medical knowledge source. | 190 | | [h2ogpt-fortune2000
personalized](https://huggingface.co/datasets/h2oai/h2ogpt-fortune2000-personalized) | h2ogpt | Pairs | English | 11363 entries | A instruction finetune developed by h2oai, covered various topics. | 191 | | [SHP](https://huggingface.co/datasets/stanfordnlp/SHP) | StableVicuna,
chat-opt,
, SteamSHP | RLHF | English | 385K entries | An RLHF dataset different from previously mentioned ones, it use scores+timestamps to infer the users' preferences. Covers 18 domains, collected by Stanford. | 192 | | [ELI5](https://huggingface.co/datasets/eli5#source-data) | MiniLM series | FT,
RLHF | English | 270K entries | Questions and Answers collected from Reddit, including score. Might be used for RLHF reward model training. | 193 | | [WizardLM
evol_instruct](https://huggingface.co/datasets/victor123/evol_instruct_70k)
[V2](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k) | WizardLM | Pairs | English | | An instruction finetune dataset derived from Alpaca-52K, using the **evolution** method in [this paper](https://arxiv.org/pdf/2304.12244.pdf) | 194 | | [MOSS SFT data](https://github.com/OpenLMLab/MOSS/tree/main/SFT_data) | MOSS | Pairs,
Dialog | Chinese, English | 1.1M entries | A conversational dataset collected and developed by MOSS team. It has usefulness, loyalty and harmlessness labels for every data entries. | 195 | | [ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K) | Koala, Stable LLM | Pairs | Multilingual | 52K | This dataset comprises conversations collected from ShareGPT, with a specific focus on customized creative conversation. | 196 | | [GPT-4all Dataset](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations) | GPT-4all | Pairs | English,
Might have
a translated version | 400k entries | A combination of some subsets of OIG, P3 and Stackoverflow. Covers topics like general QA, customized creative questions. | 197 | | [COIG](https://huggingface.co/datasets/BAAI/COIG) | / | Pairs | Chinese,
code | 200K entries | A Chinese-based dataset. It contains domains like general purpose QA, Chinese exams, code. Its quality is checked by human annotators. | 198 | | [RedPajama-Data-1T](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T) | RedPajama | PT | Primarily English | 1.2T tokens
5TB | A fully open pretraining dataset follows the LLaMA's method. | 199 | | [OASST1](https://huggingface.co/datasets/OpenAssistant/oasst1) | OpenAssistant | Pairs,
Dialog | Multilingual
(English, Spanish, etc.) | 66,497 conversation trees | A large, human-written, human-annotated high quality conversation dataset. It aims at making LLM generates more natural response. | 200 | | [Alpaca-COT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) | Phoenix | Pairs,
Dialog,
CoT | English | / | A mixture a many dataset like classic Alpaca dataset, OIG, Guanaco and some CoT(Chain-of-Thought) datasets like FLAN-CoT. May be handy to use. | 201 | | [Bactrian-X](https://huggingface.co/datasets/MBZUAI/Bactrian-X) | / | Pairs | Multilingual
(52 languages) | 67K entries per language | A multilingual version of **Alpaca** and **Dolly-15K**. | 202 | | [databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k)
[zh-cn Ver](https://huggingface.co/datasets/jaja7744/dolly-15k-cn) | Dolly2.0 | Pairs | English | 15K+ entries | A dataset of **human-written** prompts and responses, featuring tasks such as open-domain question-answering, brainstorming, summarization, and more. | 203 | | [AlpacaDataCleaned](https://github.com/gururise/AlpacaDataCleaned) | Some Alpaca/ LLaMA-like models | Pairs | English | / | Cleaned version of Alpaca, GPT_LLM and GPTeacher. | 204 | | [GPT-4-LLM Dataset](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) | Some Alpaca-like models | Pairs,
RLHF | English,
Chinese | 52K entries for English and Chinese respectively
9K entries unnatural-instruction | NOT the dataset used by GPT-4!! It is generated by GPT-4 and some other LLM for better Pairs and RLHF. It includes instruction data as well as comparison data in RLHF style. | 205 | | [GPTeacher](https://github.com/teknium1/GPTeacher) | / | Pairs | English | 20k entries | A dataset contains targets generated by GPT-4 and includes many of the same seed tasks as the Alpaca dataset, with the addition of some new tasks such as roleplay. | 206 | | [HC3](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection) | Koala | RLHF | English,
Chinese | 24322 English
12853 Chinese | A multi-domain, human-vs-ChatGPT comparison dataset. Can be used for reward model training or ChatGPT detector training. | 207 | | [Alpaca data](https://github.com/tatsu-lab/stanford_alpaca#data-release)
[Download](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json) | Alpaca, ChatGLM-finetune-LoRA, Koala | Dialog,
Pairs | English | 52K entries
21.4MB | A dataset generated by text-davinci-003 to improve language models' ability to follow human instruction. | 208 | | [OIG](https://huggingface.co/datasets/laion/OIG)
[OIG-small-chip2](https://huggingface.co/datasets/0-hero/OIG-small-chip2) | Pythia-Chat-Base-7B, GPT-NeoXT-Chat-Base-20B, Koala | Dialog,
Pairs | English,
code | 44M entries | A large conversational instruction dataset with medium and high quality subsets *(OIG-small-chip2)* for multi-task learning. | 209 | | [ChatAlpaca data](https://github.com/cascip/ChatAlpaca) | / | Dialog,
Pairs | English,
Chinese version coming soon | 10k entries
39.5MB | A dataset aims to help researchers develop models for instruction-following in multi-turn conversations. | 210 | | [InstructionWild](https://github.com/XueFuzhao/InstructionWild) | ColossalChat | Pairs | English, Chinese | 10K enreues | A Alpaca-style dataset, but with seed tasks comes from chatgpt screenshot. | 211 | | [Firefly](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) | Firefly(流萤) | Pairs | Chinese | 1.1M entries
1.17GB | A Chinese instruction-tuning dataset with 1.1 million human-written examples across 23 tasks, but no conversation. | 212 | | [BELLE](https://github.com/LianjiaTech/BELLE)
[0.5M version](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
[1M version](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
[2M version](https://huggingface.co/datasets/BelleGroup/train_2M_CN) | BELLE series, Chunhua (春华) | Pairs | Chinese | 2.67B in total | A Chinese instruction dataset similar to *Alpaca data* constructed by generating answers from seed tasks, but no conversation. | 213 | | [GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset#guanacodataset) | Guanaco | Dialog,
Pairs | English,
Chinese,
Japanese | 534,530 entries | A multilingual instruction dataset for enhancing language models' capabilities in various linguistic tasks, such as natural language understanding and explicit content recognition. | 214 | | [OpenAI WebGPT](https://huggingface.co/datasets/openai/webgpt_comparisons) | WebGPT's reward model, Koala | RLHF | English | 19,578 pairs | Data set used in WebGPT paper. Used for training reward model in RLHF. | 215 | | [OpenAI
Summarization
Comparison](https://huggingface.co/datasets/openai/summarize_from_feedback) | Koala | RLHF | English | ~93K entries
420MB | A dataset of human feedback which helps training a reward model. The reward model was then used to train a summarization model to align with human preferences. | 216 | | [self-instruct](https://github.com/yizhongw/self-instruct) | / | Pairs | English | 82K entries | The dataset generated by using the well-known [self-instruction method](https://arxiv.org/abs/2212.10560) | 217 | | [unnatural-instructions](https://github.com/orhonovich/unnatural-instructions) | / | Pairs | English | 240,670 examples | An early attempt to use powerful model (text-davinci-002) to generate data. | 218 | | [xP3 (and some variant)](https://huggingface.co/datasets/bigscience/xP3) | BLOOMZ, mT0 | Pairs | Multilingual,
code | 79M entries
88GB | An instruction dataset for improving language models' generalization ability, similar to *Natural Instruct*. | 219 | | [Flan V2](https://github.com/google-research/FLAN/tree/main/flan/v2) | / | / | English | / | A dataset compiles datasets from Flan 2021, P3, Super-Natural Instructions, along with dozens more datasets into one and formats them into a mix of zero-shot, few-shot and chain-of-thought templates | 220 | | [Natural Instruction](https://instructions.apps.allenai.org/)
[GitHub&Download](https://github.com/allenai/natural-instructions) | tk-instruct series | Pairs,
evaluation | Multilingual | / | A benchmark with over 1,600 tasks with instruction and definition for evaluating and improving language models' multi-task generalization under natural language instruction. | 221 | | [CrossWOZ](https://github.com/thu-coai/CrossWOZ) | / | Dialog | English,
Chinese | 6K dialogs | The dataset introduced by [this paper](https://arxiv.org/pdf/2002.11893.pdf), mainly about tourism topic in Beijing, answers are generated automatically by rules. | 222 | | [proof-pile](https://huggingface.co/datasets/hoskinson-center/proof-pile) | proof-GPT | PT | English
LaTeX | 13GB | A pretraining dataset which is similar to the pile but have LaTeX corpus to enhance LM's ability in proof. | 223 | | [peS2o](https://huggingface.co/datasets/allenai/peS2o) | / | PT | English | 7.5GB | A high quality academic paper dataset for pretraining. | 224 | | [StackOverflow
post](https://huggingface.co/datasets/mikex86/stackoverflow-posts) | / | PT | / | 35GB | Raw StackOverflow data in markdown format, for pretraining. | 225 | | [lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired/tree/main/data/rl) | Stack LLaMA 2 | PT | English | 6.3GB | Paired StackOverFlow human preference dataset | 226 | | [SlimPajama](https://huggingface.co/datasets/cerebras/SlimPajama-627B) | / | PT | Primarily
English | / | A cleaned and deduplicated version of RedPajama | 227 | | [NMBVC](https://github.com/esbatmop/MNBVC) | / | PT | Chinese | / | A large scale, continuously updating Chinese pretraining dataset. | 228 | | [falcon-refinedweb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) | tiiuae/falcon series | PT | English | / | A refined subset of CommonCrawl. | 229 | | [CBook-150K](https://github.com/FudanNLPLAB/CBook-150K) | / | PT,
building dataset | Chinese | 150K+ books | A raw Chinese books dataset. Need some preprocess pipeline. | 230 | | [Common Crawl](https://commoncrawl.org/) | LLaMA (After some process) | building datasets,
PT | / | / | The most well-known raw dataset, rarely be used directly. One possible preprocess pipeline is [CCNet](https://github.com/facebookresearch/cc_net) | 231 | | [nlp_Chinese_Corpus](https://github.com/brightmart/nlp_chinese_corpus) | / | PT,
TF | Chinese | / | A Chinese pretrain corpus. Includes Wikipedia, Baidu Baike, Baidu QA, some forums QA and news corpus. | 232 | | [The Pile (V1)](https://pile.eleuther.ai/) | GLM (partly), LLaMA (partly), GPT-J, GPT-NeoX-20B, Cerebras-GPT 6.7B, OPT-175b | PT | Multilingual,
code | 825GB | A diverse open-source language modeling dataset consisting of 22 smaller, high-quality datasets that includes many domains and tasks. | 233 | | C4
[Huggingface dataset](https://huggingface.co/datasets/c4)
[TensorFlow dataset](https://www.tensorflow.org/datasets/catalog/c4) | Google T5 Series, LLaMA | PT | English | 305GB | A colossal, cleaned version of Common Crawl's web crawl corpus. Frequently be used. | 234 | | [ROOTS](https://huggingface.co/bigscience-data) | BLOOM | PT | Multilingual,
code | 1.6TB | A diverse open-source dataset consisting of sub-datasets like Wikipedia and StackExchange for language modeling. | 235 | | [PushshPairs reddit](https://files.pushshPairs.io/reddit/)
[paper](https://arxiv.org/pdf/2001.08435.pdf) | OPT-175b | PT | / | / | Raw reddit data, one possible processing pipeline in [this paper](https://aclanthology.org/2021.eacl-main.24.pdf) | 236 | | [Gutenberg project](https://www.gutenberg.org/policy/robot_access.html) | LLaMA | PT | Multilingual | / | A book dataset, mostly novels. Not be preprocessed. | 237 | | [CLUECorpus](https://github.com/CLUEbenchmark/CLUE) | / | PT,
finetune,
evaluation | Chinese | 100GB | A Chinese pretraining Corpus sourced from *Common Crawl*. | 238 | | [starcoderdata](https://huggingface.co/datasets/bigcode/starcoderdata) | starcoder
series | PT | code | 783GB | A large pretraining dataset for improving LM's coding ability. | 239 | | [code_
instructions
_120k_alpaca](https://huggingface.co/datasets/iamtarun/code_instructions_120k_alpaca) | / | Pairs | English/code | 121,959 entries | [code_instruction](https://huggingface.co/datasets/sahil2801/code_instructions_120k) in instruction finetune format. | 240 | | [function-
invocations-25k](https://huggingface.co/datasets/unaidedelf87777/openapi-function-invocations-25k) | some MPT
variants | Pairs | English code | 25K entries | A dataset aims at teaching AI models how to correctly invoke [APIsGuru](https://github.com/APIs-guru/openapi-directory) functions based on natural language prompts. | 241 | | [TheoremQA](https://huggingface.co/datasets/wenhu/TheoremQA) | / | Pairs | English | 800 | A high quality STEM theorm QA dataset. | 242 | | [phi-1](https://huggingface.co/datasets/teleprint-me/phi-1) | phi-1 | Dialog | English | / | A dataset generated by using the method in [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644). It focuses on math and CS problems. | 243 | | [FinNLP](https://github.com/AI4Finance-Foundation/FinNLP) | [FinGPT](https://github.com/AI4Finance-Foundation/FinGPT) | Raw data | English,
Chinese | / | Open-source raw financial text data. Includes news, social media and etc. | 244 | | [PRM800K](https://github.com/openai/prm800k) | A variant of
GPT-4 | Context | English | 800K entries | A process supervision dataset for mathematical problems | 245 | | [MeChat data](https://github.com/qiuhuachuan/smile) ⚠️ | MeChat | Dialog | Chinese | 355733 utterances | A Chinese SFT dataset for training a mental healthcare chatbot. | 246 | | [ChatGPT-Jailbreak-Prompts](https://huggingface.co/datasets/rubend18/ChatGPT-Jailbreak-Prompts) ⚠️ | / | / | English | 163KB file size | Prompts for bypassing the safety regulation of ChatGPT. Can be use for probing the harmlessness of LLMs | 247 | | [awesome chinese
legal resources](https://github.com/pengxiao-song/awesome-chinese-legal-resources) | LaWGPT | / | Chinese | / | A collection of Chinese legal data for LLM training. | 248 | | [Long Form](https://github.com/akoksal/LongForm) | / | Pairs | English | 23.7K entries | A dataset aims at improving the long text generation ability of LLM. | 249 | | [symbolic-instruction-tuning](https://huggingface.co/datasets/sail/symbolic-instruction-tuning) | / | Pairs | English,
code | 796 | A dataset focuses on the 'symbolic' tasks: like SQL coding, mathematical computation, etc. | 250 | | [Safety Prompt](https://github.com/thu-coai/Safety-Prompts) | / | Evaluation only | Chinese | 100k entries | Chinese safety prompts for evaluating and improving the safety of LLMs. | 251 | | [Tapir-Cleaned](https://huggingface.co/datasets/MattiaL/tapir-cleaned-116k) | / | Pairs | English, | 116k entries | This is a revised version of the DAISLab dataset of PairsTT rules, which has been thoroughly cleaned, scored, and adjusted for the purpose of instruction-tuning | 252 | | [instructional_
codesearchnet_python](https://huggingface.co/datasets/Nan-Do/instructional_codesearchnet_python) | / | Pairs | English &
Python | 192MB | This dataset is a template generated instructional Python datastet generated from an annotated version of the code-search-net dataset for the Open-Assistant project. | 253 | | [finance-alpaca](https://huggingface.co/datasets/gbharti/finance-alpaca) | / | Pairs | English | 1.3K entries | An Alpaca-style dataset but focus on financial topics | 254 | | [OBELICS](https://huggingface.co/datasets/HuggingFaceM4/OBELICS) | idefics
series | image-document | English | 141M documents | an open, massive, and curated collection of interleaved image-text web documents. | 255 | | [JourneyDB](https://huggingface.co/datasets/JourneyDB/JourneyDB) | / | image-prompt-caption | English | 4M instances | A large scale dataset comprises QA, caption, and text prompting tasks, which is based on Midjourney images. | 256 | | [M3IT](https://huggingface.co/datasets/MMInstruction/M3IT) | Ying-VLM | instruction-image | Multilingual | 2.4M instances | A dataset comprises 40 tasks with 400 human written instruction. | 257 | | [MIMIC-IT](https://github.com/Luodian/Otter/tree/main/mimic-it) | Otter | instruction-image | Multilingial | 2.2M instances | High quality multi-modal instructions-response pairs based on images and videos. | 258 | | [LLaVA Instruction](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) | LLaVA | instruction-image | English | 158k samples | A multimodal dataset generated upon COCO dataset by prompting GPT-4 to get instructions. | 259 | | WebText(Reddit links) | GPT-2 | PT | English | / | Data crawled from Reddit and filtered for GPT-2 pretraining. | 260 | | MassiveText | Gopher, Chinchilla | PT | 99% English, 1% other(including code) | | | 261 | | WuDao Corpora | GLM | PT | Chinese | 200GB | A large scale Chinese corpus, Possible component originally open-sourced but not available now. | 262 | 263 | 264 | -------------------------------------------------------------------------------- /syncfolk_submodules.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | def sync_submodules(): 4 | # Initialize and update submodules 5 | subprocess.run(["git", "submodule", "update", "--init", "--recursive"], check=True) 6 | # Sync and update submodules to the latest version 7 | subprocess.run(["git", "submodule", "foreach", "git", "pull", "origin", "main"], check=True) 8 | 9 | def main(): 10 | try: 11 | sync_submodules() 12 | print("All submodules are synced to the latest version.") 13 | except subprocess.CalledProcessError as e: 14 | print(f"An error occurred while syncing submodules: {e}") 15 | 16 | if __name__ == "__main__": 17 | main() 18 | --------------------------------------------------------------------------------