├── .github
    └── FUNDING.yml
├── .gitignore
├── .gitmodules
├── README.md
└── syncfolk_submodules.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: dsdanielpark
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: #minwoopark
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
  1 | [submodule "langchain"]
  2 | 	path = langchain
  3 | 	url = https://github.com/hwchase17/langchain.git
  4 | [submodule "dolly"]
  5 | 	path = dolly
  6 | 	url = https://github.com/databrickslabs/dolly.git
  7 | [submodule "openai-cookbook"]
  8 | 	path = openai-cookbook
  9 | 	url = https://github.com/openai/openai-cookbook.git
 10 | [submodule "lit-llama"]
 11 | 	path = lit-llama
 12 | 	url = https://github.com/Lightning-AI/lit-llama.git
 13 | [submodule "open_llama"]
 14 | 	path = open_llama
 15 | 	url = https://github.com/openlm-research/open_llama.git
 16 | [submodule "stanford_alpaca"]
 17 | 	path = stanford_alpaca
 18 | 	url = https://github.com/tatsu-lab/stanford_alpaca.git
 19 | [submodule "LoRA"]
 20 | 	path = LoRA
 21 | 	url = https://github.com/microsoft/LoRA.git
 22 | [submodule "privateGPT"]
 23 | 	path = privateGPT
 24 | 	url = https://github.com/imartinez/privateGPT.git
 25 | [submodule "llama"]
 26 | 	path = llama
 27 | 	url = https://github.com/facebookresearch/llama.git
 28 | [submodule "qlora"]
 29 | 	path = qlora
 30 | 	url = https://github.com/artidoro/qlora.git
 31 | [submodule "FastChat"]
 32 | 	path = FastChat
 33 | 	url = https://github.com/lm-sys/FastChat.git
 34 | [submodule "peft"]
 35 | 	path = peft
 36 | 	url = https://github.com/huggingface/peft.git
 37 | [submodule "transformers"]
 38 | 	path = transformers
 39 | 	url = https://github.com/huggingface/transformers.git
 40 | [submodule "hf-transllm"]
 41 | 	path = hf-transllm
 42 | 	url = https://github.com/dsdanielpark/hf-transllm.git
 43 | [submodule "PaLM"]
 44 | 	path = PaLM
 45 | 	url = https://github.com/conceptofmind/PaLM.git
 46 | [submodule "LMFlow"]
 47 | 	path = LMFlow
 48 | 	url = https://github.com/OptimalScale/LMFlow.git
 49 | [submodule "Awesome-LLM"]
 50 | 	path = Awesome-LLM
 51 | 	url = https://github.com/Hannibal046/Awesome-LLM.git
 52 | [submodule "StableLM"]
 53 | 	path = StableLM
 54 | 	url = https://github.com/Stability-AI/StableLM.git
 55 | [submodule "gpt-neox"]
 56 | 	path = gpt-neox
 57 | 	url = https://github.com/EleutherAI/gpt-neox.git
 58 | [submodule "metaseq"]
 59 | 	path = metaseq
 60 | 	url = https://github.com/facebookresearch/metaseq.git
 61 | [submodule "fairseq"]
 62 | 	path = fairseq
 63 | 	url = https://github.com/facebookresearch/fairseq.git
 64 | [submodule "alpaca-lora"]
 65 | 	path = alpaca-lora
 66 | 	url = https://github.com/tloen/alpaca-lora.git
 67 | [submodule "trl"]
 68 | 	path = trl
 69 | 	url = https://github.com/lvwerra/trl.git
 70 | [submodule "vllm"]
 71 | 	path = vllm
 72 | 	url = https://github.com/vllm-project/vllm.git
 73 | [submodule "EasyLM"]
 74 | 	path = EasyLM
 75 | 	url = https://github.com/young-geng/EasyLM.git
 76 | [submodule "gptq"]
 77 | 	path = gptq
 78 | 	url = https://github.com/IST-DASLab/gptq.git
 79 | [submodule "axolotl"]
 80 | 	path = axolotl
 81 | 	url = https://github.com/OpenAccess-AI-Collective/axolotl.git
 82 | [submodule "flash-attention"]
 83 | 	path = flash-attention
 84 | 	url = https://github.com/Dao-AILab/flash-attention.git
 85 | [submodule "LLM-eval-survey"]
 86 | 	path = LLM-eval-survey
 87 |         url = https://github.com/MLGroupJLU/LLM-eval-survey
 88 | [submodule "llama.cpp"]
 89 | 	path = llama.cpp
 90 |     url = https://github.com/ggerganov/llama.cpp
 91 | [submodule "llama-cpp-python"]
 92 | 	path = llama-cpp-python
 93 |     url = https://github.com/abetlen/llama-cpp-python
 94 | [submodule "ggml"]
 95 | 	path = ggml
 96 |     url = https://github.com/ggerganov/ggml
 97 | [submodule "llama2.c"]
 98 | 	path = llama2.c
 99 |     url = https://github.com/karpathy/llama2.c
100 | [submodule "RedPajama-Data"]
101 | 	path = RedPajama-Data
102 | 	url = https://github.com/togethercomputer/RedPajama-Data
103 | [submodule "LLaVA"]
104 | 	path = LLaVA
105 | 	url = https://github.com/haotian-liu/LLaVA
106 | [submodule "Qwen"]
107 | 	path = Qwen
108 | 	url = https://github.com/QwenLM/Qwen.git
109 | [submodule "mistral-src"]
110 | 	path = mistral-src
111 | 	url = https://github.com/mistralai/mistral-src.git
112 | [submodule "tiktoken"]
113 | 	path = tiktoken
114 | 	url = https://github.com/openai/tiktoken.git
115 | [submodule "open-llm-datasets"]
116 | 	path = open-llm-datasets
117 | 	url = https://github.com/dsdanielpark/open-llm-datasets.git
118 | [submodule "awesome"]
119 | 	path = awesome
120 | 	url = https://github.com/dsdanielpark/awesome.git
121 | [submodule "milvus"]
122 | 	path = milvus
123 | 	url = https://github.com/milvus-io/milvus.git
124 | [submodule "exllama"]
125 | 	path = exllama
126 | 	url = https://github.com/turboderp/exllama.git
127 | [submodule "autotrain-advanced"]
128 | 	path = autotrain-advanced
129 | 	url = https://github.com/huggingface/autotrain-advanced
130 | [submodule "alignment-handbook"]
131 | 	path = alignment-handbook
132 | 	url = https://github.com/huggingface/alignment-handbook.git
133 | [submodule "openai-python"]
134 | 	path = openai-python
135 | 	url = https://github.com/openai/openai-python
136 | [submodule "DeepSpeed-MII"]
137 | 	path = DeepSpeed-MII
138 | 	url = https://github.com/microsoft/DeepSpeed-MII.git
139 | [submodule "LLMDataHub"]
140 | 	path = LLMDataHub
141 | 	url = https://github.com/Zjh-819/LLMDataHub
142 | [submodule "attention_sinks"]
143 | 	path = attention_sinks
144 | 	url = https://github.com/tomaarsen/attention_sinks.git
145 | [submodule "Qwen-VL"]
146 | 	path = Qwen-VL
147 | 	url = https://github.com/QwenLM/Qwen-VL.git
148 | [submodule "Awesome-Multimodal-Large-Language-Models"]
149 | 	path = Awesome-Multimodal-Large-Language-Models
150 | 	url = https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models.git
151 | [submodule "Chinese-LLaMA-Alpac"]
152 | 	path = Chinese-LLaMA-Alpac
153 | 	url = https://github.com/ymcui/Chinese-LLaMA-Alpaca.git
154 | [submodule "Video-LLaVA"]
155 | 	path = Video-LLaVA
156 | 	url = https://github.com/PKU-YuanGroup/Video-LLaVA.git
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | [![](https://img.shields.io/badge/Language-English-lightgrey)](https://github.com/dsdanielpark/all-about-llm) 
  4 | 
  5 | 
  6 | # All About LLM 
  7 | Curated the comments as a sub-module to see how active the activity is while syncing forks. Therefore, this repository serves the purpose of curating comments not only for some experiments but mostly for self-checking, where I can see on my own where and when commits and pull requests frequently occur. To allow for viewing a list of all submodules, I intentionally do not use folders for organizing the repository. Additionally, you can view the complete list in the [git submodule file.](https://github.com/dsdanielpark/all-about-llm/blob/main/.gitmodules)
  8 | 
  9 | This repository contains only some of the models required for _personal_ research, so please refer to other repositories for detailed information and updates.
 10 | 
 11 | <br>
 12 | 
 13 | - [All About LLM](#all-about-llm)
 14 |   - [Quick start](#quick-start)
 15 |   - [Leaderboards](#leaderboards)
 16 |   - [Open LLM](#open-llm)
 17 |   - [LLM Model Evaluation](#llm-model-evaluation)
 18 |   - [Datasets](#datasets)
 19 | 
 20 | 
 21 | <br>
 22 |  
 23 | 
 24 | 
 25 | ## Quick start
 26 | ```
 27 | $ git clone https://github.com/dsdanielpark/all-about-llm.git
 28 | $ cd all-about-llm
 29 | $ git submodule update --init --recursive
 30 | $ python syncfolk_submodules.py
 31 | ```
 32 | 
 33 | ## Leaderboards
 34 | 
 35 | | Leaderboard Name | Description | 
 36 | | --- | --- | 
 37 | | [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/) | Provides evaluation metrics for LLMs. |  
 38 | | [Chatbot Arena (LMSYS Org)](https://chat.lmsys.org/) | Offers resources and a leaderboard for LLM performance. | 
 39 | | [Open LLM Leaderboard (Hugging Face)](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) | Features a leaderboard for LLMs. |
 40 | | [The Big Benchmarks Collection](https://huggingface.co/collections/open-llm-leaderboard/the-big-benchmarks-collection-64faca6335a7fc7d4ffe974a) | Gathering benchmark spaces on the hub (beyond the Open LLM Leaderboard). |
 41 | | [MTEB Leaderboard](#) | Massive Text Embedding Benchmark (MTEB) Leaderboard. |
 42 | | [Chatbot Arena Leaderboard](#) | This leaderboard is based on Chatbot Arena, MT-Bench, and MMLU (5-shot). |
 43 | | [LLM-Perf Leaderboard](#) | Benchmarks performance (latency, throughput & memory) of LLMs with different hardwares and optimizations. |
 44 | | [Big Code Models Leaderboard](#) | Compares performance of base multilingual code generation models on benchmarks like HumanEval and MultiPL-E. |
 45 | | [Open ASR Leaderboard](#) | Ranks and evaluates speech recognition models, reporting Average WER and RTF. |
 46 | | [MT Bench](#) | MT-Bench Browser associated with Chatbot Arena. |
 47 | | [Toolbench Leaderboard](#) | - |
 48 | | [OpenCompass LLM Leaderboard](#) | - |
 49 | | [OpenCompass MMBench Leaderboard](#) | - |
 50 | | [Open Ko-LLM Leaderboard](#) | - |
 51 | 
 52 | 
 53 | <br>
 54 | 
 55 | ## Open LLM
 56 | 
 57 | | LLM | Initial Release | Developer | License |
 58 | | --- | --- | --- | --- |
 59 | | [GPT-J](#) | 2021-06-09 | EleutherAI | Apache 2.0 |
 60 | | [GPTNeo](#) | 2021-03-21 | EleutherAI, Together | Apache 2.0 |
 61 | | [FLAN-T5](#) | 2022-12-06 | Google | Apache 2.0 |
 62 | | [BLOOM](#) | 2022-07-06 | Hugging Face | Open RAIL-M v1 |
 63 | | [OPT](#) | 2022-05-03 | Meta | NA |
 64 | | [Pythia](#) | 2023-02-13 | EleutherAI, Together | Apache 2.0 |
 65 | | [LLaMA](#) | 2023-02-24 | Meta | Noncommercial |
 66 | | [FLAN-UL2](#) | 2023-03-03 | Google | Apache 2.0 |
 67 | | [Alpaca](#) | 2023-03-13 | Stanford | Noncommercial |
 68 | | [Cerebras-GPT](#) | 2023-03-28 | Cerebras | Apache 2.0 |
 69 | | [Dolly](#) | 2023-03-24 | Databricks | MIT |
 70 | | [Vicuna](#) | 2023-03-30 | UC Berkeley, CMU, Stanford, MBZUAI, UCSD | Noncommercial |
 71 | | [GPT4All](#) | 2023-03-26 | Nomic AI | Varies |
 72 | | [Koala](#) | 2023-04-03 | BAIR | Noncommercial |
 73 | | [OpenAssistant](#) | 2023-04-15 | LAION | Varies |
 74 | | [StableLM](#) | 2023-04-19 | Stability AI | CC BY-SA 4.0 |
 75 | | [OpenLLaMA](#) | 2023-04-28 | OpenLM Research | Apache 2.0 |
 76 | | [FastChat](#) | 2023-04-28 | LMSYS | Apache 2.0 |
 77 | | [StableVicuna](#) | 2023-04-28 | Stability AI | Noncommercial |
 78 | | [BLOOMChat](#) | 2023-05-19 | SambaNova | Apache 2.0 |
 79 | | [MPT](https://www.mosaicml.com/blog/mpt-7b) | 2023-05-05 | MosaicML | Apache 2.0 |
 80 | | [RedPajama](https://github.com/togethercomputer/RedPajama-Data) | 2023-05-05 | Together | Apache 2.0 |
 81 | | [Falcon](https://falconllm.tii.ae/) | 2023-05-23 | TII | Apache 2.0 |
 82 | | [Guanaco](https://guanaco-model.github.io/) | 2023-05-23 | UW NLP | Noncommercial |
 83 | | [WizardLM](https://huggingface.co/WizardLM/WizardLM-70B-V1.0) | 2023-05-26 | WizardLM | Non-commercial |
 84 | | [Orca](https://huggingface.co/Open-Orca/OpenOrca-Preview1-13B) | 2023-06-05 | Microsoft | Noncommercial |
 85 | | [Llama 2](https://ai.meta.com/llama/) | 2023-07-18 | Meta | Custom (Commercial OK) |
 86 | | [Platypus](https://arxiv.org/abs/2308.07317) | 2023-08-14 | - | Non-commercial |
 87 | | [Qwen](https://arxiv.org/abs/2308.07317) | 2023-08-28 | Alibaba Cloud | commercial |
 88 | | [Mistral](https://mistral.ai) | 2023-10-10 | Mistral AI | Permissive commercial |
 89 | | [Zephyr](https://github.com/zephyrproject-rtos/zephyr) | 2023-10-25 | - | Apache |
 90 | 
 91 | 
 92 | <br>
 93 | 
 94 | ## LLM Model Evaluation
 95 | - [Harness Task Table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md)
 96 | - [Harness Task](https://github.com/EleutherAI/lm-evaluation-harness/tree/master/lm_eval/tasks)
 97 | 
 98 | | No. | Task | Description | Year | Few-shot Examples | Random Baseline Accuracy |
 99 | | --- | --- | --- | --- | --- | --- |
100 | | 1 | [Jeopardy](https://github.com/aigoopy/llm-jeopardy) | Consists of 2,117 Jeopardy questions from the topics of Literature, American History, World History, Word Origins, and Science, where the model is expected to provide correct answers. | 2022 | 10 | 0% |
101 | | 2 | [MMLU](https://paperswithcode.com/sota/multi-task-language-understanding-on-mmlu) | Comprises 14,042 multiple-choice questions across 57 categories, with academic-standard test-style questions covering subjects like law, mathematics, ethics, and more. The model must choose between options A, B, C, or D. | 2019 | 10 | 25% |
102 | | 3 | [BIG-bench: wikidata](https://github.com/google/BIG-bench/blob/main/bigbench/benchmark_tasks/qa_wikidata/README.md) | Consists of 20,321 questions regarding factual information derived from Wikipedia. The model is expected to complete sentences like "Barack Obama's nationality is..." | 2022 | 10 | ~0% |
103 | | 4 | [ARC easy](https://leaderboard.allenai.org/arc_easy/submissions/get-started) | Comprises 2,376 simple multiple-choice science questions extracted from 3rd to 9th-grade science exams, requiring the model to use basic scientific world knowledge. | 2019 | 10 | 25% |
104 | | 5 | [ARC challenge](https://paperswithcode.com/dataset/arc) | Contains 1,172 challenging multiple-choice science questions extracted from 3rd to 9th-grade science exams, involving scientific world knowledge and some procedural reasoning. | 2019 | 10 | 25% |
105 | | 6 | [BIG-bench misconceptions](https://paperswithcode.com/sota/misconceptions-on-big-bench) | Comprises 219 true/false questions about common misconceptions across various topics, and the model is expected to provide correct answers. | 2022 | 10 | 50% |
106 | | 7 | [BIG-bench: Strategy QA](https://github.com/google/BIG-bench) | Consists of 2,289 yes/no questions related to various common-sense topics, and the model is expected to select the correct answers. | 2022 | 10 | - |
107 | | 8 | [BIG-bench: Strange Stories](https://github.com/google/BIG-bench) | Comprises 174 short stories followed by 2-choice multiple-choice questions regarding characters, their emotions, and common-sense inferences about specific actions. | 2022 | 10 | 50% |
108 | | 9 | [BIG-bench: Novel Concepts](https://github.com/google/BIG-bench) | Contains 32 problems for finding common concepts, and the model is expected to choose the common concept among three given words. | 2022 | 10 | 25% |
109 | | 10 | [COPA](https://paperswithcode.com/sota/question-answering-on-copa) | Involves cause/effect multiple-choice questions where the model receives premises and must select the correct cause/effect among two options. | 2011 | 0 | 50% |
110 | | 11 | [PIQA](https://paperswithcode.com/paper/piqa-reasoning-about-physical-commonsense-in) | Comprises 1,838 2-choice multiple-choice questions about common-sense physics intuition, and the model is expected to select the correct answer. | 2019 | 10 | 50% |
111 | | 12 | [OpenBook QA](https://allenai.org/data/open-book-qa) | Consists of 500 multiple-choice questions about basic physics and scientific intuition for general objects and entities, and the model is expected to select the correct answers. | 2018 | 0 | 25% |
112 | | 13 | [LAMBADA](https://paperswithcode.com/sota/language-modelling-on-lambada) | Contains 5,153 text passages from books where the model reads the first N-1 words of each passage and predicts the last token. | 2016 | 0 | 0% |
113 | | 14 | [HellaSwag](https://paperswithcode.com/dataset/hellaswag) | Consists of 10,042 multiple-choice scenario-based questions where the model must choose the most plausible conclusion among four options. | 2019 | 10 | 25% |
114 | | 15 | [Winograd Schema Challenge](https://paperswithcode.com/dataset/wsc) | Contains 273 scenarios where the model must correctly resolve semantic coreferences in sentences. | 2012 | 0 | 50% |
115 | | 16 | [Winogrande](https://paperswithcode.com/paper/winogrande-an-adversarial-winograd-schema) | Comprises 1,267 scenarios with two starting sentences and a single ending sentence, and the model must select the semantically correct one. | 2012 | 0 | 50% |
116 | | 17 | [BIG bench language identification](https://github.com/google/BIG-bench) | Contains 10,000 multiple-choice questions where the model must recognize sentences written in languages other than English and identify the corresponding language. | 2012 | 10 | 25% |
117 | | 18 | [BIG bench conceptual combinations](https://github.com/google/BIG-bench) | Comprises 103 questions where the model answers multiple-choice questions about the meaning of defined neologisms and sentences using these neologisms. | 2022 | 10 | 25% |
118 | | 19 | [BIG bench conlang translation](https://github.com/google/BIG-bench) | Contains 164 problems where the model provides translations of simple sentences between English and a constructed language. | 2022 | 0 | 0% |
119 | | 20 | [BIG-bench elementary math QA](https://github.com/google/BIG-bench) | Consists of 38,160 multiple-choice arithmetic word problems, and the model is expected to select the correct answer. | 2022 | 10 | 25% |
120 | | 21 | [BIG-bench dyck languages](https://github.com/google/BIG-bench) | Involves 1,000 problems where the model must output the correct tokens required to complete a balanced expression of parentheses and curly braces. | 2022 | 10 | 0% |
121 | | 22 | [BIG-bench algorithms](https://example.com/big-bench-algorithms) | Contains 1,320 problems where the model must determine the length of the longest common subsequence of two strings or check the balance of expressions consisting of parentheses and curly braces. | 2022 | 10 | 0% |
122 | | 23 | [BIG-bench logical deduction](https://github.com/google/BIG-bench) | Comprises 1,500 multiple-choice questions requiring the model to select the logically consistent unique proposition among multiple logical constraints describing the relative order of objects. | 2022 | 10 | 25% |
123 | | 24 | [BIG-bench operators](https://github.com/google/BIG-bench) | Contains 210 problems where the model must calculate the result of expressions using mathematical operators, testing the model's ability to apply mathematical concepts. | 2022 | 10 | 0% |
124 | | 25 | [BIG-bench repeat copy logic](https://github.com/google/BIG-bench) | Comprises 32 tasks where the model must repeatedly copy a series of words in a specific order and produce the correct output. | 2022 | 10 | 0% |
125 | | 26 | [Simple arithmetic with spaces](https://github.com/google/BIG-bench) | Contains 1,000 arithmetic problems with three-digit numbers and up to three operations, where the model must calculate the correct result using the right order of operations. | 2023 | 10 | 0% |
126 | | 27 | [Simple arithmetic without spaces](https://github.com/google/BIG-bench) | Comprises 1,000 arithmetic problems with three-digit numbers and up to three operations, where the model must calculate the correct result of expressions with no spaces between numbers and operators. | 2023 | 10 | 0% |
127 | | 28 | [Math QA](https://github.com/google/BIG-bench) | Contains 2,983 multiple-choice math word problems, requiring basic inference, language comprehension, and arithmetic/algebra skills. | 2021 | 10 | 25% |
128 | | 29 | [LogiQA](https://github.com/google/BIG-bench) | Comprises 651 multiple-choice logic word problems based on mathematical and symbolic problems, where the model must make logical conclusions. | 2020 | 10 | 25% |
129 | | 30 | [BIG-bench: Understanding fables](https://github.com/google/BIG-bench) | Consists of 189 short stories followed by 4-choice multiple-choice questions where the model must select the correct moral for the story. | 2022 | 10 | 25% |
130 | | 31 | [Pubmed QA Labeled](https://pubmedqa.github.io/) | Comprises 1,000 hand-labeled medical documents and related questions, where the model must respond with yes/no/maybe. | 2019 | 10 | ~0% |
131 | | 32 | [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) | Consists of 10,570 short documents followed by related questions on various topics, and the model is expected to output the exact correct answer. | 2016 | 10 | ~0% |
132 | | 33 | [BoolQ](https://paperswithcode.com/paper/boolq-exploring-the-surprising-difficulty-of) | Contains 3,270 short passages on a diverse range of subjects followed by yes/no questions in multiple-choice format. | 2019 | 10 | ~50% |
133 | | 34 | [HumanEval code generation](https://paperswithcode.com/sota/code-generation-on-humaneval) | Comprises 164 Python programming challenges where the model is presented with the method signature and docstring comment for a Python program and is expected to complete the program. The resulting code's functional correctness is tested on a number of input/output pairs. | 2022 | 0 | 0% |
134 | | 35 | [AI2 Reasoning Challenge (25-shot)](https://allenai.org/data/arc) | Consists of grade-school science questions. | / | 25 | / |
135 | | 36 | [TruthfulQA (0-shot)](https://github.com/sylinrl/TruthfulQA) | A test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima 6-shot task, as it is prepended by 6 [examples](https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/finetune_truth.jsonl) systematically, even when launched using 0 for the number of few-shot examples. | / | 0 | / |
136 | | 37 | [AGIEval](https://github.com/ruixiangcui/AGIEval) | AGIEval is a new benchmark designed to assess foundation models in human-centric
137 | 
138 | 
139 | <br>
140 | 
141 | ## Datasets 
142 | - https://github.com/Zjh-819/LLMDataHub
143 | - Curated by [Junhao Zhao](zhaol9555@gmail.com)
144 |   
145 | | Dataset name                                                                                            | Used by                   | Type                | Language            | Size        | Description ️                                                                                                                                          |
146 | |---------------------------------------------------------------------------------------------------------|---------------------------|---------------------|---------------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|
147 | | [function_<br/>calling_<br/>extended](https://huggingface.co/datasets/Trelis/function_calling_extended) | /                         | Pairs               | English<br/>code    | /           | High quality human created dataset from enhance LM's API using ability.                                                                                |
148 | | [AmericanStories](https://huggingface.co/datasets/dell-research-harvard/AmericanStories)                | /                         | Pre-trained                  | English             | /           | Vast sized corpus scanned from US Library of Congress.                                                                                                 |
149 | | [dolma](https://huggingface.co/datasets/allenai/dolma)                                                  | OLMo                      | Pre-trained                  | /                   | 3T tokens   | A large diverse open-source corpus for LM pretraining.                                                                                                 |
150 | | [Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)                                  | Platypus2                 | Pairs               | English             | 25K         | A very high quality dataset for improving LM's STEM reasoning ability.                                                                                 |
151 | | [Puffin](https://huggingface.co/datasets/LDJnr/Puffin)                                                  | Redmond-Puffin<br/>Series | Dialog              | English             | ~3k entries | A dataset consists of conversations between real human and GPT-4，which features long context (over 1k tokens per conversation) and multi-turn dialogs. |
152 | | [tiny series](https://huggingface.co/datasets/nampdn-ai/tiny-codes)                                     | /                         | Pairs               | English             | /           | A series of short and concise codes or texts aim at improving LM's reasoning ability.                                                                  |
153 | | [LongBench](https://huggingface.co/datasets/THUDM/LongBench)                                            | /                         | Evaluation<br/>Only | English<br/>Chinese | 17 tasks    | A benchmark for evaluate LLM's long context understanding capability.                                                                                  |
154 | | [orca-chat](https://huggingface.co/datasets/shahules786/orca-chat)                                          | /            | Dialog          | English      | 198,463 entries   | An Orca-style dialog dataset aims at improving LM's long context conversational ability.                                                                                                              |
155 | | [DialogStudio](https://github.com/salesforce/DialogStudio)                                                  | /            | Dialog          | Multilingual | /                 | A collection of diverse datasets aim at building conversational Chatbot.                                                                                                                              |
156 | | [chatbot_arena<br/>_conversations](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations)       | /            | RLHF<br/>Dialog | Multilingual | 33k conversations | Cleaned conversations with pairwise human preferences collected on Chatbot Arena.                                                                                                                     |
157 | | [WebGLM-qa](https://huggingface.co/datasets/THUDM/webglm-qa)                                                | WebGLm       | Pairs           | English      | 43.6k entries     | Dataset used by WebGLM, which is a QA system based on LLM and Internet. Each of the entry in this dataset comprise a question, a response and a reference. The response is grounded in the reference. |
158 | | [phi-1](https://huggingface.co/datasets/teleprint-me/phi-1)                                                 | phi-1        | Dialog          | English      | /                 | A dataset generated by using the method in [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644). It focuses on math and CS problems.                                                        |
159 | | [Linly-<br/>pretraining-<br/>dataset](https://huggingface.co/datasets/Linly-AI/Chinese-pretraining-dataset) | Linly series | PT              | Chinese      | 3.4GB             | Chinese pretraining dataset used by Linly series model, comprises ClueCorpusSmall, CSL news-crawl and etc.                                                                                            |
160 | | [FineGrainedRLHF](https://github.com/allenai/FineGrainedRLHF)                                               | /            | RLHF            | English      | ~5K examples      | A repo aims at develop a new framework to collect human feedbacks. Data collected is with the purpose to improve LLMs  factual correctness, topic relevance and other abilities.                      |
161 | | [dolphin](https://huggingface.co/datasets/ehartford/dolphin)                                                | /            | Pairs           | English      | 4.5M entries      | An attempt to replicate Microsoft's Orca. Based on FLANv2.                                                                                                                                            |
162 | | [openchat_<br/>sharegpt4_<br/>dataset](https://huggingface.co/datasets/openchat/openchat_sharegpt4_dataset) | OpenChat     | Dialog          | English      | 6k dialogs        | A high quality dataset generated by using GPT-4 to complete refined ShareGPT prompts.                                                                                                                 |
163 | | [OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca)                                                                                                                                                                                                                                | /                | Pairs        | English               | 4.5M completions              | A collection of augmented FLAN data. Generated by using method is Orca paper.                                                                                                           |
164 | | [COIG-PC](https://huggingface.co/datasets/BAAI/COIG-PC) <br/> [COIG-Lite](https://huggingface.co/datasets/BAAI/COIG-PC-Lite)                                                                                                                                                                  | /                | Pairs        | Chinese               | /                             | Enhanced version of COIG.                                                                                                                                                               |
165 | | [WizardLM_Orca](https://huggingface.co/datasets/psmathur/WizardLM_Orca)                                                                                                                                                                                                                       | orca_mini series | Pairs        | English               | 55K entries                   | Enhanced WizardLM data. Generated by using orca's method.                                                                                                                               |
166 | | arxiv instruct datasets<br/> [math](https://huggingface.co/datasets/ArtifactAI/arxiv-math-instruct-50k) <br/> [CS](https://huggingface.co/datasets/ArtifactAI/arxiv-beir-cs-ml-generated-queries) <br/> [Physics](https://huggingface.co/datasets/ArtifactAI/arxiv-physics-instruct-tune-30k) | /                | Pairs        | English               | 50K/<br/>50K/<br/>30K entries | dataset consists of question-answer pairs derived from ArXiv abstracts. Questions are generated using the t5-base model, while the answers are generated using the GPT-3.5-turbo model. |
167 | | [im-feeling-<br/>curious](https://huggingface.co/datasets/xiyuez/im-feeling-curious)                                                                                                                                                                                                          | /                | Pairs        | English               | 2595 entries                  | Random questions and correspond facts generated by Google **I'm feeling curious** features.                                                                                             |
168 | | [ign_clean<br/>_instruct<br/>_dataset_500k](https://huggingface.co/ignmilton)                                                                                                                                                                                                                 | /                | Pairs        | /                     | 509K entries                  | A large scale SFT dataset which is synthetically created from a subset of Ultrachat prompts. ⚠ lack of detailed datacard                                                                |
169 | | [WizardLM<br/>evolve_instruct V2](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)                                                                                                                                                                                    | WizardLM         | Dialog       | English               | 196k entries                  | The latest version of Evolve Instruct dataset.                                                                                                                                          |
170 | | [Dynosaur](https://github.com/WadeYin9712/Dynosaur)                                                                                                                                                                                                                                           | /                | Pairs        | English               | 800K entries                  | The dataset generated by applying method in [this paper](https://dynosaur-it.github.io/). Highlight is generating high-quality data at low cost.                                        |
171 | | [SlimPajama](https://huggingface.co/datasets/cerebras/SlimPajama-627B)                                                                                                                                                                                                                        | /                | PT           | Primarily<br/>English | /                             | A cleaned and deduplicated version of RedPajama                                                                                                                                         |
172 | | [LIMA dataset](https://huggingface.co/datasets/GAIR/lima)                                                                                                                                                                                                                                     | LIMA             | Pairs        | English               | 1k entries                    | High quality SFT dataset used by [LIMA: Less Is More for Alignment](https://arxiv.org/pdf/2305.11206.pdf)                                                                               |
173 | | [TigerBot Series](https://github.com/TigerResearch/TigerBot#%E5%BC%80%E6%BA%90%E6%95%B0%E6%8D%AE%E9%9B%86)                                                                                                                                                                                    | TigerBot         | PT<br/>Pairs | Chinese<br/>English   | /                             | Datasets used to train the TigerBot, including pretraining data, STF data and some domain specific datasets like financial research reports.                                            |
174 | | [TSI-v0](https://huggingface.co/datasets/tasksource/tasksource-instruct-v0)                                                                                                                                                                                                                   | /                | Pairs        | English               | 30k examples<br/>per task     | A Multi-task instruction-tuning data recasted from 475 of the tasksource datasets. Similar to Flan dataset and Natural instruction.                                                     |
175 | | [NMBVC](https://github.com/esbatmop/MNBVC)                                                                                                                                                                                                                                                    | /                | PT           | Chinese               | /                             | A large scale, continuously updating Chinese pretraining dataset.                                                                                                                       |
176 | | [StackOverflow<br/>post](https://huggingface.co/datasets/mikex86/stackoverflow-posts)                                                                                                                                                                                                         | /                | PT           | /                     | 35GB                          | Raw StackOverflow data in markdown format, for pretraining.                                                                                                                             |
177 | | [LaMini-Instruction](https://huggingface.co/datasets/MBZUAI/LaMini-instruction)                                                                                                                                                                                                    | /                                                   | Pairs                        | English                                              | 2.8M entries                                                                            | A dataset distilled from flan collection, p3 and self-instruction.                                                                                                                                     |
178 | | [ultraChat](https://huggingface.co/datasets/stingning/ultrachat)                                                                                                                                                                                                                   | /                                                   | Dialog                       | English                                              | 1.57M dialogs                                                                           | A large scale dialog dataset created by using two ChatGPT, one of which act as the user, another generates response.                                                                                   |
179 | | [ShareGPT_<br/>Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)                                                                                                                                                                       | Vicuna                                              | Pairs                        | Multilingual                                         | 53K entries                                                                             | Cleaned ShareGPT dataset.                                                                                                                                                                              |
180 | | [pku-saferlhf-dataset](https://github.com/PKU-Alignment/safe-rlhf#pku-saferlhf-dataset)                                                                                                                                                                                            | Beaver                                              | RLHF                         | English                                              | 10K + 1M                                                                                | The first dataset of its kind and contains 10k instances with safety preferences.                                                                                                                      |
181 | | RefGPT-Dataset<br/>[nonofficial link](https://github.com/sufengniu/RefGPT)                                                                                                                                                                                                         | RefGPT                                              | Pairs, Dialog                | Chinese                                              | ~50K entries                                                                            | A Chinese dialog dataset aims at improve the correctness of fact in LLMs (mitigate the hallucination of LLM).                                                                                          |
182 | | [Luotuo-QA-A<br/>CoQA-Chinese](https://huggingface.co/datasets/silk-road/Luotuo-QA-A-CoQA-Chinese)                                                                                                                                                                                 | Luotuo project                                      | Context                      | Chinese                                              | 127K QA pairs                                                                           | A dataset built upon translated CoQA. Augmented by using OpenAI API.                                                                                                                                   |
183 | | [Wizard-LM-Chinese<br/>instruct-evol](https://huggingface.co/datasets/silk-road/Wizard-LM-Chinese-instruct-evol)                                                                                                                                                                   | Luotuo project                                      | Pairs                        | Chinese                                              | ~70K entries                                                                            | Chinese version WizardLM 70K. Answers are obtained by feed translated questions in OpenAI's GPT API and then get responses.                                                                            |
184 | | [alpaca_chinese<br/>dataset](https://github.com/hikariming/alpaca_chinese_dataset)                                                                                                                                                                                                 | /                                                   | Pairs                        | Chinese                                              | /                                                                                       | GPT-4 translated alpaca data includes some complement data (like Chinese poetry, application, etc.). Inspected by human.                                                                               |
185 | | [Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL)                                                                                                                                                                                                                    | Open Assistant                                      | Pairs                        | Chinese                                              | 1.5GB                                                                                   | QA data on well-know Chinese Zhihu QA platform.                                                                                                                                                        |
186 | | [Alpaca-GPT-4_zh-cn](https://huggingface.co/datasets/shibing624/alpaca-zh)                                                                                                                                                                                                         | /                                                   | Pairs                        | Chinese                                              | about 50K entries                                                                       | A Chinese Alpaca-style dataset, generated by GPT-4 originally in Chinese, not translated.                                                                                                              |
187 | | [hh-rlhf](https://github.com/anthropics/hh-rlhf) <br/> [on Huggingface](https://huggingface.co/datasets/Anthropic/hh-rlhf)                                                                                                                                                         | Koala                                               | RLHF                         | English                                              | 161k pairs<br/>79.3MB                                                                   | A pairwise dataset for training reward models in reinforcement learning for improving language models' harmlessness and helpfulness.                                                                   |
188 | | [Panther-dataset_v1](https://huggingface.co/datasets/Rardilit/Panther-dataset_v1)                                                                                                                                                                                                  | Panther                                             | Pairs                        | English                                              | 377 entries                                                                             | A dataset comes from the hh-rlhf. It rewrite hh-rlhf into the form of input-output pairs.                                                                                                              |
189 | | [Baize Dataset](https://github.com/project-baize/baize-chatbot/tree/main/data)                                                                                                                                                                                                     | Baize                                               | Dialog                       | English                                              | 100K dialogs                                                                            | A dialog dataset generated by GPT-4 using self-talking. Questions and topics are collected from Quora, StackOverflow and some medical knowledge source.                                                |
190 | | [h2ogpt-fortune2000<br/>personalized](https://huggingface.co/datasets/h2oai/h2ogpt-fortune2000-personalized)                                                                                                                                                                       | h2ogpt                                              | Pairs                        | English                                              | 11363 entries                                                                           | A instruction finetune developed by h2oai, covered various topics.                                                                                                                                     |
191 | | [SHP](https://huggingface.co/datasets/stanfordnlp/SHP)                                                                                                                                                                                                                             | StableVicuna,<br/>chat-opt,<br/>, SteamSHP          | RLHF                         | English                                              | 385K entries                                                                            | An RLHF dataset different from previously mentioned ones, it use scores+timestamps to infer the users' preferences. Covers 18 domains, collected by Stanford.                                          |
192 | | [ELI5](https://huggingface.co/datasets/eli5#source-data)                                                                                                                                                                                                                           | MiniLM series                                       | FT,<br/>RLHF                 | English                                              | 270K entries                                                                            | Questions and Answers collected from Reddit, including score. Might be used for RLHF reward model training.                                                                                            |
193 | | [WizardLM<br/>evol_instruct](https://huggingface.co/datasets/victor123/evol_instruct_70k) <br/> [V2](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)                                                                                                      | WizardLM                                            | Pairs                        | English                                              |                                                                                         | An instruction finetune dataset derived from Alpaca-52K, using the **evolution** method in [this paper](https://arxiv.org/pdf/2304.12244.pdf)                                                          |
194 | | [MOSS SFT data](https://github.com/OpenLMLab/MOSS/tree/main/SFT_data)                                                                                                                                                                                                              | MOSS                                                | Pairs,<br/>Dialog            | Chinese, English                                     | 1.1M entries                                                                            | A conversational dataset collected and developed by MOSS team. It has usefulness, loyalty and harmlessness labels for every data entries.                                                              |
195 | | [ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K)                                                                                                                                                                                                                 | Koala, Stable LLM                                   | Pairs                        | Multilingual                                         | 52K                                                                                     | This dataset comprises conversations collected from ShareGPT, with a specific focus on customized creative conversation.                                                                               |
196 | | [GPT-4all Dataset](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations)                                                                                                                                                                                          | GPT-4all                                            | Pairs                        | English, <br/> Might have <br/> a translated version | 400k entries                                                                            | A combination of some subsets of OIG, P3 and Stackoverflow. Covers topics like general QA, customized creative questions.                                                                              |
197 | | [COIG](https://huggingface.co/datasets/BAAI/COIG)                                                                                                                                                                                                                                  | /                                                   | Pairs                        | Chinese,<br/>code                                    | 200K entries                                                                            | A Chinese-based dataset. It contains domains like general purpose QA, Chinese exams, code. Its quality is checked by human annotators.                                                                 |
198 | | [RedPajama-Data-1T](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T)                                                                                                                                                                                            | RedPajama                                           | PT                           | Primarily English                                    | 1.2T tokens <br/> 5TB                                                                   | A fully open pretraining dataset follows the LLaMA's method.                                                                                                                                           |
199 | | [OASST1](https://huggingface.co/datasets/OpenAssistant/oasst1)                                                                                                                                                                                                                     | OpenAssistant                                       | Pairs,<br/> Dialog           | Multilingual<br/>(English, Spanish, etc.)            | 66,497 conversation trees                                                               | A large, human-written, human-annotated high quality conversation dataset. It aims at making LLM generates more natural response.                                                                      |
200 | | [Alpaca-COT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)                                                                                                                                                                                                                  | Phoenix                                             | Pairs,<br/> Dialog,<br/> CoT | English                                              | /                                                                                       | A mixture a many dataset like classic Alpaca dataset, OIG, Guanaco and some CoT(Chain-of-Thought) datasets like FLAN-CoT. May be handy to use.                                                         |
201 | | [Bactrian-X](https://huggingface.co/datasets/MBZUAI/Bactrian-X)                                                                                                                                                                                                                    | /                                                   | Pairs                        | Multilingual<br/> (52 languages)                     | 67K entries per language                                                                | A multilingual version of **Alpaca** and **Dolly-15K**.                                                                                                                                                |
202 | | [databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) <br/> [zh-cn Ver](https://huggingface.co/datasets/jaja7744/dolly-15k-cn)                                                                                                                   | Dolly2.0                                            | Pairs                        | English                                              | 15K+ entries                                                                            | A dataset of **human-written** prompts and responses, featuring tasks such as open-domain question-answering, brainstorming, summarization, and more.                                                  |
203 | | [AlpacaDataCleaned](https://github.com/gururise/AlpacaDataCleaned)                                                                                                                                                                                                                 | Some Alpaca/ LLaMA-like models                      | Pairs                        | English                                              | /                                                                                       | Cleaned version of Alpaca, GPT_LLM and GPTeacher.                                                                                                                                                      |
204 | | [GPT-4-LLM Dataset](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)                                                                                                                                                                                                    | Some Alpaca-like models                             | Pairs,<br/> RLHF             | English,<br/> Chinese                                | 52K entries for English and Chinese respectively <br/> 9K entries unnatural-instruction | NOT the dataset used by GPT-4!! It is generated by GPT-4 and some other LLM for better Pairs and RLHF. It includes instruction data as well as comparison data in RLHF style.                          |
205 | | [GPTeacher](https://github.com/teknium1/GPTeacher)                                                                                                                                                                                                                                 | /                                                   | Pairs                        | English                                              | 20k entries                                                                             | A dataset contains targets generated by GPT-4 and includes many of the same seed tasks as the Alpaca dataset, with the addition of some new tasks such as roleplay.                                    |
206 | | [HC3](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)                                                                                                                                                                                                              | Koala                                               | RLHF                         | English,<br/> Chinese                                | 24322 English <br/> 12853 Chinese                                                       | A multi-domain, human-vs-ChatGPT comparison dataset. Can be used for reward model training or ChatGPT detector training.                                                                               |
207 | | [Alpaca data](https://github.com/tatsu-lab/stanford_alpaca#data-release) <br/> [Download](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json)                                                                                                                 | Alpaca, ChatGLM-finetune-LoRA, Koala                | Dialog,<br/> Pairs           | English                                              | 52K entries<br/>21.4MB                                                                  | A dataset generated by text-davinci-003 to improve language models' ability to follow human instruction.                                                                                               |
208 | | [OIG](https://huggingface.co/datasets/laion/OIG) <br/> [OIG-small-chip2](https://huggingface.co/datasets/0-hero/OIG-small-chip2)                                                                                                                                                   | Pythia-Chat-Base-7B, GPT-NeoXT-Chat-Base-20B, Koala | Dialog,<br/> Pairs           | English,<br/> code                                   | 44M entries                                                                             | A large conversational instruction dataset with medium and high quality subsets *(OIG-small-chip2)* for multi-task learning.                                                                           |
209 | | [ChatAlpaca data](https://github.com/cascip/ChatAlpaca)                                                                                                                                                                                                                            | /                                                   | Dialog,<br/> Pairs           | English,<br/> Chinese version coming soon            | 10k entries<br/>39.5MB                                                                  | A dataset aims to help researchers develop models for instruction-following in multi-turn conversations.                                                                                               |
210 | | [InstructionWild](https://github.com/XueFuzhao/InstructionWild)                                                                                                                                                                                                                    | ColossalChat                                        | Pairs                        | English, Chinese                                     | 10K enreues                                                                             | A Alpaca-style dataset, but with seed tasks comes from chatgpt screenshot.                                                                                                                             |
211 | | [Firefly](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)                                                                                                                                                                                                         | Firefly(流萤)                                         | Pairs                        | Chinese                                              | 1.1M entries<br/>1.17GB                                                                 | A Chinese instruction-tuning dataset with 1.1 million human-written examples across 23 tasks, but no conversation.                                                                                     |
212 | | [BELLE](https://github.com/LianjiaTech/BELLE) <br/> [0.5M version](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN) <br/> [1M version](https://huggingface.co/datasets/BelleGroup/train_1M_CN) <br/> [2M version](https://huggingface.co/datasets/BelleGroup/train_2M_CN) | BELLE series, Chunhua (春华)                          | Pairs                        | Chinese                                              | 2.67B in total                                                                          | A Chinese instruction dataset similar to *Alpaca data* constructed by generating answers from seed tasks, but no conversation.                                                                         |
213 | | [GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset#guanacodataset)                                                                                                                                                                                     | Guanaco                                             | Dialog,<br/> Pairs           | English,<br/> Chinese,<br/> Japanese                 | 534,530 entries                                                                         | A multilingual instruction dataset for enhancing language models' capabilities in various linguistic tasks, such as natural language understanding and explicit content recognition.                   |
214 | | [OpenAI WebGPT](https://huggingface.co/datasets/openai/webgpt_comparisons)                                                                                                                                                                                                         | WebGPT's reward model, Koala                        | RLHF                         | English                                              | 19,578 pairs                                                                            | Data set used in WebGPT paper. Used for training reward model in RLHF.                                                                                                                                 |
215 | | [OpenAI<br/>Summarization<br/>Comparison](https://huggingface.co/datasets/openai/summarize_from_feedback)                                                                                                                                                                          | Koala                                               | RLHF                         | English                                              | ~93K entries<br/>420MB                                                                  | A dataset of human feedback which helps training a reward model. The reward model was then used to train a summarization model to align with human preferences.                                        |
216 | | [self-instruct](https://github.com/yizhongw/self-instruct)                                                                                                                                                                                                                         | /                                                   | Pairs                        | English                                              | 82K entries                                                                             | The dataset generated by using the well-known [self-instruction method](https://arxiv.org/abs/2212.10560)                                                                                              |
217 | | [unnatural-instructions](https://github.com/orhonovich/unnatural-instructions)                                                                                                                                                                                                     | /                                                   | Pairs                        | English                                              | 240,670 examples                                                                        | An early attempt to use powerful model (text-davinci-002) to generate data.                                                                                                                            |
218 | | [xP3 (and some variant)](https://huggingface.co/datasets/bigscience/xP3)                                                                                                                                                                                                           | BLOOMZ, mT0                                         | Pairs                        | Multilingual,<br/> code                              | 79M entries<br/>88GB                                                                    | An instruction dataset for improving language models' generalization ability, similar to *Natural Instruct*.                                                                                           |
219 | | [Flan V2](https://github.com/google-research/FLAN/tree/main/flan/v2)                                                                                                                                                                                                               | /                                                   | /                            | English                                              | /                                                                                       | A dataset compiles datasets from Flan 2021, P3, Super-Natural Instructions, along with dozens more datasets into one and formats them into a mix of zero-shot, few-shot and chain-of-thought templates |
220 | | [Natural Instruction](https://instructions.apps.allenai.org/) <br/> [GitHub&Download](https://github.com/allenai/natural-instructions)                                                                                                                                             | tk-instruct series                                  | Pairs, <br/> evaluation      | Multilingual                                         | /                                                                                       | A benchmark with over 1,600 tasks with instruction and definition for evaluating and improving language models' multi-task generalization under natural language instruction.                          |
221 | | [CrossWOZ](https://github.com/thu-coai/CrossWOZ)                                                                                                                                                                                                                                   | /                                                   | Dialog                       | English,<br/>Chinese                                 | 6K dialogs                                                                              | The dataset introduced by [this paper](https://arxiv.org/pdf/2002.11893.pdf), mainly about tourism topic in Beijing, answers are generated automatically by rules.                                     |
222 | | [proof-pile](https://huggingface.co/datasets/hoskinson-center/proof-pile)                                                                     | proof-GPT                                                                      | PT                                   | English<br/>LaTeX       | 13GB        | A pretraining dataset which is similar to the pile but have LaTeX corpus to enhance LM's ability in proof.                                        |
223 | | [peS2o](https://huggingface.co/datasets/allenai/peS2o)                                                                                        | /                                                                              | PT                                   | English                 | 7.5GB       | A high quality academic paper dataset for pretraining.                                                                                            |
224 | | [StackOverflow<br/>post](https://huggingface.co/datasets/mikex86/stackoverflow-posts)                                                         | /                                                                              | PT                                   | /                       | 35GB        | Raw StackOverflow data in markdown format, for pretraining.                                                                                       |
225 | | [lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired/tree/main/data/rl) | Stack LLaMA 2 | PT | English | 6.3GB | Paired StackOverFlow human preference dataset  |
226 | | [SlimPajama](https://huggingface.co/datasets/cerebras/SlimPajama-627B)                                                                        | /                                                                              | PT                                   | Primarily<br/>English   | /           | A cleaned and deduplicated version of RedPajama                                                                                                   |
227 | | [NMBVC](https://github.com/esbatmop/MNBVC)                                                                                                    | /                                                                              | PT                                   | Chinese                 | /           | A large scale, continuously updating Chinese pretraining dataset.                                                                                 |
228 | | [falcon-refinedweb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)                                                                 | tiiuae/falcon series                                                           | PT                                   | English                 | /           | A refined subset of CommonCrawl.                                                                                                                  |
229 | | [CBook-150K](https://github.com/FudanNLPLAB/CBook-150K)                                                                                       | /                                                                              | PT, <br/> building dataset           | Chinese                 | 150K+ books | A raw Chinese books dataset. Need some preprocess pipeline.                                                                                       |
230 | | [Common Crawl](https://commoncrawl.org/)                                                                                                      | LLaMA (After some process)                                                     | building datasets, <br/> PT          | /                       | /           | The most well-known raw dataset, rarely be used directly. One possible preprocess pipeline is [CCNet](https://github.com/facebookresearch/cc_net) |
231 | | [nlp_Chinese_Corpus](https://github.com/brightmart/nlp_chinese_corpus)                                                                        | /                                                                              | PT,<br/>TF                           | Chinese                 | /           | A Chinese pretrain corpus. Includes Wikipedia, Baidu Baike, Baidu QA, some forums QA and news corpus.                                             |
232 | | [The Pile (V1)](https://pile.eleuther.ai/)                                                                                                    | GLM (partly), LLaMA (partly), GPT-J, GPT-NeoX-20B, Cerebras-GPT 6.7B, OPT-175b | PT                                   | Multilingual,<br/> code | 825GB       | A diverse open-source language modeling dataset consisting of 22 smaller, high-quality datasets that includes many domains and tasks.             |
233 | | C4 <br/> [Huggingface dataset](https://huggingface.co/datasets/c4) <br/> [TensorFlow dataset](https://www.tensorflow.org/datasets/catalog/c4) | Google T5 Series, LLaMA                                                        | PT                                   | English                 | 305GB       | A colossal, cleaned version of Common Crawl's web crawl corpus. Frequently be used.                                                               |
234 | | [ROOTS](https://huggingface.co/bigscience-data)                                                                                               | BLOOM                                                                          | PT                                   | Multilingual,<br/> code | 1.6TB       | A diverse open-source dataset consisting of sub-datasets like Wikipedia and StackExchange for language modeling.                                  |
235 | | [PushshPairs reddit](https://files.pushshPairs.io/reddit/) <br/> [paper](https://arxiv.org/pdf/2001.08435.pdf)                                | OPT-175b                                                                       | PT                                   | /                       | /           | Raw reddit data, one possible processing pipeline in [this paper](https://aclanthology.org/2021.eacl-main.24.pdf)                                 |
236 | | [Gutenberg project](https://www.gutenberg.org/policy/robot_access.html)                                                                       | LLaMA                                                                          | PT                                   | Multilingual            | /           | A book dataset, mostly novels. Not be preprocessed.                                                                                               |
237 | | [CLUECorpus](https://github.com/CLUEbenchmark/CLUE)                                                                                           | /                                                                              | PT, <br/> finetune, <br/> evaluation | Chinese                 | 100GB       | A Chinese pretraining Corpus sourced from *Common Crawl*.                                                                                         |
238 | | [starcoderdata](https://huggingface.co/datasets/bigcode/starcoderdata)                                               | starcoder<br/>series                                      | PT               | code                  | 783GB             | A large pretraining dataset for improving LM's coding ability.                                                                                                        |
239 | | [code_<br/>instructions<br/>_120k_alpaca](https://huggingface.co/datasets/iamtarun/code_instructions_120k_alpaca)    | /                                                         | Pairs            | English/code          | 121,959 entries   | [code_instruction](https://huggingface.co/datasets/sahil2801/code_instructions_120k) in instruction finetune format.                                                  |
240 | | [function-<br/>invocations-25k](https://huggingface.co/datasets/unaidedelf87777/openapi-function-invocations-25k)    | some MPT <br/> variants                                   | Pairs            | English code          | 25K entries       | A dataset aims at teaching AI models how to correctly invoke [APIsGuru](https://github.com/APIs-guru/openapi-directory) functions based on natural language prompts.  |
241 | | [TheoremQA](https://huggingface.co/datasets/wenhu/TheoremQA)                                                         | /                                                         | Pairs            | English               | 800               | A high quality STEM theorm QA dataset.                                                                                                                                |
242 | | [phi-1](https://huggingface.co/datasets/teleprint-me/phi-1)                                                          | phi-1                                                     | Dialog           | English               | /                 | A dataset generated by using the method in [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644). It focuses on math and CS problems.                        |
243 | | [FinNLP](https://github.com/AI4Finance-Foundation/FinNLP)                                                            | [FinGPT](https://github.com/AI4Finance-Foundation/FinGPT) | Raw data         | English,<br/>Chinese  | /                 | Open-source raw financial text data. Includes news, social media and etc.                                                                                             |
244 | | [PRM800K](https://github.com/openai/prm800k)                                                                         | A variant of<br/>GPT-4                                    | Context          | English               | 800K entries      | A process supervision dataset for mathematical problems                                                                                                               |
245 | | [MeChat data](https://github.com/qiuhuachuan/smile)  ⚠️                                                 | MeChat                                                    | Dialog           | Chinese               | 355733 utterances | A Chinese SFT dataset for training a mental healthcare chatbot.                                                                                                       |
246 | | [ChatGPT-Jailbreak-Prompts](https://huggingface.co/datasets/rubend18/ChatGPT-Jailbreak-Prompts) ⚠️        | /                                                         | /                | English               | 163KB file size   | Prompts for bypassing the safety regulation of ChatGPT. Can be use for probing the harmlessness of LLMs                                                               |
247 | | [awesome chinese<br/>legal resources](https://github.com/pengxiao-song/awesome-chinese-legal-resources)              | LaWGPT                                                    | /                | Chinese               | /                 | A collection of Chinese legal data for LLM training.                                                                                                                  |
248 | | [Long Form](https://github.com/akoksal/LongForm)                                                                     | /                                                         | Pairs            | English               | 23.7K entries     | A dataset aims at improving the long text generation ability of LLM.                                                                                                  |
249 | | [symbolic-instruction-tuning](https://huggingface.co/datasets/sail/symbolic-instruction-tuning)                      | /                                                         | Pairs            | English,<br/> code    | 796               | A dataset focuses on the 'symbolic' tasks: like SQL coding, mathematical computation, etc.                                                                            |
250 | | [Safety Prompt](https://github.com/thu-coai/Safety-Prompts)                                                          | /                                                         | Evaluation  only | Chinese               | 100k entries      | Chinese safety prompts for evaluating and improving the safety of LLMs.                                                                                               |
251 | | [Tapir-Cleaned](https://huggingface.co/datasets/MattiaL/tapir-cleaned-116k)                                          | /                                                         | Pairs            | English,              | 116k entries      | This is a revised version of the DAISLab dataset of PairsTT rules, which has been thoroughly cleaned, scored, and adjusted for the purpose of instruction-tuning      |
252 | | [instructional_<br/>codesearchnet_python](https://huggingface.co/datasets/Nan-Do/instructional_codesearchnet_python) | /                                                         | Pairs            | English &<br/> Python | 192MB             | This dataset is a template generated instructional Python datastet generated from an annotated version of the code-search-net dataset for the Open-Assistant project. |
253 | | [finance-alpaca](https://huggingface.co/datasets/gbharti/finance-alpaca)                                             | /                                                         | Pairs            | English               | 1.3K entries      | An Alpaca-style dataset but focus on financial topics                                                                                                                 |
254 | | [OBELICS](https://huggingface.co/datasets/HuggingFaceM4/OBELICS)                    | idefics<br/>series | image-document       | English      | 141M documents | an open, massive, and curated collection of interleaved image-text web documents.                           |
255 | | [JourneyDB](https://huggingface.co/datasets/JourneyDB/JourneyDB)                    | /                  | image-prompt-caption | English      | 4M instances   | A large scale dataset comprises QA, caption, and text prompting tasks, which is based on Midjourney images. |
256 | | [M3IT](https://huggingface.co/datasets/MMInstruction/M3IT)                          | Ying-VLM           | instruction-image    | Multilingual | 2.4M instances | A dataset comprises 40 tasks with 400 human written instruction.                                            |
257 | | [MIMIC-IT](https://github.com/Luodian/Otter/tree/main/mimic-it)                     | Otter              | instruction-image    | Multilingial | 2.2M instances | High quality multi-modal instructions-response pairs based on images and videos.                            |
258 | | [LLaVA Instruction](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) | LLaVA              | instruction-image    | English      | 158k samples   | A multimodal dataset generated upon COCO dataset by prompting GPT-4 to get instructions.                    |
259 | | WebText(Reddit links) | GPT-2              | PT   | English                               | /     | Data crawled from Reddit and filtered for GPT-2 pretraining.                                    |
260 | | MassiveText           | Gopher, Chinchilla | PT   | 99% English, 1% other(including code) |       |                                                                                                 |
261 | | WuDao Corpora     | GLM                | PT   | Chinese                               | 200GB | A large scale Chinese corpus, Possible component originally open-sourced but not available now. |
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------
/syncfolk_submodules.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | def sync_submodules():
 4 |     # Initialize and update submodules
 5 |     subprocess.run(["git", "submodule", "update", "--init", "--recursive"], check=True)
 6 |     # Sync and update submodules to the latest version
 7 |     subprocess.run(["git", "submodule", "foreach", "git", "pull", "origin", "main"], check=True)
 8 | 
 9 | def main():
10 |     try:
11 |         sync_submodules()
12 |         print("All submodules are synced to the latest version.")
13 |     except subprocess.CalledProcessError as e:
14 |         print(f"An error occurred while syncing submodules: {e}")
15 | 
16 | if __name__ == "__main__":
17 |     main()
18 | 


--------------------------------------------------------------------------------