├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── LICENSE
├── README.md
├── README_ja.md
├── README_zh.md
├── assets
    ├── YuLan-logo.jpg
    ├── YuLan-logo.png
    ├── data-pipeline.png
    ├── data-preview.png
    ├── data_distribution_for_every_phase.png
    ├── main.png
    └── training-stability.png
├── post_train
    ├── README.md
    └── img
    │   └── result.png
└── pretrain
    ├── README.md
    ├── configuration_yulanmini.py
    ├── datasets
        ├── README.md
        ├── data_mix
        │   ├── 01_20241017_013512.json
        │   ├── 02_20241017_013401.json
        │   ├── 03_20241020_001556.json
        │   ├── 04_20241021_170901.json
        │   ├── 05_20241022_221453.json
        │   ├── 06_20241024_013137.json
        │   ├── 07_20241025_022032.json
        │   ├── 08_20241026_151354.json
        │   ├── 09_20241027_190948.json
        │   ├── 10_20241028_225112.json
        │   ├── 11_20241030_124814.json
        │   ├── 12_20241101_002827.json
        │   ├── 13_20241102_160534.json
        │   ├── 14_20241104_000454.json
        │   ├── 15_20241105_023029.json
        │   ├── 16_20241106_180613.json
        │   ├── 17_20241108_004951.json
        │   ├── 18_20241113_034017.json
        │   ├── 19_20241114_115241.json
        │   ├── 20_20241115_234357.json
        │   ├── 21_20241117_021115.json
        │   ├── 22_20241118_155407.json
        │   ├── 23_20241120_033942.json
        │   ├── 24_20241121_133110.json
        │   ├── 25_20241123_030124.json
        │   ├── 26_20241127_205447.json
        │   ├── 26_20241211_015209.json
        │   └── 27_20241213_051741.json
        ├── download_datasets_step1.sh
        ├── download_datasets_step3.sh
        └── final.pdf
    ├── ds2_config_adamw.json
    ├── modeling_yulanmini.py
    ├── preprocess
        ├── README.md
        ├── convert_hf_datasets_to_megatron.py
        ├── mix
        │   └── update_metadata_from_clipboard.py
        └── tokenize
        │   ├── run_tokenize.sh
        │   ├── split_data.py
        │   └── tokenize_text.py
    ├── scripts
        ├── calc_norm.py
        ├── convert_yulanmini_to_llama.py
        └── estimate_mfu.py
    ├── setup.sh
    ├── synthesis
        ├── README.md
        ├── gen_lean_reasoning.py
        ├── gen_qwq.py
        └── gen_vllm.py
    ├── torchrun_wrapper.sh
    ├── train.py
    ├── train.sh
    ├── train_utils.py
    ├── yulanmini-2B-final-phase25.sh
    ├── yulanmini-2B-s25d-decay80-1sqrt-long-28k-final-phase26.sh
    └── yulanmini_trainer.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Additional context**
23 | Add any other context about the problem here.
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Yiwen Hu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README_ja.md:
--------------------------------------------------------------------------------
  1 | <h4 align="center">
  2 |     <p>
  3 |         <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/README_zh.md">中文</a> | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/README.md">English</a> | <b>日本語</b>
  4 |     <p>
  5 | </h4>
  6 | 
  7 | <div align=center>
  8 | <img src="assets/YuLan-logo.png" width="400px">
  9 | <h1>YuLan-Mini: データ効率の高いオープンな言語モデル</h1>
 10 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-MIT-blue" alt="license"></a>
 11 | <a href="https://arxiv.org/abs/2412.17743" target="_blank"><img src=https://img.shields.io/badge/arXiv-b5212f.svg?logo=arxiv></a>
 12 | <a href="https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3"><img alt="Static Badge" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?color=8A2BE2"></a>
 13 | <a><img src="https://img.shields.io/github/stars/RUC-GSAI/YuLan-Mini"></a>
 14 | </div>
 15 | 
 16 | YuLan-Miniは2.4Bパラメータの軽量な言語モデルです。1.08Tトークンのみを使用して事前トレーニングを行い、特に**数学**と**コード**の分野で、より多くのデータを使用した業界トップのモデルと同等の性能を達成しています。再現性を高めるために、関連する事前トレーニングリソースをオープンソース化します。
 17 | 
 18 | ---
 19 | 
 20 | ## モデルのダウンロード 🔗
 21 | 
 22 | |  Model  | Context Length | SFT | 🤗 Hugging Face | Wise Model |
 23 | |---------|----------------|-----|-----------------|------------|
 24 | | YuLan-Mini (Recommended) | 28K | ❎ | [`YuLan-Mini`](https://huggingface.co/yulan-team/YuLan-Mini) | [`YuLan-Mini`](https://wisemodel.cn/models/yulan-team/YuLan-Mini) |
 25 | | YuLan-Mini-2.4B-4K | 4K | ❎ | | |
 26 | | YuLan-Mini-Instruct | Comming soon | ✅ | | |
 27 | 
 28 | ---
 29 | 
 30 | ## 特徴 🌟
 31 | 
 32 | <div align=center>
 33 | <img src="assets/main.png">
 34 | </div>
 35 | 
 36 | 私たちの事前トレーニング方法は、以下の3つの重要な技術革新によりトレーニング効率を向上させます：
 37 | 
 38 | 1. データクリーニングとデータスケジュール戦略を組み合わせた精巧な**データパイプライン**。
 39 | 2. トレーニングの不安定性を効果的に緩和する体系的な**最適化方法**。
 40 | 3. ターゲットデータ選択と長いコンテキストトレーニングを統合した効果的な**アニーリングアプローチ**。
 41 | 
 42 | ---
 43 | ## ベンチマーク 🌟
 44 | 
 45 | |      モデル      | モデルサイズ | トレイントークン数 | コンテキスト長 | MATH 500 | GSM 8K | Human Eval | MBPP   | RACE Middle | RACE High | RULER  |
 46 | |:----------------|----------:|--------------:|--------------:|:--------|:------|:----------|:------|:-----------|:---------|:------|
 47 | |     MiniCPM      |    2.6B    |     1.06T      |       4K       |   15.00  |  53.83 |     50.00* |  47.31 |     56.61   |   44.27   |   N/A  |
 48 | |      Qwen-2      |    1.5B    |       7T       |      128K      |   22.60  | 46.90* |     34.80* | 46.90* |     55.77   |   43.69   |  60.16 |
 49 | |     Qwen2.5      |    0.5B    |      18T       |      128K      |   23.60  | 41.60* |     30.50* | 39.30* |     52.36   |   40.31   |  49.23 |
 50 | |     Qwen2.5      |    1.5B    |      18T       |      128K      |   **45.40**  | **68.50\*** |     37.20* | 60.20* |     **58.77**   |   44.33   |  <ins>68.26</ins> |
 51 | |     Gemma2       |    2.6B    |       2T       |       8K       |   18.30* | 30.30* |     19.50* | 42.10* |       -     |      -    |   N/A  |
 52 | |    StableLM2     |    1.7B    |       2T       |       4K       |     -    |  20.62 |      8.50* |  17.50 |     56.33   |   **45.06**   |   N/A  |
 53 | |    SmolLM2       |    1.7B    |      11T       |       8K       |   11.80  |    -   |     23.35  |  45.00 |     55.77   |   43.06   |   N/A  |
 54 | |    Llama3.2      |    3.2B    |       9T       |      128K      |    7.40  |    -   |     29.30  |  49.70 |     55.29   |   43.34   |  **77.06** |
 55 | |    YuLan-Mini    |    2.4B    |     1.04T      |       4K       |   32.60  |  66.65 |     <ins>61.60</ins>  |  **66.70** |     55.71   |   43.58   |   N/A  |
 56 | |    YuLan-Mini    |    2.4B    |     1.08T      |      28K       |  <ins>37.80</ins>  |  <ins>68.46</ins> |    **64.00**  |  <ins>65.90</ins>|     <ins>57.18</ins>   |   <ins>44.57</ins>   |  51.48 |
 57 | 
 58 | 
 59 | |      モデル      | LAMBADA | MMLU  | CMMLU | CEval | HellaSwag | WinoGrande | StoryCloze | ARC-e | ARC-c |
 60 | |:----------------|:-------|:-----|:-----|:-----|:----------|:-----------|:-----------|:-----|:-----|
 61 | |   MiniCPM-2.6B   |  61.91  | 53.37 | 48.97 | 48.24 |   67.92    |     65.74   |     78.51   | 55.51 | 43.86 |
 62 | |   Qwen2-1.5B     |  64.68  | 55.90 | **70.76** | **71.94** |   66.11    |     66.14   |     77.60   | 62.21 | 42.92 |
 63 | |  Qwen2.5-0.5B    |  52.00  | 47.50 | 52.17 | 54.27 |   50.54    |     55.88   |     71.67   | 56.10 | 39.51 |
 64 | |  Qwen2.5-1.5B    |  62.12  | <ins>60.71</ins> | <ins>67.82</ins> | <ins>69.05</ins> |   67.18    |     64.48   |     76.80   | **71.51** | <ins>53.41</ins> |
 65 | |   Gemma2-2.6B    |    -    | 52.20*|   -   | 28.00*|   <ins>74.60*</ins>   |    **71.50\***   |       -     |   -   | **55.70\***|
 66 | | StableLM2-1.7B   |  66.15  | 40.37 | 29.29 | 26.99 |   69.79    |     64.64   |     <ins>78.56</ins>   | 54.00 | 40.78 |
 67 | |  SmolLM2-1.7B    |  <ins>67.42</ins>  | 51.91 | 33.46 | 35.10 |   72.96    |     67.40   |     **79.32**   | 44.82 | 35.49 |
 68 | |   Llama3.2-3B    |  **69.08**  | **63.40** | 44.44 | 44.49 |   **75.62**    |     <ins>67.48</ins>   |     76.80   | <ins>70.12</ins> | 48.81 |
 69 | |    YuLan-Mini    |  64.72  | 51.79 | 48.35 | 51.47 |   68.65    |     67.09   |     76.37   | 69.87 | 50.51 |
 70 | |    YuLan-Mini    |  65.67  | 49.10 | 45.45 | 48.23 |   67.22    |     67.24   |     75.89   | 67.47 | 49.32 |
 71 | 
 72 | ---
 73 | 
 74 | ## 事前トレーニングリソース 🔧
 75 | 
 76 | 研究の透明性と再現性を高めるために、関連する[事前トレーニングリソース](https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain)をオープンソース化します：
 77 | 
 78 | <details><summary>1. 事前トレーニングと評価コード</summary>
 79 | 
 80 | 事前トレーニングと評価コードは今後のアップデートで公開されます。
 81 | </details>
 82 | 
 83 | 
 84 | 
 85 | <details><summary>2. 中間段階のチェックポイント</summary>
 86 | 中間段階のチェックポイントは<a href="https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3">YuLan-Mini</a>で公開されています。
 87 | 
 88 | </details>
 89 | 
 90 | <details><summary>3. アニーリング前のオプティマイザーステート</summary>
 91 | 
 92 | アニーリング前のオプティマイザーステートは今後のアップデートで公開されます。
 93 | </details>
 94 | 
 95 | 
 96 | <details><summary>4. 使用したオープンソースデータセット</summary>
 97 | 
 98 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/datasets">使用データセットリスト</a>
 99 | 
100 | </details>
101 | 
102 | <details><summary>5. 各フェーズのデータ分布</summary>
103 | 
104 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/datasets/final.pdf">
105 |   <div align=center>
106 |     <img src="assets/data_distribution_for_every_phase.png">
107 |   </div>
108 | </a>
109 | 
110 | </details>
111 | 
112 | <details><summary>6. 合成データ</summary>
113 | 
114 | データクリーニングと合成パイプライン：
115 | <div align=center>
116 | <img src="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/assets/data-pipeline.png">
117 | </div>
118 | 
119 | 私たちが使用している合成データは<a href="https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3">YuLan-Mini-Datasets</a>で公開されています。
120 | 
121 | </details>
122 | 
123 | <details><summary>7. 中間オプティマイザーステート</summary>
124 | 
125 | 中間オプティマイザーステートは今後のアップデートで公開されます。
126 | </details>
127 | 
128 | 
129 | ### これらの事前トレーニングリソースでできること
130 | 
131 | 1. **独自のLLMを事前トレーニング**。私たちのデータとカリキュラムを使用して、YuLan-Miniと同等の強力なモデルをトレーニングできます。
132 | 2. **学習率アニーリング**を独自に実行。アニーリングフェーズ中、YuLan-Miniの学習能力はピークに達します。アニーリング前のチェックポイントからトレーニングを再開し、独自のデータセットを使用して学習率アニーリングを行うことができます。
133 | 3. **LLMのInstructバージョンを微調整**。YuLan-Miniベースモデルを使用して、独自のInstructバージョンをトレーニングできます。
134 | 4. **トレーニングダイナミクス**の研究。YuLan-Miniの中間チェックポイントを使用して、事前トレーニングプロセス中の内部変化を探ることができます。
135 | 5. **独自のデータを合成**。YuLan-Miniのデータパイプラインを使用して、独自のデータセットをクリーニングおよび生成できます。
136 | 
137 | ---
138 | 
139 | ## クイックスタート 💻
140 | 
141 | 以下はHuggingfaceを使用した簡単な推論コードの例です：
142 | 
143 | **Huggingface推論例**
144 | ```python
145 | import torch
146 | from transformers import AutoTokenizer, AutoModelForCausalLM
147 | 
148 | # モデルとトークナイザーをロード
149 | tokenizer = AutoTokenizer.from_pretrained("yulan-team/YuLan-Mini")
150 | model = AutoModelForCausalLM.from_pretrained("yulan-team/YuLan-Mini", torch_dtype=torch.bfloat16)
151 | 
152 | # 入力テキスト
153 | input_text = "Renmin University of China is"
154 | inputs = tokenizer(input_text, return_tensors="pt")
155 | 
156 | # 推論
157 | output = model.generate(inputs["input_ids"], max_new_tokens=100)
158 | print(tokenizer.decode(output[0], skip_special_tokens=True))
159 | ```
160 | 
161 | **vLLMサーブ例**
162 | ```bash
163 | vllm serve yulan-team/YuLan-Mini --dtype bfloat16
164 | ```
165 | 
166 | **SGLangサーブ例**
167 | ```bash
168 | python -m sglang.launch_server --model-path yulan-team/YuLan-Mini --port 30000 --host 0.0.0.0
169 | ```
170 | 
171 | ---
172 | 
173 | ## チーム
174 | 
175 | YuLan-Miniは[中国人民大学AIボックス](http://aibox.ruc.edu.cn/)によって開発および維持されています。
176 | 
177 | ## ライセンス
178 | 
179 | - このリポジトリのコードは[MITライセンス](./LICENSE)の下で公開されています。
180 | - モデルの重み、中間オプティマイザーステート、およびトレーニングデータの使用に関するポリシーは今後のアップデートで発表されます。
181 | - 制限事項：安全性の懸念を緩和し、倫理的かつ合法的なテキストの生成を奨励するために努力していますが、言語モデルの確率的な性質により、予期しない出力が発生する可能性があります。たとえば、応答には偏見、差別、またはその他の有害な内容が含まれることがあります。このような内容を広めないでください。有害な情報の拡散によって生じるいかなる結果についても責任を負いません。
182 | 
183 | ## 引用
184 | 
185 | YuLan-Miniが研究や開発に役立つ場合は、[技術報告書](https://arxiv.org/abs/2412.17743)を引用してください：
186 | 
187 | ```
188 | @misc{hu2024yulanmini,
189 |       title={YuLan-Mini: An Open Data-efficient Language Model},
190 |       author={Yiwen Hu and Huatong Song and Jia Deng and Jiapeng Wang and Jie Chen and Kun Zhou and Yutao Zhu and Jinhao Jiang and Zican Dong and Wayne Xin Zhao and Ji-Rong Wen},
191 |       year={2024},
192 |       eprint={2412.17743},
193 |       archivePrefix={arXiv},
194 |       primaryClass={cs.CL},
195 |       url={https://arxiv.org/abs/2412.17743},
196 | }
197 | ```
198 | 


--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
  1 | <h4 align="center">
  2 |     <p>
  3 |         <b>中文</b> | <a href="https://github.com/RUC-GSAI/YuLan-Mini">English</a> | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/README_ja.md">日本語</a>
  4 |     <p>
  5 | </h4>
  6 | 
  7 | <div align=center>
  8 | <img src="assets/YuLan-logo.png" width="400px">
  9 | <h1>YuLan-Mini: 数据高效的开源语言模型</h1>
 10 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-MIT-blue" alt="license"></a>
 11 | <a href="https://arxiv.org/abs/2412.17743" target="_blank"><img src=https://img.shields.io/badge/arXiv-b5212f.svg?logo=arxiv></a>
 12 | <a href="https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3"><img alt="Static Badge" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue?color=8A2BE2"></a>
 13 | <a><img src="https://img.shields.io/github/stars/RUC-GSAI/YuLan-Mini"></a>
 14 | </div>
 15 | 
 16 | YuLan-Mini 是一个 2.4B 参数量的轻量化语言模型。仅使用 1.08T Tokens 进行预训练，却达到了与使用更多数据的行业领先模型相媲美的性能，尤其是 **数学** 和 **代码** 两个领域。为方便复现，我们将开源相关预训练资源。
 17 | 
 18 | ---
 19 | 
 20 | ## 新闻
 21 | 
 22 | - [2025.01.29] YuLan-Mini-Instruct-v1 发布
 23 | - [2024.12.23] YuLan-Mini 及预训练资源发布
 24 | 
 25 | ## 模型下载 🔗
 26 | 
 27 | > YuLan-Mini 是 [YuLan 系列](https://github.com/RUC-GSAI/YuLan-Chat) 的一部分，该系列还包括更大规模和不同训练策略的模型。
 28 | 
 29 | |  模型  | 上下文长度 | SFT | 🤗 Hugging Face | ModelScope | Wise Model |
 30 | |---------|----------------|-----|-----------------|------------|------------|
 31 | | YuLan-Mini | 28K | ❎ | [`Base`](https://huggingface.co/yulan-team/YuLan-Mini) | [`Base`](https://modelscope.cn/models/yulan-team/YuLan-Mini) | [`Base`](https://wisemodel.cn/models/yulan-team/YuLan-Mini) |
 32 | | YuLan-Mini-Instruct | 28K | ✅ | [`Instruct`](https://huggingface.co/yulan-team/YuLan-Mini-Instruct) | | |
 33 | 
 34 | > 中间检查点可以在[这里](#%E9%A2%84%E8%AE%AD%E7%BB%83%E8%B5%84%E6%BA%90-)找到。
 35 | 
 36 | ---
 37 | 
 38 | ## 能力介绍 🌟
 39 | 
 40 | <div align=center>
 41 | <img src="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/assets/main.png">
 42 | </div>
 43 | 
 44 | 我们的预训练方法通过以下三项关键技术改进提升了训练效率：
 45 | 
 46 | 1. 精细的数据处理流程，将数据清洗与数据课程策略相结合；
 47 | 2. 稳定的优化方法，有效缓解预训练中的不稳定性；
 48 | 3. 高效的退火策略，融合了目标数据选择和长上下文训练。
 49 | 
 50 | 最终，使用我们的高效预训练策略，仅 1T 的数据量便可在数学和代码等领域，媲美 Qwen2.5-1.5B 在 18T 数据上的效果。我们将开源使用到的 1T 数据，其中指令数据仅占 3.5%。
 51 | 
 52 | ---
 53 | ## 基准测试 🌟
 54 | 
 55 | | Models                  | MMLU | CEVAL | GSM8K | ARC_CHALLENGE | GPQA | MATH | HUMANEVAL@1 | MBPP@1 |
 56 | |-------------------------|-------|-------|-------|---------------|------|------|-------------|--------|
 57 | | Qwen-2.5-1.5B-Instruct  | 57.5  | 65.4  | 73.2  | 47.8          | 29.8 | 55.2 | 61.6        | 88.1   |
 58 | | Llama3.2-3B-Instruct    | 60    | 45.9  | 43.4  | 78.6          | 38.6 | 48   | 51.5        | 80.4   |
 59 | | YuLan-Mini-Instruct  | 53.6  | 50.5    | 82.3  | 51.8          | 30.1 | 55.2 | 67.7        | 85.7   |
 60 | 
 61 | > 注意：模型大小的计算包含了嵌入层（embedding）的大小。
 62 | 
 63 | |      Models      | Model Size | # Train Tokens | Context Length | MATH 500 | GSM 8K | Human Eval | MBPP   | RACE Middle | RACE High | RULER  |
 64 | |:----------------|----------:|--------------:|--------------:|:--------|:------|:----------|:------|:-----------|:---------|:------|
 65 | |     MiniCPM      |    2.6B    |     1.06T      |       4K       |   15.00  |  53.83 |     50.00* |  47.31 |     56.61   |   44.27   |   N/A  |
 66 | |      Qwen-2      |    1.5B    |       7T       |      128K      |   22.60  | 46.90* |     34.80* | 46.90* |     55.77   |   43.69   |  60.16 |
 67 | |     Qwen2.5      |    0.5B    |      18T       |      128K      |   23.60  | 41.60* |     30.50* | 39.30* |     52.36   |   40.31   |  49.23 |
 68 | |     Qwen2.5      |    1.5B    |      18T       |      128K      |   **45.40**  | **68.50\*** |     37.20* | 60.20* |     **58.77**   |   44.33   |  <ins>68.26</ins> |
 69 | |     Gemma2       |    2.6B    |       2T       |       8K       |   18.30* | 30.30* |     19.50* | 42.10* |       -     |      -    |   N/A  |
 70 | |    StableLM2     |    1.7B    |       2T       |       4K       |     -    |  20.62 |      8.50* |  17.50 |     56.33   |   **45.06**   |   N/A  |
 71 | |    SmolLM2       |    1.7B    |      11T       |       8K       |   11.80  |    -   |     23.35  |  45.00 |     55.77   |   43.06   |   N/A  |
 72 | |    Llama3.2      |    3.2B    |       9T       |      128K      |    7.40  |    -   |     29.30  |  49.70 |     55.29   |   43.34   |  **77.06** |
 73 | |    YuLan-Mini    |    2.4B    |     1.04T      |       4K       |   32.60  |  66.65 |     <ins>61.60</ins>  |  **66.70** |     55.71   |   43.58   |   N/A  |
 74 | |    YuLan-Mini    |    2.4B    |     1.08T      |      28K       |  <ins>37.80</ins>  |  <ins>68.46</ins> |    **64.00**  |  <ins>65.90</ins>|     <ins>57.18</ins>   |   <ins>44.57</ins>   |  51.48 |
 75 | 
 76 | 
 77 | |      Models      | LAMBADA | MMLU  | CMMLU | CEval | HellaSwag | WinoGrande | StoryCloze | ARC-e | ARC-c |
 78 | |:----------------|:-------|:-----|:-----|:-----|:----------|:-----------|:-----------|:-----|:-----|
 79 | |   MiniCPM-2.6B   |  61.91  | 53.37 | 48.97 | 48.24 |   67.92    |     65.74   |     78.51   | 55.51 | 43.86 |
 80 | |   Qwen2-1.5B     |  64.68  | 55.90 | **70.76** | **71.94** |   66.11    |     66.14   |     77.60   | 62.21 | 42.92 |
 81 | |  Qwen2.5-0.5B    |  52.00  | 47.50 | 52.17 | 54.27 |   50.54    |     55.88   |     71.67   | 56.10 | 39.51 |
 82 | |  Qwen2.5-1.5B    |  62.12  | <ins>60.71</ins> | <ins>67.82</ins> | <ins>69.05</ins> |   67.18    |     64.48   |     76.80   | **71.51** | <ins>53.41</ins> |
 83 | |   Gemma2-2.6B    |    -    | 52.20*|   -   | 28.00*|   <ins>74.60*</ins>   |    **71.50\***   |       -     |   -   | **55.70\***|
 84 | | StableLM2-1.7B   |  66.15  | 40.37 | 29.29 | 26.99 |   69.79    |     64.64   |     <ins>78.56</ins>   | 54.00 | 40.78 |
 85 | |  SmolLM2-1.7B    |  <ins>67.42</ins>  | 51.91 | 33.46 | 35.10 |   72.96    |     67.40   |     **79.32**   | 44.82 | 35.49 |
 86 | |   Llama3.2-3B    |  **69.08**  | **63.40** | 44.44 | 44.49 |   **75.62**    |     <ins>67.48</ins>   |     76.80   | <ins>70.12</ins> | 48.81 |
 87 | |    YuLan-Mini    |  64.72  | 51.79 | 48.35 | 51.47 |   68.65    |     67.09   |     76.37   | 69.87 | 50.51 |
 88 | |    YuLan-Mini    |  65.67  | 49.10 | 45.45 | 48.23 |   67.22    |     67.24   |     75.89   | 67.47 | 49.32 |
 89 | 
 90 | ---
 91 | 
 92 | ## 预训练资源 🔧
 93 | 
 94 | 为了提高研究的透明度和可复现性，我们开源了相关的[预训练资源](https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain)：
 95 | 
 96 | ### 预训练
 97 | 
 98 | <details><summary>1. 预训练和评估代码</summary>
 99 | 
100 | 预训练代码可以在[这里](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain)找到。请注意，由于后续的代码修改，此代码可能无法直接运行，可能需要进行一些调整。
101 | 
102 | <h4 id="step-1-modify-the-config-json-">步骤 1：修改 <code>config.json</code></h4>
103 | <p>由于 Hugging Face Trainer 的实现，某些参数存储在 <code>config.json</code> 文件中，无法通过 Trainer 的命令行参数进行修改。因此，您需要首先更新 <code>config.json</code> 文件中的这些参数，特别是：</p>
104 | <ul>
105 | <li><strong><code>save_steps</code></strong>：保存中间检查点的频率。</li>
106 | <li><strong><code>train_batch_size</code></strong>：每个 GPU 的批大小（相当于 Trainer 中的 <code>per_device_train_batch_size</code>）。在稳定训练阶段，我们使用了 1008 的批大小（大约 4M 个 token）。保持相同的批大小对于训练效果同样重要。</li>
107 | </ul>
108 | <p>以下是一个正确配置的 <code>config.json</code> 文件示例：</p>
109 | <pre><code class="lang-json">{
110 |   <span class="hljs-attr">"best_metric"</span>: <span class="hljs-literal">null</span>,
111 |   <span class="hljs-attr">"best_model_checkpoint"</span>: <span class="hljs-literal">null</span>,
112 |   <span class="hljs-attr">"epoch"</span>: <span class="hljs-number">0.0</span>,
113 |   <span class="hljs-attr">"eval_steps"</span>: <span class="hljs-number">500</span>,
114 |   <span class="hljs-attr">"global_step"</span>: <span class="hljs-number">0</span>,
115 |   <span class="hljs-attr">"is_hyper_param_search"</span>: <span class="hljs-literal">false</span>,
116 |   <span class="hljs-attr">"is_local_process_zero"</span>: <span class="hljs-literal">true</span>,
117 |   <span class="hljs-attr">"is_world_process_zero"</span>: <span class="hljs-literal">true</span>,
118 |   <span class="hljs-attr">"log_history"</span>: [],
119 |   <span class="hljs-attr">"logging_steps"</span>: <span class="hljs-number">3</span>,
120 |   <span class="hljs-attr">"max_steps"</span>: <span class="hljs-number">0</span>,
121 |   <span class="hljs-attr">"num_input_tokens_seen"</span>: <span class="hljs-number">0</span>,
122 |   <span class="hljs-attr">"num_train_epochs"</span>: <span class="hljs-number">0</span>,
123 |   <span class="hljs-attr">"save_steps"</span>: <span class="hljs-number">250</span>,
124 |   <span class="hljs-attr">"stateful_callbacks"</span>: {
125 |     <span class="hljs-attr">"TrainerControl"</span>: {
126 |       <span class="hljs-attr">"args"</span>: {
127 |         <span class="hljs-attr">"should_epoch_stop"</span>: <span class="hljs-literal">false</span>,
128 |         <span class="hljs-attr">"should_evaluate"</span>: <span class="hljs-literal">false</span>,
129 |         <span class="hljs-attr">"should_log"</span>: <span class="hljs-literal">false</span>,
130 |         <span class="hljs-attr">"should_save"</span>: <span class="hljs-literal">true</span>,
131 |         <span class="hljs-attr">"should_training_stop"</span>: <span class="hljs-literal">true</span>
132 |       },
133 |       <span class="hljs-attr">"attributes"</span>: {}
134 |     }
135 |   },
136 |   <span class="hljs-attr">"total_flos"</span>: <span class="hljs-number">0</span>,
137 |   <span class="hljs-attr">"train_batch_size"</span>: <span class="hljs-number">3</span>,
138 |   <span class="hljs-attr">"trial_name"</span>: <span class="hljs-literal">null</span>,
139 |   <span class="hljs-attr">"trial_params"</span>: <span class="hljs-literal">null</span>
140 | }
141 | </code></pre>
142 | <h4 id="step-2-enable-universal-checkpointing-in-the-deepspeed-configuration">步骤 2：在 DeepSpeed 配置中启用通用检查点</h4>
143 | <p>为了确保 DeepSpeed 集成加载通用检查点，您需要在 DeepSpeed 配置 JSON 文件中启用此功能。</p>
144 | <p>以下是一个启用了通用检查点的 ZeRO2 配置示例：</p>
145 | <pre><code class="lang-json">{
146 |   <span class="hljs-attr">"bf16"</span>: {
147 |     <span class="hljs-attr">"enabled"</span>: <span class="hljs-string">"auto"</span>
148 |   },
149 |   <span class="hljs-attr">"zero_optimization"</span>: {
150 |     <span class="hljs-attr">"stage"</span>: <span class="hljs-number">2</span>,
151 |     <span class="hljs-attr">"allgather_partitions"</span>: <span class="hljs-literal">true</span>,
152 |     <span class="hljs-attr">"allgather_bucket_size"</span>: <span class="hljs-number">8e8</span>,
153 |     <span class="hljs-attr">"overlap_comm"</span>: <span class="hljs-literal">true</span>,
154 |     <span class="hljs-attr">"reduce_scatter"</span>: <span class="hljs-literal">true</span>,
155 |     <span class="hljs-attr">"reduce_bucket_size"</span>: <span class="hljs-number">8e8</span>,
156 |     <span class="hljs-attr">"contiguous_gradients"</span>: <span class="hljs-literal">true</span>
157 |   },
158 |   <span class="hljs-attr">"gradient_accumulation_steps"</span>: <span class="hljs-string">"auto"</span>,
159 |   <span class="hljs-attr">"gradient_clipping"</span>: <span class="hljs-string">"auto"</span>,
160 |   <span class="hljs-attr">"steps_per_print"</span>: <span class="hljs-number">16</span>,
161 |   <span class="hljs-attr">"train_batch_size"</span>: <span class="hljs-string">"auto"</span>,
162 |   <span class="hljs-attr">"train_micro_batch_size_per_gpu"</span>: <span class="hljs-string">"auto"</span>,
163 |   <span class="hljs-attr">"wall_clock_breakdown"</span>: <span class="hljs-literal">false</span>,
164 |   <span class="hljs-attr">"dump_state"</span>: <span class="hljs-literal">true</span>,
165 |   <span class="hljs-attr">"optimizer"</span>: {
166 |     <span class="hljs-attr">"type"</span>: <span class="hljs-string">"AdamW"</span>,
167 |     <span class="hljs-attr">"params"</span>: {
168 |       <span class="hljs-attr">"lr"</span>: <span class="hljs-string">"auto"</span>,
169 |       <span class="hljs-attr">"betas"</span>: <span class="hljs-string">"auto"</span>,
170 |       <span class="hljs-attr">"eps"</span>: <span class="hljs-string">"auto"</span>,
171 |       <span class="hljs-attr">"weight_decay"</span>: <span class="hljs-string">"auto"</span>
172 |     }
173 |   },
174 |   <span class="hljs-attr">"checkpoint"</span>: {
175 |     <span class="hljs-attr">"load_universal"</span>: <span class="hljs-literal">true</span>
176 |   }
177 | }
178 | </code></pre>
179 | <h4 id="step-3-resume-training">步骤 3：恢复训练</h4>
180 | <p>调用 <code>trainer.train</code> 时，包含 <code>resume_from_checkpoint</code> 参数以从通用检查点加载分布式优化器状态并恢复训练。</p>
181 | <pre><code class="lang-python"><span class="hljs-attr">trainer.train(resume_from_checkpoint</span>=<span class="hljs-string">training_args.resume_from_checkpoint)</span>
182 | </code></pre>
183 | <p>我们提供了一个内部<a href="https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain">训练框架</a>供您参考，但您可以自由选择其他框架。</p>
184 | 
185 | </details>
186 | 
187 | <details><summary>2. 中间阶段检查点</summary>
188 | 中间阶段检查点发布在 <a href="https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3">YuLan-Mini</a> 中。
189 | 
190 | <table>
191 |     <thead>
192 |         <tr>
193 |             <th>阶段</th>
194 |             <th>课程阶段</th>
195 |             <th>4K 上下文</th>
196 |             <th>28K 上下文</th>
197 |             <th>优化器</th>
198 |             <th>推理架构</th>
199 |             <th>LAMBADA <code>Acc</code></th>
200 |             <th>GSM8K <code>Acc</code></th>
201 |             <th>HumanEval <code>pass@1</code></th>
202 |         </tr>
203 |     </thead>
204 |     <tbody>
205 |         <tr>
206 |             <td>稳定</td>
207 |             <td>5</td>
208 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Phase5">YuLan-Mini-Phase5</a></td>
209 |             <td></td>
210 |             <td></td>
211 |             <td><code>yulanmini</code></td>
212 |             <td>53.85</td>
213 |             <td>3.41</td>
214 |             <td>12.26</td>
215 |         </tr>
216 |         <tr>
217 |             <td>稳定</td>
218 |             <td>10</td>
219 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Phase10">YuLan-Mini-Phase10</a></td>
220 |             <td></td>
221 |             <td></td>
222 |             <td><code>yulanmini</code></td>
223 |             <td>55.00</td>
224 |             <td>9.57</td>
225 |             <td>15.95</td>
226 |         </tr>
227 |         <tr>
228 |             <td>稳定</td>
229 |             <td>15</td>
230 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Phase15">YuLan-Mini-Phase15</a></td>
231 |             <td></td>
232 |             <td></td>
233 |             <td><code>yulanmini</code></td>
234 |             <td>55.81</td>
235 |             <td>13.81</td>
236 |             <td>16.99</td>
237 |         </tr>
238 |         <tr>
239 |             <td>稳定</td>
240 |             <td>20</td>
241 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Phase20">YuLan-Mini-Phase20</a></td>
242 |             <td></td>
243 |             <td>✅</td>
244 |             <td><code>yulanmini</code></td>
245 |             <td>55.81</td>
246 |             <td>21.39</td>
247 |             <td>20.79</td>
248 |         </tr>
249 |         <tr>
250 |             <td>稳定</td>
251 |             <td>25 (1T tokens)</td>
252 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing">YuLan-Mini-Before-Annealing</a></td>
253 |             <td></td>
254 |             <td>✅</td>
255 |             <td><code>yulanmini</code></td>
256 |             <td>55.67</td>
257 |             <td>29.94</td>
258 |             <td>34.06</td>
259 |         </tr>
260 |         <tr>
261 |             <td></td>
262 |             <td></td>
263 |             <td></td>
264 |             <td></td>
265 |             <td></td>
266 |             <td></td>
267 |             <td></td>
268 |             <td></td>
269 |             <td></td>
270 |         </tr>
271 |         <tr>
272 |             <td>退火</td>
273 |             <td>26</td>
274 |             <td>YuLan-Mini-4K</td>
275 |             <td></td>
276 |             <td></td>
277 |             <td><code>llama</code>*</td>
278 |             <td>64.72</td>
279 |             <td>66.65</td>
280 |             <td>61.60</td>
281 |         </tr>
282 |         <tr>
283 |             <td>退火</td>
284 |             <td>27</td>
285 |             <td></td>
286 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini">YuLan-Mini</a></td>
287 |             <td></td>
288 |             <td><code>llama</code>*</td>
289 |             <td>65.67</td>
290 |             <td>68.46</td>
291 |             <td>64.00</td>
292 |         </tr>
293 |     </tbody>
294 | </table>
295 | 
296 | \*：为了更容易推理和部署，我们将重新参数化的附加参数和缩放因子合并到最终发布的模型中 ([**YuLan-Mini**](https://huggingface.co/yulan-team/YuLan-Mini) 和 **YuLan-Mini-Intermediate-4K**)，使其能够在 Llama 架构上运行。但是，这些参数仍然保留在训练过程的中间检查点中。
297 | 
298 | </details>
299 | 
300 | <details><summary>3. 退火前的优化器状态</summary>
301 | 
302 | <a href="https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing">🤗 YuLan-Mini-Before-Annealing</a>
303 | </details>
304 | 
305 | ### 数据集
306 | 
307 | 
308 | <details><summary>4. 使用的开源数据集</summary>
309 | 
310 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/datasets">使用的开源数据集列表</a>
311 | 
312 | </details>
313 | 
314 | <details><summary>5. 每个阶段的数据分布</summary>
315 | 
316 | ⬇️ 点击查看更多详情：
317 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/datasets/final.pdf">
318 |   <div align=center>
319 |     <img src="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/assets/data_distribution_for_every_phase.png">
320 |   </div>
321 | </a>
322 | 
323 | </details>
324 | 
325 | <details><summary>6. 合成数据</summary>
326 | 
327 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/preprocess">数据清洗</a> 和 <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/synthesis">合成</a> 流程：
328 | 
329 | <div align=center>
330 | <img src="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/assets/data-pipeline.png">
331 | </div>
332 | 
333 | 我们使用的合成数据发布在 <a href="https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3">🤗 YuLan-Mini-Datasets</a>
334 | 
335 | </details>
336 | 
337 | 
338 | ### 您可以使用这些预训练资源做什么
339 | 
340 | 1. **预训练**您自己的 LLM。您可以使用[我们的数据](https://huggingface.co/yulan-team/YuLan-Mini-Datasets)和课程来训练一个与 YuLan-Mini 一样强大的模型。
341 | 2. 执行您自己的**学习率退火**。在退火阶段，YuLan-Mini 的学习能力达到顶峰。您可以从[退火前的检查点](https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing)恢复训练，并使用您自己的数据集进行学习率退火。
342 | 3. **微调** LLM 的 Instruct 版本。您可以使用 [YuLan-Mini](https://huggingface.co/yulan-team/YuLan-Mini) 基础模型来训练您自己的 Instruct 版本。
343 | 4. **训练动态**研究。您可以使用 YuLan-Mini 的[中间检查点](https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3)来探索预训练过程中的内部变化。
344 | 5. **合成**您自己的数据。您可以使用 YuLan-Mini 的[数据流程](https://github.com/RUC-GSAI/YuLan-Mini)来清理和生成您自己的数据集。
345 | ---
346 | 
347 | ## 快速开始 💻
348 | 
349 | 以下是使用 Huggingface 的简单推理代码示例：
350 | 
351 | **Huggingface 推理示例**
352 | ```python
353 | import torch
354 | from transformers import AutoTokenizer, AutoModelForCausalLM
355 | 
356 | # Load model and tokenizer
357 | tokenizer = AutoTokenizer.from_pretrained("yulan-team/YuLan-Mini-Instruct")
358 | model = AutoModelForCausalLM.from_pretrained("yulan-team/YuLan-Mini-Instruct", torch_dtype=torch.bfloat16)
359 | 
360 | # Input text
361 | chat = [
362 |     {"role": "system", "content": "You are YuLan-Mini, created by RUC AI Box. You are a helpful assistant."},
363 |     {"role": "user", "content": "What is Renmin University of China?"}
364 | ]
365 | formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
366 | inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
367 | 
368 | # Completion
369 | output = model.generate(inputs["input_ids"], max_new_tokens=100, temperature=0.5)
370 | print(tokenizer.decode(output[0][inputs['input_ids'].size(1):], skip_special_tokens=True))
371 | ```
372 | 
373 | **vLLM部署示例**
374 | ```bash
375 | vllm serve yulan-team/YuLan-Mini-Instruct --dtype bfloat16
376 | ```
377 | 
378 | **SGLang部署示例**
379 | ```bash
380 | python -m sglang.launch_server --model-path yulan-team/YuLan-Mini-Instruct --port 30000 --host 0.0.0.0
381 | ```
382 | 
383 | **Ollama部署示例**
384 | ```bash
385 | ollama run hf.co/mradermacher/YuLan-Mini-Instruct-GGUF:IQ4_XS
386 | ```
387 | 
388 | ---
389 | 
390 | ## 贡献
391 | 
392 | 我们欢迎任何形式的贡献，包括模型错误案例的反馈、功能建议和示例贡献。您可以通过提交[issue](https://github.com/RUC-GSAI/YuLan-Mini/issues)来贡献。
393 | 
394 | ## 许可协议
395 | 
396 | - 本仓库代码使用 [MIT License](./LICENSE)。
397 | - 局限性：尽管我们尝试减少模型在使用中可能出现的安全性问题，并鼓励模型生成符合道德和法律要求的文本，但由于语言模型基于概率生成的范式，模型仍然可能会产生意外的输出。例如，生成的响应可能包含偏见、歧视或其他有害内容。请不要传播此类内容。我们对因传播有害信息而造成的任何后果不承担任何责任。
398 | 
399 | ## 引用
400 | 
401 | 如果您发现 YuLan-Mini 对您的研究或开发有帮助，请引用我们的[技术报告](https://arxiv.org/abs/2412.17743)：
402 | 
403 | ```BibTex
404 | @article{hu2024yulan,
405 |   title={YuLan-Mini: An Open Data-efficient Language Model},
406 |   author={Hu, Yiwen and Song, Huatong and Deng, Jia and Wang, Jiapeng and Chen, Jie and Zhou, Kun and Zhu, Yutao and Jiang, Jinhao and Dong, Zican and Zhao, Wayne Xin and others},
407 |   journal={arXiv preprint arXiv:2412.17743},
408 |   year={2024}
409 | }
410 | ```
411 | 


--------------------------------------------------------------------------------
/assets/YuLan-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/assets/YuLan-logo.jpg


--------------------------------------------------------------------------------
/assets/YuLan-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/assets/YuLan-logo.png


--------------------------------------------------------------------------------
/assets/data-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/assets/data-pipeline.png


--------------------------------------------------------------------------------
/assets/data-preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/assets/data-preview.png


--------------------------------------------------------------------------------
/assets/data_distribution_for_every_phase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/assets/data_distribution_for_every_phase.png


--------------------------------------------------------------------------------
/assets/main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/assets/main.png


--------------------------------------------------------------------------------
/assets/training-stability.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/assets/training-stability.png


--------------------------------------------------------------------------------
/post_train/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | ## 1. Introduction
  5 | 
  6 | We introduce **YuLan-Mini-Instruct**, a compact yet powerful model with 2.4 billion parameters. YuLan-Mini-Instruct represents a post-training adaptation of [YuLan-Mini](https://arxiv.org/abs/2412.17743) base model. By leveraging efficient training on both open and synthetic data, the model achieves performance comparable to mainstream models (such as Qwen-1.5b and LLaMA-3B), without compromising its core capabilities in math and code.
  7 | YuLan-Mini-Instruct is pre-trained on 1.08 trillion tokens and further enhanced through a post-training pipeline  that incorporates Supervised Fine-Tuning, Preference Finetuning, and Reinforcement Learning to maximize its capabilities.
  8 | Extensive evaluations demonstrate that YuLan-Mini-Instruct achieves state-of-the-art performance, demonstrating competitive capabilities with leading industrial counterparts across multiple domains including instruction-following, math, code, and reasoning tasks. The model is available at https://huggingface.co/yulan-team/YuLan-Mini-Instruct.
  9 | 
 10 | ## 2. Model Overview
 11 | 
 12 | **YuLan-Mini-Instruct** exhibits competitive performance compared to similarly sized models, particularly in reasoning tasks involving math and code.
 13 | We evaluate YuLan-Mini-Instruct against other models from the frontier series with similar parameter sizes, including Qwen2.5-1.5B-Instruct and Llama3.2-3B-Instruct. 
 14 | 
 15 | ![result](img/result.png)
 16 | 
 17 | ## 3. Supervised Finetuning
 18 | 
 19 | ### Data
 20 | #### Data Source
 21 | We meticulously select two primary data sources: 1) Open-source data and 2) Synthetic Data, ensuring that our model achieves comprehensive knowledge coverage and robust domain-specific expertise.
 22 | 
 23 | ##### Open-source data
 24 | We curate a collection of high-quality open-source datasets spanning multiple fields and subjects. These datasets are either constructed by powerful proprietary models like GPT-4 or crafted by human experts, providing a robust foundation for SFT.
 25 | 
 26 | ##### Synthetic data
 27 | Given the scarcity of high-quality domain-specific data, synthetic data plays a crucial role in improving the model's professional capabilities. Consequently, we also synthesize a substantial corpus of targeted data across domains such as mathematics, science, and Chinese commonsense reasoning.
 28 | 
 29 | * **Mathematics**. Distilling from powerful models provides an efficient approach for data construction, especially in the domain of mathematics. However, in addition to selecting an appropriate teacher model, the choice of high-quality and domain-specific initial instructions is equally crucial. Consequently, we adopt Qwen2.5-Math-7B-Instruct, known for its strong mathematical capabilities, as the teacher model. For the initial distillation samples, we select questions of varying difficulty from OpenMathInstruct2, a dataset containing over 10 million high-quality math question-answer pairs. Each distilled response is matched with the ground truth to ensure the precision of the answer.
 30 | 
 31 | * **Science**. To develop high-quality scientific datasets, we first collect scientific texts from a diverse array of reputable sources, such as Wiki, Reddit, and Arxiv. These texts cover a broad spectrum of disciplines, including physics, chemistry, biology, and psychology. After the collection phase, we use Qwen2.5-32B-Instruct to extract questions from the texts and generate appropriate answers. Finally, the generated question-answer pairs are reclassified using a topic classifier to ensure accurate alignment with respective scientific categories.
 32 | 
 33 | * **Chinese commonsense reasoning**. For the construction of Chinese commonsense reasoning data, we collect data from authoritative source Baidu Baike and  authentic user interactions from prominent platforms such as WildChat and SmolTalk-Chinese. Using TF-IDF and keyword matching, we identify and filter content with highly semantic relevance for generating multiple question-answer pairs with knowledge-intensive documents. To maintain rigorous quality standards, we employ the CMMLU classifier for topic categorization and LLM-based evaluation to assess response quality. To prevent hallucination, we cross-reference entity-based queries with encyclopedia entries and maintain a mapping list for consistent answers to similar questions.
 34 | 
 35 | 
 36 | 
 37 | #### Data Filtering
 38 | During the SFT phase, the quality of training data plays a pivotal role in model performance. Therefore, we design a comprehensive data filtering pipeline incorporating multiple quality control strategies, ensuring that the refined dataset maximally contributes to the model's effectiveness and reliability.
 39 | 
 40 | ##### De-duplication
 41 | During the data filtration process, deduplication represents a critical preprocessing step, as the presence of redundant data samples can significantly constrain the model's ability to to learn diverse patterns, thereby compromising the training efficiency. To address this challenge, we adopt MinHash and Locality-Sensitive Hashing (LSH) algorithms, which enables efficient identification and removal of near-duplicate content.
 42 | 
 43 | ##### Heuristic filtering
 44 | Anomalous patterns in training data, including but not limited to repetitive punctuation or excessive garbled text, have been empirically demonstrated to adversely affect model performance. To avoid this, we develop a comprehensive suite of rule-based filtration heuristics to eliminating low-quality samples that could potentially harm the model. 
 45 | 
 46 | ##### Quality-based scoring
 47 | LLM-as-a-Judge is an effective method for evaluating data quality. Specifically, we use Qwen2.5-32B-Instruct as the judge to score each piece of data on three dimensions: Instruction Following, Informativeness and Truthfulness, with scores ranging from 1 to 5. We then sort the data according to the average score across these three dimensions and discard samples from the lower percentile. 
 48 | 
 49 | ##### Complexity-based selection
 50 | Besides low data quality, excessive data complexity can significantly impair learning efficacy. This phenomenon is particularly pronounced in smaller-scale language models, which demonstrate limited capacity to effectively process and internalize highly complex patterns. Consequently, we develop an approach to identify and filter training samples that exceed the model's optimal learning capacity. The complexity of each instruction is measured using the following equation:
 51 | 
 52 | ```math
 53 | \text{C}(x,y) = \lambda_1 \cdot L_{\text{length}}  + \lambda_2 \cdot \text{Loss}_{\text{it}}(x, y),
 54 | ```
 55 | 
 56 | where $\lambda_1$, $\lambda_2$ are hyperparameters, $L_{\text{length}}$ denotes the length of the instruction, and $\text{Loss}_{\text{it}}(x, y)$ is the loss calculated by the base model:
 57 | 
 58 | ```math
 59 | \text{Loss}_{\text{it}}(x,y)=\sum\limits_{i=1}^{|y|} \log P(y_i|x,y_{1:i-1}),
 60 | ```
 61 | where $y_i$ represents the $i$-th token in the output $y$, and $y_{1:i-1}$ denotes the sequence up to the $i-1$ tokens. 
 62 | We implemente a complexity-based stratification protocol followed by selective pruning of samples exceeding empirically determined complexity thresholds. 
 63 | 
 64 | 
 65 | #### Data Mix
 66 | The mere aggregation of diverse data types during the SFT phase may lead to data conflicts, potentially degrading model performance in specific domains. To mitigate the issue, we allocate data proportions based on each source's characteristics, balancing general and domain-specific data. Moreover, through extensive incremental experiments, we also dynamically adjust the data ratio in real-time according to training performance and feedback to achieve optimal results. The detailed data proportions utilized are listed in the following table.
 67 | 
 68 | | **Category**  | **Count** | **Ratio** |
 69 | |---------------|-----------|-----------|
 70 | | General English    | 3.2M      | 39%      |
 71 | | General Chinese     | 3M      | 36%      |
 72 | | Math     | 1.8M      | 22%      |
 73 | | Code     | 0.2M      | 3%      |
 74 | | Total     | 8.2M      | 100% |
 75 | 
 76 | 
 77 | 
 78 | ### Recipe
 79 | #### Training Settings
 80 | The model training is conducted on a distributed computing infrastructure comprising four nodes, each equipped with eight NVIDIA A800 GPUs interconnected through a high-speed network. We used an effective batch size of 512 and a maximum sequence length of 28K tokens. The optimization process is carried out over 2 epochs with a learning rate of 1e-5.
 81 | 
 82 | ## 4. Preference Finetuning
 83 | 
 84 | ### Preference Data
 85 | 
 86 | Our preference data comprises a combination of off-policy and on-policy data sources.  
 87 | 1) **Off-policy data**: We aggregate a collection of high-quality publicly available datasets to form this component.  
 88 | 2) **On-policy data**: We systematically curate specialized datasets encompassing diverse domains, including scientific problems, mathematics, programming, Chinese common knowledge, hallucinations, and self-awareness evaluation.
 89 | 
 90 | Following the collection of instruction data, we sample completions from the SFT model. For instructions with objectively verifiable answers, we extract responses using an LLM, where correct responses are designated as chosen responses, while incorrect responses are classified as rejected responses. For the remaining instructions, we employ the reward model *Skywork/Skywork-Reward-Llama-3.1-8B-v0.2* to assign scores. High-scoring responses are marked as chosen, while low-scoring ones are classified as rejected.
 91 | 
 92 | The final composition of the constructed dataset for preference data is as follows:
 93 | 
 94 | | Category  | Count | Ratio |
 95 | |---------------|-----------|-----------|
 96 | | Off-policy    | 258K      | 69\%      |
 97 | | On-policy     | 115K      | 31\%      |
 98 | | Total     | 373K      | 100\% |
 99 | 
100 | 
101 | ### Preference Tuning Recipe
102 | 
103 | Based on the SFT model, we employ Direct Preference Optimization (DPO) to align the model with human preferences. The model is trained for 2 epochs with a batch size of 128 and a learning rate of 5e-7, $\beta$ is set to 0.1. This configuration demonstrate effective convergence during training while maintaining a balance between training efficiency and computational resource consumption.
104 | 
105 | 
106 | 
107 | ## 5. Reinforcement Learning
108 | 
109 | Building upon the DPO model, we further enhance our model's performance and alignment with human preferences through Proximal Policy Optimization (PPO).
110 | 
111 | ### Data Preparing
112 | 
113 | For PPO training, we extract 10,000 challenging instructions from the DPO dataset. These instructions, representing diverse task categories, effectively optimize the model's performance across various scenarios.
114 | 
115 | ### Reward Model
116 | 
117 | Considering the relatively small size of our model's parameters and the substantial computational resources required for training a high-performance reward model, we opt to leverage an open-source reward model to provide reward signals during the training process. Specifically, we used the Skywork-Reward-Llama-3.1-8B-v0.2 model, which has demonstrated robust performance in reward modeling.
118 | 
119 | ### Training Recipe
120 | 
121 | The training process is conducted utilizing 8xA800 GPUs for four epochs. We employ the training on the OpenRLHF framework, which provides a flexible environment for reinforcement learning tasks. During the training phase, we encounter several technical challenges, such as reward hacking phenomena that manifested as training instability and output length collapse. To address these issues, we implement a series of mitigation strategies:
122 | 
123 | - **Critic Model Initialization and Actor Parameter Freezing.** The critic model is initialized from the reward model. Additionally, the actor model's parameters are maintained in a fixed state during the initial 15 training steps, allowing the critic model to accurately assess value estimates.
124 | - **Adaptive KL Controller.** We adopted an Adaptive KL Controller that dynamically adjusts the KL coefficient (beta) based on the KL divergence target, balancing exploration and exploitation during training.
125 | - **Rule-based Penalties for Reward Hacking.** We applied rule-based penalties to address reward hacking patterns, such as incomplete sentences (which also help penalize truncated responses, often containing redundant patterns in our smaller model), mixed-language responses (e.g., English prompts with Chinese characters), and single-sentence responses like "I hope this helps,". These identified patterns represent a subset of the systematic anomalies observed during our extensive evaluation process, demonstrating high reward scores despite their lack of substantive content across diverse queries. 
126 | 
127 | The specific hyperparameters used in the PPO training are detailed in the table below.
128 | 
129 | | **Hyperparameters**                     | **Value**              |
130 | |-----------------------------------------|------------------------|
131 | | Actor Learning Rate                     | $1 \times 10^{-6}$     |
132 | | Critic Learning Rate                    | $1 \times 10^{-5}$     |
133 | | Training Batch Size                     | 128                    |
134 | | Freezing Actor Steps                    | 15                     |
135 | | Number of Episodes                      | 4                      |
136 | | Samples per Prompt                      | 4                      |
137 | | Prompt Maximum Length                   | 2048                   |
138 | | Generate Maximum Length                 | 2048                   |
139 | | Initial KL Coefficient                  | 0.01                   |
140 | | KL Target                                | 0.06                   |
141 | | General Advantage Estimation λ          | 0.95                   |
142 | | Discount Factor γ                       | 1.0                    |
143 | | Generation Temperature                  | 1.0                    |
144 | | Reward Clip Range                       | (-20, 10)              |
145 | | Learning Rate Warmup Ratio              | 0                      |
146 | | Learning Rate Scheduler                 | constant               |
147 | 
148 | 
149 | ## 6. Evaluation
150 | 
151 | We release the evaluation data generated from our YuLan-Mini-Instruct post-trained models tested on various benchmark tasks, which maintains consistency with our pre-training approach.
152 | 
153 | To comprehensively evaluate the performance of YuLan-Mini-Instruct, we conducted a rigorous comparative analysis against other models with similar scales and capabilities. For each benchmark task, we consistently select the optimal performance score (either from our empirical evaluations or published results) for comparison.
154 | 
155 | We utilize the metrics outlined in following table, where higher scores are consistently preferred. 
156 | 
157 | | **Core Skill** | **Development**                |
158 | | -------------- | ------------------------------ |
159 | | **Knowledge**  | MMLU<sub> (0 shot, CoT)</sub>  |
160 | |                | MMLU<sub> (5 shot)</sub>       |
161 | |                | TruthfulQA<sub>(0 shot)</sub>  |
162 | | **Reasoning**  | ARC<sub>(0 shot)</sub>         |
163 | |                | GPQA<sub>(5 shot)</sub>        |
164 | | **Math**       | MATH<sub>(0 shot, CoT)</sub>   |
165 | |                | GSM8K<sub>(8 shot)</sub>       |
166 | |                | GSM8K<sub>(0 shot, CoT)</sub>  |
167 | | **Code**       | HumanEval<sub>(pass@1)</sub>   |
168 | |                | HumanEval+<sub>(pass@1)</sub>  |
169 | |                | HumanEval<sub>(pass@10)</sub>  |
170 | |                | HumanEval+<sub>(pass@10)</sub> |
171 | |                | MBPP<sub>(pass@1)</sub>        |
172 | |                | MBPP+<sub>(pass@1)</sub>       |
173 | |                | MBPP<sub>(pass@10)</sub>       |
174 | |                | MBPP+<sub>(pass@10)</sub>      |
175 | 
176 | 
177 | ### General Knowledge
178 | 
179 | We adopt MMLU and TruthfulQA as benchmarks to assess YuLan-Mini-Instruct's performance in knowledge-based question answering. For MMLU, we report the macro average of subtask accuracy under the 5-shot standard setting without CoT and 0-shot standard setting with CoT. For TruthfulQA, we report the macro average of subtask accuracy under the 0-shot standard setting without CoT. As shown in the table, YuLan-Mini-Instruct demonstrates comparable performance to both Qwen2.5 and Llama3.2 models across the commonsense reasoning tasks.
180 | 
181 | ### Reasoning
182 | 
183 | We evaluate the reasoning capabilities of YuLan-Mini-Instruct using the ARC benchmark. The experimental results demonstrate that YuLan-Mini-Instruct achieves superior performance compared to Qwen2.5, while maintaining intermediate performance levels among the three evaluated models.
184 | 
185 | ### Math
186 | 
187 | We evaluate YuLan-Mini-Instruct's mathematical reasoning capabilities using MATH and GSM8K benchmarks. For MATH, we report macro-averaged subtask accuracy under the 0-shot setting with CoT. For GSM8K, we report macro-averaged accuracy under both 0-shot (without CoT) and 8-shot settings. Experimental results demonstrate that YuLan-Mini-Instruct achieves superior mathematical performance compared to Llama3.2, despite its significantly smaller model size.
188 | 
189 | ### Code
190 | 
191 | We evaluate code generation capabilities across four established benchmarks: HumanEval, HumanEvalPlus, MBPP, and MBPPPlus. Experimental results demonstrate that YuLan-Mini-Instruct achieves superior performance across all benchmarks, outperforming comparable models in code generation tasks.
192 | 
193 | 
194 | 
195 | | **Benchmarks**                 | **YuLan-Mini-Instruct** | **Llama3.2-3B-Instruct** | **Qwen-2.5-1.5B-Instruct** |
196 | | ------------------------------ | ----------------------- | ------------------------ | -------------------------- |
197 | | MMLU<sub> (0 shot, CoT)</sub>  | 53.6                   | **60**                   | 57.4                       |
198 | | MMLU<sub> (5 shot)</sub>       | 52.7                   | 63.4                     | **66.5**                   |
199 | | TruthfulQA<sub>(0 shot)</sub>  | 50.1                   | 49.7                     | **58.8**                   |
200 | | ARC<sub>(0 shot)</sub>         | 51.8                   | **78.6**                 | 47.8                       |
201 | | GPQA<sub>(5 shot)</sub>        | 30.1                   | **32.6**                 | 29.8                   |
202 | | MATH<sub>(0 shot, CoT)</sub>   | **55.2**               | 48.0                       | **55.2**                   |
203 | | GSM8K<sub>(8 shot)</sub>       | **81.8**                | 43.4                     | 73.2                       |
204 | | GSM8K<sub>(0 shot, CoT)</sub>  | **71.7**               | 66.0                       | 69.4                       |
205 | | HumanEval<sub>(pass@1)</sub>   | **67.7**                | 51.5                     | 61.6                       |
206 | | HumanEval+<sub>(pass@1)</sub>  | **61.6**                | 45.2                     | 47.0                       |
207 | | HumanEval<sub>(pass@10)</sub>  | **86.6**                | 78.7                     | 84.1                       |
208 | | HumanEval+<sub>(pass@10)</sub> | **80.5**                | 72.0                       | 78.0                       |
209 | | MBPP<sub>(pass@1)</sub>        | **66.7**                | 57.4                     | 63.2                       |
210 | | MBPP+<sub>(pass@1)</sub>       | **56.6**                | 47.8                     | 52.0                       |
211 | | MBPP<sub>(pass@10)</sub>       | 85.7                    | 80.4                     | **88.1**                   |
212 | | MBPP+<sub>(pass@10)</sub>      | 75.4                    | 71.2                     | **77.5**                       |
213 | 
214 | 
215 | ## 7. Conclusion, Limitation, and Future Work
216 | 
217 | <!-- While  its performance on knowledge-intensive benchmarks like MMLU reveals a current limitation, our experiments demonstrate a promising performance of smaller LLMs in general tasks. Future work will concentrate on expanding our training data with more diverse and comprehensive examples and developing the model's reasoning capabilities. We believe that these efforts will unlock significant improvements in overall performance and contribute to a more robust and generalizable LLM. -->
218 | 
219 | We propose YuLan-Mini-Instruct, a powerful small-scale language model with 2.4 billion parameters with complete post-training process with SFT, DPO and PPO strategies. Although YuLan-Mini-Instruct demonstrates limitations on knowledge-intensive benchmarks such as MMLU, our experimental results indicate that it exhibit competitive performance in several general-purpose tasks. We anticipate that our empirical contributions will contribute to the development of more robust and generalizable LLMs. Future research directions will focus on enhancing dataset diversity and comprehensiveness through expanded training data collection to improve reasoning capabilities. 
220 | 
221 | 
222 | ## Contributors
223 | 
224 | ### YuLan-Mini-Instruct Team 
225 | 
226 | Authors are listed in alphabetical order.
227 | 
228 | **Core Contributors**:
229 | 
230 |  Fei Bai, Yanzipeng Gao, Yukai Gu, Yihong Liu, Shuang Sun, Chenghao Wu
231 | 
232 | **Contributors**:
233 | 
234 | Zhipeng Chen,  Yiwen Hu,  Yingqian Min, Ruiyang Ren, Huatong Song,  Ji-Rong Wen,  Xin Zhao, Kun Zhou, Yutao Zhu
235 | 
236 | ## Reference
237 | 
238 | Please kindly cite our reports if they are helpful for your research.
239 | 
240 | ```
241 | @article{YuLan-Mini-Instruct,
242 |   title={YuLan-Mini-Instruct Technical Report},
243 |   author={RUCAIBox YuLan-Mini-Instruct Team},
244 |   url={https://github.com/RUC-GSAI/YuLan-Mini},
245 |   year={2025}
246 | }
247 | ```
248 | 


--------------------------------------------------------------------------------
/post_train/img/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/post_train/img/result.png


--------------------------------------------------------------------------------
/pretrain/README.md:
--------------------------------------------------------------------------------
  1 | # Pre-Training Resources 🔧
  2 | 
  3 | To enhance research transparency and reproducibility, we are open-sourcing relevant pre-training resources:
  4 | 
  5 | ### Pre-Training
  6 | 
  7 | 
  8 | <details><summary>1. Pre-training and Evaluation Code</summary>
  9 | 
 10 | The pre-training code can be found [here](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain). Note that due to subsequent code modifications, this code may not run directly and may require some adjustments.
 11 | 
 12 | <h3 id="key-features-">Key Features:</h3>
 13 | <ol>
 14 | <li><strong>Stability</strong>: We adopted muP initialization and scaling factor, as well as the reparameterization method of WeSaR, achieving training stability without significantly increasing training time. For details, see our technical report.</li>
 15 | <li><strong>Training efficiency</strong>: By using the <code>flash_attn</code> and <code>liger_kernel</code> libraries, we achieved 51% MFU (in comparison, Megatron only has about 41% MFU on small models of the same scale).</li>
 16 | <li><strong>Data curriculum</strong>: We modified the HF Trainer to make it suitable for training in successive curriculum phases and different decay functions of WSD.</li>
 17 | <li><strong>Other features</strong>: Support automatic restart training of torchrun, wandb records hidden states to monitor training stability, and other attempts (such as QK-LayerNorm, Embedding Gradient Shrink, etc.).</li>
 18 | </ol>
 19 | 
 20 | <pre><code>├── train.py  <span class="hljs-comment"># 👈🏻 The main training script</span>
 21 | ├── train.<span class="hljs-keyword">sh </span> <span class="hljs-comment"># 👈🏻 The main training script for each curriculum phase</span>
 22 | ├── yulanmini-2B-final-phase25.<span class="hljs-keyword">sh </span> <span class="hljs-comment"># 👈🏻 example script for phase 25</span>
 23 | ├── yulanmini-2B-s25d-decay80-1sqrt-long-28k-final-phase26.<span class="hljs-keyword">sh </span> <span class="hljs-comment"># 👈🏻 example script for phase 26</span>
 24 | ├── ds2_config_adamw.<span class="hljs-keyword">json </span> <span class="hljs-comment"># The DeepSpeed configuration file</span>
 25 | ├── setup.<span class="hljs-keyword">sh </span> <span class="hljs-comment"># The setup script for the training environment</span>
 26 | ├── torchrun_wrapper.<span class="hljs-keyword">sh </span> <span class="hljs-comment"># The wrapper script for torchrun</span>
 27 | ├── train_utils.py  <span class="hljs-comment"># The training utility functions</span>
 28 | └── yulanmini_trainer.py  <span class="hljs-comment"># 👈🏻 The Trainer class for training</span>
 29 | </code></pre>
 30 | 
 31 | <h3 id="key-features-">Continual Training Tutorial:</h3>
 32 | <h4 id="step-1-modify-the-config-json-">Step 1: Modify the <code>trainer_state.json</code></h4>
 33 | <p>Due to the implementation of Hugging Face Trainer, certain parameters are stored in the <code>trainer_state.json</code> file and cannot be modified through the Trainer&#39;s command-line arguments. Therefore, you need to update these parameters in the <code>trainer_state.json</code> file first, particularly:</p>
 34 | <ul>
 35 | <li><strong><code>save_steps</code></strong>: The frequency of saving intermediate checkpoints.</li>
 36 | <li><strong><code>train_batch_size</code></strong>: The batch size per GPU (equivalent to <code>per_device_train_batch_size</code> in the Trainer). We used a batch size of 1008 (approximately 4M tokens) during the stable training stage. Maintaining this same batch size is equally important for training effectiveness.</li>
 37 | </ul>
 38 | <p>Below is an example of a properly configured <code>trainer_state.json</code> file:</p>
 39 | <pre><code class="lang-json">{
 40 |   <span class="hljs-attr">"best_metric"</span>: <span class="hljs-literal">null</span>,
 41 |   <span class="hljs-attr">"best_model_checkpoint"</span>: <span class="hljs-literal">null</span>,
 42 |   <span class="hljs-attr">"epoch"</span>: <span class="hljs-number">0.0</span>,
 43 |   <span class="hljs-attr">"eval_steps"</span>: <span class="hljs-number">500</span>,
 44 |   <span class="hljs-attr">"global_step"</span>: <span class="hljs-number">0</span>,
 45 |   <span class="hljs-attr">"is_hyper_param_search"</span>: <span class="hljs-literal">false</span>,
 46 |   <span class="hljs-attr">"is_local_process_zero"</span>: <span class="hljs-literal">true</span>,
 47 |   <span class="hljs-attr">"is_world_process_zero"</span>: <span class="hljs-literal">true</span>,
 48 |   <span class="hljs-attr">"log_history"</span>: [],
 49 |   <span class="hljs-attr">"logging_steps"</span>: <span class="hljs-number">3</span>,
 50 |   <span class="hljs-attr">"max_steps"</span>: <span class="hljs-number">0</span>,
 51 |   <span class="hljs-attr">"num_input_tokens_seen"</span>: <span class="hljs-number">0</span>,
 52 |   <span class="hljs-attr">"num_train_epochs"</span>: <span class="hljs-number">0</span>,
 53 |   <span class="hljs-attr">"save_steps"</span>: <span class="hljs-number">250</span>,
 54 |   <span class="hljs-attr">"stateful_callbacks"</span>: {
 55 |     <span class="hljs-attr">"TrainerControl"</span>: {
 56 |       <span class="hljs-attr">"args"</span>: {
 57 |         <span class="hljs-attr">"should_epoch_stop"</span>: <span class="hljs-literal">false</span>,
 58 |         <span class="hljs-attr">"should_evaluate"</span>: <span class="hljs-literal">false</span>,
 59 |         <span class="hljs-attr">"should_log"</span>: <span class="hljs-literal">false</span>,
 60 |         <span class="hljs-attr">"should_save"</span>: <span class="hljs-literal">true</span>,
 61 |         <span class="hljs-attr">"should_training_stop"</span>: <span class="hljs-literal">true</span>
 62 |       },
 63 |       <span class="hljs-attr">"attributes"</span>: {}
 64 |     }
 65 |   },
 66 |   <span class="hljs-attr">"total_flos"</span>: <span class="hljs-number">0</span>,
 67 |   <span class="hljs-attr">"train_batch_size"</span>: <span class="hljs-number">3</span>,
 68 |   <span class="hljs-attr">"trial_name"</span>: <span class="hljs-literal">null</span>,
 69 |   <span class="hljs-attr">"trial_params"</span>: <span class="hljs-literal">null</span>
 70 | }
 71 | </code></pre>
 72 | <h4 id="step-2-enable-universal-checkpointing-in-the-deepspeed-configuration">Step 2: Enable Universal Checkpointing in the DeepSpeed Configuration</h4>
 73 | <p>To ensure DeepSpeed Integration loads the Universal Checkpoint, you need to enable this feature in the DeepSpeed configuration JSON file. </p>
 74 | <p>Here is an example of a ZeRO2 configuration with Universal Checkpointing enabled:</p>
 75 | <pre><code class="lang-json">{
 76 |   <span class="hljs-attr">"bf16"</span>: {
 77 |     <span class="hljs-attr">"enabled"</span>: <span class="hljs-string">"auto"</span>
 78 |   },
 79 |   <span class="hljs-attr">"zero_optimization"</span>: {
 80 |     <span class="hljs-attr">"stage"</span>: <span class="hljs-number">2</span>,
 81 |     <span class="hljs-attr">"allgather_partitions"</span>: <span class="hljs-literal">true</span>,
 82 |     <span class="hljs-attr">"allgather_bucket_size"</span>: <span class="hljs-number">8e8</span>,
 83 |     <span class="hljs-attr">"overlap_comm"</span>: <span class="hljs-literal">true</span>,
 84 |     <span class="hljs-attr">"reduce_scatter"</span>: <span class="hljs-literal">true</span>,
 85 |     <span class="hljs-attr">"reduce_bucket_size"</span>: <span class="hljs-number">8e8</span>,
 86 |     <span class="hljs-attr">"contiguous_gradients"</span>: <span class="hljs-literal">true</span>
 87 |   },
 88 |   <span class="hljs-attr">"gradient_accumulation_steps"</span>: <span class="hljs-string">"auto"</span>,
 89 |   <span class="hljs-attr">"gradient_clipping"</span>: <span class="hljs-string">"auto"</span>,
 90 |   <span class="hljs-attr">"steps_per_print"</span>: <span class="hljs-number">16</span>,
 91 |   <span class="hljs-attr">"train_batch_size"</span>: <span class="hljs-string">"auto"</span>,
 92 |   <span class="hljs-attr">"train_micro_batch_size_per_gpu"</span>: <span class="hljs-string">"auto"</span>,
 93 |   <span class="hljs-attr">"wall_clock_breakdown"</span>: <span class="hljs-literal">false</span>,
 94 |   <span class="hljs-attr">"dump_state"</span>: <span class="hljs-literal">true</span>,
 95 |   <span class="hljs-attr">"optimizer"</span>: {
 96 |     <span class="hljs-attr">"type"</span>: <span class="hljs-string">"AdamW"</span>,
 97 |     <span class="hljs-attr">"params"</span>: {
 98 |       <span class="hljs-attr">"lr"</span>: <span class="hljs-string">"auto"</span>,
 99 |       <span class="hljs-attr">"betas"</span>: <span class="hljs-string">"auto"</span>,
100 |       <span class="hljs-attr">"eps"</span>: <span class="hljs-string">"auto"</span>,
101 |       <span class="hljs-attr">"weight_decay"</span>: <span class="hljs-string">"auto"</span>
102 |     }
103 |   },
104 |   <span class="hljs-attr">"checkpoint"</span>: {
105 |     <span class="hljs-attr">"load_universal"</span>: <span class="hljs-literal">true</span>
106 |   }
107 | }
108 | </code></pre>
109 | <h4 id="step-3-resume-training">Step 3: Resume Training</h4>
110 | <p>When calling <code>trainer.train</code>, include the <code>resume_from_checkpoint</code> argument to load the distributed optimizer state from the Universal Checkpoint and resume training.</p>
111 | <pre><code class="lang-python"><span class="hljs-attr">trainer.train(resume_from_checkpoint</span>=<span class="hljs-string">training_args.resume_from_checkpoint)</span>
112 | </code></pre>
113 | <p>We provide an internal <a href="https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain">training framework</a> for your reference, but you are free to choose other frameworks.</p>
114 | 
115 | </details>
116 | 
117 | <details><summary>2. Intermediate Stage Checkpoints</summary>
118 | The intermediate stage checkpoints are released in <a href="https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3">YuLan-Mini</a>.
119 | 
120 | <table>
121 |     <thead>
122 |         <tr>
123 |             <th>Stage</th>
124 |             <th>Curriculum Phase</th>
125 |             <th>4K Context</th>
126 |             <th>28K Context</th>
127 |             <th>Optimizer</th>
128 |             <th>Inference Architecture</th>
129 |             <th>LAMBADA <code>Acc</code></th>
130 |             <th>GSM8K <code>Acc</code></th>
131 |             <th>HumanEval <code>pass@1</code></th>
132 |         </tr>
133 |     </thead>
134 |     <tbody>
135 |         <tr>
136 |             <td>Stable</td>
137 |             <td>5</td>
138 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Phase5">YuLan-Mini-Phase5</a></td>
139 |             <td></td>
140 |             <td></td>
141 |             <td><code>yulanmini</code></td>
142 |             <td>53.85</td>
143 |             <td>3.41</td>
144 |             <td>12.26</td>
145 |         </tr>
146 |         <tr>
147 |             <td>Stable</td>
148 |             <td>10</td>
149 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Phase10">YuLan-Mini-Phase10</a></td>
150 |             <td></td>
151 |             <td></td>
152 |             <td><code>yulanmini</code></td>
153 |             <td>55.00</td>
154 |             <td>9.57</td>
155 |             <td>15.95</td>
156 |         </tr>
157 |         <tr>
158 |             <td>Stable</td>
159 |             <td>15</td>
160 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Phase15">YuLan-Mini-Phase15</a></td>
161 |             <td></td>
162 |             <td></td>
163 |             <td><code>yulanmini</code></td>
164 |             <td>55.81</td>
165 |             <td>13.81</td>
166 |             <td>16.99</td>
167 |         </tr>
168 |         <tr>
169 |             <td>Stable</td>
170 |             <td>20</td>
171 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Phase20">YuLan-Mini-Phase20</a></td>
172 |             <td></td>
173 |             <td>✅</td>
174 |             <td><code>yulanmini</code></td>
175 |             <td>55.81</td>
176 |             <td>21.39</td>
177 |             <td>20.79</td>
178 |         </tr>
179 |         <tr>
180 |             <td>Stable</td>
181 |             <td>25 (1T tokens)</td>
182 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing">YuLan-Mini-Before-Annealing</a></td>
183 |             <td></td>
184 |             <td>✅</td>
185 |             <td><code>yulanmini</code></td>
186 |             <td>55.67</td>
187 |             <td>29.94</td>
188 |             <td>34.06</td>
189 |         </tr>
190 |         <tr>
191 |             <td></td>
192 |             <td></td>
193 |             <td></td>
194 |             <td></td>
195 |             <td></td>
196 |             <td></td>
197 |             <td></td>
198 |             <td></td>
199 |             <td></td>
200 |         </tr>
201 |         <tr>
202 |             <td>Annealing</td>
203 |             <td>26</td>
204 |             <td>YuLan-Mini-4K</td>
205 |             <td></td>
206 |             <td></td>
207 |             <td><code>llama</code>*</td>
208 |             <td>64.72</td>
209 |             <td>66.65</td>
210 |             <td>61.60</td>
211 |         </tr>
212 |         <tr>
213 |             <td>Annealing</td>
214 |             <td>27</td>
215 |             <td></td>
216 |             <td><a href="https://huggingface.co/yulan-team/YuLan-Mini">YuLan-Mini</a></td>
217 |             <td></td>
218 |             <td><code>llama</code>*</td>
219 |             <td>65.67</td>
220 |             <td>68.46</td>
221 |             <td>64.00</td>
222 |         </tr>
223 |     </tbody>
224 | </table>
225 | 
226 | \*: For easier inference and deployment, we merged the re-parameterized added parameters and scaling factors into the final released models ([**YuLan-Mini**](https://huggingface.co/yulan-team/YuLan-Mini) and **YuLan-Mini-Intermediate-4K**), enabling it to run on the Llama architecture. However, these parameters are still retained in the intermediate checkpoints from the training process.
227 | 
228 | </details>
229 | 
230 | <details><summary>3. Optimizer States Before Annealing</summary>
231 | 
232 | <a href="https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing">YuLan-Mini-Before-Annealing</a>
233 | </details>
234 | 
235 | ### Datasets
236 | 
237 | 
238 | <details><summary>4. The Used Open-Source Datasets </summary>
239 | 
240 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/datasets">Used-Datasets-List</a>
241 | 
242 | </details>
243 | 
244 | <details><summary>5. Data Distribution for every phase</summary>
245 | 
246 | <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/datasets/final.pdf">
247 |   <div align=center>
248 |     <img src="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/assets/data_distribution_for_every_phase.png">
249 |   </div>
250 | </a>
251 | 
252 | </details>
253 | 
254 | <details><summary>6. Data Preprocessing and Synthesis Pipeline</summary>
255 | 
256 | The synthetic data we are using is released in <a href="https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3">YuLan-Mini-Datasets</a>
257 | 
258 | 
259 | We also released the <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/preprocess">data preprocessing</a> (including data formatting, filtering, tokenization, and mixing) and <a href="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/pretrain/synthesis">synthesis pipeline</a> for your reference.
260 | 
261 | 
262 | <div align=center>
263 | <img src="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/assets/data-pipeline.png">
264 | </div>
265 | 
266 | </details>
267 | 
268 | 
269 | ### What you can do with these pre-training resources
270 | 
271 | 1. **Pre-train** your own LLM. You can use [our data](https://huggingface.co/yulan-team/YuLan-Mini-Datasets) and curriculum to train a model that's just as powerful as YuLan-Mini.
272 | 2. Perform your own **learning rate annealing**. During the annealing phase, YuLan-Mini's learning ability is at its peak. You can resume training from [the checkpoint before annealing](https://huggingface.co/yulan-team/YuLan-Mini-Before-Annealing) and use your own dataset for learning rate annealing.
273 | 3. **Fine-tune** the Instruct version of the LLM. You can use the [YuLan-Mini](https://huggingface.co/yulan-team/YuLan-Mini) base model to train your own Instruct version.
274 | 4. **Training dynamics** research. You can use YuLan-Mini's [intermediate checkpoints](https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3) to explore internal changes during the pre-training process.
275 | 5. **Synthesize** your own data. You can use YuLan-Mini's [data pipeline](https://github.com/RUC-GSAI/YuLan-Mini) to clean and generate your own dataset.
276 | 
277 | 


--------------------------------------------------------------------------------
/pretrain/configuration_yulanmini.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
  5 | # and OPT implementations in this library. It has been modified from its
  6 | # original forms to accommodate minor architectural differences compared
  7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
  8 | #
  9 | # Licensed under the Apache License, Version 2.0 (the "License");
 10 | # you may not use this file except in compliance with the License.
 11 | # You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0
 14 | #
 15 | # Unless required by applicable law or agreed to in writing, software
 16 | # distributed under the License is distributed on an "AS IS" BASIS,
 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | # See the License for the specific language governing permissions and
 19 | # limitations under the License.
 20 | """ YuLanMinimodel configuration"""
 21 | 
 22 | import math
 23 | 
 24 | from transformers.configuration_utils import PretrainedConfig
 25 | from transformers.utils import logging
 26 | 
 27 | logger = logging.get_logger(__name__)
 28 | 
 29 | YULANMINI_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 30 | 
 31 | 
 32 | class YuLanMiniConfig(PretrainedConfig):
 33 |     r"""
 34 |     This is the configuration class to store the configuration of a [`YuLanMiniModel`]. It is used to instantiate an YuLanMini
 35 |     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
 36 |     defaults will yield a similar configuration to that of the YuLanMini-7B.
 37 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 38 |     documentation from [`PretrainedConfig`] for more information.
 39 |     Args:
 40 |         vocab_size (`int`, *optional*, defaults to 32000):
 41 |             Vocabulary size of the YuLanMinimodel. Defines the number of different tokens that can be represented by the
 42 |             `inputs_ids` passed when calling [`YuLanMiniModel`]
 43 |         hidden_size (`int`, *optional*, defaults to 4096):
 44 |             Dimension of the hidden representations.
 45 |         intermediate_size (`int`, *optional*, defaults to 11008):
 46 |             Dimension of the MLP representations.
 47 |         num_hidden_layers (`int`, *optional*, defaults to 32):
 48 |             Number of hidden layers in the Transformer decoder.
 49 |         num_attention_heads (`int`, *optional*, defaults to 32):
 50 |             Number of attention heads for each attention layer in the Transformer decoder.
 51 |         num_key_value_heads (`int`, *optional*):
 52 |             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
 53 |             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
 54 |             `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
 55 |             converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
 56 |             by meanpooling all the original heads within that group. For more details checkout [this
 57 |             paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
 58 |             `num_attention_heads`.
 59 |         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
 60 |             The non-linear activation function (function or string) in the decoder.
 61 |         max_position_embeddings (`int`, *optional*, defaults to 2048):
 62 |             The maximum sequence length that this model might ever be used with. YuLanMini1 supports up to 2048 tokens,
 63 |             YuLanMini2 up to 4096, CodeYuLanMiniup to 16384.
 64 |         initializer_range (`float`, *optional*, defaults to 0.02):
 65 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 66 |         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
 67 |             The epsilon used by the rms normalization layers.
 68 |         use_cache (`bool`, *optional*, defaults to `True`):
 69 |             Whether or not the model should return the last key/values attentions (not used by all models). Only
 70 |             relevant if `config.is_decoder=True`.
 71 |         pad_token_id (`int`, *optional*):
 72 |             Padding token id.
 73 |         bos_token_id (`int`, *optional*, defaults to 1):
 74 |             Beginning of stream token id.
 75 |         eos_token_id (`int`, *optional*, defaults to 2):
 76 |             End of stream token id.
 77 |         pretraining_tp (`int`, *optional*, defaults to 1):
 78 |             Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
 79 |             document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
 80 |             necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
 81 |             issue](https://github.com/pytorch/pytorch/issues/76232).
 82 |         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
 83 |             Whether to tie weight embeddings
 84 |         rope_theta (`float`, *optional*, defaults to 10000.0):
 85 |             The base period of the RoPE embeddings.
 86 |         rope_scaling (`Dict`, *optional*):
 87 |             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
 88 |             strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
 89 |             `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
 90 |             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
 91 |             these scaling strategies behave:
 92 |             https://www.reddit.com/r/LocalYuLanMini/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
 93 |             experimental feature, subject to breaking API changes in future versions.
 94 |         attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
 95 |             Whether to use a bias in the query, key, value and output projection layers during self-attention.
 96 |         attention_dropout (`float`, *optional*, defaults to 0.0):
 97 |             The dropout ratio for the attention probabilities.
 98 |     ```python
 99 |     >>> from transformers import YuLanMiniModel, YuLanMiniConfig
100 |     >>> # Initializing a YuLanMini-7b style configuration
101 |     >>> configuration = YuLanMiniConfig()
102 |     >>> # Initializing a model from the YuLanMini-7b style configuration
103 |     >>> model = YuLanMiniModel(configuration)
104 |     >>> # Accessing the model configuration
105 |     >>> configuration = model.config
106 |     ```"""
107 | 
108 |     model_type = "yulanmini"
109 |     keys_to_ignore_at_inference = ["past_key_values"]
110 | 
111 |     def __init__(
112 |         self,
113 |         vocab_size=99000,
114 |         hidden_size=1920,
115 |         intermediate_size=4800,
116 |         num_hidden_layers=56,
117 |         num_attention_heads=30,
118 |         num_key_value_heads=6,
119 |         # 不常用变量
120 |         hidden_act="silu",
121 |         max_position_embeddings=4096,
122 |         rms_norm_eps=1e-6,
123 |         use_cache=True,
124 |         pad_token_id=None,  # /home/u20140041/pretrain-mini/preprocess/modify_tokenizer/1731
125 |         bos_token_id=1,
126 |         eos_token_id=2,
127 |         tie_word_embeddings=False,
128 |         rope_theta=10000.0,
129 |         use_sliding_window=False,
130 |         sliding_window=4096,
131 |         rope_scaling=None,
132 |         attention_bias=True,  # qwen
133 |         attention_dropout=0.0,
134 |         # 放缩embedding grad
135 |         shrink_alpha=1,
136 |         shrink_alpha2=1,
137 |         use_liger=False,
138 |         # 初始化
139 |         initializer_range=0.014434,
140 |         init_scale_o=10.582218,
141 |         model_reproduce="transformer",
142 |         # 下面是为了muparam设置的参数，需要保证：默认值是不使用任何muparam的部分
143 |         hidden_states_shrink=1,
144 |         dim_model_base=None,
145 |         dim_ffn_base_init=None,  # 新版muparam没有使用了
146 |         dim_model_base_init=None,
147 |         dim_model_base_attn=None,
148 |         dim_model_base_lmh=None,
149 |         dim_model_base_logits=None,
150 |         dim_model_base_lr=None,
151 |         scale_emb=1,
152 |         # qk_layernorm
153 |         qk_layernorm=False,
154 |         layer_norm_eps=1e-6,
155 |         embedding_ln=False,
156 |         embedding_rmsln=False,
157 |         ln_scale=1.,
158 |         z_loss=0.0001,
159 |         # wesar
160 |         wesar_weights=True,
161 |         embed_tokens_alpha=1,
162 |         q_proj_alpha=1,
163 |         k_proj_alpha=1,
164 |         v_proj_alpha=1,
165 |         o_proj_alpha=1,
166 |         down_proj_alpha=1,
167 |         gate_up_proj_alpha=1,
168 |         input_layernorm_alpha=1,
169 |         post_attention_layernorm_alpha=1,
170 |         norm_alpha=1,
171 |         lm_head_alpha=1,
172 |         use_norm_alpha=True,
173 |         use_emb_alpha=False,
174 |         rms_type="llama",
175 |         num_steps_trained_before_this_epoch=0,
176 |         num_epochs_trained_before_this_epoch=0,
177 |         # 加速
178 |         gradient_checkpointing_step=7,
179 |         **kwargs,
180 |     ):
181 |         # 训练states，每个epoch更新，epoch内部不会变。比如训练到第4轮数据，这两个的值都是第三轮最后一步的值（epochs=3, steps=xxx），只要是在第4轮，无论是多少步，都是第三轮的值，由update_trained_steps_and_epochs控制是否更新
182 |         self.num_steps_trained_before_this_epoch = num_steps_trained_before_this_epoch
183 |         self.num_epochs_trained_before_this_epoch = num_epochs_trained_before_this_epoch
184 | 
185 |         self.vocab_size = vocab_size
186 |         self.max_position_embeddings = max_position_embeddings
187 |         self.hidden_size = hidden_size
188 |         self.intermediate_size = intermediate_size
189 |         self.num_hidden_layers = num_hidden_layers
190 |         self.num_attention_heads = num_attention_heads
191 |         self.use_sliding_window = use_sliding_window
192 |         self.sliding_window = sliding_window if use_sliding_window else None
193 | 
194 |         # for backward compatibility
195 |         if num_key_value_heads is None:
196 |             num_key_value_heads = num_attention_heads
197 | 
198 |         self.num_key_value_heads = num_key_value_heads
199 |         self.hidden_act = hidden_act
200 |         self.initializer_range = initializer_range
201 |         self.rms_norm_eps = rms_norm_eps
202 |         self.use_cache = use_cache
203 |         self.rope_theta = rope_theta
204 |         self.rope_scaling = rope_scaling
205 |         self._rope_scaling_validation()
206 |         self.attention_bias = attention_bias
207 |         self.attention_dropout = attention_dropout
208 |         self.shrink_alpha = shrink_alpha
209 |         self.use_liger = use_liger
210 |         self.init_scale_o = init_scale_o
211 |         self.hidden_states_shrink = 1 / math.sqrt(num_hidden_layers) if hidden_states_shrink == "muparam" else hidden_states_shrink
212 |         self.dim_model_base = dim_model_base if dim_model_base is not None else hidden_size
213 |         self.dim_model_base_init = dim_model_base_init
214 |         self.dim_model_base_attn = dim_model_base_attn if dim_model_base_attn is not None else (hidden_size // num_attention_heads)  # 初始化为1则是使用1/H_dim
215 |         self.dim_model_base_lmh = dim_model_base_lmh if dim_model_base_lmh is not None else 1  # 初始化为1则是不放缩lm_head的init
216 |         self.scale_emb = scale_emb if scale_emb is not None else 1
217 |         self.model_reproduce=model_reproduce if model_reproduce is not None else "transformer"
218 |         self.dim_model_base_logits = dim_model_base_logits if dim_model_base_logits is not None else hidden_size
219 |         self.dim_model_base_lr = dim_model_base_lr if dim_model_base_lr is not None else hidden_size
220 | 
221 |         self.qk_layernorm = qk_layernorm
222 |         self.layer_norm_eps = layer_norm_eps
223 |         self.embedding_ln = embedding_ln
224 |         self.embedding_rmsln = embedding_rmsln
225 |         self.ln_scale = ln_scale
226 |         self.z_loss = z_loss
227 | 
228 |         if embedding_ln and embedding_rmsln:
229 |             raise ValueError("Only one of embedding_ln and embedding_rmsln should be True")
230 | 
231 |         self.wesar_weights = wesar_weights
232 |         self.embed_tokens_alpha = embed_tokens_alpha
233 |         self.q_proj_alpha = q_proj_alpha
234 |         self.k_proj_alpha = k_proj_alpha
235 |         self.v_proj_alpha = v_proj_alpha
236 |         self.o_proj_alpha = o_proj_alpha
237 |         self.down_proj_alpha = down_proj_alpha
238 |         self.gate_up_proj_alpha = gate_up_proj_alpha
239 |         self.input_layernorm_alpha = input_layernorm_alpha
240 |         self.post_attention_layernorm_alpha = post_attention_layernorm_alpha
241 |         self.norm_alpha = norm_alpha
242 |         self.lm_head_alpha = lm_head_alpha
243 |         self.use_norm_alpha = use_norm_alpha
244 |         self.use_emb_alpha = use_emb_alpha
245 |         self.rms_type = rms_type
246 | 
247 |         self.gradient_checkpointing_step = gradient_checkpointing_step
248 | 
249 |         if self.dim_model_base != hidden_size or self.dim_model_base_init is not None or self.dim_model_base_attn != (hidden_size // num_attention_heads) or self.dim_model_base_lmh != 1:
250 |             if init_scale_o != 1:
251 |                 raise ValueError("When using muparam, init_scale_o should be 1")
252 | 
253 |         # multiplier
254 |         print("Attention放缩：", math.sqrt(self.dim_model_base_attn) / (hidden_size // num_attention_heads))
255 |         print("Residual链接处的Hidden States放缩：", hidden_states_shrink)
256 |         print("Logits放缩：", 1 / (hidden_size / self.dim_model_base))
257 | 
258 |         # initializer
259 |         if dim_model_base_init is not None:
260 |             print("o_proj,down_proj初始化STD：", initializer_range / math.sqrt(2 * (hidden_size / dim_model_base_init) * num_hidden_layers))
261 |             print("gate_proj,up_proj,q_proj,k_proj,v_proj初始化STD：", initializer_range / math.sqrt(self.hidden_size / self.dim_model_base_init))
262 |         else:
263 |             print("o_proj,down_proj初始化STD：", initializer_range / init_scale_o)
264 |             print("gate_proj,up_proj,q_proj,k_proj,v_proj初始化STD：", initializer_range)
265 |         print("lm_head初始化STD：", initializer_range / math.sqrt(self.dim_model_base_lmh))
266 | 
267 |         if not tie_word_embeddings and self.scale_emb != 1:
268 |             raise ValueError("When using scale_emb, tie_word_embeddings should be False")
269 | 
270 |         super().__init__(
271 |             pad_token_id=pad_token_id,
272 |             bos_token_id=bos_token_id,
273 |             eos_token_id=eos_token_id,
274 |             tie_word_embeddings=tie_word_embeddings,
275 |             **kwargs,
276 |         )
277 |         try:
278 |             import flash_attn
279 |             self._attn_implementation = "flash_attention_2"
280 |         except:
281 |             pass
282 | 
283 |     def _rope_scaling_validation(self):
284 |         """
285 |         Validate the `rope_scaling` configuration.
286 |         """
287 |         if self.rope_scaling is None:
288 |             return
289 | 
290 |         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
291 |             raise ValueError(
292 |                 "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
293 |                 f"got {self.rope_scaling}"
294 |             )
295 |         rope_scaling_type = self.rope_scaling.get("type", None)
296 |         rope_scaling_factor = self.rope_scaling.get("factor", None)
297 |         if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
298 |             raise ValueError(
299 |                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
300 |             )
301 |         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
302 |             raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
303 | 


--------------------------------------------------------------------------------
/pretrain/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Pretraining Data Resources
  2 | 
  3 | We primarily used open-source data and synthetic data generated by ourselves. You can find our synthetic dataset at [here](https://huggingface.co/collections/yulan-team/yulan-mini-676d214b24376739b00d95f3) and synthesis pipeline at [here](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/synthesis). The open-source datasets used are listed [below](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets#list-of-open-source-datasets-used). The data curriculum is also included in the list [below](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets#data-curriculum).
  4 | 
  5 | ## Data Curriculum
  6 | 
  7 | The JSON files in this folder show the **real data ratio** for each curriculum phase.
  8 | 
  9 | | Phase      | Data Mix (4K) | Data Mix (28K) |
 10 | |------------|---------------|----------------|
 11 | |`1`|[01_20241017_013512.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/01_20241017_013512.json)||
 12 | |`2`|[02_20241017_013401.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/02_20241017_013401.json)||
 13 | |`3`|[03_20241020_001556.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/03_20241020_001556.json)||
 14 | |`4`|[04_20241021_170901.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/04_20241021_170901.json)||
 15 | |`5`|[05_20241022_221453.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/05_20241022_221453.json)||
 16 | |`6`|[06_20241024_013137.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/06_20241024_013137.json)||
 17 | |`7`|[07_20241025_022032.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/07_20241025_022032.json)||
 18 | |`8`|[08_20241026_151354.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/08_20241026_151354.json)||
 19 | |`9`|[09_20241027_190948.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/09_20241027_190948.json)||
 20 | |`10`|[10_20241028_225112.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/10_20241028_225112.json)||
 21 | |`11`|[11_20241030_124814.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/11_20241030_124814.json)||
 22 | |`12`|[12_20241101_002827.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/12_20241101_002827.json)||
 23 | |`13`|[13_20241102_160534.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/13_20241102_160534.json)||
 24 | |`14`|[14_20241104_000454.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/14_20241104_000454.json)||
 25 | |`15`|[15_20241105_023029.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/15_20241105_023029.json)||
 26 | |`16`|[16_20241106_180613.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/16_20241106_180613.json)||
 27 | |`17`|[17_20241108_004951.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/17_20241108_004951.json)||
 28 | |`18`|[18_20241113_034017.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/18_20241113_034017.json)||
 29 | |`19`|[19_20241114_115241.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/19_20241114_115241.json)||
 30 | |`20`|[20_20241115_234357.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/20_20241115_234357.json)||
 31 | |`21`|[21_20241117_021115.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/21_20241117_021115.json)||
 32 | |`22`|[22_20241118_155407.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/22_20241118_155407.json)||
 33 | |`23`|[23_20241120_033942.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/23_20241120_033942.json)||
 34 | |`24`|[24_20241121_133110.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/24_20241121_133110.json)||
 35 | |`25`|[25_20241123_030124.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/25_20241123_030124.json)||
 36 | |`26`|[26_20241127_205447.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/26_20241127_205447.json)|[26_20241211_015209.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/26_20241211_015209.json) (Tokenized: [🤗 YuLan-Mini-Datasets-Phasae-26](https://huggingface.co/datasets/yulan-team/YuLan-Mini-Datasets-Phasae-26))|
 37 | |`27`||[27_20241213_051741.json](https://github.com/RUC-GSAI/YuLan-Mini/tree/main/pretrain/datasets/data_mix/27_20241213_051741.json) (Tokenized: [🤗 YuLan-Mini-Datasets-Phasae-27](https://huggingface.co/datasets/yulan-team/YuLan-Mini-Datasets-Phasae-27))|
 38 | 
 39 | ## How to Download the Data
 40 | 
 41 | For some extremely large corpus (e.g., `fineweb-edu`), we only use a subset of the data. We provide the script to help you achieve this. For speed consideration, we recommend you to use `aria2` to download the data in parallel.
 42 | 
 43 | ```bash
 44 | # Install aria2 and git lfs
 45 | pip install aria2 arai2p
 46 | git lfs install
 47 | 
 48 | # Step1: Get the LFS links (this will not download the actual data)
 49 | bash download_datasets_step1.sh HuggingFaceFW/fineweb-edu --dataset
 50 | 
 51 | # Step2: Select the subsets you want to download
 52 | cd fineweb-edu
 53 | grep -E "CC-MAIN-2024" urls.txt > urls_selected.txt
 54 | aria2c --console-log-level=error -s 4 -c --auto-file-renaming=false --check-certificate=false --file-allocation=none --enable-rpc
 55 | 
 56 | # Step3: (Open another terminal) Download the selected subsets
 57 | bash download_datasets_step3.sh urls_selected.txt
 58 | ```
 59 | 
 60 | ## List of Open Source Datasets Used
 61 | 
 62 | - **🔥 Updated (April 11, 2025): For a clearer presentation of the information, see the table at this link: [link](https://docs.google.com/spreadsheets/d/1YP8-loVUxgxo36UEpOwflR3GRHLieBnLlCy8g10g8RU/edit?gid=0#gid=0).**
 63 | 
 64 | This section provides links to the datasets used.
 65 | 
 66 | | Dataset      | Link                        |
 67 | |----------------|---------------------------------|
 68 | YuLan-Mini-Datasets: | [https://huggingface.co/datasets/yulan-team/YuLan-Mini-Datasets](https://huggingface.co/datasets/yulan-team/YuLan-Mini-Datasets)
 69 | [1] Classified data using [`python-edu-scorer`](https://huggingface.co/HuggingFaceTB/python-edu-scorer) and [`fineweb-edu-classifier`](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) |
 70 | [2] Synthesized data (math, code, instruction, ...) |
 71 | [3] Retrieved data using [`math`](https://huggingface.co/yulan-team/math-classifier), [`code`](https://huggingface.co/yulan-team/code-classifier), and [`reasoninig-classifier`](https://huggingface.co/yulan-team/reasoning-classifier) |
 72 | chinese-fineweb-edu | [https://huggingface.co/datasets/opencsg/chinese-fineweb-edu](https://huggingface.co/datasets/opencsg/chinese-fineweb-edu)
 73 | baidu-baike | [https://huggingface.co/datasets/TMZN/baidubaike](https://huggingface.co/datasets/TMZN/baidubaike)
 74 | wikipeida-cn | [https://huggingface.co/datasets/TigerResearch/pretrain-zh](https://huggingface.co/datasets/TigerResearch/pretrain-zh)
 75 | LLM360-TxT360 | [https://huggingface.co/datasets/LLM360/TxT360](https://huggingface.co/datasets/LLM360/TxT360)
 76 | wanjuan | [https://opendatalab.org.cn/OpenDataLab/WanJuan1-dot-0](https://opendatalab.org.cn/OpenDataLab/WanJuan1-dot-0)
 77 | MNBVC | [https://huggingface.co/datasets/liwu/MNBVC](https://huggingface.co/datasets/liwu/MNBVC)
 78 | cbooks | [https://github.com/FudanNLPLAB/CBook-150K](https://github.com/FudanNLPLAB/CBook-150K)
 79 | legal-case-law-cn | [https://wenshu.court.gov.cn/](https://wenshu.court.gov.cn/)
 80 | textbook-quality-programming | [https://huggingface.co/datasets/vikp/textbook-quality-programming](https://huggingface.co/datasets/vikp/textbook-quality-programming)
 81 | the-stack-v2 | [https://huggingface.co/datasets/bigcode/the-stack-v2](https://huggingface.co/datasets/bigcode/the-stack-v2)
 82 | data-starcoderdata | [https://huggingface.co/datasets/bigcode/starcoderdata](https://huggingface.co/datasets/bigcode/starcoderdata)
 83 | ioccc | [https://www.ioccc.org](https://www.ioccc.org)
 84 | smollm-dedup | [https://huggingface.co/datasets/HuggingFaceTB/smollm-corpus](https://huggingface.co/datasets/HuggingFaceTB/smollm-corpus)
 85 | opencoder-sft-stage1-en | [https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage1](https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage1)
 86 | opencoder-sft-stage2-text | [https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage2](https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage2)
 87 | javascript | [https://huggingface.co/datasets/OpenCoder-LLM/opc-annealing-corpus](https://huggingface.co/datasets/OpenCoder-LLM/opc-annealing-corpus)
 88 | longwanjuan-others | [https://opendatalab.com/OpenLMLab/LongWanjuan](https://opendatalab.com/OpenLMLab/LongWanjuan)
 89 | algebraic-stack | [https://huggingface.co/datasets/EleutherAI/proof-pile-2](https://huggingface.co/datasets/EleutherAI/proof-pile-2)
 90 | arxiv | [https://huggingface.co/datasets/math-ai/AutoMathText](https://huggingface.co/datasets/math-ai/AutoMathText)
 91 | open-web-math-pro | [https://huggingface.co/datasets/gair-prox/open-web-math-pro](https://huggingface.co/datasets/gair-prox/open-web-math-pro)
 92 | AMPS-mathematica | [https://huggingface.co/datasets/XinyaoHu/AMPS-mathematica](https://huggingface.co/datasets/XinyaoHu/AMPS-mathematica)
 93 | AMPS-khan | [https://huggingface.co/datasets/XinyaoHu/AMPS-khan](https://huggingface.co/datasets/XinyaoHu/AMPS-khan)
 94 | arithmetic--add-or-sub-steps | [https://huggingface.co/datasets/deepmind/math-dataset](https://huggingface.co/datasets/deepmind/math-dataset)
 95 | basic-math-10m | [https://huggingface.co/datasets/mrfakename/basic-math-10m](https://huggingface.co/datasets/mrfakename/basic-math-10m)
 96 | orca | [https://huggingface.co/datasets/microsoft/orca-math-word-problems-200k](https://huggingface.co/datasets/microsoft/orca-math-word-problems-200k)
 97 | MetaMathQA | [https://huggingface.co/datasets/meta-math/MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA)
 98 | numina | [https://huggingface.co/datasets/AI-MO/NuminaMath-CoT](https://huggingface.co/datasets/AI-MO/NuminaMath-CoT)
 99 | ScaleQuest-Math | [https://huggingface.co/datasets/dyyyyyyyy/ScaleQuest-Math](https://huggingface.co/datasets/dyyyyyyyy/ScaleQuest-Math)
100 | auto-math-text | [https://huggingface.co/datasets/HuggingFaceTB/cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
101 | score-neg1 | [https://huggingface.co/datasets/Infi-MM/InfiMM-WebMath-40B](https://huggingface.co/datasets/Infi-MM/InfiMM-WebMath-40B)
102 | split-1 | [https://huggingface.co/datasets/OpenCoder-LLM/opc-fineweb-math-corpus](https://huggingface.co/datasets/OpenCoder-LLM/opc-fineweb-math-corpus)
103 | Lean-CoT-base | [https://huggingface.co/ScalableMath/Lean-CoT-base](https://huggingface.co/ScalableMath/Lean-CoT-base)
104 | Lean-CoT-plus | [https://huggingface.co/datasets/ScalableMath/Lean-CoT-plus](https://huggingface.co/datasets/ScalableMath/Lean-CoT-plus)
105 | Lean-STaR-base | [https://huggingface.co/datasets/ScalableMath/Lean-STaR-base](https://huggingface.co/datasets/ScalableMath/Lean-STaR-base)
106 | Lean-STaR-plus | [https://huggingface.co/datasets/ScalableMath/Lean-STaR-plus](https://huggingface.co/datasets/ScalableMath/Lean-STaR-plus)
107 | lean-workbook | [https://huggingface.co/datasets/internlm/Lean-Workbook](https://huggingface.co/datasets/internlm/Lean-Workbook)
108 | lean-deepseek-v1 | [https://huggingface.co/datasets/deepseek-ai/DeepSeek-Prover-V1?row=0](https://huggingface.co/datasets/deepseek-ai/DeepSeek-Prover-V1?row=0)
109 | Lean-Github-processed | [https://huggingface.co/datasets/internlm/Lean-Github](https://huggingface.co/datasets/internlm/Lean-Github)
110 | ape210k | [https://huggingface.co/datasets/MU-NLPC/Calc-ape210k](https://huggingface.co/datasets/MU-NLPC/Calc-ape210k)
111 | TAL-SCQ5K | [https://huggingface.co/datasets/math-eval/TAL-SCQ5K](https://huggingface.co/datasets/math-eval/TAL-SCQ5K)
112 | Polytope | [https://huggingface.co/datasets/sequelbox/Polytope](https://huggingface.co/datasets/sequelbox/Polytope)
113 | WizardLM-evol-instruct-V2-196k | [https://huggingface.co/datasets/WizardLMTeam/WizardLM-evol-instruct-V2-196k](https://huggingface.co/datasets/WizardLMTeam/WizardLM-evol-instruct-V2-196k)
114 | Code-290k-ShareGPT | [https://huggingface.co/datasets/ajibawa-2023/Code-290k-ShareGPT](https://huggingface.co/datasets/ajibawa-2023/Code-290k-ShareGPT)
115 | evol-codealpaca-v1 | [https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1)
116 | Magicoder-Evol-Instruct-110K | [https://huggingface.co/datasets/ise-uiuc/Magicoder-Evol-Instruct-110K](https://huggingface.co/datasets/ise-uiuc/Magicoder-Evol-Instruct-110K)
117 | MAmmoTHMathInstruct | [https://huggingface.co/datasets/TIGER-Lab/MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
118 | MathCodeInstruct | [https://huggingface.co/datasets/MathLLMs/MathCodeInstruct-Plus](https://huggingface.co/datasets/MathLLMs/MathCodeInstruct-Plus)
119 | CodeFeedback-Filtered-Instruction | [https://huggingface.co/datasets/m-a-p/CodeFeedback-Filtered-Instruction](https://huggingface.co/datasets/m-a-p/CodeFeedback-Filtered-Instruction)
120 | OpenMathInstruct-1 | [https://huggingface.co/datasets/nvidia/OpenMathInstruct-1](https://huggingface.co/datasets/nvidia/OpenMathInstruct-1)
121 | less-data | [https://huggingface.co/datasets/princeton-nlp/less-data](https://huggingface.co/datasets/princeton-nlp/less-data)
122 | Claude-3-Opus-Claude-3.5-Sonnnet-9k | [https://huggingface.co/datasets/QuietImpostor/Claude-3-Opus-Claude-3.5-Sonnnet-9k](https://huggingface.co/datasets/QuietImpostor/Claude-3-Opus-Claude-3.5-Sonnnet-9k)
123 | SlimOrca | [https://huggingface.co/datasets/Open-Orca/SlimOrca](https://huggingface.co/datasets/Open-Orca/SlimOrca)
124 | Python-Code-23k-ShareGPT | [https://huggingface.co/datasets?search=Python-Code-23k-ShareGPT](https://huggingface.co/datasets?search=Python-Code-23k-ShareGPT)
125 | tulu-v3.1-mix-preview-4096-OLMoE | [https://huggingface.co/datasets/allenai/tulu-v3.1-mix-preview-4096-OLMoE](https://huggingface.co/datasets/allenai/tulu-v3.1-mix-preview-4096-OLMoE)
126 | EvolKit-20k | [https://huggingface.co/datasets/arcee-ai/EvolKit-20k](https://huggingface.co/datasets/arcee-ai/EvolKit-20k)
127 | Supernova | [https://huggingface.co/datasets/sequelbox/Supernova](https://huggingface.co/datasets/sequelbox/Supernova)
128 | Evol-Instruct-Code-80k-v1 | [https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1](https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1)
129 | Tachibana | [https://huggingface.co/datasets/sequelbox/Tachibana](https://huggingface.co/datasets/sequelbox/Tachibana)
130 | self-oss-instruct-sc2-exec-filter-50k | [https://huggingface.co/datasets/bigcode/self-oss-instruct-sc2-exec-filter-50k](https://huggingface.co/datasets/bigcode/self-oss-instruct-sc2-exec-filter-50k)
131 | leetcode-solution-python | [https://huggingface.co/datasets/richardodliu/leetcode-solution-python](https://huggingface.co/datasets/richardodliu/leetcode-solution-python)
132 | Magicoder-OSS-Instruct-75K | [https://huggingface.co/datasets/ise-uiuc/Magicoder-OSS-Instruct-75K](https://huggingface.co/datasets/ise-uiuc/Magicoder-OSS-Instruct-75K)
133 | Spurline | [https://huggingface.co/datasets/sequelbox/Spurline](https://huggingface.co/datasets/sequelbox/Spurline)
134 | Celestia | [https://huggingface.co/datasets/sequelbox/Celestia](https://huggingface.co/datasets/sequelbox/Celestia)
135 | Titanium | [https://huggingface.co/datasets/sequelbox/Titanium](https://huggingface.co/datasets/sequelbox/Titanium)
136 | CodeExercise-Python-27k | [https://huggingface.co/datasets/codefuse-ai/CodeExercise-Python-27k](https://huggingface.co/datasets/codefuse-ai/CodeExercise-Python-27k)
137 | XCoder-80K | [https://huggingface.co/datasets/banksy235/XCoder-80K](https://huggingface.co/datasets/banksy235/XCoder-80K)
138 | Codefuse-Evol-Instruct-Clean | [https://huggingface.co/datasets/banksy235/Codefuse-Evol-Instruct-Clean](https://huggingface.co/datasets/banksy235/Codefuse-Evol-Instruct-Clean)
139 | ruozhiba | [https://huggingface.co/datasets/hfl/ruozhiba-gpt4](https://huggingface.co/datasets/hfl/ruozhiba-gpt4)
140 | chinese-poetry | [https://github.com/chinese-poetry/chinese-poetry](https://github.com/chinese-poetry/chinese-poetry)
141 | MathScaleQA-2M | [https://huggingface.co/datasets/fdqerq22ds/MathScaleQA-2M](https://huggingface.co/datasets/fdqerq22ds/MathScaleQA-2M)
142 | tulu-code | [https://huggingface.co/datasets/allenai/tulu-3-sft-personas-code](https://huggingface.co/datasets/allenai/tulu-3-sft-personas-code)
143 | tulu-math | [https://huggingface.co/datasets/allenai/tulu-3-sft-personas-math](https://huggingface.co/datasets/allenai/tulu-3-sft-personas-math)
144 | tulu-math-grade | [https://huggingface.co/datasets/allenai/tulu-3-sft-personas-math-grade](https://huggingface.co/datasets/allenai/tulu-3-sft-personas-math-grade)
145 | tulu-algebra | [https://huggingface.co/datasets/allenai/tulu-3-sft-personas-algebra](https://huggingface.co/datasets/allenai/tulu-3-sft-personas-algebra)
146 | orca-agentinstruct-code-cot | [https://huggingface.co/datasets/microsoft/orca-agentinstruct-1M-v1](https://huggingface.co/datasets/microsoft/orca-agentinstruct-1M-v1)
147 | gretel-math-gsm8k-v1 | [https://huggingface.co/datasets/gretelai/gretel-math-gsm8k-v1](https://huggingface.co/datasets/gretelai/gretel-math-gsm8k-v1)
148 | FOL-nli-v1 | [https://huggingface.co/datasets/tasksource/FOL-nli](https://huggingface.co/datasets/tasksource/FOL-nli)
149 | data-jsonl | [https://huggingface.co/datasets/garage-bAInd/Open-Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
150 | reasoning-0.01-data-jsonl | [https://huggingface.co/datasets/SkunkworksAI/reasoning-0.01](https://huggingface.co/datasets/SkunkworksAI/reasoning-0.01)
151 | Magpie-Reasoning-150K-data-jsonl | [https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K](https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K)
152 | dclm-baseline | [https://huggingface.co/datasets/mlfoundations/dclm-baseline-1.0](https://huggingface.co/datasets/mlfoundations/dclm-baseline-1.0)
153 | fineweb-edu | [https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu)
154 | books3-book-en-v1 | [https://open.umn.edu/opentextbooks/](https://open.umn.edu/opentextbooks/)
155 | gutenberg-book-en-v2 | [https://huggingface.co/datasets/manu/project-gutenberg](https://huggingface.co/datasets/manu/project-gutenberg)
156 | LoC-PD-Books | [https://huggingface.co/datasets/storytracer/LoC-PD-Books](https://huggingface.co/datasets/storytracer/LoC-PD-Books)
157 | smollm-corpus-cosmopedia-v2 | [https://huggingface.co/datasets/HuggingFaceTB/smollm-corpus](https://huggingface.co/datasets/HuggingFaceTB/smollm-corpus)
158 | pes2o-paper-en-v1 | [https://huggingface.co/datasets/allenai/peS2o](https://huggingface.co/datasets/allenai/peS2o)
159 | arxiv-paper-en-v2 | [https://huggingface.co/datasets/arxiv-community/arxiv-dataset](https://huggingface.co/datasets/arxiv-community/arxiv-dataset)
160 | wiki-baike-en-v3 | [https://dumps.wikimedia.org/backup-index.html](https://dumps.wikimedia.org/backup-index.html)
161 | dolma | [https://huggingface.co/datasets/allenai/dolma](https://huggingface.co/datasets/allenai/dolma)
162 | OpenCoder-LLM-fineweb-code | [https://huggingface.co/datasets/OpenCoder-LLM/opc-fineweb-code-corpus](https://huggingface.co/datasets/OpenCoder-LLM/opc-fineweb-code-corpus)
163 | 


--------------------------------------------------------------------------------
/pretrain/datasets/download_datasets_step1.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Source: https://gist.github.com/padeoe/697678ab8e528b85a2a7bddafea1fa4f
  3 | # Color definitions
  4 | RED='\033[0;31m'
  5 | GREEN='\033[0;32m'
  6 | YELLOW='\033[1;33m'
  7 | NC='\033[0m' # No Color
  8 | 
  9 | trap 'printf "${YELLOW}\nDownload interrupted. If you re-run the command, you can resume the download from the breakpoint.\n${NC}"; exit 1' INT
 10 | 
 11 | display_help() {
 12 |     cat <<EOF
 13 | Usage:
 14 |   hfd <repo_id> [--include include_pattern] [--exclude exclude_pattern] [--hf_username username] [--hf_token token] [--tool aria2c|wget] [-x threads] [--dataset] [--local-dir path]
 15 | 
 16 | Description:
 17 |   Downloads a model or dataset from Hugging Face using the provided repo ID.
 18 | 
 19 | Parameters:
 20 |   repo_id        The Hugging Face repo ID in the format 'org/repo_name'.
 21 |   --include       (Optional) Flag to specify a string pattern to include files for downloading.
 22 |   --exclude       (Optional) Flag to specify a string pattern to exclude files from downloading.
 23 |   include/exclude_pattern The pattern to match against filenames, supports wildcard characters. e.g., '--exclude *.safetensor', '--include vae/*'.
 24 |   --hf_username   (Optional) Hugging Face username for authentication. **NOT EMAIL**.
 25 |   --hf_token      (Optional) Hugging Face token for authentication.
 26 |   --tool          (Optional) Download tool to use. Can be wget (default) or aria2c.
 27 |   -x              (Optional) Number of download threads for aria2c. Defaults to 4.
 28 |   --dataset       (Optional) Flag to indicate downloading a dataset.
 29 |   --local-dir     (Optional) Local directory path where the model or dataset will be stored.
 30 |   --mirror        (Optional) Force use hf-mirror
 31 | 
 32 | Example:
 33 |   hfd bigscience/bloom-560m --exclude *.safetensors
 34 |   hfd meta-llama/Llama-2-7b --hf_username myuser --hf_token mytoken -x 4
 35 |   hfd lavita/medical-qa-shared-task-v1-toy --dataset
 36 | EOF
 37 |     exit 1
 38 | }
 39 | 
 40 | MODEL_ID=$1
 41 | shift
 42 | 
 43 | # Default values
 44 | TOOL="wget"
 45 | THREADS=4
 46 | HF_ENDPOINT="https://huggingface.co"
 47 | 
 48 | while [[ $# -gt 0 ]]; do
 49 |     case $1 in
 50 |     --include)
 51 |         INCLUDE_PATTERN="$2"
 52 |         shift 2
 53 |         ;;
 54 |     --exclude)
 55 |         EXCLUDE_PATTERN="$2"
 56 |         shift 2
 57 |         ;;
 58 |     --hf_username)
 59 |         HF_USERNAME="$2"
 60 |         shift 2
 61 |         ;;
 62 |     --hf_token)
 63 |         HF_TOKEN="$2"
 64 |         shift 2
 65 |         ;;
 66 |     --tool)
 67 |         TOOL="$2"
 68 |         shift 2
 69 |         ;;
 70 |     -x)
 71 |         THREADS="$2"
 72 |         shift 2
 73 |         ;;
 74 |     --dataset)
 75 |         DATASET=1
 76 |         shift
 77 |         ;;
 78 |     --local-dir)
 79 |         LOCAL_DIR="$2"
 80 |         shift 2
 81 |         ;;
 82 |     --mirror)
 83 |         HF_ENDPOINT="https://hf-mirror.com"
 84 |         shift
 85 |         ;;
 86 |     *) shift ;;
 87 |     esac
 88 | done
 89 | 
 90 | # Check if aria2, wget, curl, git, and git-lfs are installed
 91 | check_command() {
 92 |     if ! command -v $1 &>/dev/null; then
 93 |         echo -e "${RED}$1 is not installed. Please install it first.${NC}"
 94 |         exit 1
 95 |     fi
 96 | }
 97 | 
 98 | # Mark current repo safe when using shared file system like samba or nfs
 99 | ensure_ownership() {
100 |     if git status 2>&1 | grep "fatal: detected dubious ownership in repository at" >/dev/null; then
101 |         git config --global --add safe.directory "${PWD}"
102 |         printf "${YELLOW}Detected dubious ownership in repository, mark ${PWD} safe using git, edit ~/.gitconfig if you want to reverse this.\n${NC}"
103 |     fi
104 | }
105 | 
106 | [[ "$TOOL" == "aria2c" ]] && check_command aria2c
107 | [[ "$TOOL" == "wget" ]] && check_command wget
108 | check_command curl
109 | check_command git
110 | check_command git-lfs
111 | 
112 | [[ -z "$MODEL_ID" || "$MODEL_ID" =~ ^-h ]] && display_help
113 | 
114 | if [[ -z "$LOCAL_DIR" ]]; then
115 |     LOCAL_DIR="${MODEL_ID#*/}"
116 | fi
117 | 
118 | if [[ "$DATASET" == 1 ]]; then
119 |     MODEL_ID="datasets/$MODEL_ID"
120 | fi
121 | echo "Downloading to $LOCAL_DIR"
122 | 
123 | if [ -d "$LOCAL_DIR/.git" ]; then
124 |     printf "${YELLOW}%s exists, Skip Clone.\n${NC}" "$LOCAL_DIR"
125 |     cd "$LOCAL_DIR" && ensure_ownership && GIT_LFS_SKIP_SMUDGE=1 git pull || {
126 |         printf "${RED}Git pull failed.${NC}\n"
127 |         exit 1
128 |     }
129 | else
130 |     REPO_URL="$HF_ENDPOINT/$MODEL_ID"
131 |     GIT_REFS_URL="${REPO_URL}/info/refs?service=git-upload-pack"
132 |     echo "Testing GIT_REFS_URL: $GIT_REFS_URL"
133 |     response=$(curl -s -o /dev/null -w "%{http_code}" "$GIT_REFS_URL")
134 |     if [ "$response" == "401" ] || [ "$response" == "403" ]; then
135 |         if [[ -z "$HF_USERNAME" || -z "$HF_TOKEN" ]]; then
136 |             printf "${RED}HTTP Status Code: $response.\nThe repository requires authentication, but --hf_username and --hf_token is not passed. Please get token from https://huggingface.co/settings/tokens.\nExiting.\n${NC}"
137 |             exit 1
138 |         fi
139 |         REPO_URL="https://$HF_USERNAME:$HF_TOKEN@${HF_ENDPOINT#https://}/$MODEL_ID"
140 |     elif [ "$response" != "200" ]; then
141 |         printf "${RED}Unexpected HTTP Status Code: $response\n${NC}"
142 |         printf "${YELLOW}Executing debug command: curl -v %s\nOutput:${NC}\n" "$GIT_REFS_URL"
143 |         curl -v "$GIT_REFS_URL"
144 |         printf "\n${RED}Git clone failed.\n${NC}"
145 |         exit 1
146 |     fi
147 |     echo "GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR"
148 | 
149 |     GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR && cd "$LOCAL_DIR" || {
150 |         printf "${RED}Git clone failed.\n${NC}"
151 |         exit 1
152 |     }
153 | 
154 |     ensure_ownership
155 | 
156 |     while IFS= read -r file; do
157 |         truncate -s 0 "$file"
158 |     done <<<$(git lfs ls-files | cut -d ' ' -f 3-)
159 | fi
160 | 
161 | printf "\nStart Downloading lfs files, bash script:\ncd $LOCAL_DIR\n"
162 | files=$(git lfs ls-files | cut -d ' ' -f 3-)
163 | declare -a urls
164 | 
165 | while IFS= read -r file; do
166 |     if [ -z "$file" ]; then
167 |         echo "Empty file path, skipping..."
168 |         continue
169 |     fi
170 |     url="$HF_ENDPOINT/$MODEL_ID/resolve/main/$file"
171 |     file_dir=$(dirname "$file")
172 |     mkdir -p "$file_dir"
173 |     if [[ "$TOOL" == "wget" ]]; then
174 |         download_cmd="wget -c \"$url\" -O \"$file\""
175 |         [[ -n "$HF_TOKEN" ]] && download_cmd="wget --header=\"Authorization: Bearer ${HF_TOKEN}\" -c \"$url\" -O \"$file\""
176 |     else
177 |         download_cmd="aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""
178 |         [[ -n "$HF_TOKEN" ]] && download_cmd="aria2c --header=\"Authorization: Bearer ${HF_TOKEN}\" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""
179 |     fi
180 |     [[ -n "$INCLUDE_PATTERN" && ! "$file" == $INCLUDE_PATTERN ]] && printf "# %s\n" "$download_cmd" && continue
181 |     [[ -n "$EXCLUDE_PATTERN" && "$file" == $EXCLUDE_PATTERN ]] && printf "# %s\n" "$download_cmd" && continue
182 |     printf "%s\n" "$download_cmd"
183 |     urls+=("$url|$file")
184 |     echo "$url|$file" >>urls.txt
185 | done <<<"$files"
186 | 


--------------------------------------------------------------------------------
/pretrain/datasets/download_datasets_step3.sh:
--------------------------------------------------------------------------------
 1 | URLS_PATH=$1
 2 | 
 3 | for url_file in $(cat $URLS_PATH); do
 4 |     IFS='|' read -r url file <<< "$url_file"
 5 |     if [[ -z "$file" ]]; then
 6 |         continue
 7 |     fi
 8 |     echo "Downloading $file from $url"
 9 |     python -m aria2p add "$url" -o out="$file" &
10 | done
11 | 


--------------------------------------------------------------------------------
/pretrain/datasets/final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RUC-GSAI/YuLan-Mini/d9b546224bc8539db4482bccc73df50e61125af7/pretrain/datasets/final.pdf


--------------------------------------------------------------------------------
/pretrain/ds2_config_adamw.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |       "enabled": "auto"
 4 |     },
 5 |     "zero_optimization": {
 6 |       "stage": 2,
 7 |       "allgather_partitions": true,
 8 |       "allgather_bucket_size": 8e8,
 9 |       "overlap_comm": true,
10 |       "reduce_scatter": true,
11 |       "reduce_bucket_size": 8e8,
12 |       "contiguous_gradients": true
13 |     },
14 |     "gradient_accumulation_steps": "auto",
15 |     "gradient_clipping": "auto",
16 |     "steps_per_print": 16,
17 |     "train_batch_size": "auto",
18 |     "train_micro_batch_size_per_gpu": "auto",
19 |     "wall_clock_breakdown": false,
20 |     "dump_state": true,
21 |     "optimizer": {
22 |       "type": "AdamW",
23 |       "params": {
24 |         "lr": "auto",
25 |         "betas": "auto",
26 |         "eps": "auto",
27 |         "weight_decay": "auto"
28 |       }
29 |     },
30 |     "checkpoint": {
31 |       "load_universal": false
32 |     }
33 |   }
34 | 


--------------------------------------------------------------------------------
/pretrain/preprocess/README.md:
--------------------------------------------------------------------------------
  1 | # Data Preprocess
  2 | 
  3 | This part introduces the data preprocessing methods, including text data preprocessing and then index data preprocessing.
  4 | 
  5 | ## I. Text Data Preprocessing
  6 | 
  7 | ### 1. Text Formatting
  8 | 
  9 | #### Jupyter Notebook
 10 | 
 11 | 1. Collect data from **the-stack-v2**.
 12 | 2. Perform file-level de-duplication.
 13 | 3. Split data based on repository stars and forks:
 14 |    - [Sum > 0 (prefixed with `md2`)](https://huggingface.co/datasets/yulan-team/YuLan-Mini-Text-Datasets/viewer/code-the-stack-v2-Jupyter_Notebook-md2_scored_classifier-score_4).
 15 |    - [Sum = 0 (prefixed with `md`)](https://huggingface.co/datasets/yulan-team/YuLan-Mini-Text-Datasets/viewer/code-the-stack-v2-Jupyter_Notebook-md_scored_classified-score_5?views%5B%5D=code_the_stack_v2_jupyter_notebook_md_scored_classified_score_5).
 16 | 4. Add Python version as a Markdown annotation (to enable conditioned learning).
 17 | 5. Perform line-level de-duplication using a text-distance algorithm to filter out repetitive content, such as progress bars, CSV tables, etc.
 18 | 6. Randomly format a subset of code snippets using **yapf** with random styles.
 19 | 7. Convert Markdown blocks, Python blocks, and execution results into Markdown format.
 20 | 8. Score data using [python-edu-scorer](https://huggingface.co/HuggingFaceTB/python-edu-scorer). 
 21 |    - High-scoring examples resemble **tutorial-style notebooks**. 
 22 |    - Low-scoring examples resemble **spreadsheet processing**.
 23 | 
 24 | #### Python
 25 | 
 26 | 1. Collect data from **the-stack-v2**, **MNBVC**, **SmolLM**, **StarCoder**, **OpenCoder**.
 27 | 2. Additionally, synthesize pre-train and sft data using Qwen2.5 series models.
 28 | 3. Perform file-level de-duplication.
 29 | 4. Apply text metrics filtering (following **DeepSeek Coder** guidelines).
 30 | 5. Score data using [python-edu-scorer](https://huggingface.co/HuggingFaceTB/python-edu-scorer).
 31 | 
 32 | #### C, C++, Rust, HTML, etc.
 33 | 
 34 | 1. Collect data from **the-stack-v2**, **MNBVC**, **OpenCoder**, etc.
 35 | 2. Apply text metrics filtering (following **DeepSeek Coder** guidelines).
 36 | 
 37 | > Hugging Face has recently open-sourced their [scorers for other languages](https://huggingface.co/collections/HuggingFaceTB/the-ultimate-collection-of-code-classifiers-67b5aa3eb8994a4b71453005), and readers are encouraged to check them out.
 38 | 
 39 | #### Math
 40 | 
 41 | 1. Collect data from **ProofPile2**, **AutoMathText**, and **OpenWebMath-Pro**.
 42 | 2. Additionally, retrieve math content from **fineweb-edu** and **dclm** using the [math-classifier](https://huggingface.co/yulan-team/math-classifier), and synthesize pre-train and sft data using Qwen2.5 series models.
 43 | 3. Perform file-level de-duplication.
 44 | 4. Score data using [fineweb-edu-classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier).
 45 | 
 46 | ### 2. Data Filtering Pipeline
 47 | 
 48 | <div align=center>
 49 | <img src="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/assets/data-pipeline.png">
 50 | </div>
 51 | 
 52 | ## II. Index Data Preprocessing
 53 | 
 54 | This part introduces the process of tokenization, data mixing, packing, etc.
 55 | 
 56 | ### 1. Preliminary
 57 | 
 58 | We store the data before tokenization and after tokenization in two different folders, e.g., `/data/raw` and `/data/input_ids`.
 59 | 
 60 | ```txt
 61 | /data
 62 | ├── raw
 63 | │   ├── dataset-collection-1
 64 | │   │   ├── dataset-1
 65 | │   │   │   ├── file-1.jsonl
 66 | │   │   │   ├── file-2.jsonl
 67 | │   │   │   └── ...
 68 | │   │   ├── dataset-2
 69 | │   │   │   └── file-1.parquet
 70 | │   │   └── ...
 71 | │   ├── dataset-3
 72 | │   └── dataset-collection-2
 73 | │       ├── dataset-4
 74 | │       │   └── file-1.jsonl
 75 | │       └── ...
 76 | └── input_ids
 77 |     └── dataset-collection-2
 78 |         ├── dataset-4
 79 |         │   └── file-1
 80 |         │       └── wo_ppl
 81 |         │           ├── splitted_part-0001.parquet
 82 |         │           ├── splitted_part-0001-metadata.json
 83 |         │           └── ...
 84 |         └── ...
 85 | ```
 86 | 
 87 | The data before tokenization is stored in `parquet` (recommended) or `jsonl` format, which look like:
 88 | 
 89 | ```json
 90 | {"text": "This is a sentence."}
 91 | {"text": "This is another sentence."}
 92 | ```
 93 | 
 94 | As for data mixing, we mainly use an online spreadsheet to manage the mixing ratio of different course stages. We provide an [example google spreadsheet](https://docs.google.com/spreadsheets/d/1WJTJuZqSr9kVFqVyNwsOHqvgLwDCjBcW3Pz3d6vwHZs/edit?usp=sharing) for reference.
 95 | 
 96 | ### 2. Tokenization
 97 | 
 98 | The following script will tokenize the files in the target path (including all subfolders), and then split the tokenized data into multiple files according to 0.01B Tokens for more fine-grained data mixing.
 99 | 
100 | ```bash
101 | cd tokenize
102 | bash run_tokenize.sh /data/raw/dataset-collection-2/dataset-4
103 | ```
104 | 
105 | > [!NOTE]
106 | > Please modify the script to set:
107 | > 1. default key for text dataset, e.g. "text",
108 | > 2. tokenizer path,
109 | > 3. the save location of the tokenized files,
110 | > 4. context window size, e.g. 4096,
111 | > 5. the number of threads.
112 | 
113 | 
114 | ### 3. Data Mixing
115 | 
116 | Now we can mix the data for each curriculum phase. The following steps are required:
117 | 
118 | 1. Update metadata from the google spreadsheet (required only when metadata changes):
119 | 
120 | ```bash
121 | # Copy & paste corresponding columns from spreadsheet following the instruction
122 | cd mix
123 | python update_metadata_from_clipboard.py
124 | 
125 | # Check the updated metadata
126 | cat datasets.txt
127 | cat subsets.txt
128 | cat sfts.txt
129 | ```
130 | 
131 | 2. Generate data mix recipe file:
132 | 
133 | ```bash
134 | # Copy & paste corresponding column from spreadsheet (e.g. column F for phase 1) following the instruction
135 | python mix_from_clipboard.py 1
136 | cat 01_xxxx_xxxx.json  # check the generated recipe file
137 | ```
138 | 
139 | 3. Save and pack the mixed data:
140 | 
141 | ```bash
142 | python save_from_recipe.py 01_xxxx_xxxx
143 | ```
144 | 


--------------------------------------------------------------------------------
/pretrain/preprocess/convert_hf_datasets_to_megatron.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | from typing import Type, List, Optional
  3 | from enum import Enum
  4 | import os
  5 | import logging
  6 | import shutil
  7 | import struct
  8 | import time
  9 | from tqdm import tqdm
 10 | from functools import lru_cache
 11 | from types import TracebackType
 12 | from typing import List, Optional, Tuple, Type, Union, Any
 13 | import datasets
 14 | from concurrent.futures import ProcessPoolExecutor
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | _INDEX_HEADER = b"MMIDIDX\x00\x00"
 19 | VOCAB_SIZE = 99000
 20 | 
 21 | 
 22 | def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any):
 23 |     """If torch distributed is initialized, log only on rank
 24 | 
 25 |     Args:
 26 |         logger (logging.Logger): The logger to write the logs
 27 | 
 28 |         args (Tuple[Any]): All logging.Logger.log positional arguments
 29 | 
 30 |         rank (int, optional): The rank to write on. Defaults to 0.
 31 | 
 32 |         kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments
 33 |     """
 34 |     # if torch.distributed.is_initialized():
 35 |     #     if torch.distributed.get_rank() == rank:
 36 |     #         logger.log(*args, **kwargs)
 37 |     # else:
 38 |     logger.log(*args, **kwargs)
 39 | 
 40 | 
 41 | def get_idx_path(path_prefix: str) -> str:
 42 |     """Get the path to the index file from the prefix
 43 | 
 44 |     Args:
 45 |         path_prefix (str): The prefix
 46 | 
 47 |     Returns:
 48 |         str: The path to the index file
 49 |     """
 50 |     return path_prefix + ".idx"
 51 | 
 52 | 
 53 | def get_bin_path(path_prefix: str) -> str:
 54 |     """Get the path to the data file from the prefix
 55 | 
 56 |     Args:
 57 |         path_prefix (str): The prefix
 58 | 
 59 |     Returns:
 60 |         str: The path to the data file
 61 |     """
 62 |     return path_prefix + ".bin"
 63 | 
 64 | 
 65 | class DType(Enum):
 66 |     """The NumPy data type Enum for writing/reading the IndexedDataset indices"""
 67 | 
 68 |     uint8 = 1
 69 |     int8 = 2
 70 |     int16 = 3
 71 |     int32 = 4
 72 |     int64 = 5
 73 |     float64 = 6
 74 |     float32 = 7
 75 |     uint16 = 8
 76 | 
 77 |     @classmethod
 78 |     def code_from_dtype(cls, value: Type[numpy.number]) -> int:
 79 |         """Get the code from the dtype
 80 | 
 81 |         Args:
 82 |             value (Type[numpy.number]): The dtype
 83 | 
 84 |         Returns:
 85 |             int: The code
 86 |         """
 87 |         return cls[value.__name__].value
 88 | 
 89 |     @classmethod
 90 |     def dtype_from_code(cls, value: int) -> Type[numpy.number]:
 91 |         """Get the dtype from the code
 92 | 
 93 |         Args:
 94 |             value (int): The code
 95 | 
 96 |         Returns:
 97 |             Type[numpy.number]: The dtype
 98 |         """
 99 |         return getattr(numpy, cls(value).name)
100 | 
101 |     @staticmethod
102 |     def size(key: Union[int, Type[numpy.number]]) -> int:
103 |         """Get the size of the dtype/code in bytes
104 | 
105 |         Args:
106 |             key (Union[int, Type[numpy.number]]): The dtype or code
107 | 
108 |         Raises:
109 |             ValueError: If the key is neither dtype nor integer code
110 | 
111 |         Returns:
112 |             int: The size of the dtype/code in in bytes
113 |         """
114 |         if isinstance(key, int):
115 |             return DType.dtype_from_code(key)().itemsize
116 |         elif numpy.number in key.__mro__:
117 |             return key().itemsize
118 |         else:
119 |             raise ValueError
120 | 
121 |     @staticmethod
122 |     def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]:
123 |         """Get the dtype to use for an index of a certain cardinality
124 | 
125 |         Args:
126 |             cardinality (Optional[int]): The number of elements to be indexed
127 | 
128 |         Returns:
129 |             Type[numpy.number]: The dtype to use for the index
130 |         """
131 |         if cardinality is not None and cardinality < 65500:
132 |             return numpy.uint16
133 |         else:
134 |             return numpy.int32
135 | 
136 | 
137 | class _IndexWriter(object):
138 |     """Object class to write the index (.idx) file
139 | 
140 |     Args:
141 |         idx_path (str): The path to the index file
142 | 
143 |         dtype (Type[numpy.number]): The dtype of the index file
144 |     """
145 | 
146 |     def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None:
147 |         self.idx_path = idx_path
148 |         self.dtype = dtype
149 | 
150 |     def __enter__(self) -> "_IndexWriter":
151 |         """Enter the context introduced by the 'with' keyword
152 | 
153 |         Returns:
154 |             _IndexWriter: The instance
155 |         """
156 |         self.idx_writer = open(self.idx_path, "wb")
157 |         # fixed, vestigial practice
158 |         self.idx_writer.write(_INDEX_HEADER)
159 |         # fixed, vestigial practice
160 |         self.idx_writer.write(struct.pack("<Q", 1))
161 |         # the numeric code for the dtype
162 |         self.idx_writer.write(struct.pack("<B", DType.code_from_dtype(self.dtype)))
163 |         return self
164 | 
165 |     def __exit__(
166 |         self,
167 |         exc_type: Optional[Type[BaseException]],
168 |         exc_val: Optional[BaseException],
169 |         exc_tb: Optional[TracebackType],
170 |     ) -> Optional[bool]:
171 |         """Exit the context introduced by the 'with' keyword
172 | 
173 |         Args:
174 |             exc_type (Optional[Type[BaseException]]): Exception type
175 | 
176 |             exc_val (Optional[BaseException]): Exception value
177 | 
178 |             exc_tb (Optional[TracebackType]): Exception traceback object
179 | 
180 |         Returns:
181 |             Optional[bool]: Whether to silence the exception
182 |         """
183 |         self.idx_writer.close()
184 | 
185 |     def write(
186 |         self,
187 |         sequence_lengths: List[int],
188 |         sequence_modes: Optional[List[int]],
189 |         document_indices: List[int],
190 |     ) -> None:
191 |         """Write the index (.idx) file
192 | 
193 |         Args:
194 |             sequence_lengths (List[int]): The length of each sequence
195 | 
196 |             sequence_modes (Optional[List[int]]): The mode of each sequences
197 | 
198 |             document_indices (List[int]): The seqyebce indices demarcating the end of each document
199 |         """
200 |         sequence_pointers = self._sequence_pointers(sequence_lengths)
201 | 
202 |         # the number of sequences in the dataset
203 |         sequence_count = len(sequence_lengths)
204 |         self.idx_writer.write(struct.pack("<Q", sequence_count))
205 | 
206 |         # the number of documents in the dataset
207 |         document_count = len(document_indices)
208 |         self.idx_writer.write(struct.pack("<Q", document_count))
209 | 
210 |         # the number of tokens per sequence
211 |         sequence_lengths = numpy.array(sequence_lengths, dtype=numpy.int32)
212 |         self.idx_writer.write(sequence_lengths.tobytes(order="C"))
213 |         del sequence_lengths
214 | 
215 |         # the byte offsets for all sequences
216 |         sequence_pointers = numpy.array(sequence_pointers, dtype=numpy.int64)
217 |         self.idx_writer.write(sequence_pointers.tobytes(order="C"))
218 |         del sequence_pointers
219 | 
220 |         # the sequence indices marking the end of each document
221 |         document_indices = numpy.array(document_indices, dtype=numpy.int64)
222 |         self.idx_writer.write(document_indices.tobytes(order="C"))
223 | 
224 |         # the mode per sequence
225 |         if sequence_modes is not None:
226 |             sequence_modes = numpy.array(sequence_modes, dtype=numpy.int8)
227 |             self.idx_writer.write(sequence_modes.tobytes(order='C'))
228 |             del sequence_modes
229 | 
230 |     def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
231 |         """Build the sequence pointers per the sequence lengths and dtype size
232 | 
233 |         Args:
234 |             sequence_lengths (List[int]): The length of each sequence
235 | 
236 |         Returns:
237 |             List[int]: The pointer to the beginning of each sequence
238 |         """
239 |         itemsize = DType.size(self.dtype)
240 |         curr_ptr = 0
241 |         list_ptr = []
242 |         for length in sequence_lengths:
243 |             list_ptr.append(curr_ptr)
244 |             curr_ptr += length * itemsize
245 |         return list_ptr
246 | 
247 | 
248 | class _IndexReader(object):
249 |     """Object class to read the index (.idx) file
250 | 
251 |     Args:
252 |         idx_path (str): The path to the index file
253 | 
254 |         multimodal (bool): Whether the dataset is multimodal
255 |     """
256 | 
257 |     def __init__(self, idx_path: str, multimodal: bool) -> None:
258 | 
259 |         log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} from {idx_path}")
260 | 
261 |         with open(idx_path, "rb") as stream:
262 |             header = stream.read(9)
263 |             assert header == _INDEX_HEADER, f"bad header, cannot read: {idx_path}"
264 | 
265 |             version = struct.unpack("<Q", stream.read(8))[0]
266 |             assert version == 1, f"bad version, cannot read: {idx_path}"
267 | 
268 |             code = struct.unpack("<B", stream.read(1))[0]
269 |             self.dtype = DType.dtype_from_code(code)
270 |             self.dtype_size = DType.size(self.dtype)
271 | 
272 |             self.sequence_count = struct.unpack("<Q", stream.read(8))[0]
273 |             self.document_count = struct.unpack("<Q", stream.read(8))[0]
274 | 
275 |             offset = stream.tell()
276 | 
277 |         self.bin_buffer_mmap = numpy.memmap(idx_path, mode="r", order="C")
278 |         self.bin_buffer = memoryview(self.bin_buffer_mmap)
279 | 
280 |         log_single_rank(logger, logging.INFO, f"\tExtract the sequence lengths")
281 |         t_beg = time.time()
282 |         self.sequence_lengths = numpy.frombuffer(
283 |             self.bin_buffer, dtype=numpy.int32, count=self.sequence_count, offset=offset
284 |         )
285 |         t_end = time.time()
286 |         log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
287 | 
288 |         log_single_rank(logger, logging.INFO, f"\tExtract the sequence pointers")
289 |         t_beg = time.time()
290 |         self.sequence_pointers = numpy.frombuffer(
291 |             self.bin_buffer,
292 |             dtype=numpy.int64,
293 |             count=self.sequence_count,
294 |             offset=offset + self.sequence_lengths.nbytes,
295 |         )
296 |         t_end = time.time()
297 |         log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
298 | 
299 |         log_single_rank(logger, logging.INFO, f"\tExtract the document indices")
300 |         t_beg = time.time()
301 |         self.document_indices = numpy.frombuffer(
302 |             self.bin_buffer,
303 |             dtype=numpy.int64,
304 |             count=self.document_count,
305 |             offset=offset + self.sequence_lengths.nbytes + self.sequence_pointers.nbytes,
306 |         )
307 |         t_end = time.time()
308 |         log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
309 | 
310 |         self.sequence_modes = None
311 |         if multimodal:
312 |             log_single_rank(logger, logging.INFO, f"\tExtract the sequence modes")
313 |             t_beg = time.time()
314 |             self.sequence_modes = numpy.frombuffer(
315 |                 self.bin_buffer,
316 |                 dtype=numpy.int8,
317 |                 count=self.sequence_count,
318 |                 offset=offset
319 |                 + self.sequence_lengths.nbytes
320 |                 + self.sequence_pointers.nbytes
321 |                 + self.document_indices.nbytes,
322 |             )
323 |             t_end = time.time()
324 |             log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
325 | 
326 |         assert self.sequence_lengths.shape[0] == len(self)
327 |         assert self.sequence_lengths.shape[0] == self.sequence_count
328 |         assert self.sequence_lengths.shape[0] == self.document_indices[-1]
329 | 
330 |         log_single_rank(logger, logging.INFO, f"> total number of sequences: {len(self)}")
331 |         log_single_rank(
332 |             logger,
333 |             logging.INFO,
334 |             f"> total number of documents: {self.document_indices.shape[0] - 1}",
335 |         )
336 | 
337 |     def __del__(self) -> None:
338 |         """Clean up the object"""
339 |         if hasattr(self, "bin_buffer_mmap"):
340 |             self.bin_buffer_mmap._mmap.close()
341 |             del self.bin_buffer_mmap
342 | 
343 |     def __len__(self) -> int:
344 |         """Return the length of the dataset
345 | 
346 |         Returns:
347 |             int: The length of the dataset
348 |         """
349 |         return self.sequence_count
350 | 
351 |     @lru_cache(maxsize=8)
352 |     def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]:
353 |         """Return the pointer, length, and mode at the index
354 | 
355 |         Args:
356 |             idx (int): The index into the dataset
357 | 
358 |         Returns:
359 |             Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]: The pointer, length and mode at the index
360 |         """
361 |         return (
362 |             self.sequence_pointers[idx],
363 |             self.sequence_lengths[idx],
364 |             self.sequence_modes[idx] if self.sequence_modes is not None else None,
365 |         )
366 | 
367 | 
368 | class IndexedDatasetBuilder(object):
369 |     """Builder class for the IndexedDataset class
370 | 
371 |     Args:
372 |         bin_path (str): The path to the data (.bin) file
373 | 
374 |         dtype (Type[numpy.number], optional): The dtype of the index file. Defaults to numpy.int32.
375 | 
376 |         multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
377 |     """
378 | 
379 |     def __init__(
380 |         self, bin_path: str, dtype: Type[numpy.number] = numpy.int32
381 |     ) -> None:
382 |         self.data_file = open(bin_path, "wb")
383 |         self.dtype = dtype
384 | 
385 |         self.sequence_lengths = []
386 |         self.document_indices = [0]
387 | 
388 |     def add_document(
389 |         self, input_ids: List[int], token_length: int, modes: Optional[List[int]] = None
390 |     ) -> None:
391 |         """Add an entire document to the dataset
392 | 
393 |         Args:
394 |             tensor (torch.Tensor): The document to add
395 | 
396 |             lengths (List[int]): The lengths of each item in the document
397 | 
398 |             modes (Optional[List[int]], optional): The modes for each item in the document. Defaults to None.
399 |         """
400 |         np_array = numpy.array(input_ids, dtype=self.dtype)
401 |         self.data_file.write(np_array.tobytes(order="C"))
402 |         self.sequence_lengths.extend([token_length])
403 |         self.document_indices.append(len(self.sequence_lengths))
404 | 
405 |     def add_index(self, path_prefix: str) -> None:
406 |         """Add an entire IndexedDataset to the dataset
407 | 
408 |         Args:
409 |             path_prefix (str): The index (.idx) and data (.bin) prefix
410 |         """
411 |         # Concatenate index
412 |         index = _IndexReader(get_idx_path(path_prefix), multimodal=False)
413 |         assert index.dtype == self.dtype
414 | 
415 |         offset = len(self.sequence_lengths)
416 |         self.sequence_lengths.extend(index.sequence_lengths)
417 |         self.document_indices.extend((offset + index.document_indices)[1:])
418 | 
419 |         # Concatenate data
420 |         with open(get_bin_path(path_prefix), "rb") as f:
421 |             shutil.copyfileobj(f, self.data_file)
422 | 
423 |     def finalize(self, idx_path: str) -> None:
424 |         """Clean up and write the index (.idx) file
425 | 
426 |         Args:
427 |             idx_path (str): The path to the index file
428 |         """
429 |         self.data_file.close()
430 |         with _IndexWriter(idx_path, self.dtype) as writer:
431 |             writer.write(self.sequence_lengths, None, self.document_indices)
432 | 
433 | 
434 | def process_partition(kwargs):
435 | 
436 |     partition: str = kwargs['partition']
437 |     output_prefix: str = kwargs['output_prefix']
438 |     json_keys: List[str] = kwargs['json_keys']
439 | 
440 |     output_bin_files = {}
441 |     output_idx_files = {}
442 |     builders = {}
443 |     level = 'document'
444 |     for key in json_keys:
445 |         output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
446 |                                                         key, level)
447 |         output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
448 |                                                         key, level)
449 |         builders[key] = IndexedDatasetBuilder(
450 |             output_bin_files[key],
451 |             dtype=DType.optimal_dtype(VOCAB_SIZE),
452 |         )
453 | 
454 |     encoded_docs = datasets.load_dataset(partition, split='train', streaming=True)
455 |     for json_dict in encoded_docs:
456 |         for key in json_keys:
457 |             content = json_dict[key]
458 |             builders[key].add_document(content, len(content))
459 |     for key in json_keys:
460 |         builders[key].finalize(output_idx_files[key])
461 | 
462 | 
463 | def convert_hf_dataset(dataset_path: str, json_keys: List[str], output_prefix: str, num_workers: int = 16):
464 | 
465 |     output_bin_files = {}
466 |     output_idx_files = {}
467 |     builders = {}
468 |     level = 'document'
469 | 
470 |     dataset_names = sorted(os.listdir(dataset_path))
471 |     dataset_names = [d for d in dataset_names if not d.startswith('.')]
472 |     in_ss_out_names = [{'partition': os.path.join(dataset_path, d), 'output_prefix': d, 'json_keys': json_keys} for d in dataset_names]
473 | 
474 |     # process the dataset in parallel
475 |     with ProcessPoolExecutor(num_workers) as executor:
476 |         p = executor.map(process_partition, in_ss_out_names)
477 |         for _ in tqdm(p, total=len(in_ss_out_names), desc="Processing dataset"):
478 |             pass
479 | 
480 |     # collect different subsets into the same bin file
481 |     for key in json_keys:
482 |         output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
483 |                                                       key, level)
484 |         output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
485 |                                                       key, level)
486 |         builders[key] = IndexedDatasetBuilder(
487 |             output_bin_files[key],
488 |             dtype=DType.optimal_dtype(VOCAB_SIZE),
489 |         )
490 | 
491 |         for name in in_ss_out_names:
492 |             parition_output_prefix = name['output_prefix']
493 |             full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
494 |                                                             key, level)
495 |             builders[key].add_index(full_partition_output_prefix)
496 |         builders[key].finalize(output_idx_files[key])
497 | 
498 | 
499 | if __name__ == "__main__":
500 |     convert_hf_dataset('/data/hf_dataset/myl_new_no_math/17_20241108_004951', ['input_ids'], '17_20241108_004951')
501 | 


--------------------------------------------------------------------------------
/pretrain/preprocess/mix/update_metadata_from_clipboard.py:
--------------------------------------------------------------------------------
 1 | datasets = []
 2 | during_dataset = False
 3 | DATASET_COLUMN = 'DatasetName'
 4 | 
 5 | subsets = []
 6 | during_subset = False
 7 | SUBSET_COLUMN = 'SubsetName'
 8 | 
 9 | sfts = []
10 | during_sft = False
11 | ISSFT_COLUMN = 'IsSFT'
12 | 
13 | print(f"Paste the '{DATASET_COLUMN}' column")
14 | 
15 | while True:
16 | 
17 |     dataset = input()
18 | 
19 |     if dataset == "END_OF_DATASET":
20 |         datasets.append(dataset)
21 |         with open("datasets.txt", "w") as f:
22 |             f.write("\n".join(datasets))
23 |         print(f"'{DATASET_COLUMN}' column saved. Then you can paste the '{SUBSET_COLUMN}' column.")
24 | 
25 |     elif dataset == "END_OF_SUBSET":
26 |         subsets.append(dataset)
27 |         with open("subsets.txt", "w") as f:
28 |             f.write("\n".join(subsets))
29 |         print(f"'{SUBSET_COLUMN}' column saved. Then you can paste the '{ISSFT_COLUMN}' column.")
30 | 
31 |     elif dataset == "END_OF_SFT":
32 |         sfts.append(dataset)
33 |         with open("sfts.txt", "w") as f:
34 |             f.write("\n".join(sfts))
35 |         print("'{ISSFT_COLUMN}' column saved. then you can press Ctrl+C to exit")
36 | 
37 |     elif dataset == DATASET_COLUMN:
38 |         during_dataset = True
39 |         continue
40 | 
41 |     elif dataset == SUBSET_COLUMN:
42 |         during_subset = True
43 |         continue
44 | 
45 |     elif dataset == ISSFT_COLUMN:
46 |         during_sft = True
47 |         continue
48 | 
49 |     if during_dataset:
50 |         datasets.append(dataset.strip())
51 | 
52 |     if during_subset:
53 |         subsets.append(dataset.strip())
54 | 
55 |     if during_sft:
56 |         sfts.append(dataset.strip())
57 | 


--------------------------------------------------------------------------------
/pretrain/preprocess/tokenize/run_tokenize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | data_path=$1
 4 | 
 5 | tokenizer_path=<hf-tokenizer-path>
 6 | num_file=10000
 7 | num_worker=8
 8 | # num_file means how many jsonl/json/parquet files to tokenize at once (to avoid memory overflow). If num_file < actual number of files, simply run the script multiple times to tokenize all files.
 9 | 
10 | export RAW_DATA_PREFIX="/data/raw"
11 | export INPUT_IDS_PREFIX="/data/input_ids"
12 | # target save path for tokenized data. The tokenized data will retain the same directory structure as the raw data.
13 | 
14 | echo num_file=$num_file num_worker=$num_worker
15 | 
16 | # check if data_path exists
17 | if [ ! -d "$data_path" ]; then
18 |     echo "$data_path does not exist."
19 |     exit
20 | else
21 |     echo $data_path
22 | fi
23 | 
24 | 
25 | python tokenize/tokenize_text.py \
26 |     --tokenizer_path $tokenizer_path \
27 |     --data_path $data_path \
28 |     --model_name mini \
29 |     --num_file $num_file \
30 |     --text_key text \
31 |     --num_worker $num_worker \
32 |     --skip_exist True
33 | 
34 | # split data by 0.01B tokens
35 | python tokenize/split_data.py $data_path
36 | 
37 | # delete intermediate tokenization files
38 | cat datasets_to_delete.txt | xargs -I {} rm {}
39 | rm datasets_to_delete.txt
40 | 
41 | # incase of missing deletion
42 | if [ -n "$(find . -type f -regex '.*part-[0-9]+\.jsonl')" ]; then
43 |     find . -type f -regex '.*part-[0-9]+\.jsonl'
44 |     echo "Please check the intermediate part-xx.jsonl files listed above and delete them manually."
45 | fi
46 | 


--------------------------------------------------------------------------------
/pretrain/preprocess/tokenize/split_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import glob
  4 | import pathlib
  5 | import numpy as np
  6 | import sys
  7 | import threading
  8 | from tqdm import tqdm
  9 | import multiprocessing as mp
 10 | import pyarrow
 11 | from pyarrow import parquet as pq
 12 | 
 13 | # split data by 0.01B tokens (this is a soft limit)
 14 | MAX_TOKEN = int(0.01 * 1000 * 1000 * 1000)
 15 | 
 16 | if len(sys.argv) >= 3:
 17 |     father_datasets = list(sys.argv)[2:]
 18 |     datasets_to_delete = sys.argv[1]
 19 | else:
 20 |     father_datasets = list(sys.argv)[1:]
 21 |     datasets_to_delete = "datasets_to_delete.txt"
 22 | print("father_datasets", father_datasets)
 23 | print("datasets_to_delete", datasets_to_delete)
 24 | 
 25 | metadata_columns = ["source"]
 26 | 
 27 | # replace raw data path with input_ids path
 28 | raw_data_prefix = os.environ["RAW_DATA_PREFIX"]
 29 | input_ids_prefix = os.environ["INPUT_IDS_PREFIX"]
 30 | 
 31 | father_datasets = [
 32 |     i.replace(raw_data_prefix, input_ids_prefix)
 33 |     for i in father_datasets
 34 | ]
 35 | 
 36 | 
 37 | def warn(msg):
 38 |     print("\033[0;33m" + msg + "\033[0m")
 39 | 
 40 | 
 41 | def process_file(src_folder, src_file, print_id, write_format="parquet", is_last=False, last_states=None, max_part=-1):
 42 | 
 43 |     def write_to_file(all_data, num_tokens: int, cur_idx: int, metadata: list):
 44 |         """Write splitted data and metadata."""
 45 | 
 46 |         cur_idx = f"{cur_idx:04d}"
 47 |         if write_format == "jsonl":
 48 |             tgt_path = os.path.join(src_folder, "splitted_part-{}.jsonl".format(cur_idx))
 49 |             print(print_id, "updating", tgt_path, num_tokens)
 50 |             with open(tgt_path, "w") as fout:
 51 |                 for tmp_data in all_data:
 52 |                     fout.write(json.dumps({"input_ids": tmp_data}, ensure_ascii=False) + "\n")
 53 | 
 54 |         elif write_format == "parquet":
 55 |             tgt_path = os.path.join(src_folder, "splitted_part-{}.parquet".format(cur_idx))
 56 |             print(print_id, "updating", tgt_path, num_tokens)
 57 |             arr = pyarrow.array(all_data)
 58 |             pq.write_table(pyarrow.Table.from_arrays([arr], ["input_ids"]), tgt_path)
 59 | 
 60 |         tokens_num_tgt_path = os.path.join(src_folder, "splitted_part-{}-metadata.json".format(cur_idx))
 61 |         with open(tokens_num_tgt_path, "w") as fout:
 62 |             json.dump({"total_tokens_num": num_tokens, "metadata": metadata}, fout, indent=2)
 63 | 
 64 |     def load_data_jsonl(fin):
 65 |         """Read one line and return as parsed json data."""
 66 |         data = fin.readline().strip()
 67 |         if not data:
 68 |             return None
 69 |         else:
 70 |             json_data = json.loads(data)
 71 |             input_ids = json_data["input_ids"]
 72 |             meta_data = {col: json_data[col] for col in metadata_columns if col in json_data}
 73 |             meta_data["num_tokens"] = len(input_ids)
 74 |             return (input_ids, meta_data)
 75 | 
 76 |     all_data = []
 77 |     metadata = []
 78 |     num_tokens = 0
 79 |     cur_idx = max_part + 1
 80 |     if last_states is not None:
 81 |         all_data, num_tokens, cur_idx, metadata = last_states
 82 | 
 83 |     if src_file.endswith(".parquet"):
 84 | 
 85 |         warn("Deprecated: parquet file as intermediate format is deprecated. Please use jsonl format instead.")
 86 | 
 87 |         # parquet read
 88 |         table = pq.read_table(src_file)
 89 |         all_all_data = table["input_ids"].to_pylist()
 90 |         for ids in all_all_data:
 91 |             all_data.append(ids)
 92 |             metadata.append({"num_tokens": len(ids)})
 93 |             num_tokens += len(ids)
 94 |             if num_tokens > MAX_TOKEN:
 95 |                 # flush new splitted data to file
 96 |                 write_to_file(all_data, num_tokens, cur_idx, metadata)
 97 |                 all_data = []
 98 |                 metadata = []
 99 |                 num_tokens = 0
100 |                 cur_idx += 1
101 |                 print(print_id, "next split", cur_idx)
102 | 
103 |         # trailing lines
104 |         if len(all_data) > 0 and is_last:
105 |             write_to_file(all_data, num_tokens, cur_idx, metadata)
106 | 
107 |     elif src_file.endswith(".jsonl"):
108 | 
109 |         # jsonl read line by line
110 |         with open(src_file) as fin:
111 |             while True:
112 |                 data = load_data_jsonl(fin)
113 |                 if data is None:
114 |                     break
115 | 
116 |                 # add data
117 |                 all_data.append(data[0])
118 |                 metadata.append(data[1])
119 |                 num_tokens += data[1]["num_tokens"]
120 |                 if num_tokens > MAX_TOKEN:
121 |                     # flush new splitted data to file
122 |                     write_to_file(all_data, num_tokens, cur_idx, metadata)
123 |                     all_data = []
124 |                     metadata = []
125 |                     num_tokens = 0
126 |                     cur_idx += 1
127 | 
128 |         # trailing lines of whole wo_ppl folder
129 |         if len(all_data) > 0 and is_last:
130 |             write_to_file(all_data, num_tokens, cur_idx, metadata)
131 | 
132 |     with open(datasets_to_delete, "a") as f:
133 |         f.write(src_file + "\n")
134 |         print(print_id, src_file, "added to delete list")
135 | 
136 |     # pass states to next file
137 |     return all_data, num_tokens, cur_idx, metadata
138 | 
139 | 
140 | def do_parts(src_folder, src_files, max_part: int):
141 |     """Process all parts (each part is a tokenized dataset generated by ONE thread in `tokenize_text.py`) in one folder."""
142 | 
143 |     last_states = None
144 |     sort_files = sorted(src_files, key=lambda x: int(x.split("-")[-1].split(".")[0]))
145 |     length = len(sort_files)
146 |     for idx, src_file in enumerate(sort_files):
147 |         last_states = process_file(src_folder, src_file, last_states=last_states, is_last=(idx == length - 1), max_part=max_part, print_id=os.getpid())
148 | 
149 | 
150 | def process_dataset(fd):
151 |     datasets = os.listdir(fd)
152 |     folder2file = {}
153 |     for dataset_name in tqdm(datasets):
154 |         raw_src_folder = os.path.join(fd, dataset_name)
155 |         print("Finding intermediate results in {} ...".format(raw_src_folder))
156 | 
157 |         try:
158 |             for root_dir, _, files in os.walk(raw_src_folder, topdown=False):
159 |                 max_part = max([int(fp.split("-")[-1].split(".")[0]) for fp in files if "splitted_part" in fp and "metadata" not in fp], default=-1)
160 |                 for fp in files:
161 |                     if "sort" in fp or "splitted_part" in fp:
162 |                         continue
163 |                     if not fp.endswith(".jsonl") and not fp.endswith(".parquet"):
164 |                         continue
165 |                     if root_dir not in folder2file:
166 |                         folder2file[root_dir] = ([], max_part)
167 |                     folder2file[root_dir][0].append(os.path.join(root_dir, fp))
168 | 
169 |         except FileNotFoundError as e:
170 |             print("Error Dataset: {} ({})".format(dataset_name, e))
171 |             continue
172 |         except NotADirectoryError as e:
173 |             print("Error Dataset: {} ({})".format(dataset_name, e))
174 |             continue
175 | 
176 |         if len(folder2file) == 0:
177 |             print("Error Dataset: {} (len(folder2file) == 0)".format(dataset_name))
178 |             continue
179 | 
180 |     # process all files in parallel
181 |     folder_n = len(folder2file)
182 |     p = mp.Pool(32)
183 |     for idx, (src_folder, (src_files, max_part)) in enumerate(folder2file.items()):
184 |         print(f"Splitting {idx + 1} / {folder_n}", src_folder, len(src_files))
185 |         p.apply_async(do_parts, args=(src_folder, src_files, max_part))
186 |     p.close()
187 |     p.join()
188 | 
189 |     warn(f"finished {raw_src_folder}")
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     try:
194 |         for fd in father_datasets:
195 |             process_dataset(fd)
196 |     except (Exception, KeyboardInterrupt) as e:
197 |         warn(f"Early abortion. Please delete manully files in {datasets_to_delete}")
198 |         raise e
199 | 


--------------------------------------------------------------------------------
/pretrain/preprocess/tokenize/tokenize_text.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import multiprocessing as mp
  4 | import os
  5 | import pathlib
  6 | import random
  7 | import re
  8 | import signal
  9 | from copy import deepcopy
 10 | 
 11 | import numpy as np
 12 | import pyarrow
 13 | from pyarrow import parquet as pq
 14 | from tqdm import tqdm, trange
 15 | 
 16 | from transformers import AutoTokenizer
 17 | 
 18 | random.seed(42)
 19 | 
 20 | # Max line per file to tokenize. In case of OOM
 21 | MAX_DATA = int(1e7)
 22 | 
 23 | # replace raw data path with input_ids path
 24 | raw_data_prefix = os.environ["RAW_DATA_PREFIX"]
 25 | input_ids_prefix = os.environ["INPUT_IDS_PREFIX"]
 26 | 
 27 | SKIP_TOKENIZATION_EXTENTIONS = {".py", ".git", ".md", ".png", ".jpg"}
 28 | 
 29 | 
 30 | def get_tgt_folder(file_path, model_name):
 31 |     """Each jsonl or parquet file will generate a folder with the same name."""
 32 | 
 33 |     # token id folder directory
 34 |     file_path = file_path.replace(raw_data_prefix,
 35 |                                   input_ids_prefix)
 36 | 
 37 |     # remove the file extension
 38 |     tgt_folder = file_path[:file_path.rfind(".")]
 39 |     tgt_folder = os.path.join(tgt_folder, "wo_ppl")
 40 |     if os.path.exists(tgt_folder) == True:
 41 |         is_exists = True
 42 |     else:
 43 |         is_exists = False
 44 |     pathlib.Path(tgt_folder).mkdir(parents=True, exist_ok=True)
 45 |     return tgt_folder, is_exists
 46 | 
 47 | 
 48 | def warn(msg):
 49 |     print("\033[0;33m" + str(msg) + "\033[0m")
 50 | 
 51 | 
 52 | def clean_fn(text: str) -> str:
 53 |     """Data cleaning function. Important notice: this function applies to ALL the text data."""
 54 |     if not isinstance(text, str):
 55 |         warn(f"Type Error: {type(text)} {str(text)[:10]}...")
 56 |         text = str(text)
 57 | 
 58 |     text = text.replace("\u3000", " ")  # remove wide space
 59 | 
 60 |     return text
 61 | 
 62 | 
 63 | def tokenize_text(dataset,
 64 |                   src_folder,
 65 |                   file_nos,
 66 |                   tgt_folder,
 67 |                   idx,
 68 |                   text_key,
 69 |                   is_first,
 70 |                   skip_exists: bool = False):
 71 |     tgt_path = os.path.join(tgt_folder, "part-{}.jsonl".format(idx))
 72 |     if is_first == False:
 73 |         write_mode = "a"
 74 |     else:
 75 |         if skip_exists and os.path.exists(tgt_path):
 76 |             warn(f"skip tokenizing {tgt_path}")
 77 |             return
 78 |         write_mode = "w"
 79 | 
 80 |     batch_size = 1000
 81 |     with open(tgt_path, write_mode) as fout:
 82 |         for batch_st in tqdm(range(0, len(dataset), batch_size)):
 83 |             batch_data = dataset[batch_st:batch_st + batch_size]
 84 |             batch_file_nos = file_nos[batch_st:batch_st + batch_size]
 85 |             input_ids = tokenizer([clean_fn(data[text_key]) for data in batch_data],
 86 |                                 add_special_tokens=False)["input_ids"]
 87 |             for ipts, no in zip(input_ids, batch_file_nos):
 88 |                 new_data = {"input_ids": ipts, "source": f"{src_folder}:{no}"}
 89 |                 fout.write(json.dumps(new_data, ensure_ascii=False) + "\n")
 90 | 
 91 | 
 92 | wanna_exit = False
 93 | 
 94 | 
 95 | def interrupt_handler(signum, frame, ask=True):
 96 |     print("Ctrl+C pressed. Waiting for the current process to be finished.")
 97 |     global wanna_exit
 98 |     wanna_exit = True
 99 | 
100 | 
101 | def start_mp(dataset, is_first, src_folder, file_nos):
102 |     """dataset: List[Dict[str, str]]"""
103 | 
104 |     if len(dataset) == 0:
105 |         warn("len(dataset) == 0")
106 |         return
107 |     if not isinstance(dataset, list):
108 |         warn("not isinstance(dataset, list)")
109 |         return
110 |     try:
111 |         assert args.text_key in dataset[0]
112 |         text_key = args.text_key
113 |     except AssertionError:
114 |         warn(f"Available Keys: {dataset[0].keys()}")
115 |         raise Exception("Unknown Key!")
116 | 
117 |     seed = random.random()
118 |     def sample_seed():
119 |         return seed
120 | 
121 |     # shuffle again
122 |     # random.shuffle(dataset, sample_seed)
123 |     # random.shuffle(file_nos, sample_seed)
124 |     random.shuffle(dataset)
125 |     random.shuffle(file_nos)
126 | 
127 | 
128 |     part_num = args.num_worker
129 |     slice_idx = np.linspace(0, len(dataset), part_num + 1).astype("int")
130 |     p = mp.Pool(part_num)
131 |     for start_id in range(part_num):
132 |         start, end = slice_idx[start_id], slice_idx[start_id + 1]
133 |         new_lines = dataset[start:end]
134 |         p.apply_async(tokenize_text,
135 |                       args=(new_lines, src_folder, file_nos, tgt_folder, start_id, text_key,
136 |                             is_first))
137 |     p.close()
138 |     p.join()
139 |     print("All of the child processes over!")
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument("--tokenizer_path", type=str)
145 |     parser.add_argument("--model_name", type=str)
146 |     parser.add_argument("--data_path", type=str)
147 |     parser.add_argument("--num_files", type=int)
148 |     parser.add_argument("--text_key", type=str)
149 |     parser.add_argument("--num_worker", type=int)
150 |     parser.add_argument("--skip_exist", type=bool, default=False)
151 |     parser.add_argument("--skip_exists", type=bool, default=False)
152 |     args = parser.parse_args()
153 | 
154 |     # load tokenizer
155 |     kwargs = {}
156 |     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, **kwargs)
157 | 
158 |     # register signal handler
159 |     signal.signal(signal.SIGINT, interrupt_handler)
160 | 
161 |     # start tokenization
162 |     for root, _, files in os.walk(args.data_path, topdown=False):
163 |         step = 0
164 |         random.shuffle(files)
165 |         for fp in tqdm(files):
166 |             if wanna_exit:
167 |                 print("Tokenization done.")
168 |                 break
169 | 
170 |             file_path = os.path.join(root, fp)
171 | 
172 |             # check file extention
173 |             skip_tokenization = False
174 |             for ext in SKIP_TOKENIZATION_EXTENTIONS:
175 |                 if file_path.endswith(ext):
176 |                     skip_tokenization = True
177 |                     break
178 |             if skip_tokenization:
179 |                 continue
180 | 
181 |             # check target folder existance
182 |             tgt_folder, is_exists = get_tgt_folder(file_path, args.model_name)
183 |             if is_exists and args.skip_exist:
184 |                 warn(f"skip {fp}")
185 |                 continue
186 | 
187 |             print("Tokenizing {}".format(file_path))
188 |             print("Target Folder: {}".format(tgt_folder))
189 | 
190 |             fin = open(file_path, "r")
191 |             is_jsonl = False
192 | 
193 |             # this is shit code
194 |             if os.path.exists(file_path + "/dataset_info.json"):
195 |                 import datasets
196 |                 ds = datasets.load_from_disk(file_path, streaming=True)
197 |                 started = 0
198 |                 for i in trange(MAX_DATA, desc="Reading Data"):
199 |                     try:
200 |                         # get dataset & line number
201 |                         dataset = [next(ds) for _ in range(320000)]
202 |                         file_nos = [started + i for i in range(len(dataset))]
203 | 
204 |                         start_mp(dataset, True, file_path, file_nos)
205 | 
206 |                         started += len(dataset)
207 |                         step = step + 1
208 |                         if step >= args.num_files:
209 |                             break
210 |                     except StopIteration:
211 |                         break
212 | 
213 |             if file_path.endswith(".json") == True:
214 |                 try:
215 |                     # get dataset & line number
216 |                     dataset = json.load(fin)
217 |                     file_nos = [i for i in range(len(dataset))]
218 | 
219 |                     start_mp(dataset, True, file_path, file_nos)
220 |                     step = step + 1
221 |                     if step >= args.num_files:
222 |                         break
223 |                     continue
224 |                 except json.decoder.JSONDecodeError:
225 |                     is_jsonl = True
226 |                     fin.close()
227 |                     # reopen for jsonl
228 |                     fin = open(file_path, "r")
229 | 
230 |             if file_path.endswith(".jsonl") == True or is_jsonl == True:
231 |                 is_finish = False
232 |                 is_first = True
233 |                 started = 0
234 |                 while True:
235 |                     # get dataset
236 |                     dataset = []
237 |                     for i in trange(MAX_DATA, desc="Reading Data"):
238 |                         tmp_data = fin.readline()
239 |                         if not tmp_data:
240 |                             is_finish = True
241 |                             break
242 |                         try:
243 |                             tmp_data = json.loads(tmp_data)
244 |                             dataset.append(tmp_data)
245 |                         except json.decoder.JSONDecodeError as e:
246 |                             warn(str(e) + tmp_data)
247 |                             continue
248 | 
249 |                     # get line number
250 |                     file_nos = [started + i for i in range(len(dataset))]
251 |                     start_mp(dataset, is_first, file_path, file_nos)
252 |                     is_first = False  # append mode
253 |                     if is_finish == True:
254 |                         break
255 |             elif file_path.endswith(".parquet"):
256 |                 try:
257 |                     # get dataset & line number
258 |                     table = pq.read_table(file_path)
259 |                     file_nos = [i for i in range(len(table))]
260 |                     start_mp(table.to_pylist(), True, file_path, file_nos)
261 |                 except pyarrow.lib.ArrowInvalid as e:
262 |                     warn(str(e))
263 |                     continue
264 |             else:
265 |                 continue
266 | 
267 |             fin.close()
268 |             step = step + 1
269 |             if step >= args.num_files:
270 |                 break
271 | 


--------------------------------------------------------------------------------
/pretrain/scripts/calc_norm.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import sys
 3 | 
 4 | import torch
 5 | from safetensors import safe_open
 6 | 
 7 | 
 8 | def calc_norm(model_path: str):
 9 | 
10 |     with safe_open(f"{model_path}/model.safetensors", framework="pt") as f:
11 |         for k in f.keys():
12 |             v = f.get_tensor(k)
13 |             vnorm = torch.norm(v).item()
14 |             vnum = torch.numel(v)
15 |             print(k, vnorm, vnorm / vnum, vnorm / math.sqrt(vnum))
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     calc_norm(sys.argv[1])
20 | 


--------------------------------------------------------------------------------
/pretrain/scripts/convert_yulanmini_to_llama.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import os
 3 | import shutil
 4 | import sys
 5 | from collections import defaultdict
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import (AutoConfig, AutoModelForCausalLM, LlamaConfig,
10 |                           LlamaForCausalLM)
11 | 
12 | 
13 | 
14 | def rebalance_weights2(model_path, method):
15 |     target_model_path = model_path + "-" + method
16 |     shutil.copytree(model_path, target_model_path, dirs_exist_ok=True)  # copy includes optimizer
17 | 
18 |     source_model = AutoModelForCausalLM.from_pretrained(target_model_path, trust_remote_code=True)
19 | 
20 |     if os.path.exists(target_model_path + "/model.safetensors"):
21 |         os.remove(target_model_path + "/model.safetensors")  # prepare for save_pretrained
22 | 
23 |     if os.path.exists(target_model_path + "/model.safetensors.index.json"):
24 |         os.remove(target_model_path + "/model.safetensors.index.json")
25 |         os.remove(target_model_path + "/model-00001-of-00002.safetensors")
26 |         os.remove(target_model_path + "/model-00002-of-00002.safetensors")
27 | 
28 |     target_config = LlamaConfig(
29 |         attention_bias=True,
30 |         attention_dropout=source_model.config.attention_dropout,
31 |         bos_token_id=source_model.config.bos_token_id,
32 |         eos_token_id=source_model.config.eos_token_id,
33 |         head_dim=source_model.config.hidden_size // source_model.config.num_attention_heads,
34 |         hidden_act=source_model.config.hidden_act,
35 |         hidden_size=source_model.config.hidden_size,
36 |         initializer_range=source_model.config.initializer_range,
37 |         intermediate_size=source_model.config.intermediate_size,
38 |         max_position_embeddings=source_model.config.max_position_embeddings,
39 |         mlp_bias=False,
40 |         num_attention_heads=source_model.config.num_attention_heads,
41 |         num_hidden_layers=source_model.config.num_hidden_layers,
42 |         num_key_value_heads=source_model.config.num_key_value_heads,
43 |         pretraining_tp=1,
44 |         rms_norm_eps=source_model.config.rms_norm_eps,
45 |         rope_scaling=None,
46 |         rope_theta=source_model.config.rope_theta,
47 |         tie_word_embeddings=False,
48 |         torch_dtype=torch.float32,
49 |         use_cache=True,
50 |         vocab_size=source_model.config.vocab_size,
51 |     )
52 | 
53 |     state_dict = source_model.state_dict()
54 |     state_dict["model.embed_tokens.weight"] = state_dict["model.embed_tokens.weight"] * source_model.config.scale_emb
55 |     for i in range(source_model.config.num_hidden_layers):
56 |         state_dict[f"model.layers.{i}.self_attn.o_proj.bias"] = torch.zeros((source_model.config.hidden_size,), dtype=state_dict[f"model.layers.{i}.mlp.down_proj.weight"].dtype)
57 |         state_dict[f"model.layers.{i}.self_attn.o_proj.weight"] = state_dict[f"model.layers.{i}.self_attn.o_proj.weight"] * source_model.config.scale_depth / math.sqrt(source_model.config.num_hidden_layers)
58 |         state_dict[f"model.layers.{i}.mlp.down_proj.weight"] = state_dict[f"model.layers.{i}.mlp.down_proj.weight"] * source_model.config.scale_depth / math.sqrt(source_model.config.num_hidden_layers)
59 | 
60 |     target_model = LlamaForCausalLM(target_config)
61 |     # target_model = source_model
62 |     target_model.load_state_dict(state_dict)
63 | 
64 |     target_model = target_model.to(torch.bfloat16)
65 |     target_model.save_pretrained(target_model_path)
66 |     print(target_model_path)
67 | 
68 | if __name__ == "__main__":
69 |     rebalance_weights2(sys.argv[1], method="llama")
70 | 


--------------------------------------------------------------------------------
/pretrain/scripts/estimate_mfu.py:
--------------------------------------------------------------------------------
 1 | # Estimate Model FLOPs Utilization of YuLan-Mini stable training stage
 2 | 
 3 | D = 25 * 40 * 10 ** 9
 4 | 
 5 | N1 = 56
 6 | t1 = 10 * 28 * 60 * 60  # 10 stages, 23 hours/stage
 7 | 
 8 | N2 = 48  # shrink the cluster size
 9 | t2 = 15 * 32 * 60 * 60  # 15 stages, 32 hours/stage
10 | 
11 | T = D / (N1 * t1 + N2 * t2)
12 | print("T =", T)
13 | 
14 | C = 312 * 10 ** 12  # A800 GPU chips
15 | B = 1008  # = 56 * 18 = 46 * 21
16 | s = 4096  # seq length
17 | l = 56  # num hidden layers
18 | h = 1920  # hidden size
19 | f = 4800  # intermediate size
20 | V = 99000  # vocab size
21 | 
22 | E = 8 * B * s * l * h ** 2 + 6 * B * s * l * h * f + 4 * B * s ** 2 * l * h
23 | F = 3 * E + 4 * B * s ** 2 * l * h + 6 * B * s * h * V
24 | 
25 | print("F =", F)
26 | 
27 | MFU = F * T / B / s / C
28 | print("MFU =", MFU)
29 | 


--------------------------------------------------------------------------------
/pretrain/setup.sh:
--------------------------------------------------------------------------------
 1 | # setup env on each node in the slurm job
 2 | 
 3 | LOG_PREFIX=log/"$SLURM_JOB_NAME-$SLURM_JOB_ID"
 4 | LOG_DIR=/home/u20140041/pretrain-mini/${LOG_PREFIX}
 5 | echo $(date +%Y-%m-%d-%H:%M:%S) > $LOG_FILE
 6 | echo Setup hostname: $(hostname) >> $LOG_FILE
 7 | LOG_FILE=/home/u20140041/pretrain-mini/${LOG_PREFIX}/part0.log
 8 | echo "========================" >> $LOG_FILE
 9 | FILES_TO_LOG=($0 train.py train_utils.py model/modeling_miniyulan.py model/configuration_miniyulan.py torchrun_wrapper.sh)
10 | mkdir -p $LOG_DIR/artifacts
11 | for file in ${FILES_TO_LOG[@]}; do
12 |   echo $file >> $LOG_FILE
13 |   cat $file >> $LOG_FILE
14 |   cat $file >> $LOG_DIR/artifacts/$(echo $file | tr '/' '-')
15 |   echo "========================" >> $LOG_FILE
16 | done
17 | 
18 | set -x
19 | 
20 | source ~/.bashrc
21 | source .venv/bin/activate  # venvbashrc
22 | 
23 | # 传递参数
24 | FETCH_TIME=$1  # 没有默认值，需要在 submit_to_slurm.sh 中填写
25 | PER_DEVICE_TRAIN_BATCH_SIZE=$2  # 默认值为 18（对应 7 节点）
26 | DATASET_MODEL_NAME=$3  # 默认值为 myl
27 | 
28 | # 计算相关环境变量
29 | NNODES=$SLURM_JOB_NUM_NODES
30 | export WORLD_SIZE=$(expr $NNODES \* 8)
31 | hostnames=$(scontrol show hostnames $SLURM_JOB_NODELIST)
32 | comma_hostnames=$(echo $hostnames | tr ' ' ',')
33 | export MASTER_ADDR=$(echo $hostnames | cut -d ' ' -f 1)  # MASTER节点对应RANK 0
34 | MASTER_ADDR=$(getent ahosts $MASTER_ADDR | awk '{ print $1 }' | tail -n 1)
35 | JOB_NAME=$SLURM_JOB_NAME
36 | JOB_ID=$SLURM_JOB_ID
37 | export MASTER_PORT=$(expr 11450 + $(expr $RANDOM % 10000))  # 随机选择一个端口
38 | 
39 | trap 'cleanup' SIGTERM  # handle scancel gracefully
40 | 
41 | # cleanup 函数：在捕获到 SIGTERM 信号时，清理所有由 pdsh 启动的远程进程
42 | cleanup() {
43 |   echo "Received SIGTERM at $(date +%Y-%m-%d-%H:%M:%S), cleaning up remote processes..."
44 |   pdsh -w $comma_hostnames "kill \$(ps aux | grep '$SLURM_JOB_NAME-$SLURM_JOB_ID' | grep -v grep | awk '{print \$2}')"
45 |   kill $(ps aux | grep '$SLURM_JOB_NAME-$SLURM_JOB_ID' | grep -v grep | awk '{print $2}')
46 |   kill $(ps aux | grep '$SLURM_JOB_NAME $SLURM_JOB_ID' | grep -v grep | awk '{print $2}')
47 |   curl -H "Content-Type: application/json" -X POST https://wxpusher.zjiecode.com/api/send/message --data '{"appToken": "xxx", "content": "canceled job '$SLURM_JOB_NAME-$SLURM_JOB_ID'", "topicIds": [32270]}'
48 |   exit 15
49 | }
50 | 
51 | ############################### 上面没有需要更改的地方 ###############################


--------------------------------------------------------------------------------
/pretrain/synthesis/README.md:
--------------------------------------------------------------------------------
 1 | # Data Synthesis
 2 | 
 3 | This directory contains the scripts and prompts for data synthesis.
 4 | 
 5 | <div align=center>
 6 | <img src="https://github.com/RUC-GSAI/YuLan-Mini/blob/main/assets/data-pipeline.png">
 7 | </div>
 8 | 
 9 | ## Preliminary
10 | 
11 | ### SGLang
12 | 
13 | We primarily use the [`sglang`](https://docs.sglang.ai/start/install.html) package to generate synthetic data.
14 | 
15 | Then, choose the model you want to use for data synthesis. For example, we use `DeepSeek-Prover-V1.5` and `Qwen2.5-Math-Instruct-7B` to augument the Lean theorem proving dataset.
16 | 
17 | ```bash
18 | CUDA_VISIBLE_DEVICES=0,1 python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-Prover-V1.5-RL --port 30000 --trust-remote-code --dp 2
19 | ```
20 | 
21 | For those who run the model on a large cluster, you can install the [`sglang_router`](https://docs.sglang.ai/router/router.html) package to optimize the data parallel scheduling efficiency.
22 | 
23 | ```bash
24 | pip install sglang-router
25 | ```
26 | 
27 | ### vLLM
28 | 
29 | We also use the [`vLLM`](https://docs.vllm.ai/) package to generate synthetic data (on Ascend 910B NPU).
30 | 
31 | ```bash
32 | python gen_vllm.py --input_file_path input.jsonl --output_file_path output.jsonl
33 | ```
34 | 
35 | ## Prompts
36 | 
37 | We have publish the prompts we used for data synthesis in our technical report. We will organize the synthesis pipeline soon.
38 | 


--------------------------------------------------------------------------------
/pretrain/synthesis/gen_lean_reasoning.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import time
  5 | from random import random, sample
  6 | from typing import Tuple
  7 | 
  8 | import datasets
  9 | import pandas as pd
 10 | import sglang as sgl
 11 | from sglang import (RuntimeEndpoint, assistant, function, gen,
 12 |                     set_default_backend, system, user)
 13 | from tqdm import tqdm
 14 | 
 15 | # 设置默认的运行时端点
 16 | set_default_backend(RuntimeEndpoint("http://localhost:30000"))
 17 | 
 18 | 
 19 | # Deepseek-Prover-V1
 20 | @function
 21 | def analyze_deepseek(s, natural_language_statement, formal_statement, state_before, state_after, tactic, explanation="", **kwargs) -> str:
 22 |     if os.path.exists("/home/huyiwen/monorepo/projects/stop_signal"):
 23 |         return None
 24 | 
 25 |     input_template = """I am a mathematician unfamiliar with Lean. Please explain the tactics used in a proof, as if you are in the process of trying to prove a theorem and haven't yet completed it. Explain the reasoning and logic behind choosing those specific tactics.
 26 | 
 27 | **Statement:**
 28 | {natural_language_statement}
 29 | ```lean4
 30 | {formal_statement}
 31 | ```
 32 | 
 33 | **Current state:**
 34 | ```lean4
 35 | {state_before}
 36 | ```
 37 | 
 38 | **Proof:**
 39 | ```lean4
 40 | {tactic}
 41 | ```
 42 | """
 43 | 
 44 |     assistant_prefix = """**Reasoning:**
 45 | {explanation}"""
 46 | 
 47 |     s += user(input_template.format(
 48 |         natural_language_statement=natural_language_statement,
 49 |         formal_statement=formal_statement,
 50 |         state_before=state_before,
 51 |         tactic=tactic,
 52 |     ))
 53 | 
 54 |     s += assistant(assistant_prefix.format(explanation="") + gen("explanation", max_tokens=600))
 55 |     return None
 56 | 
 57 | 
 58 | # Lean-Github
 59 | @function
 60 | def analyze_github(s, state_before, tactic, state_after, **kwargs) -> str:
 61 |     if os.path.exists("/home/huyiwen/monorepo/projects/stop_signal"):
 62 |         return None
 63 | 
 64 |     input_template = """I am a mathematician unfamiliar with Lean. Please explain the tactics used in a proof, as if you are in the process of trying to prove a theorem and haven't yet completed it. Explain the reasoning and logic behind choosing those specific tactics.
 65 | 
 66 | **Current state:**
 67 | ```lean4
 68 | {state_before}
 69 | ```
 70 | 
 71 | **Proof:**
 72 | ```lean4
 73 | {tactic}
 74 | ```
 75 | """
 76 | 
 77 |     assistant_prefix = """**State after:**
 78 | {state_after}
 79 | 
 80 | **Reasoning:**
 81 | """
 82 | 
 83 |     s += user(input_template.format(
 84 |         state_before=state_before,
 85 |         tactic=tactic,
 86 |     ))
 87 | 
 88 |     s += assistant(assistant_prefix.format(state_after=state_after) + gen("explanation", max_tokens=600))
 89 |     return None
 90 | 
 91 | 
 92 | # Lean-Workbook
 93 | # State Before + Tactic -> State After
 94 | @function
 95 | def analyze_workbook_a(s, natural_language_statement, formal_statement, state_before, state_after, tactic, explanation="", **kwargs) -> str:
 96 |     if os.path.exists("/home/huyiwen/monorepo/projects/stop_signal"):
 97 |         return None
 98 | 
 99 |     input_template = """Given a Lean tactic at a intermediate step in a proof and the goal state before the tactic, predict the resulting goal state after the tactic's application and provide a detailed explanation. You do not need to consider whether the tactic is sufficient to complete the proof; simply explain why the goal state changes to your predicted state after the tactic's execution.
100 | 
101 | **Statement:**
102 | {natural_language_statement}
103 | ```lean4
104 | {formal_statement}
105 | ```
106 | 
107 | **Goal state before:**
108 | ```lean4
109 | {state_before}
110 | ```
111 | 
112 | **Tactic to execute:**
113 | ```lean4
114 | {tactic}
115 | ```
116 | """
117 | 
118 |     assistant_prefix = """**State after:**
119 | {state_after}
120 | 
121 | **Explanation:**
122 | """
123 | 
124 |     s += user(input_template.format(
125 |         natural_language_statement=natural_language_statement,
126 |         formal_statement=formal_statement,
127 |         state_before=state_before,
128 |         tactic=tactic,
129 |     ))
130 | 
131 |     s += assistant(assistant_prefix.format(state_after=state_after) + gen("explanation", max_tokens=600))
132 |     return None
133 | 
134 | 
135 | # State After + Tactic -> State Before
136 | @function
137 | def analyze_workbook_b(s, natural_language_statement, formal_statement, state_before, state_after, tactic, explanation="", **kwargs) -> str:
138 |     if os.path.exists("/home/huyiwen/monorepo/projects/stop_signal"):
139 |         return None
140 | 
141 |     input_template = """Given a tactic applied at an intermediate step of a Lean proof and the resulting goal state **after** applying the tactic, predict one possible goal state **before** the tactic was applied, and provide a detailed explanation  You don't need to consider whether the tactic is sufficient to complete the proof; simply explain why the **pre-tactic goal state** would have resulted in the given post-tactic state.
142 | 
143 | **Statement:**
144 | {natural_language_statement}
145 | ```lean4
146 | {formal_statement}
147 | ```
148 | 
149 | **Tactic applied:**
150 | ```lean4
151 | {tactic}
152 | ```
153 | 
154 | **Resulting state after:**
155 | ```lean4
156 | {state_after}
157 | ```
158 | """
159 | 
160 |     assistant_prefix = """**Goal state before:**
161 | {state_before}
162 | 
163 | **Explanation:**
164 | """
165 | 
166 |     s += user(input_template.format(
167 |         natural_language_statement=natural_language_statement,
168 |         formal_statement=formal_statement,
169 |         state_after=state_after,
170 |         tactic=tactic,
171 |     ))
172 | 
173 |     s += assistant(assistant_prefix.format(state_before=state_before) + gen("explanation", max_tokens=600))
174 |     return None
175 | 
176 | 
177 | 
178 | @function
179 | def analyze_workbook(s, natural_language_statement, formal_statement, state_before, tactic, state_after, **kwargs) -> str:
180 |     if os.path.exists("/home/huyiwen/monorepo/projects/stop_signal"):
181 |         return None
182 | 
183 |     input_template = """Give the next tactic in the proof with explanatory comments.
184 | 
185 | Statement: {natural_language_statement}
186 | 
187 | ```lean4
188 | {formal_statement}
189 | ```
190 | 
191 | **Current state:**
192 | 
193 | {state_before}
194 | """
195 | 
196 |     assistant_prefix = """**Next tactic:**
197 | {tactic}
198 | /-State:
199 | {state_after}-/
200 | 
201 | **Explanatory comments:**
202 | """
203 | 
204 |     s += user(input_template.format(
205 |         natural_language_statement=natural_language_statement,
206 |         formal_statement=formal_statement,
207 |         state_before=state_before,
208 |         tactic=tactic,
209 |         state_after=state_after,
210 |     ))
211 | 
212 |     s += assistant(assistant_prefix.format(tactic=tactic, state_after=state_after) + gen("explanation", max_tokens=400))
213 |     return None
214 | 
215 | 
216 | def analyze(lines, analyze_fn, name):
217 | 
218 |     # 使用batch处理多个文本
219 |     states = analyze_fn.run_batch(
220 |         lines,
221 |         progress_bar=True,
222 |         num_threads=256,
223 |         temperature=0.3,
224 |         top_p=0.4,
225 |     )
226 | 
227 |     answers = []
228 |     for line, state in zip(lines, states):
229 |         # extract the explanation from the state
230 |         try:
231 |             line["explanation"] = state["explanation"]
232 |         except Exception:
233 |             line["explanation"] = ""
234 |         # extract the stop reason from the state
235 |         try:
236 |             line["stop_reason"] = state.get_meta_info("explanation").get("finish_reason", {}).get("type", "")
237 |         except:
238 |             line["stop_reason"] = ""
239 |         answers.append(line)
240 | 
241 |     print(f"/home/huyiwen/monorepo/projects/miniyulan/gen_lean/lean_explain_{name}.jsonl")
242 |     with open(f"/home/huyiwen/monorepo/projects/miniyulan/gen_lean/lean_explain_{name}.jsonl", "w") as f:
243 |         for line in answers:
244 |             f.write(json.dumps(line) + "\n")
245 | 
246 | 
247 | def get_data(repo="workbook"):
248 |     if repo == "workbook":  # Not used
249 |         lines = datasets.load_dataset("/home/huyiwen/lean-tactics/Lean-Workbook", split="train").to_list()
250 |         return lines
251 |     elif repo == "github":  # Not used
252 |         lean_github = pd.read_parquet('/home/huyiwen/lean-tactics/Lean-Github/lean-github.parquet')
253 | 
254 |         # dedup
255 |         lean_github = lean_github.drop_duplicates(subset=['url', 'commit', 'file_path', 'start', 'end', 'tactic', 'state_before', 'state_after'])
256 | 
257 |         # convert string to real tuple
258 |         lean_github['start'] = lean_github['start'].apply(lambda x: tuple(map(int, x[1:-1].split(','))))
259 |         lean_github['end'] = lean_github['end'].apply(lambda x: tuple(map(int, x[1:-1].split(','))))
260 |         return lean_github.to_dict(orient='records')
261 |     elif repo == "deepseek":
262 |         lines = datasets.load_dataset("/home/huyiwen/lean-tactics/DeepSeek-Prover-V1", split="train").to_list()
263 |         return lines
264 |     elif repo == "workbook-c":
265 |         with open("/home/huyiwen/lean-tactics/Lean-Workbook/c.jsonl") as f:
266 |             lines = [json.loads(line) for line in f]
267 |         return lines
268 |     elif repo == "workbook-a":
269 |         with open("/home/huyiwen/lean-tactics/Lean-Workbook/a.jsonl") as f:
270 |             lines = [json.loads(line) for line in f]
271 |         return lines
272 | 
273 | 
274 | lines = get_data("github")
275 | analyze(lines, analyze_github, "github-" + time.strftime("%Y%m%d-%H%M%S"))
276 | 


--------------------------------------------------------------------------------
/pretrain/synthesis/gen_qwq.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | import sys
 5 | import time
 6 | from copy import copy
 7 | from random import random, sample
 8 | from typing import Tuple
 9 | 
10 | import sglang as sgl
11 | from sglang import (RuntimeEndpoint, assistant, function, gen,
12 |                     set_default_backend, system, user)
13 | from tqdm import tqdm
14 | 
15 | set_default_backend(RuntimeEndpoint("http://localhost:30000"))
16 | 
17 | 
18 | @function
19 | def analyze_text(s, problem: str, **kwargs) -> str:
20 | 
21 |     if os.path.exists("/home/huyiwen/miniyulan-ckpts/qwq_gen/stop_signal"):
22 |         return "Stop signal detected."
23 | 
24 |     sys_prompt="You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."
25 | 
26 |     prompt = """Please think step by step to solve the following question, and put your final answer within \\boxed{{}}.
27 | 
28 | {question}"""
29 | 
30 |     s += system(sys_prompt)
31 |     s += user(prompt.format(question=problem))
32 |     s += assistant( gen("qwq_gen", max_tokens=16000, stop=['Human:']) )
33 | 
34 | 
35 | def analyze(origin_jsonl_path):
36 | 
37 |     lines = []
38 |     with open(origin_jsonl_path, 'r') as file:
39 |         for line in file:
40 |             lines.append(json.loads(line))
41 |     # lines = lines[16:]
42 |     print(len(lines))
43 | 
44 |     # 使用batch处理多个文本
45 |     states = analyze_text.run_batch(
46 |         lines,
47 |         progress_bar=True,
48 |         num_threads=16,
49 |         temperature=0,
50 |     )
51 | 
52 |     llama_classify_file = origin_jsonl_path.replace(".jsonl", f"-qwq_generated-{time.strftime('%Y%m%d%H%M%S')}.jsonl")
53 |     with open(llama_classify_file, "a") as f:
54 |         for line, state in zip(lines, states):
55 |             obj = copy(line)
56 | 
57 |             try:
58 |                 obj["qwq_gen"] = state["qwq_gen"]
59 |             except Exception as e:
60 |                 # print(e)
61 |                 obj["qwq_gen"] = ""
62 | 
63 |             try:
64 |                 obj["qwq_gen_answer"] = state["qwq_gen_answer"]
65 |             except Exception as e:
66 |                 # print(e)
67 |                 obj["qwq_gen_answer"] = ""
68 | 
69 |             try:
70 |                 obj["stop_reason"] = state.get_meta_info("qwq_gen").get("finish_reason", {}).get("type", "")
71 |             except Exception as e:
72 |                 obj["stop_reason"] = str(e)
73 | 
74 |             f.write(json.dumps(obj) + "\n")
75 | 
76 |     return True
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     analyze(sys.argv[1])
81 | 


--------------------------------------------------------------------------------
/pretrain/synthesis/gen_vllm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | from vllm import LLM, SamplingParams
 5 | from datasets import Dataset
 6 | from transformers import AutoTokenizer
 7 | 
 8 | 
 9 | def parse_args():
10 |     parse = argparse.ArgumentParser(description="gen")
11 |     parse.add_argument("--input_file_path", type=str, default="", help="input_path")
12 |     parse.add_argument("--output_path", type=str, default="", help="output_path")
13 |     parse.add_argument("--start_index", type=int, default=None)
14 |     parse.add_argument("--end_index", type=int, default=None)
15 |     return parse.parse_args()
16 | 
17 | def main():
18 | 
19 |     args = parse_args()
20 | 
21 |     # Load JSONL file
22 |     input_file_path = args.input_file_path
23 |     output_path = args.output_path
24 |     start_index = args.start_index
25 |     end_index = args.end_index
26 | 
27 |     data = []
28 |     with open(input_file_path, "r", encoding="utf-8") as file:
29 |         for line in file:
30 |             data.append(json.loads(line))
31 | 
32 |     # faciliate data parallelism
33 |     if start_index is not None and end_index is not None:
34 |         data = data[start_index:end_index]
35 |     elif end_index is not None:
36 |         data = data[:end_index]
37 |     elif start_index is not None:
38 |         data = data[start_index:]
39 | 
40 |     template = (
41 |         "## Instruction\nPlease gain inspiration from the following content to create a high-quality problem and solution. Present your output in two distinct sections: [Problem] and [Solution].\n\n"
42 |         "## Content\n{text}\n"
43 |         "## Guidelines \n[Problem]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem.\n\n[Solution]: Present a comprehensive, step-by-step solution that solves the problem **correctly** and educates the student, around 250-350 words long. Clearly articulate the reasoning and methods used at each step, providing insight into the problem-solving process. Take care to format any equations properly using LaTeX or appropriate notation."
44 |     )
45 | 
46 |     prompts = []
47 |     for item in data:
48 |         prompts.append(template.format(text=item["text"]) + " Please generate only one Problem and only one Solution, and when you finish generating the solution, end with the signal '<END>'.")
49 | 
50 |     stop_tokens = ["<END>"]
51 |     sampling_params = SamplingParams(temperature=0.7, top_p=1.0, max_tokens=2048, stop=stop_tokens)
52 | 
53 |     llm = LLM(model="/data/Qwen2.5-7B-Instruct", tensor_parallel_size=1, gpu_memory_utilization=0.95, trust_remote_code=True)
54 |     outputs = llm.generate(prompts, sampling_params)
55 | 
56 |     generated_texts = []
57 |     for i, output in enumerate(outputs):
58 |         prompt = output.prompt
59 |         generated_text = output.outputs[0].text
60 |         generated_texts.append({"prompt":prompt,"output":generated_text})
61 | 
62 | 
63 |     os.makedirs(os.path.dirname(output_path), exist_ok=True)
64 |     with open(output_path, "w", encoding="utf-8") as json_file:
65 |         json.dump(generated_texts, json_file, ensure_ascii=False, indent=4)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()


--------------------------------------------------------------------------------
/pretrain/torchrun_wrapper.sh:
--------------------------------------------------------------------------------
 1 | # 将本脚本所有输出重定向到文件log/$SLURM_JOB_NAME-$SLURM_JOB_ID/part$SLURM_PROCID.log:
 2 | cd xxx
 3 | comma_hostnames=$1
 4 | shift
 5 | PROCID=$(expr $(echo $comma_hostnames | tr "," "\n" | grep -n `hostname` | cut -c-1) - 1)  # 仅适用9个节点以内
 6 | SLURM_JOB_NAME=$1
 7 | shift
 8 | SLURM_JOB_ID=$1
 9 | shift
10 | if [ -z "$PROCID" ]; then
11 |     echo "torchrun_wrapper.sh: PROCID is empty, exit"
12 |     exit 1
13 | fi
14 | if [ -z "$SLURM_JOB_NAME" ]; then
15 |     echo "torchrun_wrapper.sh: SLURM_JOB_NAME is empty, exit"
16 |     exit 1
17 | fi
18 | if [ -z "$SLURM_JOB_ID" ]; then
19 |     echo "torchrun_wrapper.sh: SLURM_JOB_ID is empty, exit"
20 |     exit 1
21 | fi
22 | echo "$(date +%Y-%m-%d %H:%M:%S) torchrun_wrapper.sh: SLURM_JOB_NAME=$SLURM_JOB_NAME, SLURM_JOB_ID=$SLURM_JOB_ID, PROCID=$PROCID; hostname=`hostname`" >> log/$SLURM_JOB_NAME-$SLURM_JOB_ID/part$PROCID.log
23 | exec &>> log/$SLURM_JOB_NAME-$SLURM_JOB_ID/part$PROCID.log
24 | 
25 | export NCCL_NSOCKS_PERTHREAD=4
26 | export NCCL_SOCKET_NTHREADS=2
27 | export NCCL_MIN_CHANNELS=32
28 | 
29 | source ~/.bashrc
30 | 
31 | module load /opt/app/spack/share/spack/modules/gcc/11.3.0
32 | module load /opt/app/spack/share/spack/modules/cuda/12.5.1
33 | module load /opt/app/spack/share/spack/modules/libaio/0.3.113-gcc_13.1.0
34 | 
35 | source .venv/bin/activate  # venv
36 | 
37 | # export NCCL_SOCKET_IFNAME=vpapvn  # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
38 | # export NCCL_IB_DISABLE=1  # https://github.com/NVIDIA/nccl/issues/451
39 | export LDFLAGS="-L/usr/lib64"
40 | export CFLAGS="-I/usr/include"
41 | export PYTHONPATH=.
42 | export CUTLASS_PATH=~/cutlass
43 | export LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8  # https://stackoverflow.com/questions/74367207/segmentation-fault-core-dumped-when-launching-python-in-anaconda
44 | export OPENBLAS_NUM_THREADS=24  # https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable
45 | export OMP_NUM_THREADS=24  # https://stackoverflow.com/questions/53351194/openmp-libgomp-thread-creation-failed-resource-temporarily-unavailable-when
46 | 
47 | export WANDB_MODE=disabled
48 | export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
49 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
50 | 
51 | # DEBUG
52 | export TRANSFORMERS_VERBOSITY=debug
53 | export NCCL_DEBUG=DEBUG  # https://stackoverflow.com/questions/61075390/pytorch-nccl-error-unhandled-system-error-nccl-version-2-4-8
54 | export NCCL_DEBUG_SUBSYS=GRAPH # https://pytorch.org/docs/stable/distributed.html
55 | # export TORCH_LOGS=+all
56 | # export TORCH_DISTRIBUTED_DEBUG=DETAIL
57 | # export TORCH_CPP_LOG_LEVEL=INFO
58 | 
59 | 
60 | CACHE_PATH='/fs/archive/share/yulan/data/aa_hf_cache'
61 | export TMPDIR=${CACHE_PATH}/tmp
62 | export HF_DATASETS_CACHE=${CACHE_PATH}/hf_datasets_cache
63 | export HF_HOME=${CACHE_PATH}/hf_home
64 | mkdir -p ${CACHE_PATH}
65 | mkdir -p ${TMPDIR}
66 | mkdir -p ${HF_DATASETS_CACHE}
67 | mkdir -p ${HF_HOME}
68 | 
69 | # 打印所有环境变量
70 | env
71 | 
72 | # 输出
73 | echo "torchrun_wrapper.sh: SLURM_JOB_NAME=$SLURM_JOB_NAME, SLURM_JOB_ID=$SLURM_JOB_ID, PROCID=$PROCID; hostname=`hostname`"
74 | echo -e "torchrun_wrapper.sh: torchrun --node_rank $PROCID $@\n"
75 | 
76 | # 设置 -e 选项，这会使脚本在任何命令失败时立即退出
77 | set -e
78 | 
79 | # 设置 -o pipefail，这确保管道中的任何命令失败都会导致整个管道失败
80 | set -o pipefail
81 | 
82 | torchrun --node_rank $PROCID $@
83 | 
84 | if [ $PROCID -eq 0 ]; then
85 |     curl -H "Content-Type: application/json" -X POST https://wxpusher.zjiecode.com/api/send/message --data '{"appToken": "xxx", "content": "'$SLURM_JOB_NAME-$SLURM_JOB_ID' done ", "topicIds": [32270]}'
86 | fi
87 | 


--------------------------------------------------------------------------------
/pretrain/train.sh:
--------------------------------------------------------------------------------
  1 | source ~/.bashrc
  2 | 
  3 | # 将作业提交给SLURM
  4 | 
  5 | # 参数：--time=30:00:00 最大运行时间24小时
  6 | # 参数：--job-name=xxx 作业名称
  7 | # 参数：--nodes=1 使用1个节点（注意调节batch size!!!）
  8 | 
  9 | function decay_train() {
 10 |     # 保存数据集并启动训练
 11 |     SCRIPT=$1
 12 |     FETCH_TIME=$2
 13 |     if [[ ${#FETCH_TIME} -ne 18 ]]; then
 14 |         echo "FETCH_TIME格式错误：$FETCH_TIME"
 15 |         exit 1
 16 |     fi
 17 |     RUN_REASON=$3
 18 |     if [[ ${#RUN_REASON} -lt 10 ]]; then
 19 |         echo "RUN_REASON 至少大于10个字：$RUN_REASON"
 20 |         exit 1
 21 |     fi
 22 |     PER_DEVICE_TRAIN_BATCH_SIZE=${4:-18}
 23 |     NNODES=${5:-7}
 24 |     MODEL_NAME=${6:-"myl_new_no_math"}
 25 |     JOB_NAME=$(basename $SCRIPT .sh)-$FETCH_TIME
 26 |     if [  -z /fs/archive/share/yulan/data/aa_mini/output/${JOB_NAME} ]; then
 27 |         echo "已有checkpoint！请注意是否会覆盖：/fs/archive/share/yulan/data/aa_mini/output/${JOB_NAME}"
 28 |         exit 1
 29 |     fi
 30 |     echo "JOB_NAME: $JOB_NAME"
 31 |     echo "请检查总BATCH_SIZE: $PER_DEVICE_TRAIN_BATCH_SIZE * $NNODES * 8 = $((PER_DEVICE_TRAIN_BATCH_SIZE * NNODES * 8))"
 32 |     echo "等价于BATCH_SIZE：$((PER_DEVICE_TRAIN_BATCH_SIZE * NNODES * 8 * 4096)) Tokens"
 33 |     if [ -d /fs/archive/share/yulan/data/aa_mini/hf_dataset/$MODEL_NAME/$FETCH_TIME ]; then
 34 |         echo "数据集已存在 /fs/archive/share/yulan/data/aa_mini/hf_dataset/$MODEL_NAME/$FETCH_TIME"
 35 |     else
 36 |         python preprocess/fetch_data/distributed_save.py $FETCH_TIME $MODEL_NAME
 37 |     fi
 38 | 
 39 |     JOB_ID=$(sbatch --time=36:00:00 --job-name=$JOB_NAME --nodes=$NNODES $SCRIPT $FETCH_TIME $PER_DEVICE_TRAIN_BATCH_SIZE $MODEL_NAME | grep -o -P '\d+')
 40 |     echo "JOB_ID: $JOB_ID"
 41 |     if [ -z $JOB_ID ]; then
 42 |         echo "启动失败"
 43 |         exit 1
 44 |     fi
 45 |     mkdir -p "log/$JOB_NAME-$JOB_ID"
 46 |     touch "log/$JOB_NAME-$JOB_ID/reason-$RUN_REASON"
 47 | 
 48 |     sleep 5
 49 |     nohup new_start_monitor $JOB_NAME $JOB_ID > "log/$JOB_NAME-$JOB_ID/monitor.log" 2>&1 &
 50 |     LOF_FILE="log/$JOB_NAME-$JOB_ID/part0.log"
 51 |     squeue -o "%.6i %.35j %t %8M  %.R"
 52 |     exit 0
 53 | }
 54 | 
 55 | 
 56 | function main_train() {
 57 |     # 保存数据集并启动训练
 58 |     SCRIPT=$1
 59 |     FETCH_TIME=$2
 60 |     if [[ ${#FETCH_TIME} -ne 18 ]]; then
 61 |         echo "FETCH_TIME格式错误：$FETCH_TIME"
 62 |         exit 1
 63 |     fi
 64 |     RUN_REASON=$3
 65 |     if [[ ${#RUN_REASON} -lt 10 ]]; then
 66 |         echo "RUN_REASON 至少大于10个字：$RUN_REASON"
 67 |         exit 1
 68 |     fi
 69 |     PER_DEVICE_TRAIN_BATCH_SIZE=${4:-18}
 70 |     NNODES=${5:-7}
 71 |     MODEL_NAME=${6:-"myl_new_no_math"}
 72 |     JOB_NAME=$(basename $SCRIPT .sh)
 73 |     if [  -z /fs/archive/share/yulan/data/aa_mini/output/${JOB_NAME} ]; then
 74 |         echo "已有checkpoint！请注意是否会覆盖：/fs/archive/share/yulan/data/aa_mini/output/${JOB_NAME}"
 75 |         exit 1
 76 |     fi
 77 |     echo "JOB_NAME: $JOB_NAME"
 78 |     echo "请检查总BATCH_SIZE: $PER_DEVICE_TRAIN_BATCH_SIZE * $NNODES * 8 = $((PER_DEVICE_TRAIN_BATCH_SIZE * NNODES * 8))"
 79 |     echo "等价于BATCH_SIZE：$((PER_DEVICE_TRAIN_BATCH_SIZE * NNODES * 8 * 4096)) Tokens"
 80 |     if [ -d /fs/archive/share/yulan/data/aa_mini/hf_dataset/$MODEL_NAME/$FETCH_TIME ]; then
 81 |         echo "数据集已存在 /fs/archive/share/yulan/data/aa_mini/hf_dataset/$MODEL_NAME/$FETCH_TIME"
 82 |     else
 83 |         python preprocess/fetch_data/distributed_save.py $FETCH_TIME $MODEL_NAME
 84 |     fi
 85 | 
 86 |     JOB_ID=$(sbatch --time=36:00:00 --job-name=$JOB_NAME --nodes=$NNODES $SCRIPT $FETCH_TIME $PER_DEVICE_TRAIN_BATCH_SIZE $MODEL_NAME | grep -o -P '\d+')
 87 |     echo "JOB_ID: $JOB_ID"
 88 |     if [ -z $JOB_ID ]; then
 89 |         echo "启动失败"
 90 |         exit 1
 91 |     fi
 92 |     mkdir -p "log/$JOB_NAME-$JOB_ID"
 93 |     touch "log/$JOB_NAME-$JOB_ID/reason-$RUN_REASON"
 94 | 
 95 |     sleep 5
 96 |     nohup new_start_monitor $JOB_NAME $JOB_ID > "log/$JOB_NAME-$JOB_ID/monitor.log" 2>&1 &
 97 |     LOF_FILE="log/$JOB_NAME-$JOB_ID/part0.log"
 98 |     squeue -o "%.6i %.35j %t %8M  %.R"
 99 |     exit 0
100 | }
101 | 
102 | 
103 | # Note: Due to subsequent modifications to the training code, this launch script may require re-adaptation.
104 | 
105 | main_train yulanmini-2B-final-phase1.sh 20241017_013512 "2B-model-phase1,lm_head_alpha=1+deepspeed1+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
106 | 
107 | main_train yulanmini-2B-final-phase2.sh 02_20241017_013401 "2B-model-phase2,lm_head_alpha=1+deepspeed1+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
108 | 
109 | main_train yulanmini-2B-final-phase3.sh 03_20241020_001556 "2B-model-phase3,lm_head_alpha=1+deepspeed1+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
110 | 
111 | main_train yulanmini-2B-final-phase4.sh 04_20241021_170901 "2B-model-phase4,lm_head_alpha=1+deepspeed1+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
112 | 
113 | main_train yulanmini-2B-final-phase5.sh 05_20241022_221453 "2B-model-phase5,lm_head_alpha=1+deepspeed1+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
114 | 
115 | main_train yulanmini-2B-final-phase6.sh 06_20241024_013137 "2B-model-phase6,lm_head_alpha=1+deepspeed1+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
116 | 
117 | main_train yulanmini-2B-final-phase7-dp2.sh 07_20241025_022032 "2B-model-phase7,lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
118 | 
119 | main_train yulanmini-2B-final-phase8.sh 08_20241026_151354 "2B-model-phase8,lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
120 | 
121 | main_train yulanmini-2B-final-phase9.sh 09_20241027_190948 "2B-model-phase9,lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
122 | 
123 | main_train yulanmini-2B-final-phase10.sh 10_20241028_225112 "2B-model-phase10,lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 18 7 myl_new_no_math
124 | 
125 | main_train yulanmini-2B-final-phase11.sh 11_20241030_124814 "2B-model-phase11,lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_new_no_math
126 | 
127 | main_train yulanmini-2B-final-phase12.sh 12_20241101_002827 "2B-model-phase12,lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_new_no_math
128 | 
129 | main_train yulanmini-2B-final-phase13.sh 13_20241102_160534 "2B-model-phase13,lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_new_no_math
130 | 
131 | main_train yulanmini-2B-final-phase14.sh 14_20241104_000454 "2B-model-phase14,lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_new_no_math
132 | 
133 | main_train yulanmini-2B-final-phase15.sh 15_20241105_023029 "2B-model-phase15, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_new_no_math
134 | 
135 | main_train yulanmini-2B-final-phase16.sh 16_20241106_180613 "2B-model-phase16, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_new_no_math
136 | 
137 | main_train yulanmini-2B-final-phase17.sh 17_20241108_004951 "2B-model-phase17, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_new_no_math
138 | 
139 | main_train yulanmini-2B-final-phase18-hyw.sh 18_20241113_034017 "2B-model-phase18-remake, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_mix890
140 | 
141 | main_train yulanmini-2B-final-phase19-hyw.sh 19_20241114_115241 "2B-model-phase19-remake, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_mix890
142 | 
143 | main_train yulanmini-2B-final-phase20-remake.sh 20_20241115_234357 "2B-model-phase20-remake, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_mix890
144 | 
145 | main_train yulanmini-2B-final-phase21.sh 21_20241117_021115 "2B-model-phase21, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_mix890
146 | 
147 | main_train yulanmini-2B-final-phase22.sh 22_20241118_155407 "2B-model-phase22, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_mix890
148 | 
149 | main_train yulanmini-2B-final-phase23.sh 23_20241120_033942 "2B-model-phase23, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_mix890
150 | 
151 | main_train yulanmini-2B-final-phase24.sh 24_20241121_133110 "2B-model-phase23, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_mix890
152 | 
153 | main_train yulanmini-2B-final-phase25.sh 25_20241123_030124 "2B-model-phase23, lm_head_alpha=1+deepspeed2+norm_alpha=True+rms_type=llama+emb_alpha=False, " 21 6 myl_mix890
154 | 
155 | decay_train yulanmini-2B-s25d-decay80-1sqrt-long-28k-final-phase26.sh 26_20241211_015209 "decay-80B-phase26 " 26 5 myl_mix890_long_28k
156 | 
157 | decay_train yulanmini-2B-s25d-decay80-1sqrt-long-28k-final-phase27.sh 27_20241213_051741 "decay-80B-phase27 " 26 5 myl_mix890_long_28k
158 | 


--------------------------------------------------------------------------------
/pretrain/train_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import os
  4 | from typing import Dict, Union
  5 | 
  6 | import datasets
  7 | import torch
  8 | import transformers
  9 | import wandb
 10 | from torch.optim.lr_scheduler import LambdaLR
 11 | from torch.utils.data import DataLoader, SequentialSampler
 12 | from transformers import Trainer, TrainerCallback
 13 | from transformers.trainer_utils import seed_worker
 14 | from transformers.utils import is_datasets_available
 15 | 
 16 | LOCAL_RANK = int(os.getenv("LOCAL_RANK", "0"))
 17 | RANK = int(os.getenv("RANK", "0"))
 18 | WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
 19 | 
 20 | 
 21 | def print_rank0(*arg):
 22 |     if RANK == 0:
 23 |         print(*arg)
 24 | 
 25 | 
 26 | class LogCallback(TrainerCallback):
 27 | 
 28 |     def on_log(self, args, state, control, model, logs=None, **kwargs):
 29 |         logs["train/global_step"] = state.global_step
 30 |         logs["train/epoch"] = state.epoch
 31 |         logs['train/total_flos'] = state.total_flos
 32 |         wandb.config.update({'global_step': state.global_step},
 33 |                             allow_val_change=True)
 34 | 
 35 | 
 36 | class PyTorchProfilerCallback(TrainerCallback):
 37 | 
 38 |     def on_train_begin(self, args, state, control, logs=None, **kwargs):
 39 |         # only one epoch will be trained
 40 |         self.prof = torch.profiler.profile(
 41 |             activities=[
 42 |                 torch.profiler.ProfilerActivity.CPU,
 43 |                 torch.profiler.ProfilerActivity.CUDA
 44 |             ],
 45 |             schedule=torch.profiler.schedule(wait=20, warmup=0, active=8),
 46 |             on_trace_ready=torch.profiler.tensorboard_trace_handler(
 47 |                 args.log_dir),
 48 |             record_shapes=True,
 49 |             profile_memory=True,
 50 |             #with_stack=True,
 51 |             with_flops=True,
 52 |             #with_modules=True
 53 |         )
 54 | 
 55 |     def on_step_begin(self, args, state, control, logs=None, **kwargs):
 56 |         self.prof.step()
 57 | 
 58 |     def on_train_end(self, args, state, control, logs=None, **kwargs):
 59 |         self.prof.stop()
 60 | 
 61 | 
 62 | 
 63 | def get_wsd_scheduler(optimizer,
 64 |                       num_warmup_steps,
 65 |                       num_training_steps,
 66 |                       last_epoch=-1,
 67 |                       stable_ratio=1.0,
 68 |                       start_lambda=0,
 69 |                       end_lambda=1,
 70 |                       start_global_step=None,
 71 |                       end_global_step=None,
 72 |                       wsd_style="cos"):
 73 |     # Note: Due to subsequent modifications to the training code, this function may require re-adaptation.
 74 | 
 75 |     if wsd_style == "cos":
 76 |         def lr_lambda(current_step):
 77 |             if start_global_step is not None and end_global_step is not None and start_global_step <= current_step <= end_global_step:
 78 |                 return (1 - math.cos(
 79 |                     math.pi * float(current_step - start_global_step) /
 80 |                     float(max(1, end_global_step - start_global_step)) / 2)) * (
 81 |                         end_lambda - start_lambda) + start_lambda
 82 |             if current_step < num_warmup_steps:
 83 |                 return (float(current_step) / float(max(1, num_warmup_steps))) * (
 84 |                     end_lambda - start_lambda) + start_lambda
 85 |             num_stable_steps = stable_ratio * num_training_steps
 86 |             if stable_ratio == 1.0 or current_step <= num_stable_steps:
 87 |                 return 1.0
 88 |             return max(
 89 |                 0.1,
 90 |                 float(num_training_steps - current_step) /
 91 |                 float(max(1, num_training_steps - num_stable_steps)),
 92 |             )
 93 | 
 94 |     elif wsd_style == "linear":
 95 |         def lr_lambda(current_step):
 96 |             if start_global_step is not None and end_global_step is not None and start_global_step <= current_step <= end_global_step:
 97 |                 return (float(current_step - start_global_step) /
 98 |                         float(max(1, end_global_step - start_global_step))) * (
 99 |                             end_lambda - start_lambda) + start_lambda
100 |             if current_step < num_warmup_steps:
101 |                 return (float(current_step) / float(max(1, num_warmup_steps))) * (
102 |                     end_lambda - start_lambda) + start_lambda
103 |             num_stable_steps = stable_ratio * num_training_steps
104 |             if stable_ratio == 1.0 or current_step <= num_stable_steps:
105 |                 return 1.0
106 |             return max(
107 |                 0.1,
108 |                 float(num_training_steps - current_step) /
109 |                 float(max(1, num_training_steps - num_stable_steps)),
110 |             )
111 |     elif wsd_style == "cos2":
112 | 
113 |         def lr_lambda(current_step):
114 |             if start_global_step is not None and end_global_step is not None and start_global_step <= current_step <= end_global_step:
115 |                 return (1 - math.cos(
116 |                     math.pi * float(current_step - start_global_step) /
117 |                     float(max(1, end_global_step - start_global_step)))) * (
118 |                         end_lambda - start_lambda) / 2 + start_lambda
119 |             if current_step < num_warmup_steps:
120 |                 return (float(current_step) / float(max(1, num_warmup_steps))
121 |                         ) * (end_lambda - start_lambda) + start_lambda
122 |             num_stable_steps = stable_ratio * num_training_steps
123 |             if stable_ratio == 1.0 or current_step <= num_stable_steps:
124 |                 return 1.0
125 |             return max(
126 |                 0.1,
127 |                 float(num_training_steps - current_step) /
128 |                 float(max(1, num_training_steps - num_stable_steps)),
129 |             )
130 |     elif wsd_style == "1sqrt":
131 | 
132 |         def lr_lambda(current_step):
133 |             if current_step > 262000:  # small hack for remaining steps
134 |                 current_step = 262000
135 |             if start_global_step is not None and end_global_step is not None and start_global_step <= current_step <= end_global_step:
136 |                 return (1 - math.sqrt(
137 |                     (current_step - start_global_step) /
138 |                     (end_global_step - start_global_step))) * (
139 |                         start_lambda - end_lambda) + end_lambda
140 |             if current_step < num_warmup_steps:
141 |                 return (float(current_step) / float(max(1, num_warmup_steps))
142 |                         ) * (end_lambda - start_lambda) + start_lambda
143 |             num_stable_steps = stable_ratio * num_training_steps
144 |             if stable_ratio == 1.0 or current_step <= num_stable_steps:
145 |                 return 1.0
146 |             return max(
147 |                 0.1,
148 |                 float(num_training_steps - current_step) /
149 |                 float(max(1, num_training_steps - num_stable_steps)),
150 |             )
151 |     else:
152 |         raise ValueError(f"Unknown wsd_style: {wsd_style}")
153 | 
154 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
155 | 
156 | 


--------------------------------------------------------------------------------
/pretrain/yulanmini-2B-final-phase25.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH --comment=joint_project
  4 | 
  5 | #SBATCH --job-name=xxxx
  6 | 
  7 | #SBATCH --ntasks-per-node=1
  8 | 
  9 | #SBATCH --gres=gpu:a800:8
 10 | 
 11 | #SBATCH --partition=debug
 12 | 
 13 | #SBATCH --output=log/%x-%j/part0.log
 14 | 
 15 | #SBATCH --error=log/%x-%j/part0.log
 16 | 
 17 | ### 前面是slurm默认值，不要更改
 18 | 
 19 | source setup.sh
 20 | 
 21 | ############################### 上面没有需要更改的地方 ###############################
 22 | 
 23 | 
 24 | # ========== RESUME 只需要修改这里 ==========
 25 | last_stage_job_name=miniyulan-2B-final-phase24
 26 | STAGE=25
 27 | # ========================================
 28 | 
 29 | CONTINUE=false
 30 | if [ "$CONTINUE" = false ]; then
 31 |     DO_RMS_NORM=true
 32 |     ALLOW_0_CHECKPOINT=false
 33 |     UPDATE_TRAINED_STEPS_AND_EPOCHS=true
 34 | elif [ "$CONTINUE" = true ]; then
 35 |     DO_RMS_NORM=false
 36 |     ALLOW_0_CHECKPOINT=true
 37 |     UPDATE_TRAINED_STEPS_AND_EPOCHS=false
 38 | fi
 39 | 
 40 | MODIFY_TRAINER_STATE=false
 41 | 
 42 | # 计算上一次的最新checkpoint
 43 | last_stage_latest_checkpoint=$(ls output_soft_link/$last_stage_job_name | grep checkpoint | grep -v rebalanced | grep -v rms_norm | sort -r | head -n 1)
 44 | 
 45 | # 如果ALLOW_0_CHECKPOINT=false，检查获得的checkpoint不应该是000结尾
 46 | if [ "$ALLOW_0_CHECKPOINT" = false ] && [[ "$last_stage_latest_checkpoint" == *000 ]]; then
 47 |     echo "last_stage_latest_checkpoint is 000, exit"
 48 |     exit 1
 49 | fi
 50 | 
 51 | # 如果没有rms_norm，则重新平衡权重
 52 | if [ ! -d "output_soft_link/$last_stage_job_name/$last_stage_latest_checkpoint-rms_norm" ] && [ "$DO_RMS_NORM" = true ]; then
 53 |     python scripts/rebalance_weight.py output_soft_link/$last_stage_job_name/$last_stage_latest_checkpoint
 54 | fi
 55 | 
 56 | # dataset path
 57 | # FETCH_TIME=""  # 注意！现在FETCH_TIME自动从launch中传入！！！！所以在submit_to_slurm.sh中设置！！！！
 58 | DATA_PATH=hf_dataset/$DATASET_MODEL_NAME/$FETCH_TIME
 59 | 
 60 | MODEL_PATH=output/$last_stage_job_name
 61 | 
 62 | # model max length
 63 | MODEL_MAX_LENGTH=4096
 64 | 
 65 | # batch size
 66 | # 下面的BS 节点数   GPU数   CONTEXT-SIZE
 67 | # PER_DEVICE_TRAIN_BATCH_SIZE=18
 68 | 
 69 | # gradient accumulation steps
 70 | GRADIENT_ACCUMULATION_STEPS=1
 71 | 
 72 | # learning rate
 73 | LEARNING_RATE=1e-2
 74 | 
 75 | # warmup ratio
 76 | WARMUP_RATIO=0.0 # <-----第二个stage改这里
 77 | 
 78 | # weight decay
 79 | WEIGHT_DECAY=0.1
 80 | 
 81 | # deepspeed config path
 82 | DEEPSPEED_CONFIG_PATH='ds2_config_adamw_kd.json'
 83 | 
 84 | OUTPUT_DIR=output/${JOB_NAME}
 85 | mkdir -p ${OUTPUT_DIR}
 86 | 
 87 | /usr/bin/pdsh -w $comma_hostnames bash torchrun_wrapper.sh $comma_hostnames $SLURM_JOB_NAME $SLURM_JOB_ID \
 88 |     --nnodes $NNODES \
 89 |     --nproc_per_node 8 \
 90 |     --rdzv_backend static \
 91 |     --rdzv_id $JOB_ID \
 92 |     --master_addr $MASTER_ADDR \
 93 |     --master_port $MASTER_PORT \
 94 |     --max_restarts 3 \
 95 |     train.py \
 96 |     --model_name_or_path ${MODEL_PATH} \
 97 |     --data_path ${DATA_PATH} \
 98 |     --output_dir ${OUTPUT_DIR} \
 99 |     --bf16 True \
100 |     --num_train_epochs ${STAGE} \
101 |     --model_max_length $MODEL_MAX_LENGTH \
102 |     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
103 |     --per_device_eval_batch_size 4 \
104 |     --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
105 |     --eval_strategy "no" \
106 |     --save_strategy "steps" \
107 |     --save_steps 250 \
108 |     --save_total_limit 25 \
109 |     --learning_rate $LEARNING_RATE \
110 |     --warmup_ratio $WARMUP_RATIO \
111 |     --weight_decay $WEIGHT_DECAY \
112 |     --logging_steps 3 \
113 |     --deepspeed ${DEEPSPEED_CONFIG_PATH} \
114 |     --gradient_checkpointing True \
115 |     --deepspeed_gradient_checkpointing False \
116 |     --report_to tensorboard \
117 |     --tf32 True \
118 |     --lr_scheduler_type "linear" \
119 |     --flash_attention \
120 |     --use_wsd \
121 |     --log_dir $LOG_DIR \
122 |     --profile False \
123 |     --torch_compile \
124 |     --max_grad_norm 1 \
125 |     --hyper_param_decay_rate 0 \
126 |     --logging_dir ${LOG_DIR} \
127 |     --ddp_timeout 3600 \
128 |     --adam_beta1 0.9 \
129 |     --adam_beta2 0.95 \
130 |     --run_name $LOG_PREFIX \
131 |     --adam_epsilon 1e-15 \
132 |     --dataloader_num_workers 4 \
133 |     --dataloader_prefetch_factor 2 \
134 |     --shrink_alpha 1 \
135 |     --init_scale_o 1 \
136 |     --qk_layernorm False \
137 |     --hidden_size 1920 \
138 |     --intermediate_size 4800 \
139 |     --num_hidden_layers 56 \
140 |     --num_attention_heads 30 \
141 |     --num_key_value_heads 6 \
142 |     --model_reproduce cerebras \
143 |     --scale_emb 10 \
144 |     --tie_word_embeddings True \
145 |     --attention_bias True \
146 |     --z_loss 0.0001 \
147 |     --gradient_checkpointing_step 12 \
148 |     --use_muparam_lr True \
149 |     --initializer_range 0.00005 \
150 |     --q_proj_alpha 0.3651483716701107 \
151 |     --k_proj_alpha 0.3651483716701107 \
152 |     --v_proj_alpha 0.3651483716701107 \
153 |     --gate_up_proj_alpha 0.3651483716701107 \
154 |     --o_proj_alpha 0.03450327796711771 \
155 |     --down_proj_alpha 0.03450327796711771 \
156 |     --input_layernorm_alpha 1 \
157 |     --post_attention_layernorm_alpha 1 \
158 |     --norm_alpha 1 \
159 |     --lm_head_alpha 1 \
160 |     --dim_model_base_lr 256 \
161 |     --dim_model_base_logits 1920 \
162 |     --vi_residual_alpha 1.4 \
163 |     --wesar_weights True \
164 |     --use_norm_alpha True \
165 |     --use_emb_alpha False \
166 |     --resume_from_checkpoint $MODEL_PATH \
167 |     --add_rms_norm $DO_RMS_NORM \
168 |     --modify_trainer_state $MODIFY_TRAINER_STATE \
169 |     --update_trained_steps_and_epochs $UPDATE_TRAINED_STEPS_AND_EPOCHS \
170 | 


--------------------------------------------------------------------------------
/pretrain/yulanmini-2B-s25d-decay80-1sqrt-long-28k-final-phase26.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH --comment=joint_project
  4 | 
  5 | #SBATCH --job-name=xxxx
  6 | 
  7 | #SBATCH --ntasks-per-node=1
  8 | 
  9 | #SBATCH --gres=gpu:a800:8
 10 | 
 11 | #SBATCH --partition=debug
 12 | 
 13 | #SBATCH --output=log/%x-%j/part0.log
 14 | 
 15 | #SBATCH --error=log/%x-%j/part0.log
 16 | 
 17 | 
 18 | source setup.sh
 19 | 
 20 | # ========== RESUME 只需要修改这里 ==========
 21 | last_stage_job_name=miniyulan-2B-final-phase25
 22 | STAGE=26
 23 | START_GLOBAL_STEP=243198
 24 | DECAY_STEPS=19000  # 退火steps，注意会和batch size有关
 25 | START_LAMBDA=1
 26 | END_LAMBDA=0.  # 从0.01降至0
 27 | # ========================================
 28 | 
 29 | CONTINUE=false
 30 | if [ "$CONTINUE" = false ]; then
 31 |     DO_RMS_NORM=true
 32 |     ALLOW_0_CHECKPOINT=false
 33 |     UPDATE_TRAINED_STEPS_AND_EPOCHS=true
 34 | elif [ "$CONTINUE" = true ]; then
 35 |     DO_RMS_NORM=false
 36 |     ALLOW_0_CHECKPOINT=true
 37 |     UPDATE_TRAINED_STEPS_AND_EPOCHS=false
 38 | fi
 39 | 
 40 | MODIFY_TRAINER_STATE=false
 41 | 
 42 | # 计算上一次的最新checkpoint
 43 | last_stage_latest_checkpoint=$(ls output_soft_link/$last_stage_job_name | grep checkpoint | grep -v rebalanced | grep -v rms_norm | sort -r | head -n 1)
 44 | 
 45 | # 如果ALLOW_0_CHECKPOINT=false，检查获得的checkpoint不应该是000结尾
 46 | if [ "$ALLOW_0_CHECKPOINT" = false ] && [[ "$last_stage_latest_checkpoint" == *000 ]]; then
 47 |     echo "last_stage_latest_checkpoint is 000, exit"
 48 |     exit 1
 49 | fi
 50 | 
 51 | # 如果没有rms_norm，则重新平衡权重
 52 | if [ ! -d "output_soft_link/$last_stage_job_name/$last_stage_latest_checkpoint-rms_norm" ] && [ "$DO_RMS_NORM" = true ]; then
 53 |     python scripts/rebalance_weight.py output_soft_link/$last_stage_job_name/$last_stage_latest_checkpoint
 54 | fi
 55 | 
 56 | # dataset path
 57 | # FETCH_TIME=""  # 注意！现在FETCH_TIME自动从launch中传入！！！！所以在submit_to_slurm.sh中设置！！！！
 58 | DATA_PATH=hf_dataset/$DATASET_MODEL_NAME/$FETCH_TIME
 59 | 
 60 | MODEL_PATH=output/$last_stage_job_name
 61 | 
 62 | # model max length
 63 | MODEL_MAX_LENGTH=28672
 64 | 
 65 | # batch size
 66 | # 下面的BS 节点数   GPU数   CONTEXT-SIZE
 67 | # PER_DEVICE_TRAIN_BATCH_SIZE=18
 68 | 
 69 | # gradient accumulation steps
 70 | GRADIENT_ACCUMULATION_STEPS=1
 71 | 
 72 | # learning rate
 73 | LEARNING_RATE=1e-2
 74 | 
 75 | # warmup ratio
 76 | WARMUP_RATIO=0.0
 77 | END_GLOBAL_STEP=$(expr $START_GLOBAL_STEP + $DECAY_STEPS)
 78 | 
 79 | # weight decay
 80 | WEIGHT_DECAY=0.1
 81 | 
 82 | # deepspeed config path
 83 | DEEPSPEED_CONFIG_PATH='ds2_config_adamw.json'
 84 | 
 85 | OUTPUT_DIR=output/${JOB_NAME}
 86 | mkdir -p ${OUTPUT_DIR}
 87 | 
 88 | /usr/bin/pdsh -w $comma_hostnames bash torchrun_wrapper.sh $comma_hostnames $SLURM_JOB_NAME $SLURM_JOB_ID \
 89 |     --nnodes $NNODES \
 90 |     --nproc_per_node 8 \
 91 |     --rdzv_backend static \
 92 |     --rdzv_id $JOB_ID \
 93 |     --master_addr $MASTER_ADDR \
 94 |     --master_port $MASTER_PORT \
 95 |     --max_restarts 3 \
 96 |     train.py \
 97 |     --model_name_or_path ${MODEL_PATH} \
 98 |     --data_path ${DATA_PATH} \
 99 |     --output_dir ${OUTPUT_DIR} \
100 |     --bf16 True \
101 |     --num_train_epochs ${STAGE} \
102 |     --model_max_length $MODEL_MAX_LENGTH \
103 |     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
104 |     --per_device_eval_batch_size 4 \
105 |     --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
106 |     --eval_strategy "no" \
107 |     --save_strategy "steps" \
108 |     --save_steps 250 \
109 |     --save_total_limit 25 \
110 |     --learning_rate $LEARNING_RATE \
111 |     --warmup_ratio $WARMUP_RATIO \
112 |     --weight_decay $WEIGHT_DECAY \
113 |     --logging_steps 3 \
114 |     --deepspeed ${DEEPSPEED_CONFIG_PATH} \
115 |     --gradient_checkpointing True \
116 |     --deepspeed_gradient_checkpointing False \
117 |     --report_to tensorboard \
118 |     --tf32 True \
119 |     --lr_scheduler_type "linear" \
120 |     --flash_attention \
121 |     --use_wsd \
122 |     --log_dir $LOG_DIR \
123 |     --profile False \
124 |     --torch_compile \
125 |     --max_grad_norm 1 \
126 |     --hyper_param_decay_rate 0 \
127 |     --logging_dir ${LOG_DIR} \
128 |     --ddp_timeout 3600 \
129 |     --adam_beta1 0.9 \
130 |     --adam_beta2 0.95 \
131 |     --run_name $LOG_PREFIX \
132 |     --adam_epsilon 1e-15 \
133 |     --dataloader_num_workers 4 \
134 |     --dataloader_prefetch_factor 2 \
135 |     --shrink_alpha 1 \
136 |     --init_scale_o 1 \
137 |     --qk_layernorm False \
138 |     --hidden_size 1920 \
139 |     --intermediate_size 4800 \
140 |     --num_hidden_layers 56 \
141 |     --num_attention_heads 30 \
142 |     --num_key_value_heads 6 \
143 |     --model_reproduce cerebras \
144 |     --scale_emb 10 \
145 |     --tie_word_embeddings True \
146 |     --attention_bias True \
147 |     --z_loss 0.0001 \
148 |     --gradient_checkpointing_step 56 \
149 |     --use_muparam_lr True \
150 |     --initializer_range 0.00005 \
151 |     --q_proj_alpha 0.3651483716701107 \
152 |     --k_proj_alpha 0.3651483716701107 \
153 |     --v_proj_alpha 0.3651483716701107 \
154 |     --gate_up_proj_alpha 0.3651483716701107 \
155 |     --o_proj_alpha 0.03450327796711771 \
156 |     --down_proj_alpha 0.03450327796711771 \
157 |     --input_layernorm_alpha 1 \
158 |     --post_attention_layernorm_alpha 1 \
159 |     --norm_alpha 1 \
160 |     --lm_head_alpha 1 \
161 |     --dim_model_base_lr 256 \
162 |     --dim_model_base_logits 1920 \
163 |     --vi_residual_alpha 1.4 \
164 |     --wesar_weights True \
165 |     --use_norm_alpha True \
166 |     --use_emb_alpha False \
167 |     --resume_from_checkpoint $MODEL_PATH \
168 |     --add_rms_norm $DO_RMS_NORM \
169 |     --modify_trainer_state $MODIFY_TRAINER_STATE \
170 |     --update_trained_steps_and_epochs $UPDATE_TRAINED_STEPS_AND_EPOCHS \
171 |     --start_lambda $START_LAMBDA \
172 |     --end_lambda $END_LAMBDA \
173 |     --start_global_step $START_GLOBAL_STEP \
174 |     --end_global_step $END_GLOBAL_STEP \
175 |     --wsd_style 1sqrt \
176 | 


--------------------------------------------------------------------------------