├── assets
    └── loss.png
├── configs
    ├── llama_tokenizer.model
    ├── 10w_vocab_wudao5_pile10.model
    ├── 6w_vocab_wudao5_pile10.model
    ├── train_config.py
    └── default_config.yaml
├── requirements.txt
├── models
    └── llama.py
├── data
    ├── download_wudao.sh
    ├── download_the_pile.sh
    ├── preprocess_the_pile.py
    └── preprocess_wudao.py
├── LICENSE
├── dataset
    ├── validation.py
    ├── train_tokenizer.py
    ├── pretrain_dataset.py
    ├── data_iter.py
    └── tokenizer.py
├── .gitignore
├── pretrain_llama.py
├── README.md
└── README_en.md


/assets/loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beichao1314/Open-Llama/HEAD/assets/loss.png


--------------------------------------------------------------------------------
/configs/llama_tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beichao1314/Open-Llama/HEAD/configs/llama_tokenizer.model


--------------------------------------------------------------------------------
/configs/10w_vocab_wudao5_pile10.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beichao1314/Open-Llama/HEAD/configs/10w_vocab_wudao5_pile10.model


--------------------------------------------------------------------------------
/configs/6w_vocab_wudao5_pile10.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beichao1314/Open-Llama/HEAD/configs/6w_vocab_wudao5_pile10.model


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.13.1
 2 | torchvision
 3 | torchaudio
 4 | zstandard
 5 | accelerate
 6 | datasets
 7 | wandb
 8 | deepspeed
 9 | absl-py
10 | torchinfo
11 | scikit-learn
12 | datasets==2.10.1
13 | matplotlib
14 | seaborn
15 | sentencepiece
16 | triton
17 | functorch==1.13.1
18 | xformers
19 | git+https://github.com/Bayes-Song/transformers.git


--------------------------------------------------------------------------------
/configs/train_config.py:
--------------------------------------------------------------------------------
 1 | max_length = 1024
 2 | train_batch_size = 2
 3 | num_training_steps = 1000000
 4 | num_warmup_steps = 2000
 5 | initializer_range = 1e-2
 6 | lr = 2e-4
 7 | weight_decay = 1e-1
 8 | tokenizer_model_path = 'configs/10w_vocab_wudao5_pile10.model'
 9 | patterns = [
10 |     'data/pretrain_data/part-*.jsonl.zst'
11 | ]
12 | # global step
13 | log_interval = 5
14 | eval_interval = 200
15 | save_interval = 800
16 | work_dir = 'data/saved_ckpt/'


--------------------------------------------------------------------------------
/models/llama.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author: LiangSong(sl12160010@gmail.com)
 3 | Date: 2023-03-17 13:21:33
 4 | LastEditors: LiangSong(sl12160010@gmail.com)
 5 | LastEditTime: 2023-03-26 23:13:57
 6 | FilePath: /Open-Llama/models/llama.py
 7 | Description: 
 8 | Building the Llama model proposed by Meta. https://arxiv.org/pdf/2302.13971.pdf
 9 | Performance and effectiveness optimization based on the implementation in the Transformer library.
10 | https://github.com/Bayes-Song/transformers
11 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
12 | '''
13 | 


--------------------------------------------------------------------------------
/data/download_wudao.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ###
 3 |  # @Author: LiangSong(sl12160010@gmail.com)
 4 |  # @Date: 2023-03-16 21:21:56
 5 |  # @LastEditors: LiangSong(sl12160010@gmail.com)
 6 |  # @LastEditTime: 2023-03-26 22:58:11
 7 |  # @FilePath: /Open-Llama/data/download_wudao.sh
 8 |  # @Description: 
 9 |  # download wudao dataset and preprocess
10 |  # Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
11 | ### 
12 | apt install unrar
13 | for i in {1..100}
14 | do
15 |   curl -C - --retry 100 'https://dorc.baai.ac.cn/resources/data/WuDaoCorpora2.0/WuDaoCorpus2.0_base_200G.rar?AccessKeyId=AKLTNasiLRBBTcOgPqzlkPzu1w&Expires=1679127659&Signature=7jh%2FpnJyC2hAeumm9EjaeE5HN9E%3D' -o data/WuDaoCorpus2.0_base_200G.rar
16 | done
17 | unrar x data/WuDaoCorpus2.0_base_200G.rar
18 | mkdir data/pretrain_data
19 | python3 data/preprocess_wudao.py


--------------------------------------------------------------------------------
/configs/default_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   deepspeed_multinode_launcher: standard
 4 |   gradient_accumulation_steps: 12
 5 |   gradient_clipping: 1.0
 6 |   offload_optimizer_device: none
 7 |   offload_param_device: none
 8 |   zero3_init_flag: false
 9 |   zero_stage: 1
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | dynamo_backend: 'no'
13 | # dynamo_config: 
14 |   # dynamo_backend: INDUCTOR
15 |   # dynamo_mode: default
16 |   # dynamo_use_dynamic: true
17 |   # dynamo_use_fullgraph: false
18 | fsdp_config: {}
19 | machine_rank: 0
20 | main_training_function: main
21 | megatron_lm_config: {}
22 | mixed_precision: bf16
23 | num_machines: 1
24 | num_processes: 8
25 | rdzv_backend: static
26 | same_network: true
27 | tpu_env: []
28 | tpu_use_cluster: false
29 | tpu_use_sudo: false
30 | use_cpu: false
31 | 


--------------------------------------------------------------------------------
/data/download_the_pile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ###
 3 |  # @Author: LiangSong(sl12160010@gmail.com)
 4 |  # @Date: 2023-03-16 21:21:38
 5 |  # @LastEditors: LiangSong(sl12160010@gmail.com)
 6 |  # @LastEditTime: 2023-03-26 22:58:02
 7 |  # @FilePath: /Open-Llama/data/download_the_pile.sh
 8 |  # @Description: 
 9 |  # download the pile dataset and preprocess
10 |  # Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
11 | ### 
12 | start=0
13 | end=29
14 | mkdir data/the_pile
15 | for (( i=$start; i<=$end; i++ ))
16 | do
17 |     url="https://the-eye.eu/public/AI/pile/train/$(printf "%02d" $i).jsonl.zst"
18 |     echo "Downloading file: $url"
19 |     curl -C - $url -o data/the_pile/"$(printf "%02d" $i).jsonl.zst"
20 | done
21 | 
22 | wait
23 | 
24 | echo "All files downloaded successfully."
25 | mkdir data/pretrain_data
26 | python3 data/preprocess_the_pile.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 S
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data/preprocess_the_pile.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author: LiangSong(sl12160010@gmail.com)
 3 | Date: 2023-03-16 22:35:38
 4 | LastEditors: LiangSong(sl12160010@gmail.com)
 5 | LastEditTime: 2023-03-26 22:59:38
 6 | FilePath: /Open-Llama/data/preprocess_the_pile.py
 7 | Description: 
 8 | Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, 
 9 | making it easy for parallel training to perform streaming reads.
10 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
11 | '''
12 | import json
13 | from glob import glob
14 | from tqdm import tqdm
15 | import zstandard as zstd
16 | 
17 | paths = glob('data/the_pile/*.jsonl.zst')
18 | write_path = 'data/pretrain_data/part-pile-{}.jsonl.zst'
19 | total_num = 0
20 | file_num = 0
21 | wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
22 | for path in tqdm(paths, total=len(paths)):
23 |     with zstd.open(path, 'r', encoding='utf-8') as fp:
24 |         for line in fp:
25 |             if total_num % 16384 == 0 and total_num > 0:
26 |                 file_num += 1
27 |                 wfp.close()
28 |                 wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
29 |             wfp.write(line.encode('utf-8'))
30 |             total_num += 1
31 | wfp.close()
32 | print('total line: {}\ntotal files: {}'.format(total_num, file_num))


--------------------------------------------------------------------------------
/dataset/validation.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author: LiangSong(sl12160010@gmail.com)
 3 | Date: 2023-03-18 00:06:41
 4 | LastEditors: LiangSong(sl12160010@gmail.com)
 5 | LastEditTime: 2023-03-27 01:09:20
 6 | FilePath: /Open-Llama/dataset/validation.py
 7 | Description: 
 8 | 
 9 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
10 | '''
11 | val_set = [
12 |     '白日依山尽，',
13 |     '君不见，黄河之水天上来，奔流到海不复回。君不见，',
14 |     '秦孝公据崤函之固，拥雍州之地，君臣固守以窥周室，有席卷天下，包举宇内，囊括四海之意，并吞八荒之心。',
15 |     '古之学者必有师。师者，所以传道受业解惑也。人非生而知之者，孰能无惑？',
16 |     '当我醒来时，我发现自己在一个完全陌生的地方。我看到周围没有人，只有一张纸条。',
17 |     '这是一个斗气决定一切的大陆。在加玛帝国乌坦城，有个天才少年萧炎打破了所有族人的修炼纪录，一时间万人敬仰，众人艳羡。但不知为何，',
18 |     '人工智能技术在图像识别领域取得了很大的进展，然而在复杂场景下仍然存在一些问题，例如',
19 |     'In recent years, there has been increasing interest in the use of machine learning to',
20 |     '已知三个数分别为1, 2, 3，则它们的平均数是',
21 |     '小明总共有15个苹果，他分别给了3个人两个苹果，然后自己又吃了一个苹果，那么它还剩几个苹果？',
22 |     '根据牛顿第二定律，物体的加速度等于',
23 |     '碳纳米管是一种新型的材料，具有非常独特的电学和光学性质。在过去的几年中，我们对碳纳',
24 |     '下面是一段用python写的快速排序的代码:',
25 |     'The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on',
26 |     '下面是一个使用 PyTorch 和 Transformer 的示例代码，用于训练一个文本分类模型：import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset'
27 | ]


--------------------------------------------------------------------------------
/data/preprocess_wudao.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author: LiangSong(sl12160010@gmail.com)
 3 | Date: 2023-03-16 22:10:44
 4 | LastEditors: LiangSong(sl12160010@gmail.com)
 5 | LastEditTime: 2023-03-26 22:59:55
 6 | FilePath: /Open-Llama/data/preprocess_wudao.py
 7 | Description: 
 8 | Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, 
 9 | making it easy for parallel training to perform streaming reads.
10 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
11 | '''
12 | import json
13 | from glob import glob
14 | from tqdm import tqdm
15 | import zstandard as zstd
16 | 
17 | paths = glob('data/WuDaoCorpus2.0_base_200G/part*')
18 | write_path = 'data/pretrain_data/part-wudao-{}.jsonl.zst'
19 | total_num = 0
20 | file_num = 0
21 | wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
22 | for path in tqdm(paths, total=len(paths)):
23 |     with open(path, 'r') as fp:
24 |         data = json.load(fp)
25 |     for line in data:
26 |         if total_num % 16384 == 0 and total_num > 0:
27 |             file_num += 1
28 |             wfp.close()
29 |             wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
30 |         wfp.write(json.dumps(line).encode('utf-8'))
31 |         wfp.write('\n'.encode('utf-8'))
32 |         total_num += 1
33 | wfp.close()
34 | print('total line: {}\ntotal files: {}'.format(total_num, file_num))


--------------------------------------------------------------------------------
/dataset/train_tokenizer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author: LiangSong(sl12160010@gmail.com)
 3 | Date: 2023-03-24 20:49:03
 4 | LastEditors: LiangSong(sl12160010@gmail.com)
 5 | LastEditTime: 2023-03-26 23:43:59
 6 | FilePath: /Open-Llama/dataset/train_tokenizer.py
 7 | Description: 
 8 | 
 9 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
10 | '''
11 | import random
12 | from dataset.data_iter import create_data_iter, create_shard_kwargs
13 | 
14 | wudao_patterns = [
15 |     'data/pretrain_data/part-wudao-*.jsonl.zst',
16 | ]
17 | wudao_paths = create_shard_kwargs(wudao_patterns)
18 | random.shuffle(wudao_paths)
19 | 
20 | pile_patterns = [
21 |     'data/pretrain_data/part-pile-*.jsonl.zst',
22 | ]
23 | pile_paths = create_shard_kwargs(pile_patterns)
24 | random.shuffle(pile_paths)
25 | paths = wudao_paths[: 5] + pile_paths[: 10]
26 | transform_dict = {
27 |     'wudao': lambda line: [(line['title'] + '\n' + line['content'])],
28 |     'pile': lambda line: [line['text']]
29 | }
30 | data_iter = create_data_iter(paths, transform_dict)
31 | 
32 | import io
33 | import sentencepiece as spm
34 | 
35 | # Loads model from URL as iterator and stores the model to BytesIO.
36 | model = io.BytesIO()
37 | spm.SentencePieceTrainer.train(
38 |   sentence_iterator=data_iter, model_writer=model, shuffle_input_sentence=False, train_extremely_large_corpus=True, 
39 |   # hyperparameters of tokenizer
40 |   max_sentence_length=16384, pad_id=3, model_type='BPE', vocab_size=100000, 
41 |   # split digits and fallback to byte same as Llama. 
42 |   # set split_by_unicode_script to True to avoid grouping punctuation and characters together.
43 |   split_digits=True, split_by_unicode_script=True, byte_fallback=True,
44 |   # reserve whitespace and \n and \t etc. for code generation
45 |   allow_whitespace_only_pieces=True, remove_extra_whitespaces=False, normalization_rule_name='nfkc')
46 | 
47 | # Serialize the model as file.
48 | with open('configs/10w_vocab_wudao5_pile10.model', 'wb') as f:
49 |   f.write(model.getvalue())
50 | 
51 | # Directly load the model from serialized model.
52 | sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
53 | print(sp.decode(sp.encode('只因你太美🤗▃     \n  1')))


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | .DS_Store
131 | pretrain_data/
132 | wandb/


--------------------------------------------------------------------------------
/dataset/pretrain_dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author: LiangSong(sl12160010@gmail.com)
 3 | Date: 2023-03-17 20:41:25
 4 | LastEditors: LiangSong(sl12160010@gmail.com)
 5 | LastEditTime: 2023-03-26 23:07:56
 6 | FilePath: /Open-Llama/dataset/pretrain_dataset.py
 7 | Description: 
 8 | 
 9 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
10 | '''
11 | import math
12 | import torch
13 | 
14 | def preprocess_wudao_gen(tokenizer, segment_max_length=1024):
15 |     def preprocess_wudao(line):
16 |         '''
17 |         The format of the data is roughly as follows.
18 |         {'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
19 |         Split the data based on the tokenized length according to the maximum length.
20 |         '''
21 |         total = line['title'] + '\n' + line['content']
22 |         out = tokenizer(total)
23 |         input_ids = out['input_ids']
24 |         return [input_ids[i*segment_max_length: (i+1)*segment_max_length] 
25 |         for i in range(math.ceil(len(input_ids)/segment_max_length))]
26 |     return preprocess_wudao
27 | 
28 | def preprocess_the_pile_gen(tokenizer, segment_max_length=1024):
29 |     def preprocess_the_pile(line):
30 |         '''
31 |         The format of the data is roughly as follows.
32 |         {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
33 |         Split the data based on the tokenized length according to the maximum length.
34 |         '''
35 |         total = line['text']
36 |         out = tokenizer(total)
37 |         input_ids = out['input_ids']
38 |         return [input_ids[i*segment_max_length: (i+1)*segment_max_length] 
39 |         for i in range(math.ceil(len(input_ids)/segment_max_length))]
40 |     return preprocess_the_pile
41 | 
42 | def pretrain_collate_fn_gen(tokenizer, segment_max_length=1024):
43 |     '''
44 |     Organize data into tensors by padding based on the preset maximum length.
45 |     '''
46 |     pad_id = tokenizer.pad_id
47 |     def pretrain_collate_fn(batch):
48 |         input_ids = []
49 |         for i in batch:
50 |             input_len = len(i)
51 |             input_ids.append(i+[pad_id]*(segment_max_length-input_len))
52 |         inputs = {
53 |             'input_ids': torch.tensor(input_ids, dtype=torch.int64),
54 |         }
55 |         return inputs
56 |     return pretrain_collate_fn
57 | 
58 | if __name__ == '__main__':
59 |     import sentencepiece as spm
60 |     from datasets import IterableDataset
61 |     from torch.utils.data import DataLoader
62 | 
63 |     from dataset.tokenizer import Tokenizer
64 |     from dataset.data_iter import create_shard_kwargs, create_data_iter
65 |     
66 |     sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
67 |     tokenizer = Tokenizer(sp_model)
68 |     patterns = [
69 |         'data/pretrain_data/part-*.jsonl.zst'
70 |     ]
71 |     paths = create_shard_kwargs(patterns)
72 |     transform_dict = {
73 |         'wudao': preprocess_wudao_gen(tokenizer), 
74 |         'pile': preprocess_the_pile_gen(tokenizer)
75 |     }
76 |     data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={'paths': paths, 'transform_dict': transform_dict})
77 |     train_loader = DataLoader(data_set, batch_size=8, num_workers=4, 
78 |     collate_fn=pretrain_collate_fn_gen(tokenizer), drop_last=True)
79 |     for batch in train_loader:
80 |         for k, v in batch.items():
81 |             print(k, v.shape)
82 |         break


--------------------------------------------------------------------------------
/dataset/data_iter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author: LiangSong(sl12160010@gmail.com)
 3 | Date: 2023-03-17 19:32:20
 4 | LastEditors: LiangSong(sl12160010@gmail.com)
 5 | LastEditTime: 2023-03-26 23:03:32
 6 | FilePath: /Open-Llama/dataset/data_iter.py
 7 | Description: 
 8 | 
 9 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
10 | '''
11 | import json
12 | from glob import glob
13 | import zstandard as zstd
14 | 
15 | 
16 | def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=1):
17 |     '''
18 |     Currently, the allowed storage formats are jsonl and jsonl.zst. 
19 |     Each line of the data is a dictionary, which can be parsed as JSON for subsequent processing after reading.
20 |     '''
21 |     past = None
22 |     for i, path in paths:
23 |         dataset_name = path.split('-')[-2]
24 |         if past != dataset_name:
25 |             print('Loading data from {}'.format(path))
26 |             past = path
27 |         if num_processes > 1 and i % num_processes != process_index:
28 |             continue
29 |         if path.endswith('jsonl.zst'):
30 |              with zstd.open(path, 'r', encoding='utf-8') as fp:
31 |                  for line in fp:
32 |                     if isinstance(line, bytes):
33 |                         line = line.decode('utf-8')
34 |                     line = json.loads(line)
35 |                     line['dataset'] = dataset_name
36 |                     if transform_dict:
37 |                         line = transform_dict[dataset_name](line)
38 |                         if isinstance(line, str):
39 |                             yield line
40 |                         elif isinstance(line, list):
41 |                             for i in line:
42 |                                 yield i
43 |                         else:
44 |                             raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
45 |                     else:
46 |                         yield line
47 |         elif path.endswith('jsonl'):
48 |             with open(path, 'r') as fp:
49 |                 for line in fp:
50 |                     if isinstance(line, bytes):
51 |                         line = line.decode('utf-8')
52 |                     line = json.loads(line)
53 |                     line['dataset'] = dataset_name
54 |                     if transform_dict:
55 |                         line = transform_dict[dataset_name](line)
56 |                         if isinstance(line, str):
57 |                             yield line
58 |                         elif isinstance(line, list):
59 |                             for i in line:
60 |                                 yield i
61 |                         else:
62 |                             raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
63 |                     else:
64 |                         yield line
65 |         else:
66 |             raise Exception('File format of {} is not supported yet.'.format(path))
67 | 
68 | def create_shard_kwargs(patterns, repeat=1):
69 |     '''
70 |     Assign numbers to different shards of data to ensure that data is not duplicated 
71 |     when allocated to different nodes during distributed training.
72 |     '''
73 |     all_path = []
74 |     for p in patterns:
75 |         all_path.extend(glob(p))
76 |     all_path *= repeat
77 |     return [(i, p) for i, p in enumerate(all_path)]
78 | 
79 | if __name__ == '__main__':
80 |     patterns = [
81 |         'data/pretrain_data/part-wudao*.jsonl.zst'
82 |     ]
83 |     paths = create_shard_kwargs(patterns)
84 |     transform_dict = {
85 |         'wudao': lambda x: x['title'],
86 |         'pile': lambda x: [x['text']]
87 |     }
88 |     data_iter = create_data_iter(paths, transform_dict=transform_dict)
89 |     for i, data in enumerate(data_iter):
90 |         print(i, data)
91 |         if i == 20:
92 |             break


--------------------------------------------------------------------------------
/dataset/tokenizer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Author: LiangSong(sl12160010@gmail.com)
  3 | Date: 2023-03-20 21:39:47
  4 | LastEditors: LiangSong(sl12160010@gmail.com)
  5 | LastEditTime: 2023-03-26 23:09:39
  6 | FilePath: /Open-Llama/dataset/tokenizer.py
  7 | Description: 
  8 | 
  9 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
 10 | '''
 11 | import torch
 12 | 
 13 | class Tokenizer:
 14 |     def __init__(self, sp_model):
 15 |         self.sp_model = sp_model
 16 |         self.bos_id = self.sp_model.bos_id()
 17 |         self.eos_id = self.sp_model.eos_id()
 18 |         self.pad_id = self.sp_model.pad_id()
 19 |         self.vocab_size = self.sp_model.vocab_size()
 20 | 
 21 |     def __call__(self, inputs, padding=None, max_length=256, return_tensors=False, truncation=False, 
 22 |                  add_special_tokens=True, return_mask=False):
 23 |         if isinstance(inputs, str):
 24 |             return self.encode(inputs, padding=padding, max_length=max_length, 
 25 |             return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
 26 |         else:
 27 |             return self.encode_batch(inputs, padding=padding, max_length=max_length, 
 28 |             return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
 29 | 
 30 |     def encode(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False, 
 31 |                add_special_tokens=True, return_mask=False):
 32 |         assert(isinstance(inputs, str))
 33 |         input_ids = self.sp_model.Encode(inputs)
 34 |         if return_mask:
 35 |             attention_mask = [1] * len(input_ids)
 36 |         if truncation:
 37 |             # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L780
 38 |             # 参考Transformer中的实现 默认最后一位一定是pad或者eos
 39 |             input_ids = input_ids[: max_length-1]
 40 |             if return_mask:
 41 |                 attention_mask = attention_mask[: max_length-1]
 42 |         if add_special_tokens:
 43 |             input_ids = input_ids + [self.eos_id]
 44 |             if return_mask:
 45 |                 attention_mask = attention_mask + [0]
 46 |         if padding == 'max_length':
 47 |             input_ids = input_ids + [self.pad_id] * (max_length-len(input_ids))
 48 |             if return_mask:
 49 |                 attention_mask = attention_mask + [0] * (max_length-len(attention_mask))
 50 |         if return_tensors:
 51 |             input_ids = torch.tensor([input_ids])
 52 |             out = {
 53 |                 'input_ids': input_ids,
 54 |             }
 55 |             if return_mask:
 56 |                 attention_mask = torch.tensor([attention_mask])
 57 |                 out['attention_mask'] = attention_mask
 58 |         else:
 59 |             out = {
 60 |                 'input_ids': input_ids,
 61 |             }
 62 |             if return_mask:
 63 |                 out['attention_mask'] = attention_mask
 64 |         return out
 65 | 
 66 |     def encode_batch(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False, 
 67 |                      add_special_tokens=True, return_mask=False):
 68 |         input_ids = self.sp_model.Encode(inputs)
 69 |         if return_mask:
 70 |             attention_mask = [[1] * len(i) for i in input_ids]
 71 |         if truncation:
 72 |             input_ids = [i[: max_length-1] for i in input_ids]
 73 |             if return_mask:
 74 |                 attention_mask = [i[: max_length-1] for i in attention_mask]
 75 |         if add_special_tokens:
 76 |             input_ids = [i+[self.eos_id] for i in input_ids]
 77 |             if return_mask:
 78 |                 attention_mask = [i+[0] for i in attention_mask]
 79 |         if padding == 'max_length':
 80 |             input_ids_pad = []
 81 |             if return_mask:
 82 |                 attention_mask_pad = []
 83 |             for idx, i in enumerate(input_ids):
 84 |                 input_ids_pad.append(i + [self.pad_id] * (max_length-len(i)))
 85 |                 if return_mask:
 86 |                     j = attention_mask[idx]
 87 |                     attention_mask_pad.append(j + [0] * (max_length-len(j)))
 88 |             input_ids = input_ids_pad
 89 |             if return_mask:
 90 |                 attention_mask = attention_mask_pad
 91 |         if return_tensors:
 92 |             input_ids = torch.tensor(input_ids)
 93 |             out = {
 94 |                 'input_ids': input_ids,
 95 |             }
 96 |             if return_mask:
 97 |                 attention_mask = torch.tensor(attention_mask)
 98 |                 out['attention_mask'] = attention_mask
 99 |         else:
100 |             out = {
101 |                 'input_ids': input_ids,
102 |             }
103 |             if return_mask:
104 |                 out['attention_mask'] = attention_mask
105 |         return out
106 | 
107 |     def decode(self, inputs):
108 |         inputs = inputs.tolist()
109 |         out = []
110 |         for i in inputs:
111 |             if self.eos_id in i:
112 |                 eos_idx = i.index(self.eos_id)
113 |                 i = i[: eos_idx]
114 |             out.append(i)
115 |         out = self.sp_model.Decode(out)
116 |         return out
117 | 
118 | if __name__ == '__main__':
119 |     import sentencepiece as spm
120 |     from unicodedata import normalize
121 |     # Using sentencepiece may not be able to process some reserved keywords like '▁'.
122 |     sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
123 |     tokenizer = Tokenizer(sp_model)
124 |     tmp = ['hello world', 
125 |     '这是开源项目的V1版本，this is the first version of a open-source project!', 
126 |     '# this is a python script\nfor i in range(10):\n   print(i)\n   for j in range(10):\n       print(j)']
127 |     print(tmp)
128 |     out = tokenizer(tmp, padding='max_length', return_tensors=True, max_length=64, truncation=True)
129 |     for k, v in out.items():
130 |         print(k, v.shape)
131 |     print(out['input_ids'])
132 |     out = tokenizer.decode(out['input_ids'])
133 |     print(out)
134 |     for i, j in zip(tmp, out):
135 |         assert(normalize('NFKC', i) == j)
136 | 
137 |     from dataset.data_iter import create_shard_kwargs, create_data_iter
138 |     patterns = [
139 |         'data/pretrain_data/part-wudao*.jsonl.zst'
140 |     ]
141 |     paths = create_shard_kwargs(patterns)
142 |     data_iter = create_data_iter(paths)
143 |     for i, data in enumerate(data_iter):
144 |         assert(normalize('NFKC', data['content']) == sp_model.Decode(sp_model.Encode(data['content'])) or '▁' in data['content'])
145 |         if i == 1000:
146 |             break


--------------------------------------------------------------------------------
/pretrain_llama.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Author: LiangSong(sl12160010@gmail.com)
  3 | Date: 2023-03-17 14:27:28
  4 | LastEditors: LiangSong(sl12160010@gmail.com)
  5 | LastEditTime: 2023-03-27 01:07:25
  6 | FilePath: /Open-Llama/pretrain_llama.py
  7 | Description: 
  8 | pretrain GPT
  9 | Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
 10 | '''
 11 | import os
 12 | import time
 13 | import wandb
 14 | import torch
 15 | import random
 16 | import sentencepiece as spm
 17 | from torchinfo import summary
 18 | from accelerate import Accelerator
 19 | from datasets import IterableDataset
 20 | from torch.utils.data import DataLoader
 21 | from deepspeed.ops.adam import FusedAdam
 22 | from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
 23 | 
 24 | from dataset.validation import val_set
 25 | from dataset.tokenizer import Tokenizer
 26 | from dataset.data_iter import create_shard_kwargs, create_data_iter
 27 | from dataset.pretrain_dataset import preprocess_the_pile_gen, preprocess_wudao_gen, pretrain_collate_fn_gen
 28 | from configs.train_config import *
 29 | 
 30 | accelerator = Accelerator()
 31 | 
 32 | if accelerator.is_main_process:
 33 |     wandb.init(
 34 |         project='LLAMA Pretrain'
 35 |     )
 36 | 
 37 | log_interval *= accelerator.gradient_accumulation_steps
 38 | eval_interval *= accelerator.gradient_accumulation_steps
 39 | save_interval *= accelerator.gradient_accumulation_steps
 40 | 
 41 | sp_model = spm.SentencePieceProcessor(model_file=tokenizer_model_path)
 42 | tokenizer = Tokenizer(sp_model)
 43 | 
 44 | paths = create_shard_kwargs(patterns)
 45 | random.shuffle(paths)
 46 | transform_dict = {
 47 |     'wudao': preprocess_wudao_gen(tokenizer, max_length), 
 48 |     'pile': preprocess_the_pile_gen(tokenizer, max_length)
 49 | }
 50 | data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={
 51 |     'paths': paths, 
 52 |     'transform_dict': transform_dict,
 53 |     'process_index': accelerator.process_index, 
 54 |     'num_processes': accelerator.num_processes
 55 | })
 56 | train_loader = DataLoader(data_set, batch_size=train_batch_size, num_workers=1, 
 57 | collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), drop_last=True)
 58 | # smaller initializer_range make training more stable
 59 | # add stabel embedding to token embedding
 60 | raw_model = LlamaForCausalLM(LlamaConfig(vocab_size=tokenizer.vocab_size, 
 61 |                                          initializer_range=initializer_range, 
 62 |                                          pad_token_id=tokenizer.pad_id, 
 63 |                                          rms_norm_eps=1e-5, 
 64 |                                          hidden_dropout_prob=0.1, 
 65 |                                          attention_dropout_prob=0.1, 
 66 |                                          use_stable_embedding=True, 
 67 |                                          shared_input_output_embedding=True))
 68 | raw_model.eval()
 69 | with torch.no_grad():
 70 |     summary(raw_model.cuda(), input_data=torch.ones(1, 64, dtype=torch.int64).cuda())
 71 | no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"]
 72 | optimizer_grouped_parameters = [
 73 |     {
 74 |         "params": [p for n, p in raw_model.named_parameters() if not any(nd in n for nd in no_decay)],
 75 |         "weight_decay": weight_decay,
 76 |     },
 77 |     {
 78 |         "params": [p for n, p in raw_model.named_parameters() if any(nd in n for nd in no_decay)],
 79 |         "weight_decay": 0.0,
 80 |     },
 81 | ]
 82 | optim = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95))
 83 | optim.zero_grad()
 84 | factor = accelerator.num_processes / accelerator.gradient_accumulation_steps
 85 | scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps * factor, 
 86 |                                             num_training_steps=num_training_steps * factor)
 87 | 
 88 | _, model, optim, scheduler = accelerator.prepare(
 89 |     train_loader, raw_model, optim, scheduler
 90 | )
 91 | print('start training...')
 92 | train_loader_iter = iter(train_loader)
 93 | global_step = 0
 94 | start_time = time.time()
 95 | for data_step in range(num_training_steps):
 96 |     model.train()
 97 |     with accelerator.accumulate(model):
 98 |         batch = next(train_loader_iter)
 99 |         for k, v in batch.items():
100 |             batch[k] = v.to(accelerator.device)
101 |         labels = batch['input_ids'].clone()
102 |         labels[labels==tokenizer.pad_id] = -100
103 |         out = model(**batch, labels=labels)
104 |         total_loss = out.loss
105 |         losses = {
106 |             'total_loss': total_loss
107 |         }
108 |         accelerator.backward(total_loss)
109 |         optim.step()
110 |         scheduler.step()
111 |         optim.zero_grad()
112 |         if accelerator.sync_gradients:
113 |             global_step += 1
114 |     if data_step % log_interval == 0 and data_step > 0 and accelerator.is_main_process:
115 |         cost_time = time.time() - start_time
116 |         start_time = time.time()
117 |         tokens = train_batch_size * log_interval * max_length
118 |         wandb.log({'Training/Token per second per gpu': tokens/cost_time})
119 |         for k, v in losses.items():
120 |             wandb.log({'Losses/{}'.format(k): v})
121 |         current_lr = optim.param_groups[0]['lr']
122 |         wandb.log({'Training/LR': current_lr})
123 |         if optim.scaler is not None:
124 |             wandb.log({'Training/Loss Scale': optim.scaler.get_scale()})
125 |         wandb.log({'Training/Data Step': data_step})
126 |         wandb.log({'Training/Global Step': global_step})
127 |         accelerator.print('Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}'.format(
128 |             global_step, data_step, losses['total_loss'], tokens/cost_time))
129 |     if data_step % eval_interval == 0 and accelerator.is_main_process:
130 |         text_table = wandb.Table(columns=['question', 'pred'])
131 |         model.eval()
132 |         with torch.no_grad():
133 |             for data in val_set:
134 |                 raw_inputs = data
135 |                 inputs_len = len(raw_inputs)
136 |                 inputs = tokenizer(raw_inputs, return_tensors=True, add_special_tokens=False)
137 |                 for k, v in inputs.items():
138 |                     inputs[k] = v.to(accelerator.device)
139 |                 pred = model.generate(**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0)
140 |                 pred = tokenizer.decode(pred.cpu())[0]
141 |                 pred = pred[inputs_len:]
142 |                 text_table.add_data(raw_inputs, pred)
143 |         wandb.log({'Predictions on {}'.format(global_step) : text_table})
144 |     if data_step % save_interval == 0 and data_step > 0 and accelerator.is_main_process:
145 |         if not os.path.isdir(work_dir):
146 |             os.mkdir(work_dir)
147 |         torch.save(raw_model.state_dict(), '{}/{}.pt'.format(work_dir, global_step))
148 | wandb.finish()
149 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 |  * @Author: LiangSong(sl12160010@gmail.com)
  3 |  * @Date: 2023-03-10 21:18:35
  4 |  * @LastEditors: LiangSong(sl12160010@gmail.com)
  5 |  * @LastEditTime: 2023-03-27 02:40:54
  6 |  * @FilePath: /Open-Llama/README.md
  7 |  * @Description: 
  8 |  * 
  9 |  * Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
 10 | -->
 11 | # Open-Llama
 12 | 
 13 | [English](https://github.com/Bayes-Song/Open-Llama/blob/main/README_en.md)
 14 | 
 15 | Open-Llama是一个开源项目，提供了一整套用于构建大型语言模型的训练流程，从数据集准备到分词、预训练、指令调优，以及强化学习技术 RLHF。
 16 | 
 17 | ## **特性**
 18 | 
 19 | ### 易用性
 20 | 
 21 | 我们认为易用性是构建大型语言模型时最重要的特性之一。为了使 Open-LLAMA 更加易于使用，我们特别注重了以下几点：
 22 | 
 23 | - **最简实现**：我们采用了最简单的实现方式，降低了入门的门槛，让初学者也能轻松上手。
 24 | - **流程完整**：我们发布了从数据集构建到训练的完整代码，使得构建一个大语言模型的每一步流程都清晰可见。
 25 | 
 26 | ### 高性能
 27 | 
 28 | 由于训练大语言模型的成本高昂，因此在构建大型语言模型时，高性能也是非常重要的。为了实现高性能的训练，我们发布使用了以下技术：
 29 | 
 30 | - **Fused CUDA kernel**：使用[xformers](https://github.com/facebookresearch/xformers)中提供的 fused CUDA kernel 可以将多个操作融合在一起，减少了 GPU 和 CPU 之间的数据传输，从而提高了训练效率。
 31 | - **并行化训练**：我们使用[Accelerate](https://huggingface.co/docs/accelerate/index)库支持在多个 GPU 上进行并行化训练，以加快训练速度。
 32 | 
 33 | 对于7B模型，使用Transformers中Pytorch原生版本的Llama模型训练训练速度为1378 token/s/gpu，使用本代码库训练速度达到3290 token/s/gpu，基本达到[Llama原文](https://arxiv.org/pdf/2302.13971.pdf)中的3370 token/s/gpu。
 34 | 如果使用500B token进行预训练，需要训练43000 GPU时。按照Google Cloud上A100-80G Spot的价格计算，8卡每小时价格为12.6美元，则总价格为67725美元。
 35 | 当使用未加速版本训练时，价格为158744美元。最终降低训练成本9万美元。
 36 | ### 通用性
 37 | 
 38 | 在训练语言模型时，我们希望能够构建一个通用的模型，可以适用于不同的语言和不同的领域。为了实现这一点，我们采用了以下策略：
 39 | 
 40 | - **多语言支持**：我们支持多种语言的语料库，包括英语、中文、日语等多种语言，让用户可以根据自己的需求进行选择。
 41 | - **领域通用性**：我们希望模型不仅能在日常问题上能产生帮助，同时希望在专业领域如科学、法律等也能帮助人类。
 42 | 
 43 | ## **要求**
 44 | 
 45 | - Python 3.7 或更高版本
 46 | - PyTorch 1.11 或更高版本
 47 | - [Transformers库](https://huggingface.co/docs/transformers/index)
 48 | - [Accelerate库](https://huggingface.co/docs/accelerate/index)
 49 | - CUDA 11.1 或更高版本（用于 GPU 加速，基于CUDA11.7进行测试）
 50 | 
 51 | ## **入门指南**
 52 | ### 安装
 53 | 
 54 | 使用下面的命令安装相关依赖
 55 | 
 56 | ```bash
 57 | pip install -r requirements.txt
 58 | ```
 59 | 
 60 | ### 数据集准备
 61 | 
 62 | 目前给出了智源开源的悟道数据集和EleutherAI开源的the pile数据集。数据集下载和处理代码在data目录下。
 63 | 其中悟道数据集由于需要同意一些协议才能下载因此可能需要修改一下download_wudao中的链接，[悟道](https://data.baai.ac.cn/details/WuDaoCorporaText)。
 64 | 
 65 | 运行下面的命令进行数据下载并进行分片
 66 | ```bash
 67 | bash data/download_the_pile.sh
 68 | bash data/download_wudao.sh
 69 | ```
 70 | 数据将按照每个文件最大16384行存储为小文件，便于后续使用多进程训练时进行读取。存储格式为jsonl.zst，使用zstd进行压缩，最终数据大小为519.5G，合计16466个文件。
 71 | 
 72 | 其中the pile数据集包含210607728行json line，悟道数据集包含59132213行json line。
 73 | 
 74 | 具体数据格式如下
 75 | ```
 76 | WuDao
 77 | {'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
 78 | 
 79 | The Pile
 80 | {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
 81 | ```
 82 | 
 83 | ### 数据读取
 84 | 数据读取相关代码可见dataset目录，其中包含根据下载的数据集使用SentencePiece训练分词模型，以及根据分词器构建DataLoader。
 85 | 
 86 | 训练分词器使用如下命令
 87 | ```bash
 88 | python3 dataset/train_tokenizer.py
 89 | ```
 90 | 
 91 | 使用如下命令查看DataLoader输出的结果
 92 | ```bash
 93 | python3 dataset/pretrain_dataset.py
 94 | ```
 95 | 
 96 | ### 模型结构
 97 | 我们基于Transformers库中的[Llama](https://github.com/facebookresearch/llama)参考论文原文中的2.4 Efficient implementation一节进行了修改，
 98 | 同时还参考了一些其他论文引入了一些优化。具体来说，我们引入了由META开源的[xformers库](https://github.com/facebookresearch/xformers)中的memory_efficient_attention操作来进行
 99 | Self Attention的计算，这对于性能有明显的提升，提升大约30%。
100 | 具体可以参见[modeling_llama.py](https://github.com/Bayes-Song/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L240)
101 | 
102 | 同时我们还参考了[Bloom](https://huggingface.co/bigscience/bloom)，对于Token Embedding引入了Stable Embedding以更好的稳定训练。
103 | 
104 | 最后我们参考[PALM](https://arxiv.org/abs/2204.02311)，使用了Shared Input-Output Embeddings。
105 | 
106 | ### 预训练
107 | 我们基于Accelerate库进行多GPU并行训练，启动命令如下
108 | ```bash
109 | accelerate launch --config_file configs/default_config.yaml pretrain_llama.py
110 | ```
111 | 我们使用[Wandb](https://wandb.ai/)进行训练的可视化，需要自行修改环境变量 WANDB_API_KEY 。
112 | 
113 | 其中我们使用了DeepSpeed stage1以减少显存占用。accelerate相关配置可见configs/default_config.yaml。
114 | 
115 | 训练相关超参数可见configs/train_config.py，目前我们使用10W词表的7B Llama模型进行训练，具体配置如下
116 | 
117 | | max_length | batch_size | learning_rate | weight_decay | params | dimension | n heads | n layer | vocab_size |
118 | |------------|------------------|---------------|--------------|--------|-----------|---------|---------|------------|
119 | | 1024       | 2                | 2e-4          | 1e-1         | 6.88B  | 4096      | 32      | 32      | 100000     |
120 | 
121 | ```
122 | =========================================================================================================
123 | Layer (type:depth-idx)                                  Output Shape              Param #
124 | =========================================================================================================
125 | LlamaForCausalLM                                        [1, 64, 32, 128]          --
126 | ├─LlamaModel: 1-1                                       [1, 64, 32, 128]          --
127 | │    └─Embedding: 2-1                                   [1, 64, 4096]             409,600,000
128 | │    └─LayerNorm: 2-2                                   [1, 64, 4096]             8,192
129 | │    └─ModuleList: 2-3                                  --                        --
130 | │    │    └─LlamaDecoderLayer: x32                      [1, 64, 4096]             202,383,360 x 32
131 | │    └─LlamaRMSNorm: 2-4                                [1, 64, 4096]             4,096
132 | =========================================================================================================
133 | Total params: 6,885,879,808
134 | Trainable params: 6,885,879,808
135 | Non-trainable params: 0
136 | Total mult-adds (G): 6.89
137 | ```
138 | 
139 | 目前的进展
140 | ![](assets/loss.png)
141 | 
142 | ### Instruction-Tuning
143 | 
144 | ### RLHF
145 | 
146 | ## 性能对比
147 | 
148 | ### 训练框架
149 | 在训练框架方面我们测试了HuggingFace开源的Accelerate库和HPC-AI开源的ColossalAI，我们测试在打满显卡时性能差异较小。因此最终选择了实现相对简单的Accelerate库作为训练框架
150 | 
151 | 测试数据如下，测试过程中使用的模型结构为
152 | | Model | n gpu | n layer | n heads | hidden size | vocab size | seq length |
153 | |-------|-------|---------|---------|-------------|------------|------------|
154 | | GPT2  | 2     | 6       | heads   | 4096        | 250100     | 1024       |
155 | 
156 | 测试结果如下，可以看到当打满时速度和显存相差不大
157 | |                 | HuggingFace                       | HuggingFace                        | ColossalAI                                             | ColossalAI                                             | ColossalAI                         |
158 | |-----------------|-----------------------------------|------------------------------------|--------------------------------------------------------|--------------------------------------------------------|------------------------------------|
159 | | config          | without activation ckpt, bs2      | without activation ckpt, max_bs=12 | with activation ckpt, bs2                              | without activation ckpt, bs2                           | without activation ckpt, max_bs=10 |
160 | | second pre step | 0.336, fw=0.033, bw=0.3, opt=5e-6 | 1.25                               | 0.347                                                  | 0.308, fw=0.067, bw=0.152, opt=0.088                   | 1.055                              |
161 | | gpu memory      | nvidia-smi 45445                  |                                    | fw+bw+opt=21053.63+22064.12+17987.52, nvidia-smi 40961 | fw+bw+opt=24684.74+21087.13+17987.52, nvidia-smi 46821 | oom after 10 steps, 疑似有内存泄漏 |
162 | 
163 | ### 性能优化
164 | 在最早版本中我们使用DeepSpeed stage2 + Transformers中的原生Llama实现进行训练但是速度和论文中所说的相差较大，因此后续我们进行了一系列的优化，我们将每一步的性能提升列在下面可供参考。
165 | 
166 | 论文中提到对于6.7B模型使用了1T token进行训练，最终的gpu时为82432，因此可以计算出他的训练速度大致为3370 token/s/gpu。
167 | 当使用下面的优化后速度开源基本和论文中速度一致，使用20x8 A100-80G进行测试。预计加入更多融合算子开源取得更好的性能。
168 | 
169 | |                     | V1           | V2                    |
170 | |---------------------|--------------|-----------------------|
171 | | Model               | Transformers | Transformers+xformers |
172 | | Optimizer           | Pytorch Adam | Fused Adam            |
173 | | DeepSpeed           | stage2       | stage1                |
174 | | Grad Accumulation   | 4            | 12                    |
175 | | Return Padding Mask | yes          | no                    |
176 | | Speed token/s/gpu   | 1378         | 3290                  |
177 | 
178 | ## 后续计划
179 | 
180 | 1. 加入更多训练监控，比如训练数据类别的分布等，加入继续训练相关代码
181 | 2. 开源预训练好的多语言Llama 6.9B的checkpoint
182 | 3. 实现Instruction-tuning代码，并开源相关checkpoint
183 | 4. 使用Gradio搭建在线Demo
184 | 5. 使用[Triton](https://github.com/openai/triton)加入更多高性能算子，进一步提升性能
185 | 6. 加入根据Common Crawl构建预训练数据集相关代码，并开源相关数据集
186 | 7. 加入多模态训练代码
187 | 
188 | ## 引用
189 | 
190 | ```
191 | @misc{openllama,
192 |   title={Open-Llama},
193 |   author={Liang Song},
194 |   year={2023},
195 |   howpublished={\url{https://github.com/Bayes-Song/Open-Llama}},
196 | }
197 | ```


--------------------------------------------------------------------------------
/README_en.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 |  * @Author: LiangSong(sl12160010@gmail.com)
  3 |  * @Date: 2023-03-10 21:18:35
  4 |  * @LastEditors: LiangSong(sl12160010@gmail.com)
  5 |  * @LastEditTime: 2023-03-27 02:41:39
  6 |  * @FilePath: /Open-Llama/README_en.md
  7 |  * @Description: 
  8 |  * 
  9 |  * Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
 10 | -->
 11 | # Open-Llama
 12 | 
 13 | Translated by ChatGPT.
 14 | 
 15 | Open-Llama is an open source project that provides a complete set of training processes for building large-scale language models, from data preparation to tokenization, pre-training, instruction tuning, and reinforcement learning techniques such as RLHF.
 16 | 
 17 | ## **Features**
 18 | ### Ease of Use
 19 | We believe that ease of use is one of the most important features when building large-scale language models. To make Open-Llama more accessible, we focus on the following:
 20 | 
 21 | - **Minimal implementation**: We use the simplest implementation approach to reduce the barrier to entry and make it easy for beginners to get started.
 22 | - **Complete workflow**: We provide complete code from data set construction to training, making each step of building a large language model clear and visible.
 23 | 
 24 | ### High Performance
 25 | Since training large language models is costly, high performance is also crucial when building large-scale language models. To achieve high-performance training, we employ the following techniques:
 26 | 
 27 | - **Fused CUDA kernel**: Using fused CUDA kernels provided by [xformers](https://github.com/facebookresearch/xformers) can fuse multiple operations together, reducing data transfer between GPU and CPU, and improving training efficiency.
 28 | - **Parallel training**: We use the [Accelerate](https://huggingface.co/docs/accelerate/index) library to support parallel training on multiple GPUs, accelerating the training process.
 29 | 
 30 | 
 31 | For 7B mode, the training speed of the Llama model using the PyTorch native version in the Transformers library is 1378 tokens/s/GPU. With our code, the training speed reaches 3290 tokens/s/GPU, which is close to the reported 3370 tokens/s/GPU in the [Llama paper](https://arxiv.org/pdf/2302.13971.pdf).
 32 | If we pretrain with 500 billion tokens, it will take 43,000 GPU hours. Assuming the price of A100-80G Spot on Google Cloud is $12.6 per hour for 8 GPUs, the total cost will be $67,725.
 33 | Without acceleration, the cost would be $158,744. Our method reduces the training cost by $90,019 in total.
 34 | 
 35 | ### Universality
 36 | When training language models, we aim to build a universal model that can be used for different languages and fields. To achieve this, we adopt the following strategies:
 37 | 
 38 | - **Multi-language support**: We support a variety of language corpora, including English, Chinese, Japanese, and other languages, allowing users to choose according to their needs.
 39 | - **Field universality**: We hope that the model can not only help with everyday problems but also assist in professional fields such as science and law.
 40 | ## **Requirements**
 41 | - Python 3.7 or higher
 42 | - PyTorch 1.11 or higher
 43 | - [Transformers library](https://huggingface.co/docs/transformers/index)
 44 | - [Accelerate library](https://huggingface.co/docs/accelerate/index)
 45 | - CUDA 11.1 or higher version (for GPU acceleration, tested based on CUDA 11.7)
 46 | ## **Getting Started**
 47 | ### Installation
 48 | Use the following command to install the required dependencies:
 49 | 
 50 | ```bash
 51 | pip install -r requirements.txt
 52 | ```
 53 | 
 54 | ### Dataset Preparation
 55 | Currently, we provide the Wudao dataset from ZhuiyiAI and The Pile dataset from EleutherAI. The code for downloading and processing the datasets can be found in the data directory. Please note that the Wudao dataset requires agreeing to some agreements before downloading, so you may need to modify the link in download_wudao.sh. [WuDao](https://data.baai.ac.cn/details/WuDaoCorporaText)
 56 | 
 57 | Use the following commands to download and shard the data:
 58 | 
 59 | ```bash
 60 | bash data/download_the_pile.sh
 61 | bash data/download_wudao.sh
 62 | ```
 63 | The data will be stored as small files with a maximum of 16,384 lines per file for efficient multi-processing training. The storage format is jsonl.zst compressed with zstd, resulting in a total data size of 519.5 GB and 16,466 files.
 64 | 
 65 | The Pile dataset contains 210,607,728 rows of JSON lines, and the Wudao dataset contains 59,132,213 rows of JSON lines.
 66 | 
 67 | The specific data format is as follows:
 68 | 
 69 | ```
 70 | WuDao
 71 | {'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
 72 | 
 73 | The Pile
 74 | {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
 75 | ```
 76 | ### Data Loading
 77 | The code for loading data can be found in the dataset directory, which includes training a tokenizer using SentencePiece and constructing a DataLoader based on the tokenizer.
 78 | 
 79 | Train the tokenizer with the following command:
 80 | 
 81 | ```bash
 82 | python3 dataset/train_tokenizer.py
 83 | ```
 84 | Check the DataLoader output with the following command:
 85 | 
 86 | ```bash
 87 | python3 dataset/pretrain_dataset.py
 88 | ```
 89 | ### Model Structure
 90 | We modified the [Llama](https://github.com/facebookresearch/llama) model in the Transformers library based on section 2.4 "Efficient Implementation" in the original paper and introduced some optimizations from other papers. Specifically, we introduced the memory_efficient_attention operation from the [xformers library](https://github.com/facebookresearch/xformers) by META for computing self-attention, which significantly improves performance by about 30%. Please refer to modeling_llama.py for details.
 91 | 
 92 | We also referred to Bloom for introducing stable embeddings for better training of token embeddings.
 93 | 
 94 | Finally, we referred to PALM and used shared input-output embeddings.
 95 | 
 96 | ### Pretraining
 97 | We use the Accelerate library for multi-GPU parallel training. Launch training with the following command:
 98 | 
 99 | ```bash
100 | accelerate launch --config_file configs/default_config.yaml pretrain_llama.py
101 | ```
102 | We use [Wandb](https://wandb.ai/) for training visualization and you need to modify the environment variable WANDB_API_KEY.
103 | 
104 | We use DeepSpeed stage 1 to reduce GPU memory usage. Accelerate-related configurations can be found in configs/default_config.yaml.
105 | 
106 | The training-related hyperparameters can be found in configs/train_config.py. We currently train a 7B Llama model with a vocabulary size of 100,000, and the specific configuration is as follows:
107 | 
108 | | max_length | batch_size | learning_rate | weight_decay | params | dimension | n heads | n layer | vocab_size |
109 | |------------|------------------|---------------|--------------|--------|-----------|---------|---------|------------|
110 | | 1024       | 2                | 2e-4          | 1e-1         | 6.88B  | 4096      | 32      | 32      | 100000     |
111 | 
112 | ```
113 | =========================================================================================================
114 | Layer (type:depth-idx)                                  Output Shape              Param #
115 | =========================================================================================================
116 | LlamaForCausalLM                                        [1, 64, 32, 128]          --
117 | ├─LlamaModel: 1-1                                       [1, 64, 32, 128]          --
118 | │    └─Embedding: 2-1                                   [1, 64, 4096]             409,600,000
119 | │    └─LayerNorm: 2-2                                   [1, 64, 4096]             8,192
120 | │    └─ModuleList: 2-3                                  --                        --
121 | │    │    └─LlamaDecoderLayer: x32                      [1, 64, 4096]             202,383,360 x 32
122 | │    └─LlamaRMSNorm: 2-4                                [1, 64, 4096]             4,096
123 | =========================================================================================================
124 | Total params: 6,885,879,808
125 | Trainable params: 6,885,879,808
126 | Non-trainable params: 0
127 | Total mult-adds (G): 6.89
128 | ```
129 | Current Progress
130 | ![](assets/loss.png)
131 | ### Instruction-Tuning
132 | 
133 | ### RLHF
134 | 
135 | ## Performance Comparison
136 | 
137 | ### Training Framework
138 | In terms of the training framework, we tested the HuggingFace's Accelerate library and HPC-AI's ColossalAI, and found that there was little difference in performance when running on fully utilized GPUs. Therefore, we ultimately chose the relatively simple Accelerate library as our training framework.
139 | 
140 | The test data is shown below, and the model structure used during testing is:
141 | 
142 | | Model | n gpu | n layer | n heads | hidden size | vocab size | seq length |
143 | |-------|-------|---------|---------|-------------|------------|------------|
144 | | GPT2  | 2     | 6       | heads   | 4096        | 250100     | 1024       |
145 | 
146 | The test results are shown below, and we can see that there is little difference in speed and memory utilization when running on fully utilized GPUs:
147 | 
148 | |                 | HuggingFace                       | HuggingFace                        | ColossalAI                                             | ColossalAI                                             | ColossalAI                         |
149 | |-----------------|-----------------------------------|------------------------------------|--------------------------------------------------------|--------------------------------------------------------|------------------------------------|
150 | | config          | without activation ckpt, bs2      | without activation ckpt, max_bs=12 | with activation ckpt, bs2                              | without activation ckpt, bs2                           | without activation ckpt, max_bs=10 |
151 | | second pre step | 0.336, fw=0.033, bw=0.3, opt=5e-6 | 1.25                               | 0.347                                                  | 0.308, fw=0.067, bw=0.152, opt=0.088                   | 1.055                              |
152 | | gpu memory      | nvidia-smi 45445                  |                                    | fw+bw+opt=21053.63+22064.12+17987.52, nvidia-smi 40961 | fw+bw+opt=24684.74+21087.13+17987.52, nvidia-smi 46821 | oom after 10 steps, suspected memory leak |
153 | 
154 | ### Performance Optimization
155 | In the earliest version, we used DeepSpeed stage2 and the native Llama implementation in Transformers for training. However, the speed was significantly different from what was reported in the paper. Therefore, we conducted a series of optimizations and list the performance improvements for each step below.
156 | 
157 | The paper mentions that they trained the 6.7B model with 1T tokens, and the GPU utilization was 82432, so the training speed was approximately 3370 tokens/s/GPU. After implementing the following optimizations, our speed is now comparable to that reported in the paper, using 20x8 A100-80G for testing. We expect to achieve better performance by adding more fusion operators in the future.
158 | 
159 | |                     | V1           | V2                    |
160 | |---------------------|--------------|-----------------------|
161 | | Model               | Transformers | Transformers+xformers |
162 | | Optimizer           | Pytorch Adam | Fused Adam            |
163 | | DeepSpeed           | stage2       | stage1                |
164 | | Grad Accumulation   | 4            | 12                    |
165 | | Return Padding Mask | yes          | no                    |
166 | | Speed token/s/gpu   | 1378         | 3290                  |
167 | 
168 | ## Future Plans
169 | 1. Add more training monitoring, such as the distribution of training data categories, and add code for continuing training.
170 | 2. Realease the pre-trained checkpoint for the multi-lingual Llama 6.9B model.
171 | 3. Implement instruction-tuning code and open-source related checkpoints.
172 | Build an online demo using Gradio.
173 | 4. Use [Triton](https://github.com/openai/triton) to add more high-performance operators and further improve performance.
174 | 5. Add code for building pre-training datasets based on Common Crawl and open-source related datasets.
175 | 6. Add code for multi-modal training.
176 | ## Citation
177 | ```
178 | @misc{openllama,
179 |   title={Open-Llama},
180 |   author={Liang Song},
181 |   year={2023},
182 |   howpublished={\url{https://github.com/Bayes-Song/Open-Llama}},
183 | }
184 | ```


--------------------------------------------------------------------------------