├── assets └── goldfish-loss.jpg ├── requirements.txt ├── tutorials ├── images │ └── prepare_dataset │ │ ├── alpaca.jpg │ │ ├── dolly.jpg │ │ ├── lima.jpg │ │ ├── longform.jpg │ │ └── alpaca_libre.jpg ├── download_freewilly_2.md ├── download_longchat.md ├── download_vicuna.md ├── download_function_calling_llama_2.md ├── download_openllama.md ├── download_gemma.md ├── download_falcon.md ├── download_dolly.md ├── download_stablecode.md ├── download_redpajama_incite.md ├── evaluation.md ├── download_pythia.md ├── download_code_llama.md ├── download_llama_2.md ├── download_tinyllama.md ├── download_mistral.md ├── download_stablelm.md ├── pretrain_openwebtext.md ├── finetune_full.md ├── oom.md ├── convert_lit_models.md ├── inference.md └── download_phi.md ├── axonn_fabric ├── __init__.py └── megatron_logging.py ├── lit_gpt ├── data │ ├── __init__.py │ ├── dolly.py │ ├── longform.py │ ├── tinyllama.py │ ├── json.py │ ├── base.py │ └── lima.py ├── __init__.py ├── rmsnorm.py ├── data_loading_utils.py ├── args.py ├── tokenizer.py ├── multiple_negative_ranking_loss.py ├── retrieval_attn_utils.py └── doc_block_utils.py ├── .gitignore ├── scripts ├── check_model_exists.py ├── convert_lit_ckpt.sh ├── push_to_hub.py ├── prepare_slimpajama.py ├── prepare_starcoder.py ├── merge_lora.py ├── simulate_lr.py ├── prepare_openwebtext.py ├── convert_checkpoint_to_hf.py ├── convert_pretrained_checkpoint.py ├── download.py ├── prepare_retrieval_data.py ├── prepare_longform.py └── prepare_csv.py ├── requirements-all.txt ├── launch_scripts ├── config │ ├── config_quick_run.yaml │ ├── tinyllama-1b-control.yaml │ ├── tinyllama-1b.yaml │ ├── tinyllama-1b-equal-supervised-tokens.yaml │ ├── tinyllama-1b-equal-supervised-tokens_mbs11.yaml │ ├── tinyllama-1b-equal-supervised-tokens_mbs8.yaml │ └── tinyllama-1b-equal-supervised-tokens_.yaml └── launch_jobs_1b_hashtable.sh.sh ├── .flake8 ├── pyproject.toml ├── data_checks └── count_token_lengths.py ├── install.sh ├── eval ├── alpaca_eval_generate.py └── factmem_rephrase.py └── README.md /assets/goldfish-loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/assets/goldfish-loss.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=2.1.2 2 | lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af 3 | -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpaca.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/alpaca.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/dolly.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/dolly.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/lima.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/lima.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/longform.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/longform.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpaca_libre.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/alpaca_libre.jpg -------------------------------------------------------------------------------- /axonn_fabric/__init__.py: -------------------------------------------------------------------------------- 1 | from .fabric import AxoNNFabric 2 | from .megatron_logging import pretty_log 3 | from .hf_llama_tensor_parallel import monkey_patch_llama_with_axonn 4 | -------------------------------------------------------------------------------- /lit_gpt/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from lit_gpt.data.base import LitDataModule, SFTDataset, apply_prompt_template, get_sft_collate_fn 4 | from lit_gpt.data.alpaca import Alpaca 5 | from lit_gpt.data.json import JSON 6 | from lit_gpt.data.dolly import Dolly 7 | from lit_gpt.data.flan import FLAN 8 | from lit_gpt.data.lima import LIMA 9 | from lit_gpt.data.longform import LongForm 10 | from lit_gpt.data.tinyllama import TinyLlama 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .idea 3 | .DS_Store 4 | *.egg-info 5 | build 6 | .venv 7 | .vscode 8 | 9 | # data 10 | data 11 | datasets 12 | !lit_gpt/data 13 | !tests/data 14 | checkpoints 15 | out 16 | output 17 | outputs 18 | log 19 | wandb 20 | events.out.tfevents* 21 | results 22 | 23 | tests/reference_models 24 | 25 | # umd artifact paths 26 | slurm_logs/ 27 | logs/ 28 | checkpoints/ 29 | 30 | jwk_scratch/ahans/* 31 | mia_outputs/ 32 | data_extraction_outputs/ 33 | mem_output/ 34 | launch_scripts/ahans/archive 35 | 36 | .github* -------------------------------------------------------------------------------- /scripts/check_model_exists.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | import sys 4 | from huggingface_hub import delete_repo 5 | import os 6 | 7 | if __name__ == "__main__": 8 | from transformers import AutoTokenizer, AutoModel 9 | import argparse 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--model_name", type=str) 13 | 14 | args = parser.parse_args() 15 | 16 | try: 17 | model = AutoModel.from_pretrained(f"tomg-group-umd/{args.model_name}") 18 | print(f"Repo {args.model_name} exists") 19 | sys.exit(0) 20 | except Exception as e: 21 | try: 22 | delete_repo(repo_id = args.model_name, token = os.environ["HF_TOKEN_WRITE"]) 23 | except Exception as e: 24 | pass 25 | print(f"Repo {args.model_name} does NOT exist") 26 | sys.exit(1) -------------------------------------------------------------------------------- /requirements-all.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | jsonargparse[signatures] # CLI 3 | bitsandbytes==0.41.0 # quantization 4 | scipy # required by bitsandbytes 5 | sentencepiece # llama-based models 6 | tokenizers # pythia, falcon, redpajama 7 | datasets # eval 8 | requests # scripts/prepare_* 9 | zstandard # scripts/prepare_redpajama.py, scripts/prepare_starcoder.py 10 | pandas # scripts/prepare_csv.py, scripts/prepare_starcoder.py 11 | pyarrow # scripts/prepare_starcoder.py 12 | tensorboard # pretrain/tinyllama.py 13 | torchmetrics # pretrain/tinyllama.py 14 | # eval 15 | git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529 16 | # scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py, pretrain/tinyllama.py 17 | lightning[data] @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af 18 | wandb 19 | docstring_parser -------------------------------------------------------------------------------- /scripts/convert_lit_ckpt.sh: -------------------------------------------------------------------------------- 1 | user=$(whoami) 2 | echo "User: $user" 3 | python scripts/convert_lit_checkpoint.py --checkpoint_path /lustre/orion/csc569/scratch/$user/lit-gpt-dev/out/lit-tiny-llama-1.1b/step-00120000.pth --output_path /lustre/orion/csc569/scratch/$user/lit-gpt-dev/transformer_ckpts/lit-tiny-llama-1.1b-120k-steps-500B-tokens --model_name tiny-llama-1.1b 4 | cd /lustre/orion/csc569/scratch/$user/lit-gpt-dev/transformer_ckpts/lit-tiny-llama-1.1b-120k-steps-500B-tokens 5 | wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-step-50K-105b/resolve/main/special_tokens_map.json 6 | wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-step-50K-105b/resolve/main/tokenizer_config.json 7 | wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-step-50K-105b/resolve/main/tokenizer.json 8 | wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-step-50K-105b/resolve/main/tokenizer.model 9 | cd /lustre/orion/csc569/scratch/$user/lit-gpt-dev 10 | python scripts/push_to_hub.py --model_name tiny-llama-1.1b-120k-steps-500B-tokens --model_path /lustre/orion/csc569/scratch/$user/lit-gpt-dev/transformer_ckpts/lit-tiny-llama-1.1b-120k-steps-500B-tokens --token_id $HF_TOKEN -------------------------------------------------------------------------------- /tutorials/download_freewilly_2.md: -------------------------------------------------------------------------------- 1 | 2 | ## Download [FreeWilly 2](https://stability.ai/blog/freewilly-large-instruction-fine-tuned-models) weights 3 | 4 | Stability AI announced FreeWilly inspired by the methodology pioneered by Microsoft in its paper: "Orca: Progressive Learning from Complex Explanation Traces of GPT-4”. 5 | FreeWilly2 leverages the Llama 2 70B foundation model to reach a performance that compares favorably with GPT-3.5 for some tasks. 6 | 7 | ```bash 8 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 9 | 10 | python scripts/download.py --repo_id stabilityai/FreeWilly2 11 | 12 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/stabilityai/FreeWilly2 13 | ``` 14 | 15 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 16 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 17 | 18 | You're done! To execute the model just run: 19 | 20 | ```bash 21 | pip install sentencepiece 22 | 23 | python chat/base.py --checkpoint_dir checkpoints/stabilityai/FreeWilly2 24 | ``` 25 | -------------------------------------------------------------------------------- /lit_gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import re 4 | import logging 5 | 6 | from lit_gpt.model import GPT 7 | from lit_gpt.config import Config 8 | from lit_gpt.tokenizer import Tokenizer 9 | 10 | from lightning_utilities.core.imports import RequirementCache 11 | 12 | _LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.2.0.dev0") 13 | if not bool(_LIGHTNING_AVAILABLE): 14 | raise ImportError( 15 | "Lit-GPT requires lightning nightly. Please run:\n" 16 | f" pip uninstall -y lightning; pip install -r requirements.txt\n{str(_LIGHTNING_AVAILABLE)}" 17 | ) 18 | 19 | # Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632 20 | pattern = re.compile(".*Profiler function .* will be ignored") 21 | logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage())) 22 | 23 | # Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint 24 | logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True 25 | logging.getLogger("torch.distributed.fsdp._debug_utils").disabled = True 26 | 27 | __all__ = ["GPT", "Config", "Tokenizer"] 28 | -------------------------------------------------------------------------------- /lit_gpt/rmsnorm.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import torch 4 | 5 | 6 | class RMSNorm(torch.nn.Module): 7 | """Root Mean Square Layer Normalization. 8 | 9 | Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License: 10 | https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE. 11 | """ 12 | 13 | def __init__(self, size: int, dim: int = -1, eps: float = 1e-6, add_unit_offset: bool = False) -> None: 14 | super().__init__() 15 | self.weight = torch.nn.Parameter(torch.ones(size)) 16 | self.eps = eps 17 | self.dim = dim 18 | self.add_unit_offset = add_unit_offset 19 | 20 | def forward(self, x: torch.Tensor) -> torch.Tensor: 21 | dtype = x.dtype 22 | x = x.float() 23 | # NOTE: the original RMSNorm paper implementation is not equivalent 24 | norm_x = torch.mean(x * x, dim=self.dim, keepdim=True) 25 | x_normed = x * torch.rsqrt(norm_x + self.eps) 26 | x_normed = x_normed.to(dtype=dtype) 27 | if self.add_unit_offset: 28 | # Gemma model requires a unit offset 29 | # https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L176 30 | return x_normed * (1 + self.weight) 31 | return x_normed * self.weight 32 | 33 | def reset_parameters(self) -> None: 34 | torch.nn.init.ones_(self.weight) 35 | -------------------------------------------------------------------------------- /scripts/push_to_hub.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from huggingface_hub import create_repo 4 | 5 | transformers.logging.set_verbosity_info() 6 | 7 | 8 | if __name__ == "__main__": 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | import argparse 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--model_path", type=str, required=True) 14 | parser.add_argument("--tokenizer_path", type=str, default=None) 15 | # parser.add_argument("--repo_name", type=str, required=True) 16 | parser.add_argument("--model_name", type=str, required=True) 17 | parser.add_argument("--token_id", type=str, required=True) 18 | 19 | args = parser.parse_args() 20 | args.repo_name = f"tomg-group-umd/{args.model_name}" 21 | if args.tokenizer_path is None: 22 | args.tokenizer_path = args.model_path 23 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path) 24 | # model = AutoModelForCausalLM.from_pretrained(args.model_path) 25 | state_dict = torch.load(f"{args.model_path}/pytorch_model.bin") 26 | model = AutoModelForCausalLM.from_pretrained(args.model_path, state_dict=state_dict) 27 | print(model) 28 | 29 | create_repo(args.repo_name, private=True, token=args.token_id, exist_ok=True) 30 | model.push_to_hub(args.repo_name, use_temp_dir=True, token=args.token_id, overwrite=True) 31 | tokenizer.push_to_hub(args.repo_name, use_temp_dir=True, token=args.token_id) 32 | 33 | print(f"Model pushed to {model}") 34 | -------------------------------------------------------------------------------- /tutorials/download_longchat.md: -------------------------------------------------------------------------------- 1 | ## Download [LongChat](https://lmsys.org/blog/2023-06-29-longchat) weights 2 | 3 | LongChat is an open-source family of chatbots based on LLaMA featuring an extended context length up to 16K tokens. 4 | The technique used to extend the context length is described in [this blogpost](https://kaiokendev.github.io/context). 5 | 6 | To see all the available checkpoints, run: 7 | 8 | ```bash 9 | python scripts/download.py | grep longchat 10 | ``` 11 | 12 | which will print 13 | 14 | ```text 15 | lmsys/longchat-7b-16k 16 | lmsys/longchat-13b-16k 17 | ``` 18 | 19 | In order to use a specific checkpoint, for instance [longchat-7b-16k](https://huggingface.co/lmsys/longchat-7b-16k), download the weights and convert the checkpoint to the lit-gpt format: 20 | 21 | ```bash 22 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 23 | 24 | python scripts/download.py --repo_id lmsys/longchat-7b-16k 25 | 26 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/lmsys/longchat-7b-16k 27 | ``` 28 | 29 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 30 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 31 | 32 | You're done! To execute the model just run: 33 | 34 | ```bash 35 | pip install sentencepiece 36 | 37 | python chat/base.py --checkpoint_dir checkpoints/lmsys/longchat-7b-16k 38 | ``` 39 | -------------------------------------------------------------------------------- /tutorials/download_vicuna.md: -------------------------------------------------------------------------------- 1 | ## Download [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) weights 2 | 3 | Vicuna is an open-source family of chatbots trained by fine-tuning LLaMA on user-shared conversations collected from [ShareGPT](https://sharegpt.com). 4 | 5 | To see all the available checkpoints for Vicuna, run: 6 | 7 | ```bash 8 | python scripts/download.py | grep vicuna 9 | ``` 10 | 11 | which will print 12 | 13 | ```text 14 | lmsys/vicuna-7b-v1.3 15 | lmsys/vicuna-13b-v1.3 16 | lmsys/vicuna-33b-v1.3 17 | lmsys/vicuna-7b-v1.5 18 | lmsys/vicuna-7b-v1.5-16k 19 | lmsys/vicuna-13b-v1.5 20 | lmsys/vicuna-13b-v1.5-16k 21 | ``` 22 | 23 | In order to use a specific Vicuna checkpoint, for instance [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5), download the weights and convert the checkpoint to the lit-gpt format: 24 | 25 | ```bash 26 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 27 | 28 | python scripts/download.py --repo_id lmsys/vicuna-7b-v1.5 29 | 30 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/lmsys/vicuna-7b-v1.5 31 | ``` 32 | 33 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 34 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 35 | 36 | You're done! To execute the model just run: 37 | 38 | ```bash 39 | pip install sentencepiece 40 | 41 | python chat/base.py --checkpoint_dir checkpoints/lmsys/vicuna-7b-v1.5 42 | ``` 43 | -------------------------------------------------------------------------------- /launch_scripts/config/config_quick_run.yaml: -------------------------------------------------------------------------------- 1 | run_name: default-run 2 | out_dir: null 3 | resume: true 4 | max_tokens: 1000000000000 5 | max_iters: null 6 | seed: 1337 7 | model_name: tiny-llama-1.1b 8 | block_size: 2048 9 | world_batch_size: 32 10 | learning_rate: 0.0004 11 | warmup_steps: 2000 12 | weight_decay: 0.1 13 | beta1: 0.9 14 | beta2: 0.95 15 | grad_clip: 1.0 16 | lr_schedule: cosine 17 | decay_lr: true 18 | min_lr: 4.0e-05 19 | neptune_from_tokens: null 20 | neptune_till_tokens: null 21 | neptune_noise_alpha: null 22 | label_smoothing: 0.0 23 | k_token_loss_dropout: null 24 | fabric_strategy: ddp 25 | fabric_precision: bf16-true 26 | micro_batch_size: 4 27 | compile_model: true 28 | matmul_precision: high 29 | dataloader_num_workers: 0 30 | n_chunks: 4 31 | logger_name: wandb 32 | logger_project: tinyllama 33 | data_telemetry: false 34 | log_step_interval: 1 35 | eval_iters: 100 36 | save_and_eval_interval: 2000 37 | save_last_step: false 38 | sanity_validate: true 39 | measure_flops: false 40 | text_key: text 41 | pad_to_block_size: false 42 | add_bos: true 43 | add_eos: true 44 | shuffle_filenames: true 45 | collate_checks_enabled: true 46 | all_block_size_tensors: false 47 | data_config: 48 | train_data: 49 | - type: pkds 50 | prefix: '' 51 | weight: 1 52 | val_data: 53 | - type: pkds 54 | prefix: '' 55 | weight: 1 56 | train_data_dir: /lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd 57 | val_data_dir: /lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd 58 | tokenizer_path: /lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T 59 | -------------------------------------------------------------------------------- /tutorials/download_function_calling_llama_2.md: -------------------------------------------------------------------------------- 1 | ## Download [Function Calling Llama 2](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2) weights 2 | 3 | Llama-7B with function calling is licensed according to the Meta Community license. 4 | 5 | Function calling Llama extends the hugging face Llama 2 models with function calling capabilities. 6 | The model responds with a structured json argument with the function name and arguments. 7 | 8 | In order to use the checkpoint, download the weights and convert the checkpoint to the lit-gpt format. 9 | 10 | ```bash 11 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 12 | 13 | python scripts/download.py --repo_id Trelis/Llama-2-7b-chat-hf-function-calling-v2 --from_safetensors true 14 | 15 | python scripts/convert_hf_checkpoint.py --checkpoint_dir Trelis/Llama-2-7b-chat-hf-function-calling-v2 16 | ``` 17 | 18 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 19 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 20 | 21 | You're done! To execute the model just run: 22 | 23 | ```bash 24 | pip install sentencepiece 25 | 26 | python chat/base.py --checkpoint_dir Trelis/Llama-2-7b-chat-hf-function-calling-v2 27 | ``` 28 | Is strongly recommended to visit the model [repository](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2) to know how to format the prompt. 29 | 30 | The chat script has a generic use case with a single function defined, feel free to play with it to fit your needs, for instance to make HTTP requests with the model outputs. 31 | 32 | Have fun! 33 | -------------------------------------------------------------------------------- /tutorials/download_openllama.md: -------------------------------------------------------------------------------- 1 | ## Download [OpenLLaMA](https://github.com/openlm-research/open_llama) weights 2 | 3 | OpenLLaMA is a permissively licensed open source reproduction of [Meta AI’s LLaMA](https://github.com/facebookresearch/llama) 4 | 7B and 13B checkpoints trained on the [RedPajama dataset](https://github.com/togethercomputer/RedPajama-Data). 5 | The weights can serve as the drop in replacement of LLaMA in existing implementations. We also provide a smaller 3B variant. 6 | 7 | To see all the available checkpoints for Open LLaMA, run: 8 | 9 | ```bash 10 | python scripts/download.py | grep open_llama 11 | ``` 12 | 13 | which will print 14 | 15 | ```text 16 | openlm-research/open_llama_3b 17 | openlm-research/open_llama_7b 18 | openlm-research/open_llama_13b 19 | ``` 20 | 21 | In order to use a specific OpenLLaMA checkpoint, for instance [open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b), download the weights and convert the checkpoint to the lit-gpt format: 22 | 23 | ```bash 24 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 25 | 26 | python scripts/download.py --repo_id openlm-research/open_llama_3b 27 | 28 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/openlm-research/open_llama_3b 29 | ``` 30 | 31 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 32 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 33 | 34 | You're done! To execute the model just run: 35 | 36 | ```bash 37 | pip install sentencepiece 38 | 39 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/openlm-research/open_llama_3b 40 | ``` 41 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | enable-extensions = G 3 | select = B,C,E,F,G,P,SIM1,T4,W,B9 4 | max-line-length = 120 5 | # track with black in pyproject.toml 6 | # 7 | # 8 | # anytime you really hate a rule and don't want to follow it, add it here: 9 | # C408 ignored because we like the dict keyword argument syntax 10 | # E501 is not flexible enough, we're using B950 instead 11 | ignore = 12 | E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303, 13 | # fix these lints in the future 14 | E275, 15 | # shebang has extra meaning in fbcode lints, so I think it's not worth trying 16 | # to line this up with executable bit 17 | EXE001, 18 | # these ignores are from flake8-bugbear; please fix! 19 | B007,B008,B017,B019,B020,B023,B024,B026,B028,B903,B904,B905,B906,B907 20 | # these ignores are from flake8-comprehensions; please fix! 21 | C407, 22 | # these ignores are from flake8-logging-format; please fix! 23 | G100,G101,G200,G201,G202 24 | # these ignores are from flake8-simplify. please fix or ignore with commented reason 25 | SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12, 26 | # flake8-simplify code styles 27 | SIM102,SIM103,SIM106,SIM112, 28 | # I claim to know what I'm doing when doing this: 29 | B006, 30 | # We like commented out code sometimes :< 31 | E800, 32 | # and inefficient logging: 33 | G004, 34 | # overkill for ML code 35 | ECE001, 36 | # will get people to do this anyway: 37 | E731, 38 | # litgpt: 39 | B011, 40 | PT015 41 | per-file-ignores = 42 | __init__.py: F401 43 | optional-ascii-coding = True 44 | exclude = 45 | .git 46 | ./.git 47 | /build 48 | notebooks 49 | scripts 50 | __pycache__ 51 | dl/* 52 | log/* 53 | *.pyi 54 | -------------------------------------------------------------------------------- /tutorials/download_gemma.md: -------------------------------------------------------------------------------- 1 | ## Download [Gemma](https://blog.google/technology/developers/gemma-open-models/) weights 2 | 3 | Google developed and publicly released the Gemma large language models (LLMs), a collection of pretrained models in 2B and 7B parameter size that are based on the Gemini architecture. 4 | 5 | For more information, please see the [technical report](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf). 6 | 7 | 8 | To see all the available checkpoints, run: 9 | 10 | ```bash 11 | python scripts/download.py | grep gemma 12 | ``` 13 | 14 | which will print 15 | 16 | ```text 17 | google/gemma-7b 18 | google/gemma-2b 19 | google/gemma-7b-it 20 | google/gemma-2b-it 21 | ``` 22 | 23 | In the list above, `gemma-2b` and `gemma-7b` are the pretrained models, and `gemma-2b-it` and `gemma-7b-it` are the instruction-finetuned models. 24 | 25 | In order to use a specific checkpoint, for instance [gemma-2b](https://huggingface.co/google/gemma-2b), download the weights and convert the checkpoint to the lit-gpt format. 26 | 27 | This requires that you've been granted access to the weights on the HuggingFace hub. You can do so by following the steps at . 28 | After access is granted, you can find your HF hub token in . 29 | 30 | ```bash 31 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 32 | 33 | python scripts/download.py --repo_id google/gemma-2b --access_token your_hf_token --from_safetensors true 34 | 35 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/google/gemma-2b 36 | ``` 37 | 38 | By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM 39 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 40 | 41 | You're done! To execute the model just run: 42 | 43 | ```bash 44 | python chat/base.py --checkpoint_dir checkpoints/google/gemma-2b 45 | ``` 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | requires-python = ">= 3.11" 6 | 7 | [project] 8 | name = "lit-gpt-umd" 9 | version = "0.1" 10 | dependencies = [ 11 | "torch==2.1.2", 12 | "lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af", 13 | "pytorch-lightning==2.2.1", 14 | "jsonargparse", 15 | "requests", 16 | "tensorboard", 17 | "torchmetrics", 18 | "submitit @ git+https://github.com/jwkirchenbauer/submitit.git", 19 | "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529", 20 | "wandb", 21 | "sentencepiece", 22 | "tokenizers", 23 | "datasets", 24 | ] 25 | # Note: The order really matters here! 26 | # We really should migrate to lm-eval 0.4.* eventually (or wait for lit-gpt to migrate) 27 | # Not really a best practice to inscribe exact packages here :) 28 | 29 | [project.optional-dependencies] 30 | # only for testing 31 | dev = [ 32 | "pytest", 33 | "pytest-rerunfailures", 34 | "pytest-timeout", 35 | "transformers>=4.38.0", 36 | "einops", 37 | "protobuf", 38 | "docstring_parser", 39 | "lightning-cloud", 40 | ] 41 | 42 | # only for data preproc 43 | data = [ 44 | "lightning[data] @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af", 45 | "requests", 46 | "zstandard", 47 | "pandas", 48 | "pyarrow", 49 | ] 50 | 51 | quant = [ 52 | "bitsandbytes>=0.41.0", 53 | "scipy", 54 | ] 55 | 56 | # only on the cluster: 57 | hpc = [ 58 | "packaging", 59 | "ninja", 60 | "flash_attn @ git+https://github.com/ROCmSoftwarePlatform/flash-attention", 61 | "axonn", # requires headers 62 | ] 63 | 64 | [tool.black] 65 | line-length = 120 66 | 67 | [tool.setuptools.packages.find] 68 | include = ["lit-gpt", "axonn_fabric", "generate", "eval", "scripts", "finetune", "analysis", "chat"] 69 | 70 | [project.entry-points.console_scripts] 71 | train = "pretrain_umd.module:train" 72 | push_to_hub = "scripts.module:push_to_hub" 73 | launch = "launch_scripts.module:launch_submitit" 74 | 75 | -------------------------------------------------------------------------------- /tutorials/download_falcon.md: -------------------------------------------------------------------------------- 1 | ## Download [Falcon](https://falconllm.tii.ae) weights 2 | 3 | UAE's Technology Innovation Institute has open-sourced Falcon LLM. 4 | It is trained on [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora 5 | Weights are released under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). 6 | 7 | The first Falcon release includes a base model and an instruction tuned model of sizes 7B and 40B called `falcon-7b-instruct` and `falcon-40b-instruct`. Recently, checkpoints for 180B parameter models were added as well; the 180B instruction tuned model is called `falcon-180B-chat` and similar to the `falcon-40b-instruct` architecture except for its larger size. 8 | 9 | To see all the available checkpoints for Falcon, run: 10 | 11 | ```bash 12 | python scripts/download.py | grep falcon 13 | ``` 14 | 15 | which will print 16 | 17 | ```text 18 | tiiuae/falcon-7b 19 | tiiuae/falcon-7b-instruct 20 | tiiuae/falcon-40b 21 | tiiuae/falcon-40b-instruct 22 | tiiuae/falcon-180B 23 | tiiuae/falcon-180B-chat 24 | ``` 25 | 26 | In order to use a specific Falcon checkpoint, for instance [falcon-7b](https://huggingface.co/tiiuae/falcon-7b), download the weights and convert the checkpoint to the lit-gpt format: 27 | 28 | ```bash 29 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 30 | 31 | python scripts/download.py --repo_id tiiuae/falcon-7b 32 | 33 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/tiiuae/falcon-7b 34 | ``` 35 | 36 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 37 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 38 | 39 | You're done! To execute the model just run: 40 | 41 | ```bash 42 | pip install tokenizers 43 | 44 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/tiiuae/falcon-7b 45 | ``` 46 | 47 | or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Lightning-AI/lit-gpt/blob/main/notebooks/falcon-inference.ipynb) 48 | -------------------------------------------------------------------------------- /tutorials/download_dolly.md: -------------------------------------------------------------------------------- 1 | ## Download [Dolly](https://github.com/databrickslabs/dolly) weights 2 | 3 | Databricks’ [Dolly](https://huggingface.co/databricks/dolly-v2-12b) is an instruction-following large language model trained on the Databricks machine learning platform 4 | that is licensed for commercial use. Based on `pythia-12b`, Dolly is trained on ~15k instruction/response fine tuning records 5 | [`databricks-dolly-15k`](https://huggingface.co/datasets/databricks/databricks-dolly-15k) generated 6 | by Databricks employees in capability domains from the InstructGPT paper, including brainstorming, classification, closed QA, generation, 7 | information extraction, open QA and summarization. `dolly-v2-12b` is not a state-of-the-art model, but does exhibit surprisingly 8 | high quality instruction following behavior not characteristic of the foundation model on which it is based. 9 | 10 | For detailed info on the models, their training, and their behavior, please see the [Dolly repository](https://github.com/databrickslabs/dolly). 11 | 12 | To see all the available checkpoints for Dolly, run: 13 | 14 | ```bash 15 | python scripts/download.py | grep dolly 16 | ``` 17 | 18 | which will print 19 | 20 | ```text 21 | databricks/dolly-v2-3b 22 | databricks/dolly-v2-7b 23 | databricks/dolly-v2-12b 24 | ``` 25 | 26 | In order to use a specific Dolly checkpoint, for instance [dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b), download the weights and convert the checkpoint to the lit-gpt format: 27 | 28 | ```bash 29 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 30 | 31 | python scripts/download.py --repo_id databricks/dolly-v2-3b 32 | 33 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/databricks/dolly-v2-3b 34 | ``` 35 | 36 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 37 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 38 | 39 | You're done! To execute the model just run: 40 | 41 | ```bash 42 | pip install tokenizers 43 | 44 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/databricks/dolly-v2-3b 45 | ``` 46 | -------------------------------------------------------------------------------- /scripts/prepare_slimpajama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | import os 5 | import sys 6 | import time 7 | from pathlib import Path 8 | 9 | import zstandard as zstd 10 | from lightning.data.streaming import DataChunkRecipe, DataProcessor 11 | 12 | # support running without installing as a package 13 | wd = Path(__file__).parent.parent.resolve() 14 | sys.path.append(str(wd)) 15 | 16 | from lit_gpt import Tokenizer 17 | from lit_gpt.utils import CLI 18 | 19 | 20 | class SlimPajamaDataRecipe(DataChunkRecipe): 21 | def __init__(self, tokenizer: Tokenizer, chunk_size: int): 22 | super().__init__(chunk_size) 23 | self.tokenizer = tokenizer 24 | 25 | def prepare_structure(self, input_dir): 26 | files = Path(input_dir).rglob("*.zst") 27 | return [str(file) for file in files] 28 | 29 | def prepare_item(self, filepath): 30 | with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: 31 | for row in f: 32 | text = json.loads(row)["text"] 33 | if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub": 34 | continue # exclude the GitHub data since it overlaps with starcoder 35 | text_ids = self.tokenizer.encode(text, bos=False, eos=True) 36 | yield text_ids 37 | 38 | 39 | def prepare( 40 | input_dir: Path = Path("data/SlimPajama-627B/train"), 41 | output_dir: Path = Path("data/slimpajama/train"), 42 | tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"), 43 | chunk_size: int = (2049 * 16384), 44 | fast_dev_run: bool = False, 45 | ) -> None: 46 | tokenizer = Tokenizer(tokenizer_path) 47 | data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) 48 | data_processor = DataProcessor( 49 | input_dir=str(input_dir), 50 | output_dir=str(output_dir), 51 | fast_dev_run=fast_dev_run, 52 | num_workers=os.cpu_count(), 53 | num_downloaders=1, 54 | ) 55 | 56 | start_time = time.time() 57 | data_processor.run(data_recipe) 58 | elapsed_time = time.time() - start_time 59 | print(f"Time taken: {elapsed_time:.2f} seconds") 60 | 61 | 62 | if __name__ == "__main__": 63 | CLI(prepare) 64 | -------------------------------------------------------------------------------- /tutorials/download_stablecode.md: -------------------------------------------------------------------------------- 1 | ## Download [StableCode](https://huggingface.co/collections/stabilityai/stable-code-64f9dfb4ebc8a1be0a3f7650) weights 2 | 3 | StableCode is a suite of 4 developer assistant models. 4 | 5 | Each one of them is a decoder-only code completion model with 3 billion parameters, pre-trained on a diverse collection of programming languages that ranked highest in the 2023 StackOverflow developer survey. 6 | 7 | For more info on the models, please visit the [StableCode repository](https://huggingface.co/collections/stabilityai/stable-code-64f9dfb4ebc8a1be0a3f7650). 8 | 9 | ------ 10 | 11 | To see all the available checkpoints for StableCode, run: 12 | 13 | ```bash 14 | python scripts/download.py | grep -E "stable-?code" 15 | ``` 16 | 17 | which will print: 18 | 19 | ```text 20 | stabilityai/stablecode-completion-alpha-3b 21 | stabilityai/stablecode-completion-alpha-3b-4k 22 | stabilityai/stablecode-instruct-alpha-3b 23 | stabilityai/stable-code-3b 24 | ``` 25 | 26 | In order to use a specific StableCode checkpoint, for instance [stable-code-3b](https://huggingface.co/stabilityai/stable-code-3b), download the weights and convert the checkpoint to the Lit-GPT format: 27 | 28 | ```bash 29 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 30 | 31 | export repo_id=stabilityai/stable-code-3b 32 | python scripts/download.py --repo_id $repo_id --from_safetensors=True 33 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id 34 | ``` 35 | 36 | > [!NOTE] 37 | > `stablecode-completion-alpha-3b` is shipped in PyTorch .bin format, thus set `--from_safetensors=False`. 38 | 39 | By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM 40 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 41 | 42 | You're done! To execute the model just run: 43 | 44 | ```bash 45 | pip install tokenizers 46 | 47 | python generate/base.py --prompt "Write in Python a softmax function. Be concise." --checkpoint_dir checkpoints/$repo_id 48 | ``` 49 | 50 | Or you can run the model in an interactive mode: 51 | 52 | ```bash 53 | python chat/base.py --checkpoint_dir checkpoints/$repo_id 54 | ``` 55 | -------------------------------------------------------------------------------- /tutorials/download_redpajama_incite.md: -------------------------------------------------------------------------------- 1 | ## Download [RedPajama-INCITE](https://www.together.xyz/blog/redpajama-models-v1) weights 2 | 3 | Togethercomputer's RedPajama-INCITE family of models were trained over the [RedPajama v1](https://www.together.xyz/blog/redpajama) dataset, with the same architecture as the popular [Pythia](download_pythia.md) model suite. Weights are released under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). 4 | 5 | The release includes a base model, a chat fine-tuned model, and an instruction tuned model of sizes 3B and 7B. 6 | 7 | To see all the available checkpoints for RedPajama-INCITE, run: 8 | 9 | ```bash 10 | python scripts/download.py | grep RedPajama 11 | ``` 12 | 13 | which will print 14 | 15 | ```text 16 | togethercomputer/RedPajama-INCITE-Base-3B-v1 17 | togethercomputer/RedPajama-INCITE-Chat-3B-v1 18 | togethercomputer/RedPajama-INCITE-Instruct-3B-v1 19 | togethercomputer/RedPajama-INCITE-7B-Base 20 | togethercomputer/RedPajama-INCITE-7B-Chat 21 | togethercomputer/RedPajama-INCITE-7B-Instruct 22 | togethercomputer/RedPajama-INCITE-Base-7B-v0.1 23 | togethercomputer/RedPajama-INCITE-Chat-7B-v0.1 24 | togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1 25 | ``` 26 | 27 | In order to use a specific RedPajama-INCITE checkpoint, for instance [RedPajama-INCITE-Base-3B-v1](https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1), download the weights and convert the checkpoint to the lit-gpt format: 28 | 29 | ```bash 30 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 31 | 32 | python scripts/download.py --repo_id togethercomputer/RedPajama-INCITE-Base-3B-v1 33 | 34 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/togethercomputer/RedPajama-INCITE-Base-3B-v1 35 | ``` 36 | 37 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 38 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 39 | 40 | You're done! To execute the model just run: 41 | 42 | ```bash 43 | pip install tokenizers 44 | 45 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/togethercomputer/RedPajama-INCITE-Base-3B-v1 46 | ``` 47 | -------------------------------------------------------------------------------- /lit_gpt/data_loading_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.utils.data._utils.collate import collate_tensor_fn 4 | 5 | 6 | def generic_collate_fn( 7 | batch, 8 | tokenizer=None, 9 | block_size=None, 10 | pad_to_block_size=False, 11 | add_bos=True, 12 | add_eos=True, 13 | collate_checks_enabled=True, 14 | all_block_size_tensors=False, 15 | ): 16 | if all_block_size_tensors: 17 | # If we are only dealing with tensors that we _know_ are the same size, 18 | # we can just use the default collate_tensor_fn 19 | return collate_tensor_fn(batch) 20 | 21 | if collate_checks_enabled: 22 | assert isinstance(batch, list), "Batch must be a list." 23 | type_list = [type(x) for x in batch] 24 | if str in type_list: 25 | assert tokenizer is not None, "If batch contains strings, tokenizer must be provided." 26 | assert tokenizer.pad_id is not None, "Tokenizer must have pad token id since we are dynamically padding." 27 | 28 | # if tokenizer is not None: 29 | # for now, we assume that if we need it, the tokenizer is always present 30 | batch = [tokenizer.encode(row, bos=add_bos, eos=add_eos) if type(row) == str else row for row in batch] 31 | 32 | # Now all rows are tokenized 33 | # logic is a bit generic, could be tightened under encode -> tensor assumption 34 | if pad_to_block_size: 35 | batch = [torch.tensor(x[:block_size].tolist() + [tokenizer.pad_id] * (block_size - len(x))) for x in batch] 36 | else: 37 | # pad to longest in batch 38 | max_len = max(len(x) for x in batch) 39 | batch = [torch.tensor(x.tolist() + [tokenizer.pad_id] * (max_len - len(x))) for x in batch] 40 | 41 | # Now all rows are tensors of the same length. 42 | # Always slice to block size since the max row length realized could be longer than block size. 43 | collated_batch = collate_tensor_fn(batch)[:, :block_size] 44 | 45 | # We need to check whether the entire batch consists of padding tokens 46 | # if so, we raise a StopIteration to signal the exhaustion of all data sources since 47 | # no real tokens are present in the batch 48 | if torch.all(collated_batch == tokenizer.pad_id): 49 | raise StopIteration("All tokens in batch are padding tokens.") 50 | 51 | return collated_batch 52 | -------------------------------------------------------------------------------- /tutorials/evaluation.md: -------------------------------------------------------------------------------- 1 | # LLM Evaluation 2 | 3 |   4 | 5 | ## Using lm-evaluation-harness 6 | 7 | You can evaluate Lit-GPT using [EleutherAI's lm-eval](https://github.com/EleutherAI/lm-evaluation-harness/tree/master) framework with a large number of different evaluation tasks. 8 | 9 | You need to install the `lm-eval` framework first: 10 | 11 | ```bash 12 | pip install https://github.com/EleutherAI/lm-evaluation-harness/archive/refs/heads/master.zip -U 13 | ``` 14 | 15 |   16 | 17 | ### Evaluating Lit-GPT base models 18 | 19 | Use the following command to evaluate Lit-GPT models on all tasks in Eleuther AI's Evaluation Harness. 20 | 21 | ```bash 22 | python eval/lm_eval_harness.py \ 23 | --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \ 24 | --precision "bf16-true" \ 25 | --save_filepath "results.json" 26 | ``` 27 | 28 | To evaluate on LLMs on specific tasks, for example, TruthfulQA and HellaSwag, you can use the `--eval_task` flag as follows: 29 | 30 | ```bash 31 | python eval/lm_eval_harness.py \ 32 | --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \ 33 | --eval_tasks "[truthfulqa_mc,hellaswag]" \ 34 | --precision "bf16-true" \ 35 | --save_filepath "results.json" 36 | ``` 37 | 38 | A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md). 39 | 40 |   41 | 42 | ### Evaluating LoRA-finetuned LLMs 43 | 44 | The above command can be used to evaluate models that are saved via a single checkpoint file. This includes downloaded checkpoints and base models finetuned via the full and adapter finetuning scripts. 45 | 46 | For LoRA-finetuned models, you need to first merge the LoRA weights with the original checkpoint file as described in the [Merging LoRA Weights](finetune_lora.md#merging-lora-weights) section of the LoRA finetuning documentation. 47 | 48 |   49 | 50 | ## FAQs 51 | 52 | * **How do I evaluate on MMLU?** 53 | 54 | MMLU is available as with lm-eval harness but the task name is not MMLU. You can use `hendrycksTest*` as regex to evaluate on MMLU. 55 | 56 | ```shell 57 | python eval/lm_eval_harness.py \ 58 | --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \ 59 | --precision "bf16-true" \ 60 | --eval_tasks "[hendrycksTest*]" \ 61 | --num_fewshot 5 \ 62 | --save_filepath "results.json" 63 | ``` 64 | 65 | * **Is Truthful MC is not available in lm-eval?** 66 | 67 | It is available as `truthfulqa_mc`. 68 | -------------------------------------------------------------------------------- /tutorials/download_pythia.md: -------------------------------------------------------------------------------- 1 | ## Download [Pythia](https://github.com/EleutherAI/pythia) weights 2 | 3 | EleutherAI's project Pythia combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers. Weights are released under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). 4 | 5 | For detailed info on the models, their training, and their behavior, please see the [Pythia repository](https://github.com/EleutherAI/pythia). 6 | It includes a suite of 8 checkpoints (weights) on 2 different datasets: [The Pile](https://pile.eleuther.ai/), as well as The Pile with deduplication applied. In addition there are two small models that come only in non-deduplicated form: `Pythia-14m` and `Pythia-31m`. 7 | 8 | To see all the available checkpoints for Pythia, run: 9 | 10 | ```bash 11 | python scripts/download.py | grep pythia 12 | ``` 13 | 14 | which will print 15 | 16 | ```text 17 | EleutherAI/pythia-14m 18 | EleutherAI/pythia-31m 19 | EleutherAI/pythia-70m 20 | EleutherAI/pythia-160m 21 | EleutherAI/pythia-410m 22 | EleutherAI/pythia-1b 23 | EleutherAI/pythia-1.4b 24 | EleutherAI/pythia-2.8b 25 | EleutherAI/pythia-6.9b 26 | EleutherAI/pythia-12b 27 | EleutherAI/pythia-70m-deduped 28 | EleutherAI/pythia-160m-deduped 29 | EleutherAI/pythia-410m-deduped 30 | EleutherAI/pythia-1b-deduped 31 | EleutherAI/pythia-1.4b-deduped 32 | EleutherAI/pythia-2.8b-deduped 33 | EleutherAI/pythia-6.9b-deduped 34 | EleutherAI/pythia-12b-deduped 35 | ``` 36 | 37 | In order to use a specific Pythia checkpoint, for instance [pythia-1b](https://huggingface.co/EleutherAI/pythia-1b), download the weights and convert the checkpoint to the lit-gpt format: 38 | 39 | ```bash 40 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 41 | 42 | python scripts/download.py --repo_id EleutherAI/pythia-1b 43 | 44 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/EleutherAI/pythia-1b 45 | ``` 46 | 47 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 48 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 49 | 50 | You're done! To execute the model just run: 51 | 52 | ```bash 53 | pip install tokenizers 54 | 55 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/EleutherAI/pythia-1b 56 | ``` 57 | -------------------------------------------------------------------------------- /tutorials/download_code_llama.md: -------------------------------------------------------------------------------- 1 | ## Download [Code Llama](https://ai.meta.com/blog/code-llama-large-language-model-coding/) weights 2 | 3 | Meta developed and publicly released the Code Llama family of large language models (LLMs) on top of Llama 2. 4 | 5 | Code Llama models come in three sizes: 7B, 13B, and 34B parameter models. Furthermore, there are three model versions for each size: 6 | 7 | - Code Llama: A base model trained on 500B tokens, then and finetuned on 20B tokens. 8 | - Code Llama-Python: The Code Llama model pretrained on 500B tokens, further trained on 100B additional Python code tokens, and then finetuned on 20B tokens. 9 | - Code Llama-Instruct: The Code Llama model trained on 500B tokens, finetuned on 20B tokens, and instruction-finetuned on additional 5B tokens. 10 | 11 | All models were trained on 16,000 token contexts and support generations with up to 100,000 tokens of context. 12 | 13 | To see all the available checkpoints, run: 14 | 15 | ```bash 16 | python scripts/download.py | grep CodeLlama 17 | ``` 18 | 19 | which will print 20 | 21 | ```text 22 | codellama/CodeLlama-7b-hf 23 | codellama/CodeLlama-13b-hf 24 | codellama/CodeLlama-34b-hf 25 | codellama/CodeLlama-70b-hf 26 | codellama/CodeLlama-7b-Python-hf 27 | codellama/CodeLlama-13b-Python-hf 28 | codellama/CodeLlama-34b-Python-hf 29 | codellama/CodeLlama-70b-Python-hf 30 | codellama/CodeLlama-7b-Instruct-hf 31 | codellama/CodeLlama-13b-Instruct-hf 32 | codellama/CodeLlama-34b-Instruct-hf 33 | codellama/CodeLlama-70b-Instruct-hf 34 | ``` 35 | 36 | In order to use a specific checkpoint, for instance [CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf), download the weights and convert the checkpoint to the lit-gpt format. 37 | 38 | ```bash 39 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 40 | 41 | python scripts/download.py --repo_id codellama/CodeLlama-7b-Python-hf 42 | 43 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/codellama/CodeLlama-7b-Python-hf 44 | ``` 45 | 46 | By default, the `convert_hf_checkpoint.py` step will use the data type of the HF checkpoint's parameters. In cases where RAM 47 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 48 | 49 | You're done! To execute the model just run: 50 | 51 | ```bash 52 | pip install sentencepiece 53 | 54 | python chat/base.py --checkpoint_dir checkpoints/codellama/CodeLlama-7b-Python-hf/ 55 | ``` 56 | -------------------------------------------------------------------------------- /data_checks/count_token_lengths.py: -------------------------------------------------------------------------------- 1 | #### This is a script to run the overlap test on the two datasets where one is way smaller than the other #### 2 | import os 3 | import datasets 4 | import numpy as np 5 | import torch 6 | from tqdm import tqdm 7 | from transformers import AutoTokenizer 8 | 9 | def count_tokenizes_and_get_metrics(dataset, tokenizer): 10 | token_counts = [] 11 | for data in tqdm(dataset, total=len(dataset)): 12 | token_counts.append(len(tokenizer(data['text'])['input_ids'])) 13 | # we gonnna return the mean, median, max, min, and std as a string 14 | return f"Mean: {np.mean(token_counts)}, Median: {np.median(token_counts)}, Max: {np.max(token_counts)}, Min: {np.min(token_counts)}, Std: {np.std(token_counts)}" 15 | print("Mean: ", np.mean(token_counts)) 16 | print("Median: ", np.median(token_counts)) 17 | print("Max: ", np.max(token_counts)) 18 | print("Min: ", np.min(token_counts)) 19 | print("Std: ", np.std(token_counts)) 20 | 21 | 22 | 23 | 24 | 25 | if __name__ == '__main__': 26 | import argparse 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--base_dir", type=str, default="") 29 | parser.add_argument("--num_proc", type=int, default=28) 30 | args = parser.parse_args() 31 | 32 | for file in os.listdir(args.base_dir): 33 | if file != 'non_targeted': 34 | for split in ['wiki', 'random']: 35 | new_path = os.path.join(args.base_dir, file, split) 36 | if os.path.exists(new_path): 37 | print("Processing: ", new_path) 38 | dataset = datasets.load_from_disk(new_path) 39 | tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T") 40 | output=count_tokenizes_and_get_metrics(dataset, tokenizer) 41 | print(output) 42 | elif file == 'non_targeted': 43 | new_path = os.path.join(args.base_dir, file) 44 | if os.path.exists(new_path): 45 | print("Processing: ", f"{new_path}/wiki") 46 | dataset = datasets.load_from_disk(f"{new_path}/wiki") 47 | tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T") 48 | output=count_tokenizes_and_get_metrics(dataset, tokenizer) 49 | print(output) 50 | else: 51 | raise ValueError("Invalid File: ", file) 52 | # print("File: ", file) 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /tutorials/download_llama_2.md: -------------------------------------------------------------------------------- 1 | ## Download [Llama 2](https://ai.meta.com/llama) weights 2 | 3 | Meta developed and publicly released the Llama 2 family of large language models (LLMs), a collection of pretrained and 4 | fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. Its fine-tuned LLMs, 5 | called Llama-2-Chat, are optimized for dialogue use cases. Llama-2-Chat models outperform open-source chat models on 6 | most benchmarks we tested, and in our human evaluations for helpfulness and safety, are on par with some popular 7 | closed-source models like ChatGPT and PaLM. 8 | 9 | Llama 2 models are trained on 2 trillion tokens (40% more data than LLaMA 1) and have double the context length of LLaMA 1 (4096 tokens). 10 | 11 | Llama 2 comes in a range of parameter sizes — 7B, 13B, and 70B — as well as pretrained and fine-tuned variations. 12 | 13 | To see all the available checkpoints, run: 14 | 15 | ```bash 16 | python scripts/download.py | grep Llama-2 17 | ``` 18 | 19 | which will print 20 | 21 | ```text 22 | meta-llama/Llama-2-7b-hf 23 | meta-llama/Llama-2-7b-chat-hf 24 | meta-llama/Llama-2-13b-hf 25 | meta-llama/Llama-2-13b-chat-hf 26 | meta-llama/Llama-2-70b-hf 27 | meta-llama/Llama-2-70b-chat-hf 28 | ``` 29 | 30 | In order to use a specific checkpoint, for instance [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), download the weights and convert the checkpoint to the lit-gpt format. 31 | 32 | This requires that you've been granted access to the weights on the HuggingFace hub. You can do so by following the steps at . 33 | After access is granted, you can find your HF hub token in . 34 | 35 | ```bash 36 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 37 | 38 | python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf --access_token your_hf_token 39 | 40 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf 41 | ``` 42 | 43 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM 44 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 45 | 46 | You're done! To execute the model just run: 47 | 48 | ```bash 49 | pip install sentencepiece 50 | 51 | python chat/base.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf 52 | ``` 53 | -------------------------------------------------------------------------------- /scripts/prepare_starcoder.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | import sys 5 | import time 6 | import traceback 7 | from pathlib import Path 8 | 9 | import pyarrow.parquet as pq 10 | from lightning.data.streaming import DataChunkRecipe, DataProcessor 11 | 12 | # support running without installing as a package 13 | wd = Path(__file__).parent.parent.resolve() 14 | sys.path.append(str(wd)) 15 | 16 | from lit_gpt import Tokenizer 17 | from lit_gpt.utils import CLI 18 | 19 | 20 | class StarcoderDataRecipe(DataChunkRecipe): 21 | def __init__(self, tokenizer: Tokenizer, chunk_size: int): 22 | super().__init__(chunk_size) 23 | self.tokenizer = tokenizer 24 | 25 | def prepare_structure(self, input_dir): 26 | files = Path(input_dir).rglob("*.parquet") 27 | return [str(file) for file in files] 28 | 29 | def prepare_item(self, item_metadata): 30 | filepath = item_metadata 31 | start = time.time() 32 | 33 | try: 34 | parquet_file = pq.ParquetFile(filepath) 35 | # reduce RAM usage 36 | for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]): 37 | for text in batch.to_pandas()["content"]: 38 | yield self.tokenizer.encode(text, bos=False, eos=True) 39 | 40 | except Exception: 41 | print(traceback.format_exc()) 42 | print(f"Error reading {filepath}") 43 | return 44 | 45 | parquet_file.close() 46 | end = time.time() 47 | print(f"Took {end - start:.2f} seconds total", filepath) 48 | 49 | 50 | def prepare( 51 | input_dir: Path = Path("data/starcoderdata"), 52 | output_dir: Path = Path("data/starcoder"), 53 | tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"), 54 | chunk_size: int = (2049 * 8192), 55 | fast_dev_run: bool = False, 56 | ) -> None: 57 | tokenizer = Tokenizer(tokenizer_path) 58 | data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) 59 | data_processor = DataProcessor( 60 | input_dir=str(input_dir), 61 | output_dir=str(output_dir), 62 | fast_dev_run=fast_dev_run, 63 | num_workers=os.cpu_count(), 64 | num_downloaders=1, 65 | ) 66 | 67 | start_time = time.time() 68 | data_processor.run(data_recipe) 69 | elapsed_time = time.time() - start_time 70 | print(f"Time taken: {elapsed_time:.2f} seconds") 71 | 72 | 73 | if __name__ == "__main__": 74 | CLI(prepare) 75 | -------------------------------------------------------------------------------- /launch_scripts/config/tinyllama-1b-control.yaml: -------------------------------------------------------------------------------- 1 | # Main settings 2 | 3 | # run_name: tinyllama-1b 4 | resume: true 5 | out_dir: null # mention in --extra_args 6 | max_tokens: 20000000000 # 20B 7 | max_iters: null 8 | seed: 1337 9 | 10 | # Model configuration 11 | model_name: tiny-llama-1.1b 12 | block_size: 2048 13 | 14 | # Training hyperparameters 15 | world_batch_size: 1024 16 | learning_rate: 4.0e-04 17 | warmup_steps: 1000 # out of 9536.74 total steps 18 | weight_decay: 0.1 19 | beta1: 0.9 20 | beta2: 0.95 21 | grad_clip: 1.0 22 | lr_schedule: cosine 23 | decay_lr: true 24 | min_lr: 4.0e-05 25 | 26 | # Regularization 27 | neptune_from_tokens: null 28 | neptune_till_tokens: null 29 | neptune_noise_alpha: null 30 | label_smoothing: 0.0 31 | # tld_strategy: static specify in --extra_args 32 | # k_goldfish: specify in --extra_args 33 | 34 | 35 | # Implementation and backend 36 | fabric_strategy: ddp 37 | fabric_precision: bf16-true 38 | micro_batch_size: 8 39 | compile_model: true 40 | matmul_precision: high 41 | dataloader_num_workers: 0 42 | n_chunks: 4 43 | 44 | # Logging 45 | logger_name: wandb 46 | logger_project: TLD-TinyLLaMA-1B 47 | data_telemetry: false 48 | log_step_interval: 1 49 | eval_iters: 2000 50 | save_and_eval_interval: 2000 51 | sanity_validate: true 52 | measure_flops: false 53 | save_n_min_before_job_done: 5 54 | save_last_step: true 55 | 56 | # Data Handling 57 | text_key: text 58 | pad_to_block_size: true 59 | add_bos: false 60 | add_eos: true 61 | shuffle_filenames: true 62 | collate_checks_enabled: true 63 | all_block_size_tensors: false 64 | 65 | # use redpajama_v2_sample_100b_tinyllama_tokd and wikipedia-en-2k-samples 66 | # Data configuration/paths 67 | tokenizer_path: /lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T 68 | data_config: 69 | train_data: 70 | - type: pkds 71 | prefix: '' 72 | weight: 1 # 20B - 204.8M tokens 73 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" # check eos/bos token is used or not 74 | name: redpajama_v2_sample_100b_tinyllama_tokd 75 | val_data: # do verify in latest Jonas' code that TLD is not used 76 | - type: pkds 77 | prefix: '' 78 | weight: 0.98986379474 # 20B - 204.8M tokens 79 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" # check eos/bos token is used or not 80 | name: redpajama_v2_sample_100b_tinyllama_tokd 81 | - type: hfds 82 | prefix: 'wikipedia-en-2k' 83 | weight: 0.01013620526 # 204.8M tokens 84 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val" 85 | name: wikipedia-en-2k-samples -------------------------------------------------------------------------------- /launch_scripts/config/tinyllama-1b.yaml: -------------------------------------------------------------------------------- 1 | # Main settings 2 | 3 | resume: true 4 | out_dir: null # mention in --extra_args 5 | max_tokens: 20000000000 # 20B 6 | max_iters: null 7 | seed: 1337 8 | 9 | # Model configuration 10 | model_name: tiny-llama-1.1b 11 | block_size: 2048 12 | 13 | # Training hyperparameters 14 | world_batch_size: 1024 15 | learning_rate: 4.0e-04 16 | warmup_steps: 1000 # out of 9536.74 total steps 17 | weight_decay: 0.1 18 | beta1: 0.9 19 | beta2: 0.95 20 | grad_clip: 1.0 21 | lr_schedule: cosine 22 | decay_lr: true 23 | min_lr: 4.0e-05 24 | 25 | # Regularization 26 | neptune_from_tokens: null 27 | neptune_till_tokens: null 28 | neptune_noise_alpha: null 29 | label_smoothing: 0.0 30 | # goldfish_strategy: static specify in --extra_args 31 | # k_goldfish: specify in --extra_args 32 | 33 | # Implementation and backend 34 | fabric_strategy: ddp 35 | fabric_precision: bf16-true 36 | micro_batch_size: 8 37 | compile_model: true 38 | matmul_precision: high 39 | dataloader_num_workers: 0 40 | n_chunks: 4 41 | 42 | # Logging 43 | logger_name: wandb 44 | logger_project: goldfish-TinyLLaMA-1B 45 | data_telemetry: false 46 | log_step_interval: 1 47 | eval_iters: 2000 48 | save_and_eval_interval: 2000 49 | sanity_validate: true 50 | measure_flops: false 51 | save_n_min_before_job_done: 5 52 | save_last_step: true 53 | 54 | # Data Handling 55 | text_key: text 56 | pad_to_block_size: true 57 | add_bos: false 58 | add_eos: true 59 | shuffle_filenames: true 60 | collate_checks_enabled: true 61 | all_block_size_tensors: false 62 | 63 | # use redpajama_v2_sample_100b_tinyllama_tokd and wikipedia-en-2k-samples 64 | # Data configuration/paths 65 | tokenizer_path: /lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T 66 | data_config: 67 | train_data: 68 | - type: pkds 69 | prefix: '' 70 | weight: 0.98986379474 # 20B - 204.8M tokens 71 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 72 | name: redpajama_v2_sample_100b_tinyllama_tokd 73 | - type: hfds 74 | prefix: 'wikipedia-en-2k' 75 | weight: 0.01013620526 # 204.8M tokens 76 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train" 77 | name: wikipedia-en-2k-samples 78 | val_data: 79 | - type: pkds 80 | prefix: '' 81 | weight: 0.98986379474 # 20B - 204.8M tokens 82 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 83 | name: redpajama_v2_sample_100b_tinyllama_tokd 84 | - type: hfds 85 | prefix: 'wikipedia-en-2k' 86 | weight: 0.01013620526 # 204.8M tokens 87 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val" 88 | name: wikipedia-en-2k-samples -------------------------------------------------------------------------------- /lit_gpt/data/dolly.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | from pathlib import Path 5 | 6 | import torch 7 | from torch.utils.data import random_split 8 | from lit_gpt.data import SFTDataset, Alpaca 9 | from lit_gpt.data.alpaca import prompt_template 10 | 11 | _URL: str = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl" 12 | 13 | 14 | class Dolly(Alpaca): 15 | """Dolly data module for supervised finetuning. 16 | 17 | Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels". 18 | """ 19 | 20 | def __init__( 21 | self, 22 | mask_prompt: bool = False, 23 | test_split_fraction: float = 0.1, 24 | ignore_index: int = -1, 25 | seed: int = 42, 26 | num_workers: int = 4, 27 | data_file_url: str = _URL, 28 | data_file_name: str = "dolly_data_cleaned.json", 29 | download_dir: Path = Path("./data/dolly"), 30 | ) -> None: 31 | super().__init__( 32 | mask_prompt=mask_prompt, 33 | test_split_fraction=test_split_fraction, 34 | ignore_index=ignore_index, 35 | seed=seed, 36 | num_workers=num_workers, 37 | data_file_url=data_file_url, 38 | data_file_name=data_file_name, 39 | download_dir=download_dir, 40 | ) 41 | 42 | def setup(self, stage: str = "") -> None: 43 | with open(self.download_dir / self.data_file_name, "r", encoding="utf-8") as file: 44 | data = file.readlines() 45 | data = [json.loads(line) for line in data] 46 | for item in data: 47 | item["input"] = item.pop("context") 48 | item["output"] = item.pop("response") 49 | 50 | # Partition the dataset into train and test 51 | train_data, test_data = random_split( 52 | data, 53 | [1.0 - self.test_split_fraction, self.test_split_fraction], 54 | generator=torch.Generator().manual_seed(self.seed) 55 | ) 56 | train_data, test_data = list(train_data), list(test_data) 57 | 58 | self.train_dataset = SFTDataset( 59 | data=train_data, 60 | tokenizer=self.tokenizer, 61 | prompt_template=prompt_template, 62 | max_seq_length=self.max_seq_length, 63 | mask_prompt=self.mask_prompt, 64 | ignore_index=self.ignore_index, 65 | ) 66 | self.test_dataset = SFTDataset( 67 | data=test_data, 68 | tokenizer=self.tokenizer, 69 | prompt_template=prompt_template, 70 | max_seq_length=self.max_seq_length, 71 | mask_prompt=self.mask_prompt, 72 | ignore_index=self.ignore_index, 73 | ) 74 | -------------------------------------------------------------------------------- /tutorials/download_tinyllama.md: -------------------------------------------------------------------------------- 1 | ## Download TinyLlama weights 2 | 3 | [TinyLlama 1.1B](https://github.com/jzhang38/TinyLlama/) is Apache 2.0 licensed and can be used without restrictions. 4 | It is still in development and at the time of writing this, checkpoints for the model trained up to 2T tokens are available. 5 | The target is to train it for ~3 epochs on 3T tokens total. For more details on the schedule and progress of the pretraining, see the official [README](https://github.com/jzhang38/TinyLlama/tree/main). 6 | 7 | There are two version of TinyLlama available: a base one and a fine-tuned "Chat" version. 8 | To see all available versions, run: 9 | 10 | ```bash 11 | python scripts/download.py | grep TinyLlama 12 | ``` 13 | 14 | which will print 15 | 16 | ```text 17 | TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T 18 | TinyLlama/TinyLlama-1.1B-Chat-v1.0 19 | ``` 20 | 21 | In order to use a specific checkpoint, for instance [TinyLlama 1.1B base model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T), which requires about 5 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format: 22 | 23 | ```bash 24 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 25 | 26 | python scripts/download.py --repo_id TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T 27 | 28 | python scripts/convert_hf_checkpoint.py \ 29 | --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T 30 | ``` 31 | 32 | ----- 33 | 34 | With the `Chat` version of the model, the download and conversion procedures are slightly different. 35 | As this version of the model is stored in `safetensor` format, to download it an additional flag is required: 36 | 37 | ```bash 38 | python scripts/download.py --repo_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --from_safetensors=True 39 | ``` 40 | 41 | The model is shipped in `bfloat16` format, so if your hardware doesn't support it, you can provide `dtype` argument during model conversion. For example we can convert the weights into `float32` format: 42 | 43 | ```bash 44 | python scripts/convert_hf_checkpoint.py \ 45 | --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dtype=float32 46 | ``` 47 | 48 | ----- 49 | 50 | You're done! To execute the model just run: 51 | 52 | ```bash 53 | pip install sentencepiece 54 | 55 | # base version 56 | python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T 57 | 58 | # or 59 | 60 | # chat version 61 | python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0 62 | ``` 63 | 64 | To improve the response from Chat version you can also provide these args (as in the [model card](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0)): 65 | 66 | ```bash 67 | python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0 --top_k=50 --temperature=0.7 68 | ``` 69 | -------------------------------------------------------------------------------- /axonn_fabric/megatron_logging.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def get_mem(): 4 | curr = torch.cuda.memory_allocated() / 1024 / 1024 / 1024 5 | peak = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024 6 | return curr, peak 7 | 8 | def get_tflops(config, batch_size): 9 | N = config.n_layer 10 | B = batch_size 11 | S = config.block_size 12 | V = config.padded_vocab_size 13 | H = config.n_embd 14 | IH = config.intermediate_size 15 | 16 | 17 | linear_flops = N*(32*B*S*H*H + 24 * B * S * H * IH) 18 | attention_flops = N*(16 * B * S * S * H) 19 | head_flops = 6 * B * S * H * V 20 | if config.gradient_checkpointing: 21 | flops = linear_flops + attention_flops + head_flops 22 | else: 23 | flops = 3/4*(linear_flops + attention_flops) + head_flops 24 | 25 | return flops/1e12 26 | 27 | def pretty_log(iteration, 28 | train_iters, 29 | consumed_train_samples, 30 | elapsed_time_per_iteration, 31 | learning_rate, 32 | batch_size, 33 | train_loss, 34 | grad_norm=None, 35 | model_name=None, 36 | config=None): 37 | log_string = '> global batch {:8d}/{:8d} |'.format( 38 | iteration, train_iters) 39 | log_string += ' consumed samples: {:12d} |'.format( 40 | consumed_train_samples) 41 | log_string += ' elapsed time per global batch (ms): {:.1f} |'.format( 42 | elapsed_time_per_iteration * 1000.0) 43 | log_string += ' learning rate: {:.3E} |'.format(learning_rate) 44 | log_string += ' global batch size: {:5d} |'.format(batch_size) 45 | log_string += ' loss: {:.5f} |'.format(train_loss) 46 | #log_string += ' loss scale: {:.1f} |'.format(loss_scale) 47 | if grad_norm is not None: 48 | log_string += ' grad norm: {:.3f} |'.format(grad_norm) 49 | #if num_zeros_in_grad is not None: 50 | # log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad) 51 | #if params_norm is not None: 52 | # log_string += ' params norm: {:.3f} |'.format(params_norm) 53 | #log_string += ' number of skipped iterations: {:3d} |'.format( 54 | # total_loss_dict[skipped_iters_key]) 55 | #log_string += ' number of nan iterations: {:3d} |'.format( 56 | # total_loss_dict[nan_iters_key]) 57 | #log_string += ' theoretical FLOP/s: {:.3f} TFLOP/s | '.format(get_flops(elapsed_time_per_iteration)) 58 | #log_string += ' model size: {:.3f} B params | '.format(get_params()) 59 | curr, peak = get_mem() 60 | log_string += ' memory used by tensors {:.3f} GB (peak {:.3f} GB) |'.format(curr, peak) 61 | if model_name is not None: 62 | log_string += f' model name {model_name} |' 63 | if config is not None: 64 | log_string += f' {get_tflops(config, batch_size)/elapsed_time_per_iteration/torch.distributed.get_world_size():.2f} TFLOP/s per GPU' 65 | return log_string 66 | -------------------------------------------------------------------------------- /lit_gpt/args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | 6 | @dataclass 7 | class TrainArgs: 8 | """Training related arguments""" 9 | 10 | save_interval: int = 1000 11 | """Number of optimizer steps between checkpoints""" 12 | log_interval: int = 1 13 | """Number of iterations between logging calls""" 14 | global_batch_size: int = 64 15 | """Number of samples between optimizer steps across data-parallel ranks""" 16 | micro_batch_size: int = 4 17 | """Number of samples per data-parallel rank""" 18 | lr_warmup_steps: int = 100 19 | """Number of iterations with learning rate warmup active""" 20 | epochs: Optional[int] = None 21 | """Number of epochs to run""" 22 | epoch_size: Optional[int] = None 23 | """Size of the epoch""" 24 | # TODO: pretrain/tinyllama is the only script using `max_tokens` explicitly. replace it with epoch_size*epochs? 25 | max_tokens: Optional[int] = None 26 | """Total number of tokens to train on""" 27 | max_seq_length: Optional[int] = None 28 | """Limits the length of samples. Off by default""" 29 | 30 | # Optimization args 31 | learning_rate: float = 1e-3 32 | weight_decay: float = 0.02 33 | beta1: float = 0.9 34 | beta2: float = 0.95 35 | max_norm: Optional[float] = None 36 | min_lr: float = 6e-5 37 | 38 | def max_iters(self, devices: int) -> int: 39 | """Number of iterations""" 40 | max_iters = self.epochs * self.epoch_size // devices // self.micro_batch_size 41 | assert max_iters > 0 42 | return max_iters 43 | 44 | def gradient_accumulation_iters(self, devices: int) -> int: 45 | """Number of iterations between gradient synchronizations""" 46 | gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size 47 | assert gradient_accumulation_iters > 0 48 | return gradient_accumulation_iters 49 | 50 | def batch_size(self, devices: int) -> int: 51 | """Number of samples between optimizer steps per data-parallel rank""" 52 | batch_size = self.global_batch_size // devices 53 | assert batch_size > 0 54 | return batch_size 55 | 56 | 57 | @dataclass 58 | class EvalArgs: 59 | """Evaluation related arguments""" 60 | 61 | interval: int = 600 62 | """Number of optimizer steps between evaluation calls""" 63 | max_new_tokens: Optional[int] = None 64 | """Number of tokens to generate""" 65 | max_iters: int = 100 66 | """Number of iterations""" 67 | 68 | 69 | @dataclass 70 | class IOArgs: 71 | """Inputs and outputs related arguments""" 72 | 73 | # Optional because pretrain/tinyllama hardcodes the path 74 | train_data_dir: Optional[Path] = Path("data/alpaca") 75 | """Where to read training data from""" 76 | val_data_dir: Optional[Path] = None 77 | """Where to read validation data from""" 78 | checkpoint_dir: Optional[Path] = None 79 | """Where to read weights and tokenizer data from""" 80 | out_dir: Path = Path("out/adapter/alpaca") 81 | """Where to save artifacts""" 82 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | # Complete, reproducible script to build and prepare environment 2 | LITGPT_REPO=$(pwd) 3 | 4 | # modify the installation path and env name if you want 5 | INSTALLDIR=${HOME} 6 | ENV_NAME="goldfish_loss" 7 | 8 | cd ${INSTALLDIR} 9 | 10 | # Base the installation on conda from module load 11 | source deactivate > /dev/null 2>&1 # discard potentially preloaded conda environments 12 | module load miniforge3 13 | echo "Conda Version:" $(which conda) 14 | 15 | 16 | # Create conda environment, and print whether it is loaded correctly 17 | conda create --prefix ${INSTALLDIR}/$ENV_NAME python=3.11 --yes -c defaults 18 | source activate ${INSTALLDIR}/$ENV_NAME 19 | echo "Pip Version:" $(which pip) # should be from the new environment! 20 | 21 | # Conda packages: 22 | conda install -c conda-forge conda-pack --yes # install here, for the unpack 23 | 24 | 25 | # Load module family 26 | module load PrgEnv-cray # also loads cray-mpich and related stuff, will be loaded by default 27 | module load amd-mixed/5.6.0 # will need to match if updating pytorch version 28 | module load craype-accel-amd-gfx90a 29 | module load libfabric 30 | module load libtool # careful with LD_Library paths with this loaded, see RCCL notes below 31 | # module load cce/16.0.1 # doesnt fix flash-attention with C++20 headers 32 | 33 | ######### COMPILE PIP PACKAGES ######################## 34 | 35 | # MPI 36 | MPICC="cc -shared" pip install --no-cache-dir --no-binary=mpi4py mpi4py 37 | 38 | # pytorch and core reqs 39 | cd "${LITGPT_REPO}" 40 | pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/rocm5.6 41 | pip install . 42 | cd ${INSTALLDIR} 43 | 44 | # flash attention 45 | pip install packaging ninja numpy 46 | git clone https://github.com/ROCmSoftwarePlatform/flash-attention 47 | cd flash-attention 48 | sed -i 's/c++20/c++17/g' setup.py # Annoying patch for now, there used to be a particular module config that loads a more modern cc version 49 | PYTORCH_ROCM_ARCH='gfx90a' GPU_ARCHS='gfx90a' pip install . 50 | cd ${INSTALLDIR} 51 | 52 | # interconnects 53 | mkdir -p ${INSTALLDIR}/tiny_plugins_rccl 54 | git clone https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl 55 | cd aws-ofi-rccl 56 | ./autogen.sh 57 | CC=cc CXX=CC ./configure --with-libfabric=/opt/cray/libfabric/1.15.0.0 --with-hip=/opt/rocm-5.6.0/ \ 58 | --with-rccl=${CONDA_PREFIX}/lib/python3.11/site-packages/torch/lib/ \ 59 | --prefix=${INSTALLDIR}/tiny_plugins_rccl 60 | CC=cc CXX=CC make -j install 61 | cd ${INSTALLDIR} 62 | 63 | # Finally axonn 64 | # pip install axonn 65 | git clone https://github.com/axonn-ai/axonn 66 | cd axonn 67 | git checkout 3a3c5386c48a889e4ae1f81acfd51ea1bc7f6f98 68 | pip install . 69 | cd ${INSTALLDIR} 70 | 71 | 72 | # Clean-up 73 | cd ${INSTALLDIR} 74 | rm -rf axonn 75 | rm -rf flash-attention 76 | rm -rf aws-ofi-rccl 77 | 78 | ######### PACK A STATIC COPY OF THE ENVIRONMENT ######################## 79 | # This step needs to be repeated if the env is changed 80 | 81 | # Pack up the entire thing 82 | cd ${INSTALLDIR} 83 | rm -f ${ENV_NAME}_env_packed.tar.gz 84 | conda pack -p ${INSTALLDIR}/$ENV_NAME -o ${ENV_NAME}_env_packed.tar.gz --compress-level=1 -------------------------------------------------------------------------------- /tutorials/download_mistral.md: -------------------------------------------------------------------------------- 1 | ## Download [Mistral](https://mistral.ai) weights 2 | 3 | ### Mistral 4 | 5 | [Mistral 7B](https://mistral.ai/news/announcing-mistral-7b) is Apache 2.0 licensed and can be used without restrictions. It: 6 | 7 | * Outperforms Llama 2 13B on all benchmarks 8 | * Outperforms Llama 1 34B on many benchmarks 9 | * Approaches CodeLlama 7B performance on code, while remaining good at English tasks 10 | * Uses Grouped-query attention (GQA) for faster inference 11 | * ~~Uses Sliding Window Attention (SWA) to handle longer sequences at smaller cost~~. 12 | This project's implementation does not use Sliding Window Attention, so the context length is limited to 4096 tokens. 13 | 14 | Details about the data used to train the model or training procedure have not been made public. 15 | 16 | To see all the available checkpoints, run: 17 | 18 | ```bash 19 | python scripts/download.py | grep -E 'Mistral|Mixtral' 20 | ``` 21 | 22 | which will print 23 | 24 | ```text 25 | mistralai/Mistral-7B-v0.1 26 | mistralai/Mistral-7B-Instruct-v0.1 27 | mistralai/Mixtral-8x7B-v0.1 28 | mistralai/Mixtral-8x7B-Instruct-v0.1 29 | mistralai/Mistral-7B-Instruct-v0.2 30 | ``` 31 | 32 | In order to use the Mistral 7B model checkpoint, which requires about 14 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format: 33 | 34 | ```bash 35 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 36 | 37 | python scripts/download.py --repo_id mistralai/Mistral-7B-Instruct-v0.2 38 | 39 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.2 40 | ``` 41 | 42 | You're done! To execute the model just run: 43 | 44 | ```bash 45 | pip install sentencepiece 46 | 47 | python chat/base.py --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.2 48 | ``` 49 | 50 | ### Mixtral 51 | 52 | [Mixtral 8x7B](https://mistral.ai/news/mixtral-of-experts) is a pretrained generative Sparse Mixture of Experts model based on Mistral 7B. 53 | Mistral-8x7B outperforms Llama 2 70B on most benchmarks tested. 54 | 55 | Details about the data used to train the model or training procedure have not been made public. 56 | 57 | In order to use the Mixtral 7B model checkpoint, which requires about 94 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format: 58 | 59 | ```bash 60 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 61 | 62 | python scripts/download.py --repo_id mistralai/Mixtral-8x7B-Instruct-v0.1 --from_safetensors true 63 | 64 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-Instruct-v0.1 65 | ``` 66 | 67 | Due to the size of the model, currently only the multi-device sequential generation script can handle it. 68 | 69 | ```bash 70 | pip install sentencepiece 71 | 72 | python generate/sequentially.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-Instruct-v0.1 73 | ``` 74 | 75 | You will need enough devices (2, 4, or 8) where their combined memory is higher than 94 GB to fit the model in memory. 76 | Please check out [this section](inference.md#run-a-large-model-on-multiple-smaller-devices) for more information about this script. 77 | -------------------------------------------------------------------------------- /scripts/merge_lora.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | """This script merges the LoRA weights with the base model""" 4 | 5 | import sys 6 | from pathlib import Path 7 | from typing import Optional 8 | 9 | import lightning as L 10 | import torch 11 | 12 | # support running without installing as a package 13 | wd = Path(__file__).parent.parent.resolve() 14 | sys.path.append(str(wd)) 15 | 16 | from lit_gpt.lora import GPT, Config, lora_filter, merge_lora_weights 17 | from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load 18 | 19 | 20 | def merge_lora( 21 | lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), 22 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), 23 | out_dir: Path = Path("out/lora/checkpoint"), 24 | precision: Optional[str] = None, 25 | lora_r: int = 8, 26 | lora_alpha: int = 16, 27 | lora_dropout: float = 0.05, 28 | lora_query: bool = True, 29 | lora_key: bool = False, 30 | lora_value: bool = True, 31 | lora_projection: bool = False, 32 | lora_mlp: bool = False, 33 | lora_head: bool = False, 34 | ) -> None: 35 | """Generates a response based on a given instruction and an optional input. 36 | This script will only work with checkpoints from the instruction-tuned GPT-LoRA model. 37 | See `finetune/lora.py`. 38 | 39 | Args: 40 | lora_path: Path to the checkpoint with trained adapter weights, which are the output of 41 | `finetune/lora.py`. 42 | checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights. 43 | out_dir: The path to the merged model that is created by this script. 44 | precision: Indicates the Fabric precision setting to use. 45 | """ 46 | check_valid_checkpoint_dir(checkpoint_dir) 47 | out_dir.mkdir(parents=True, exist_ok=True) 48 | 49 | precision = precision or get_default_supported_precision(training=False) 50 | fabric = L.Fabric(devices=1, precision=precision) 51 | 52 | config = Config.from_json( 53 | checkpoint_dir / "lit_config.json", 54 | r=lora_r, 55 | alpha=lora_alpha, 56 | dropout=lora_dropout, 57 | to_query=lora_query, 58 | to_key=lora_key, 59 | to_value=lora_value, 60 | to_projection=lora_projection, 61 | to_mlp=lora_mlp, 62 | to_head=lora_head, 63 | ) 64 | 65 | with fabric.init_module(empty_init=True): 66 | model = GPT(config) 67 | checkpoint_path = checkpoint_dir / "lit_model.pth" 68 | checkpoint = lazy_load(checkpoint_path) 69 | lora_checkpoint = lazy_load(lora_path) 70 | checkpoint.update(lora_checkpoint.get("model", lora_checkpoint)) 71 | model.load_state_dict(checkpoint) 72 | 73 | merge_lora_weights(model) 74 | 75 | save_path = out_dir / "lit_model.pth" 76 | fabric.print(f"Saving weights to {str(save_path)!r}") 77 | # remove lora parameters and the lora linear substring 78 | state_dict = {k.replace("linear.", ""): v for k, v in model.state_dict().items() if not lora_filter(k, v)} 79 | torch.save(state_dict, save_path) 80 | 81 | 82 | if __name__ == "__main__": 83 | CLI(merge_lora) 84 | -------------------------------------------------------------------------------- /launch_scripts/config/tinyllama-1b-equal-supervised-tokens.yaml: -------------------------------------------------------------------------------- 1 | run_name: tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13 2 | out_dir: "/lustre/orion/csc569/scratch/njain17/new_workspace/lit-gpt-dev/output/tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13_aFYDh8o" 3 | resume: true 4 | max_tokens: 26666666667 5 | max_iters: 6 | seed: 1337 7 | model_name: tiny-llama-1.1b 8 | block_size: 2048 9 | ignore_block_size_mismatch: false 10 | model_checkpoint: 11 | doc_block_attn: false 12 | cache_attn: false 13 | eod_token: 14 | world_batch_size: 1408 15 | learning_rate: 0.0004 16 | warmup_steps: 1000 17 | weight_decay: 0.1 18 | beta1: 0.9 19 | beta2: 0.95 20 | adamw_eps: 1.0e-08 21 | grad_clip: 1 22 | lr_schedule: cosine 23 | decay_lr: true 24 | min_lr: 4.0e-05 25 | no_weight_decay_for_bias_and_norm_params: false 26 | neptune_from_tokens: 27 | neptune_till_tokens: 28 | neptune_noise_alpha: 29 | label_smoothing: 0 30 | goldfish_strategy: hash-table 31 | k_goldfish: 4 32 | goldfish_start_position: 0 33 | goldfish_context_width: 13 34 | fabric_strategy: ddp 35 | fabric_precision: bf16-true 36 | micro_batch_size: 11 37 | compile_model: true 38 | matmul_precision: high 39 | dataloader_num_workers: 0 40 | n_chunks: 4 41 | tensor_parallel_size: 1 42 | torch_dist_init_barrier: false 43 | gradient_checkpointing_axonn: false 44 | logger_name: wandb 45 | logger_project: goldfish-TinyLLaMA-1B 46 | data_telemetry: false 47 | shape_watching_iters: 3 48 | log_step_interval: 1 49 | eval_iters: 2000 50 | save_and_eval_interval: 2000 51 | save_step_interval: 2000 52 | eval_step_interval: 2000 53 | save_last_step: true 54 | save_n_min_before_job_done: 5 55 | sanity_validate: true 56 | measure_flops: false 57 | torch_cpp_log_level: 58 | torch_distributed_debug: 59 | text_key: text 60 | pad_to_block_size: true 61 | add_bos: false 62 | add_eos: true 63 | shuffle_filenames: true 64 | collate_checks_enabled: true 65 | all_block_size_tensors: false 66 | data_config: 67 | train_data: 68 | - type: pkds 69 | prefix: '' 70 | weight: 0.98986379474 71 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 72 | name: redpajama_v2_sample_100b_tinyllama_tokd 73 | - type: hfds 74 | prefix: wikipedia-en-2k 75 | weight: 0.01013620526 76 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train" 77 | name: wikipedia-en-2k-samples 78 | val_data: 79 | - type: pkds 80 | prefix: '' 81 | weight: 0.98986379474 82 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 83 | name: redpajama_v2_sample_100b_tinyllama_tokd 84 | - type: hfds 85 | prefix: wikipedia-en-2k 86 | weight: 0.01013620526 87 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val" 88 | name: wikipedia-en-2k-samples 89 | train_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd" 90 | val_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd" 91 | tokenizer_path: "/lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /launch_scripts/config/tinyllama-1b-equal-supervised-tokens_mbs11.yaml: -------------------------------------------------------------------------------- 1 | run_name: tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13 2 | out_dir: "/lustre/orion/csc569/scratch/njain17/new_workspace/lit-gpt-dev/output/tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13_aFYDh8o" 3 | resume: true 4 | max_tokens: 26666666667 5 | max_iters: 6 | seed: 1337 7 | model_name: tiny-llama-1.1b 8 | block_size: 2048 9 | ignore_block_size_mismatch: false 10 | model_checkpoint: 11 | doc_block_attn: false 12 | cache_attn: false 13 | eod_token: 14 | world_batch_size: 1408 15 | learning_rate: 0.0004 16 | warmup_steps: 1000 17 | weight_decay: 0.1 18 | beta1: 0.9 19 | beta2: 0.95 20 | adamw_eps: 1.0e-08 21 | grad_clip: 1 22 | lr_schedule: cosine 23 | decay_lr: true 24 | min_lr: 4.0e-05 25 | no_weight_decay_for_bias_and_norm_params: false 26 | neptune_from_tokens: 27 | neptune_till_tokens: 28 | neptune_noise_alpha: 29 | label_smoothing: 0 30 | goldfish_strategy: hash-table 31 | k_goldfish: 4 32 | goldfish_start_position: 0 33 | goldfish_context_width: 13 34 | fabric_strategy: ddp 35 | fabric_precision: bf16-true 36 | micro_batch_size: 11 37 | compile_model: true 38 | matmul_precision: high 39 | dataloader_num_workers: 0 40 | n_chunks: 4 41 | tensor_parallel_size: 1 42 | torch_dist_init_barrier: false 43 | gradient_checkpointing_axonn: false 44 | logger_name: wandb 45 | logger_project: goldfish-TinyLLaMA-1B 46 | data_telemetry: false 47 | shape_watching_iters: 3 48 | log_step_interval: 1 49 | eval_iters: 2000 50 | save_and_eval_interval: 2000 51 | save_step_interval: 2000 52 | eval_step_interval: 2000 53 | save_last_step: true 54 | save_n_min_before_job_done: 5 55 | sanity_validate: true 56 | measure_flops: false 57 | torch_cpp_log_level: 58 | torch_distributed_debug: 59 | text_key: text 60 | pad_to_block_size: true 61 | add_bos: false 62 | add_eos: true 63 | shuffle_filenames: true 64 | collate_checks_enabled: true 65 | all_block_size_tensors: false 66 | data_config: 67 | train_data: 68 | - type: pkds 69 | prefix: '' 70 | weight: 0.98986379474 71 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 72 | name: redpajama_v2_sample_100b_tinyllama_tokd 73 | - type: hfds 74 | prefix: wikipedia-en-2k 75 | weight: 0.01013620526 76 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train" 77 | name: wikipedia-en-2k-samples 78 | val_data: 79 | - type: pkds 80 | prefix: '' 81 | weight: 0.98986379474 82 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 83 | name: redpajama_v2_sample_100b_tinyllama_tokd 84 | - type: hfds 85 | prefix: wikipedia-en-2k 86 | weight: 0.01013620526 87 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val" 88 | name: wikipedia-en-2k-samples 89 | train_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd" 90 | val_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd" 91 | tokenizer_path: "/lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /launch_scripts/config/tinyllama-1b-equal-supervised-tokens_mbs8.yaml: -------------------------------------------------------------------------------- 1 | run_name: tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13 2 | out_dir: "/lustre/orion/csc569/scratch/njain17/new_workspace/lit-gpt-dev/output/tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13_u2UQQOY" 3 | resume: true 4 | max_tokens: 26666666667 5 | max_iters: 6 | seed: 1337 7 | model_name: tiny-llama-1.1b 8 | block_size: 2048 9 | ignore_block_size_mismatch: false 10 | model_checkpoint: 11 | doc_block_attn: false 12 | cache_attn: false 13 | eod_token: 14 | world_batch_size: 1024 15 | learning_rate: 0.0004 16 | warmup_steps: 1000 17 | weight_decay: 0.1 18 | beta1: 0.9 19 | beta2: 0.95 20 | adamw_eps: 1.0e-08 21 | grad_clip: 1 22 | lr_schedule: cosine 23 | decay_lr: true 24 | min_lr: 4.0e-05 25 | no_weight_decay_for_bias_and_norm_params: false 26 | neptune_from_tokens: 27 | neptune_till_tokens: 28 | neptune_noise_alpha: 29 | label_smoothing: 0 30 | goldfish_strategy: hash-table 31 | k_goldfish: 4 32 | goldfish_start_position: 0 33 | goldfish_context_width: 13 34 | fabric_strategy: ddp 35 | fabric_precision: bf16-true 36 | micro_batch_size: 8 37 | compile_model: true 38 | matmul_precision: high 39 | dataloader_num_workers: 0 40 | n_chunks: 4 41 | tensor_parallel_size: 1 42 | torch_dist_init_barrier: false 43 | gradient_checkpointing_axonn: false 44 | logger_name: wandb 45 | logger_project: goldfish-TinyLLaMA-1B 46 | data_telemetry: false 47 | shape_watching_iters: 3 48 | log_step_interval: 1 49 | eval_iters: 2000 50 | save_and_eval_interval: 2000 51 | save_step_interval: 2000 52 | eval_step_interval: 2000 53 | save_last_step: true 54 | save_n_min_before_job_done: 5 55 | sanity_validate: true 56 | measure_flops: false 57 | torch_cpp_log_level: 58 | torch_distributed_debug: 59 | text_key: text 60 | pad_to_block_size: true 61 | add_bos: false 62 | add_eos: true 63 | shuffle_filenames: true 64 | collate_checks_enabled: true 65 | all_block_size_tensors: false 66 | data_config: 67 | train_data: 68 | - type: pkds 69 | prefix: '' 70 | weight: 0.98986379474 71 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 72 | name: redpajama_v2_sample_100b_tinyllama_tokd 73 | - type: hfds 74 | prefix: wikipedia-en-2k 75 | weight: 0.01013620526 76 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train" 77 | name: wikipedia-en-2k-samples 78 | val_data: 79 | - type: pkds 80 | prefix: '' 81 | weight: 0.98986379474 82 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 83 | name: redpajama_v2_sample_100b_tinyllama_tokd 84 | - type: hfds 85 | prefix: wikipedia-en-2k 86 | weight: 0.01013620526 87 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val" 88 | name: wikipedia-en-2k-samples 89 | train_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd" 90 | val_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd" 91 | tokenizer_path: "/lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T" 92 | -------------------------------------------------------------------------------- /launch_scripts/config/tinyllama-1b-equal-supervised-tokens_.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | run_name: tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13 3 | out_dir: "/lustre/orion/csc569/scratch/njain17/new_workspace/lit-gpt-dev/output/tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13_aFYDh8o" 4 | resume: true 5 | max_tokens: 26666666667 6 | max_iters: 7 | seed: 1337 8 | model_name: tiny-llama-1.1b 9 | block_size: 2048 10 | ignore_block_size_mismatch: false 11 | model_checkpoint: 12 | doc_block_attn: false 13 | cache_attn: false 14 | eod_token: 15 | world_batch_size: 1408 16 | learning_rate: 0.0004 17 | warmup_steps: 1000 18 | weight_decay: 0.1 19 | beta1: 0.9 20 | beta2: 0.95 21 | adamw_eps: 1.0e-08 22 | grad_clip: 1 23 | lr_schedule: cosine 24 | decay_lr: true 25 | min_lr: 4.0e-05 26 | no_weight_decay_for_bias_and_norm_params: false 27 | neptune_from_tokens: 28 | neptune_till_tokens: 29 | neptune_noise_alpha: 30 | label_smoothing: 0 31 | goldfish_strategy: hash-table 32 | k_goldfish: 4 33 | goldfish_start_position: 0 34 | goldfish_context_width: 13 35 | fabric_strategy: ddp 36 | fabric_precision: bf16-true 37 | micro_batch_size: 11 38 | compile_model: true 39 | matmul_precision: high 40 | dataloader_num_workers: 0 41 | n_chunks: 4 42 | tensor_parallel_size: 1 43 | torch_dist_init_barrier: false 44 | gradient_checkpointing_axonn: false 45 | logger_name: wandb 46 | logger_project: goldfish-TinyLLaMA-1B 47 | data_telemetry: false 48 | shape_watching_iters: 3 49 | log_step_interval: 1 50 | eval_iters: 2000 51 | save_and_eval_interval: 2000 52 | save_step_interval: 2000 53 | eval_step_interval: 2000 54 | save_last_step: true 55 | save_n_min_before_job_done: 5 56 | sanity_validate: true 57 | measure_flops: false 58 | torch_cpp_log_level: 59 | torch_distributed_debug: 60 | text_key: text 61 | pad_to_block_size: true 62 | add_bos: false 63 | add_eos: true 64 | shuffle_filenames: true 65 | collate_checks_enabled: true 66 | all_block_size_tensors: false 67 | data_config: 68 | train_data: 69 | - type: pkds 70 | prefix: '' 71 | weight: 0.98986379474 72 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 73 | name: redpajama_v2_sample_100b_tinyllama_tokd 74 | - type: hfds 75 | prefix: wikipedia-en-2k 76 | weight: 0.01013620526 77 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train" 78 | name: wikipedia-en-2k-samples 79 | val_data: 80 | - type: pkds 81 | prefix: '' 82 | weight: 0.98986379474 83 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" 84 | name: redpajama_v2_sample_100b_tinyllama_tokd 85 | - type: hfds 86 | prefix: wikipedia-en-2k 87 | weight: 0.01013620526 88 | data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val" 89 | name: wikipedia-en-2k-samples 90 | train_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd" 91 | val_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd" 92 | tokenizer_path: "/lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T" 93 | -------------------------------------------------------------------------------- /launch_scripts/launch_jobs_1b_hashtable.sh.sh: -------------------------------------------------------------------------------- 1 | # k = 3, 4, 8, 32, 128, inf 2 | 3 | # 3-goldfish 4 | python launch_scripts/launcher.py \ 5 | --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \ 6 | --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \ 7 | --config="launch_scripts/config/tinyllama-1b.yaml" \ 8 | --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \ 9 | --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish3_hash-table \ 10 | --extra_args="--k_goldfish=3 --goldfish_strategy=hash-table" 11 | 12 | # 4-goldfish 13 | python launch_scripts/launcher.py \ 14 | --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \ 15 | --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \ 16 | --config="launch_scripts/config/tinyllama-1b.yaml" \ 17 | --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \ 18 | --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table \ 19 | --extra_args="--k_goldfish=4 --goldfish_strategy=hash-table" 20 | 21 | 22 | # 8-goldfish 23 | python launch_scripts/launcher.py \ 24 | --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \ 25 | --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \ 26 | --config="launch_scripts/config/tinyllama-1b.yaml" \ 27 | --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \ 28 | --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish8_hash-table \ 29 | --extra_args="--k_goldfish=8 --goldfish_strategy=hash-table" 30 | 31 | # 32-goldfish 32 | python launch_scripts/launcher.py \ 33 | --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \ 34 | --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \ 35 | --config="launch_scripts/config/tinyllama-1b.yaml" \ 36 | --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \ 37 | --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish32_hash-table \ 38 | --extra_args="--k_goldfish=32 --goldfish_strategy=hash-table" 39 | 40 | 41 | # 128-goldfish 42 | python launch_scripts/launcher.py \ 43 | --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \ 44 | --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \ 45 | --config="launch_scripts/config/tinyllama-1b.yaml" \ 46 | --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \ 47 | --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish128_hash-table \ 48 | --extra_args="--k_goldfish=128 --goldfish_strategy=hash-table" 49 | 50 | # 128-goldfish 51 | python launch_scripts/launcher.py \ 52 | --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \ 53 | --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \ 54 | --config="launch_scripts/config/tinyllama-1b.yaml" \ 55 | --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \ 56 | --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish128_hash-table \ 57 | --extra_args="--k_goldfish=128 --goldfish_strategy=hash-table" 58 | 59 | # inf-goldfish or standard loss 60 | python launch_scripts/launcher.py \ 61 | --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \ 62 | --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \ 63 | --config="launch_scripts/config/tinyllama-1b.yaml" \ 64 | --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \ 65 | --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish128_hash-table -------------------------------------------------------------------------------- /tutorials/download_stablelm.md: -------------------------------------------------------------------------------- 1 | ## Download [StableLM](https://github.com/Stability-AI/StableLM) weights 2 | 3 | StableLM is a family of generative language models trained by StabilityAI. 4 | 5 | To see all the available checkpoints for StableLM, run: 6 | 7 | ```bash 8 | python scripts/download.py | grep stablelm 9 | ``` 10 | 11 | which will print: 12 | 13 | ```text 14 | stabilityai/stablelm-base-alpha-3b 15 | stabilityai/stablelm-base-alpha-7b 16 | stabilityai/stablelm-tuned-alpha-3b 17 | stabilityai/stablelm-tuned-alpha-7b 18 | stabilityai/stablelm-3b-4e1t 19 | stabilityai/stablelm-zephyr-3b 20 | ``` 21 | 22 | > [!Important] 23 | > `stablelm-base-alpha-(3,7)b` and `stablelm-tuned-alpha-(3,7)b` are deprecated and are no longer in the StableLM collection. Last time these models were updated in April 2023. Consider using `stablelm-3b-4e1t` (base model) or `stablelm-zephyr-3b` (instruct fine-tuned). 24 | 25 | ### StableLM-3B-4E1T 26 | 27 | StableLM-3B-4E1T is a 3 billion (3B) parameter language model pre-trained under the multi-epoch regime to study the impact of repeated tokens on downstream performance. 28 | 29 | Building on past achievements, StabilityAI underwent training on 1 trillion tokens for 4 epochs, as recommended by Muennighoff et al. (2023) in their study "Scaling Data-Constrained Language Models." They noted that training with repeated data over 4 epochs has minimal impact on loss compared to using unique data. Additionally, insights from "Go smol or go home" (De Vries, 2023) guided the choice of token count. The research suggests that a 2.96B model trained on 2.85 trillion tokens can achieve a loss similar to a compute-optimized 9.87B language model. 30 | More info can be found on [GitHub](https://github.com/Stability-AI/StableLM?tab=readme-ov-file#stablelm-3b-4e1t). 31 | 32 | ### StableLM Zephyr 3B 33 | 34 | Lightweight LLM, preference tuned for instruction following and Q&A-type tasks. This model is an extension of the pre-existing StableLM 3B-4e1t model and is inspired by the Zephyr 7B model from HuggingFace. With StableLM Zephyr's 3 billion parameters, this model efficiently caters to a wide range of text generation needs, from simple queries to complex instructional contexts on edge devices. 35 | More details can be found in the [announcement](https://stability.ai/news/stablelm-zephyr-3b-stability-llm). 36 | 37 | ### Usage 38 | 39 | In order to use a specific StableLM checkpoint, for instance [StableLM Zephyr 3B](https://huggingface.co/stabilityai/stablelm-zephyr-3b), download the weights and convert the checkpoint to the Lit-GPT format: 40 | 41 | ```bash 42 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 43 | 44 | export repo_id=stabilityai/stablelm-zephyr-3b 45 | python scripts/download.py --repo_id $repo_id --from_safetensors=True 46 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id 47 | ``` 48 | 49 | By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM 50 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing. 51 | 52 | You're done! To execute the model just run: 53 | 54 | ```bash 55 | pip install tokenizers 56 | 57 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/$repo_id 58 | ``` 59 | 60 | Or you can run the model in an interactive mode: 61 | 62 | ```bash 63 | python chat/base.py --checkpoint_dir checkpoints/$repo_id 64 | ``` 65 | -------------------------------------------------------------------------------- /scripts/simulate_lr.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script is helpful to simulate a learning rate schedule if you want to train for an existing hot checkpoint. 3 | 4 | By default it configures the hardcoded hyperparameters for the TinyLLaMA model. 5 | https://github.com/jzhang38/TinyLlama/blob/bf122247c486b6b897050e98cbb7bedae8eeba73/pretrain/tinyllama.py#L30:40 6 | You can change the hyperparameters to simulate the learning rate schedule for other models. 7 | 8 | TODO: Parameterize the script to accept the hyperparameters as arguments. 9 | """ 10 | import sys 11 | import os 12 | from dataclasses import dataclass 13 | from functools import partial 14 | import torch 15 | 16 | # Add the root directory of the project to the path 17 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 18 | from pretrain_umd.train import CLISettings, get_lr 19 | from lit_gpt.utils import * 20 | 21 | class CfgWithoutValidation(CLISettings): 22 | def __post_init__(self): 23 | pass 24 | 25 | def main(): 26 | cfg = CfgWithoutValidation( 27 | max_iters=1_430_512, 28 | min_lr=4e-5, 29 | lr_schedule="cosine", 30 | learning_rate=4e-4, 31 | warmup_steps=2000 32 | ) 33 | cfg.warmup_iters = cfg.warmup_steps # assumes steps == iters i.e. gradient accumulation steps = 1 34 | 35 | # Computing hot lr for https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-480k-1T 36 | lr = get_lr(it=480_000 + 1, lr_decay_iters=cfg.max_iters, cfg=cfg) # Can use this as new max_lr and resume training 37 | print(f"Hot LR: {lr}") # Hot LR: 0.00030937179340707335 38 | return 39 | 40 | def tld_loss_sanity(): 41 | cfg = CfgWithoutValidation( 42 | label_smoothing = 0, 43 | tld_strategy = 'hash-avalanche', 44 | k_token_loss_dropout = 3 45 | ) 46 | 47 | vocab_size = 32_000 48 | block_size = 20 49 | mbs = 2 50 | 51 | targets_swapped = torch.randint(0, vocab_size, (mbs+2, block_size)) 52 | torch.manual_seed(1337) 53 | logits = torch.randn(mbs, block_size, vocab_size) 54 | targets = torch.randint(0, vocab_size, (mbs, block_size)) 55 | 56 | swapped_targets = torch.cat((targets_swapped, targets[:1]), dim=0) 57 | 58 | ignore_index = -1 59 | 60 | loss_func = partial( 61 | chunked_cross_entropy, 62 | label_smoothing=cfg.label_smoothing, 63 | tld_strategy=cfg.tld_strategy, 64 | k_token_loss_dropout=cfg.k_token_loss_dropout, 65 | tld_start_position=cfg.tld_start_position, 66 | ignore_index=ignore_index, 67 | ) 68 | 69 | loss = loss_func(logits=logits, targets=targets) 70 | all_token_loss = loss_func(logits=logits, targets=targets, tld_strategy=None) 71 | post_tld_targets, _ = apply_tld(targets=targets, strategy=cfg.tld_strategy, k=cfg.k_token_loss_dropout, tld_start_position=cfg.tld_start_position, ignore_index=ignore_index) 72 | swapped_tld_targets, _ = post_tld_targets, _ = apply_tld(targets=swapped_targets, strategy=cfg.tld_strategy, k=cfg.k_token_loss_dropout, tld_start_position=cfg.tld_start_position, ignore_index=ignore_index) 73 | 74 | assert torch.all(swapped_tld_targets[-1] == post_tld_targets[-1]) 75 | 76 | # random TLD strategy 77 | k = cfg.k_token_loss_dropout 78 | 79 | random_tensor = torch.randint(1, k + 1, size=targets.size()) 80 | mask = (random_tensor == k).int() 81 | dropped_token_indices = mask.nonzero().reshape(mbs, -1) 82 | 83 | breakpoint() 84 | 85 | if __name__ == '__main__': 86 | # main() 87 | tld_loss_sanity() 88 | -------------------------------------------------------------------------------- /scripts/prepare_openwebtext.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | # saves the openwebtext dataset to a binary file for training. following was helpful: 4 | # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py 5 | import os 6 | import sys 7 | from pathlib import Path 8 | from typing import Union 9 | 10 | import numpy as np 11 | from tqdm import tqdm 12 | 13 | # support running without installing as a package 14 | wd = Path(__file__).parent.parent.resolve() 15 | sys.path.append(str(wd)) 16 | 17 | from lit_gpt import Tokenizer 18 | from lit_gpt.utils import CLI 19 | 20 | 21 | def prepare( 22 | destination_path: Path = Path("data/openwebtext"), 23 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), 24 | seed: int = 42, 25 | test_size: Union[float, int, None] = 0.0005, 26 | ) -> None: 27 | from datasets import load_dataset # huggingface datasets 28 | 29 | destination_path.mkdir(parents=True, exist_ok=True) 30 | 31 | tokenizer = Tokenizer(checkpoint_dir) 32 | 33 | # number of workers in .map() call 34 | # good number to use is ~order number of cpu cores // 2 35 | num_proc = os.cpu_count() // 2 36 | 37 | # number of workers in load_dataset() call 38 | # best number might be different from num_proc above as it also depends on HW speed. 39 | # it is better than 1 usually though 40 | num_proc_load_dataset = num_proc 41 | 42 | # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) 43 | dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset) 44 | 45 | # owt by default only contains the 'train' split, so create a test split 46 | split_dataset = dataset["train"].train_test_split(test_size=test_size, seed=seed, shuffle=True) 47 | split_dataset["val"] = split_dataset.pop("test") # rename the test split to val 48 | 49 | def process(example): 50 | ids = tokenizer.encode(example["text"]).tolist() 51 | ids.append(tokenizer.eos_id) 52 | 53 | # ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens 54 | # ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe 55 | # note: I think eot should be prepended not appended... hmm. it's called "eot" though... 56 | return {"ids": ids, "len": len(ids)} 57 | 58 | # tokenize the dataset 59 | tokenized = split_dataset.map(process, remove_columns=["text"], desc="tokenizing the splits", num_proc=num_proc) 60 | 61 | # concatenate all the ids in each dataset into one large file we can use for training 62 | for split, dset in tokenized.items(): 63 | arr_len = np.sum(dset["len"], dtype=np.uint64) 64 | filename = destination_path / f"{split}.bin" 65 | dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) 66 | arr = np.memmap(str(filename), dtype=dtype, mode="w+", shape=(arr_len,)) 67 | total_batches = 1024 68 | 69 | idx = 0 70 | for batch_idx in tqdm(range(total_batches), desc=f"writing {filename}"): 71 | # Batch together samples for faster write 72 | batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format("numpy") 73 | arr_batch = np.concatenate(batch["ids"]) 74 | # Write into mmap 75 | arr[idx : idx + len(arr_batch)] = arr_batch 76 | idx += len(arr_batch) 77 | arr.flush() 78 | 79 | 80 | if __name__ == "__main__": 81 | CLI(prepare) 82 | -------------------------------------------------------------------------------- /lit_gpt/data/longform.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | from pathlib import Path 5 | from typing import Optional 6 | 7 | import torch 8 | from torch.utils.data import DataLoader 9 | from lit_gpt.data import SFTDataset, get_sft_collate_fn, LitDataModule 10 | from lit_gpt.data.alpaca import download_if_missing 11 | from lit_gpt.tokenizer import Tokenizer 12 | 13 | 14 | _URL = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset" 15 | 16 | 17 | class LongForm(LitDataModule): 18 | """LongForm data module for supervised finetuning. 19 | 20 | Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels". 21 | """ 22 | 23 | def __init__( 24 | self, 25 | mask_prompt: bool = False, 26 | ignore_index: int = -1, 27 | seed: int = 42, 28 | num_workers: int = 4, 29 | download_dir: Path = Path("./data/longform"), 30 | ) -> None: 31 | super().__init__() 32 | self.mask_prompt = mask_prompt 33 | self.ignore_index = ignore_index 34 | self.seed = seed 35 | self.num_workers = num_workers 36 | self.download_dir = download_dir 37 | 38 | self.tokenizer: Optional[Tokenizer] = None 39 | self.batch_size: int = 1 40 | self.max_seq_length: int = -1 41 | self.train_dataset: Optional[SFTDataset] = None 42 | self.test_dataset: Optional[SFTDataset] = None 43 | 44 | def connect( 45 | self, 46 | tokenizer: Optional[Tokenizer] = None, 47 | batch_size: int = 1, 48 | max_seq_length: Optional[int] = None 49 | ) -> None: 50 | self.tokenizer = tokenizer 51 | self.batch_size = batch_size 52 | self.max_seq_length = -1 if max_seq_length is None else max_seq_length 53 | 54 | def prepare_data(self) -> None: 55 | self.download_dir.mkdir(parents=True, exist_ok=True) 56 | download_if_missing(self.download_dir / "train.json", f"{_URL}/train.json") 57 | download_if_missing(self.download_dir / "val.json", f"{_URL}/val.json") 58 | 59 | def train_dataloader(self): 60 | return self._dataloader("train") 61 | 62 | def val_dataloader(self): 63 | return self._dataloader("val") 64 | 65 | def _dataloader(self, split: str) -> DataLoader: 66 | with open(self.download_dir / f"{split}.json", "r", encoding="utf-8") as file: 67 | data = json.load(file) 68 | 69 | dataset = SFTDataset( 70 | data=data, 71 | tokenizer=self.tokenizer, 72 | prompt_template=prompt_template, 73 | max_seq_length=self.max_seq_length, 74 | mask_prompt=self.mask_prompt, 75 | ignore_index=self.ignore_index, 76 | ) 77 | return DataLoader( 78 | dataset=dataset, 79 | batch_size=self.batch_size, 80 | shuffle=(split == "train"), 81 | generator=torch.Generator().manual_seed(self.seed), 82 | num_workers=self.num_workers, 83 | collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index) 84 | ) 85 | 86 | 87 | def prompt_template(example: dict) -> str: 88 | """A modified Alpaca prompt template without the 'input'.""" 89 | return ( 90 | "Below is an instruction that describes a task, paired with an input that provides further context. " 91 | "Write a response that appropriately completes the request.\n\n" 92 | f"### Instruction:\n{example['input']}\n\n### Response:\n" 93 | ) 94 | -------------------------------------------------------------------------------- /scripts/convert_checkpoint_to_hf.py: -------------------------------------------------------------------------------- 1 | import json 2 | import shutil 3 | import sys 4 | from pathlib import Path 5 | 6 | import torch 7 | 8 | from convert_pretrained_checkpoint import convert_checkpoint 9 | from convert_lit_checkpoint import convert_lit_checkpoint 10 | 11 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM 12 | from huggingface_hub import create_repo 13 | 14 | # support running without installing as a package 15 | wd = Path(__file__).parent.parent.resolve() 16 | sys.path.append(str(wd)) 17 | 18 | from lit_gpt.utils import CLI 19 | 20 | 21 | @torch.inference_mode() 22 | def convert_checkpoint_to_hf( 23 | checkpoint_file: Path, 24 | tokenizer_dir: Path, 25 | model_name: str, 26 | axonn_patch: bool = False, 27 | push_to_hub: bool = True, 28 | ) -> None: 29 | ### convert training checkpoint to lit checkpoint 30 | parent_dir = checkpoint_file.parent.absolute() 31 | with open(parent_dir / "model_config.json") as f: 32 | model_config = json.load(f) 33 | config_name = model_config["name"] 34 | convert_checkpoint(checkpoint_file, tokenizer_dir, config_name, parent_dir / f"lit_checkpoint_{model_name}") 35 | 36 | ### convert training checkpoint to hf checkpoint 37 | convert_lit_checkpoint( 38 | parent_dir / f"lit_checkpoint_{model_name}/lit_model.pth", 39 | parent_dir / f"hf_checkpoint_{model_name}/pytorch_model.bin", 40 | parent_dir / f"lit_checkpoint_{model_name}/lit_config.json", 41 | axonn_patch=axonn_patch, 42 | ) 43 | 44 | for tokenizer_file in tokenizer_dir.glob("tokenizer*"): 45 | shutil.copyfile(tokenizer_file, parent_dir / f"hf_checkpoint_{model_name}" / tokenizer_file.name) 46 | 47 | if (tokenizer_dir / "generation_config.json").is_file(): 48 | shutil.copyfile( 49 | tokenizer_dir / "generation_config.json", 50 | parent_dir / f"hf_checkpoint_{model_name}" / "generation_config.json", 51 | ) 52 | 53 | if (tokenizer_dir / "special_tokens_map.json").is_file(): 54 | shutil.copyfile( 55 | tokenizer_dir / "special_tokens_map.json", 56 | parent_dir / f"hf_checkpoint_{model_name}" / "special_tokens_map.json", 57 | ) 58 | 59 | if (tokenizer_dir / "added_tokens.json").is_file(): 60 | shutil.copyfile( 61 | tokenizer_dir / "added_tokens.json", parent_dir / f"hf_checkpoint_{model_name}" / "added_tokens.json" 62 | ) 63 | 64 | if (tokenizer_dir / "config.json").is_file(): 65 | shutil.copyfile(tokenizer_dir / "config.json", parent_dir / f"hf_checkpoint_{model_name}" / "config.json") 66 | 67 | # hf_org = model_config["hf_config"]["org"] 68 | # hf_name = model_config["hf_config"]["name"] 69 | # hf_config = AutoConfig.from_pretrained(f"{hf_org}/{hf_name}") 70 | # hf_config = hf_config.to_dict() 71 | # with open(parent_dir / f"hf_checkpoint_{model_name}" / "config.json", "w") as f: 72 | # json.dump(hf_config, f, indent=4) 73 | 74 | if not push_to_hub: 75 | return 76 | 77 | ### push to hub 78 | repo_name = f"tomg-group-umd/{model_name}" 79 | tokenizer = AutoTokenizer.from_pretrained(parent_dir / f"hf_checkpoint_{model_name}") 80 | state_dict = torch.load(parent_dir / f"hf_checkpoint_{model_name}/pytorch_model.bin") 81 | model = AutoModelForCausalLM.from_pretrained(parent_dir / f"hf_checkpoint_{model_name}", state_dict=state_dict) 82 | create_repo(repo_name, private=True) 83 | model.push_to_hub(repo_name, use_temp_dir=True) 84 | tokenizer.push_to_hub(repo_name, use_temp_dir=True) 85 | 86 | print(f"Model pushed to {repo_name}") 87 | 88 | 89 | if __name__ == "__main__": 90 | CLI(convert_checkpoint_to_hf) 91 | -------------------------------------------------------------------------------- /scripts/convert_pretrained_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | import shutil 5 | import sys 6 | from dataclasses import asdict 7 | from pathlib import Path 8 | 9 | import torch 10 | 11 | # support running without installing as a package 12 | wd = Path(__file__).parent.parent.resolve() 13 | sys.path.append(str(wd)) 14 | 15 | from lit_gpt import Config 16 | from lit_gpt.utils import CLI, incremental_save 17 | 18 | 19 | @torch.inference_mode() 20 | def convert_checkpoint(checkpoint_file: Path, tokenizer_dir: Path, config_name: str, output_dir: Path) -> None: 21 | """Convert a checkpoint after pretraining. 22 | 23 | The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training 24 | is finished. This script will export the state-dict of the model and place it in the chosen output folder together 25 | with the tokenizer and model config, which then can be loaded by other scripts for inference, evaluation, etc. 26 | 27 | Args: 28 | checkpoint_file: Path to a checkpoint file scripts produced by the scripts in ``lit_gpt/pretrain/``. 29 | tokenizer_dir: A path to the folder that holds the tokenizer configuration files that were used to train 30 | the model. All files with a name starting with 'tokenizer' will be copied to the output folder. 31 | config_name: The name of the model loaded with the ``lit_gpt.Config``. The configuration will be saved as a 32 | JSON file to the output folder. 33 | output_dir: The output folder where model state-dict file, the tokenizer config file, and the model config 34 | file will be saved. 35 | """ 36 | 37 | if output_dir.is_dir() and output_dir.glob("*"): 38 | raise FileExistsError( 39 | f"The output folder exists and is not empty: {str(output_dir)}." 40 | " Please delete it first or choose a different name." 41 | ) 42 | if not tokenizer_dir.is_dir(): 43 | raise FileNotFoundError(f"The tokenizer_dir must be a directory: {str(output_dir)}.") 44 | 45 | output_dir.mkdir(parents=True) 46 | output_checkpoint_file = output_dir / "lit_model.pth" 47 | output_config_file = output_dir / "lit_config.json" 48 | 49 | # Save the config to output folder 50 | config = Config.from_name(config_name) 51 | with open(output_config_file, "w") as json_config: 52 | json.dump(asdict(config), json_config) 53 | 54 | # Export the tokenizer configuration to output folder 55 | for tokenizer_file in tokenizer_dir.glob("tokenizer*"): 56 | shutil.copyfile(tokenizer_file, output_dir / tokenizer_file.name) 57 | 58 | # Copy config for tokenization if found 59 | if (tokenizer_dir / "generation_config.json").is_file(): 60 | shutil.copyfile(tokenizer_dir / "generation_config.json", output_dir / "generation_config.json") 61 | 62 | # Extract the model state dict and save to output folder 63 | with incremental_save(output_checkpoint_file) as saver: 64 | print("Processing", checkpoint_file) 65 | full_checkpoint = torch.load(str(checkpoint_file), mmap=True) 66 | loaded_state_dict = full_checkpoint["model"] 67 | converted_state_dict = {} 68 | for param_name, param in loaded_state_dict.items(): 69 | saver.store_early(param) 70 | # remove prefix for compiled model (if any) 71 | param_name = param_name.replace("_orig_mod.", "") 72 | converted_state_dict[param_name] = param 73 | print(f"Saving converted checkpoint to {str(output_checkpoint_file)}.") 74 | saver.save(converted_state_dict) 75 | 76 | 77 | if __name__ == "__main__": 78 | CLI(convert_checkpoint) 79 | -------------------------------------------------------------------------------- /eval/alpaca_eval_generate.py: -------------------------------------------------------------------------------- 1 | """This script generates the evaluation responses that can e used by eval_scoring.py""" 2 | 3 | import argparse 4 | from functools import partial 5 | import json 6 | 7 | from datasets import Dataset 8 | import datasets 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | import torch 11 | from conversation import get_conv_template 12 | 13 | 14 | def apply_conv_template(example, template_type): 15 | # preprocess instructions into prompted inputs 16 | conv = get_conv_template(template_type) 17 | conv.append_message(conv.roles[0], example["instruction"]) 18 | conv.append_message(conv.roles[1], "") 19 | prompt = conv.get_prompt() 20 | 21 | example.update({"prompt": prompt}) 22 | 23 | return example 24 | 25 | 26 | def generate_responses_batched(example, model, tokenizer, kwargs): 27 | prompt = example["prompt"] 28 | 29 | encoding = tokenizer( 30 | prompt, 31 | return_tensors="pt", 32 | padding="longest", 33 | max_length=tokenizer.model_max_length, 34 | truncation=True, 35 | ) 36 | encoding = encoding.to(model.device) 37 | with torch.no_grad(): 38 | model_output = model.generate(**encoding, **kwargs) 39 | input_len = encoding.input_ids.shape[-1] 40 | model_output = model_output[:, input_len:].cpu() 41 | decoded_output = tokenizer.batch_decode( 42 | model_output, skip_special_tokens=True, clean_up_tokenization_spaces=False 43 | ) 44 | 45 | example.update({"output": decoded_output}) 46 | example.update({"metadata": [kwargs] * len(decoded_output)}) 47 | 48 | return example 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--model", default="llama/7B_sharded", type=str) 54 | parser.add_argument("--model_name", default=None, type=str) 55 | parser.add_argument("--template_type", default="alpaca", type=str) 56 | parser.add_argument("--save_file_name", default="outputs/answers/self-instruct_llama7B.jsonl", type=str) 57 | parser.add_argument("--batch_size", default=4, type=int) 58 | parser.add_argument( 59 | "--debug", 60 | action="store_true", 61 | help="This reduce the number of generation examples to 4, so that we can debug faster.", 62 | ) 63 | args = parser.parse_args() 64 | 65 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 66 | model = AutoModelForCausalLM.from_pretrained(args.model).to(device) 67 | tokenizer = AutoTokenizer.from_pretrained(args.model, model_max_length=2048, padding_side="left") 68 | tokenizer.pad_token = tokenizer.eos_token 69 | 70 | ## set the models to eval mode 71 | model = model.eval() 72 | 73 | raw_data = datasets.load_dataset("tatsu-lab/alpaca_eval", "alpaca_eval")["eval"] 74 | raw_data = raw_data.map(lambda x: {"generator": args.model_name if args.model_name else args.model}) 75 | 76 | ## preprocess 77 | eval_preproc = partial(apply_conv_template, template_type=args.template_type) 78 | raw_data = raw_data.map(eval_preproc) 79 | 80 | # reduce number of examples for debugging 81 | if args.debug: 82 | raw_data = raw_data.select(range(4)) 83 | 84 | ## run generation 85 | generate_kwargs = dict( 86 | max_length=2048, do_sample=True, top_p=0.7, num_return_sequences=1, temperature=1, repetition_penalty=1.2 87 | ) 88 | generate = partial( 89 | generate_responses_batched, 90 | model=model, 91 | tokenizer=tokenizer, 92 | kwargs=generate_kwargs, 93 | ) 94 | 95 | dataset_w_responses = raw_data.map(generate, batched=True, batch_size=args.batch_size) 96 | dataset_w_responses = dataset_w_responses.map(lambda x: {"metadata": json.dumps(x["metadata"])}) 97 | dataset_w_responses.to_json(args.save_file_name, orient="records", lines=False, indent=True) 98 | -------------------------------------------------------------------------------- /scripts/download.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | import sys 5 | from pathlib import Path 6 | from typing import Optional 7 | 8 | import torch 9 | from lightning_utilities.core.imports import RequirementCache 10 | 11 | # support running without installing as a package 12 | wd = Path(__file__).parent.parent.resolve() 13 | sys.path.append(str(wd)) 14 | 15 | from lit_gpt.utils import CLI 16 | 17 | _SAFETENSORS_AVAILABLE = RequirementCache("safetensors") 18 | _HF_TRANSFER_AVAILABLE = RequirementCache("hf_transfer") 19 | 20 | 21 | def download_from_hub( 22 | repo_id: Optional[str] = None, 23 | access_token: Optional[str] = os.getenv("HF_TOKEN"), 24 | from_safetensors: bool = False, 25 | tokenizer_only: bool = False, 26 | checkpoint_dir: Path = Path("checkpoints"), 27 | ) -> None: 28 | if repo_id is None: 29 | from lit_gpt.config import configs 30 | 31 | options = [f"{config['hf_config']['org']}/{config['hf_config']['name']}" for config in configs] 32 | print("Please specify --repo_id . Available values:") 33 | print("\n".join(options)) 34 | return 35 | 36 | from huggingface_hub import snapshot_download 37 | 38 | if ("meta-llama" in repo_id or "falcon-180" in repo_id) and not access_token: 39 | raise ValueError( 40 | f"{repo_id} requires authentication, please set the `HF_TOKEN=your_token` environment" 41 | " variable or pass --access_token=your_token. You can find your token by visiting" 42 | " https://huggingface.co/settings/tokens" 43 | ) 44 | 45 | download_files = ["tokenizer*", "generation_config.json", "config.json"] 46 | if not tokenizer_only: 47 | if from_safetensors: 48 | if not _SAFETENSORS_AVAILABLE: 49 | raise ModuleNotFoundError(str(_SAFETENSORS_AVAILABLE)) 50 | download_files.append("*.safetensors") 51 | else: 52 | # covers `.bin` files and `.bin.index.json` 53 | download_files.append("*.bin*") 54 | elif from_safetensors: 55 | raise ValueError("`--from_safetensors=True` won't have an effect with `--tokenizer_only=True`") 56 | 57 | import huggingface_hub._snapshot_download as download 58 | import huggingface_hub.constants as constants 59 | 60 | previous = constants.HF_HUB_ENABLE_HF_TRANSFER 61 | if _HF_TRANSFER_AVAILABLE and not previous: 62 | print("Setting HF_HUB_ENABLE_HF_TRANSFER=1") 63 | constants.HF_HUB_ENABLE_HF_TRANSFER = True 64 | download.HF_HUB_ENABLE_HF_TRANSFER = True 65 | 66 | directory = checkpoint_dir / repo_id 67 | snapshot_download( 68 | repo_id, 69 | local_dir=directory, 70 | local_dir_use_symlinks=False, 71 | resume_download=True, 72 | allow_patterns=download_files, 73 | token=access_token, 74 | ) 75 | 76 | constants.HF_HUB_ENABLE_HF_TRANSFER = previous 77 | download.HF_HUB_ENABLE_HF_TRANSFER = previous 78 | 79 | # convert safetensors to PyTorch binaries 80 | if from_safetensors: 81 | from safetensors import SafetensorError 82 | from safetensors.torch import load_file as safetensors_load 83 | 84 | print("Converting .safetensor files to PyTorch binaries (.bin)") 85 | for safetensor_path in directory.glob("*.safetensors"): 86 | bin_path = safetensor_path.with_suffix(".bin") 87 | try: 88 | result = safetensors_load(safetensor_path) 89 | except SafetensorError as e: 90 | raise RuntimeError(f"{safetensor_path} is likely corrupted. Please try to re-download it.") from e 91 | print(f"{safetensor_path} --> {bin_path}") 92 | torch.save(result, bin_path) 93 | os.remove(safetensor_path) 94 | 95 | 96 | if __name__ == "__main__": 97 | CLI(download_from_hub) 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Goldfish Loss: Mitigating Memorization in Generative LLMs [[paper]](https://arxiv.org/abs/2406.10209)[[checkpoints]](https://huggingface.co/collections/tomg-group-umd/goldfish-loss-mitigating-memorization-in-llms-66c175becb6aab07744f7272) 3 | 4 |

5 | A very smart goldfish 6 |

7 | 8 | 9 | #### We introduce goldfish loss — a strikingly simple technique to mitigating extractable memorization in large language models. 10 | 11 | ## Getting Started 12 | 13 | This codebase is setup to (pre)train large language models in a distributed training setup using SLURM. This repo is written and tested on AMD compute-nodes and is a fork of Lightning AI's [LitGPT repository](https://github.com/Lightning-AI/litgpt). 14 | 15 | For implementation of goldfish loss, please checkout [`apply_goldfish`](https://github.com/ahans30/goldfish-loss/blob/70bfad87dcf69da2921bcad08e662925ab2ab60b/lit_gpt/utils.py#L241) method in `lit_gpt/utils.py`. 16 | 17 | ### Installation 18 | Before running below command, checkout the script and setup path variables specific to your compute instance (e.g. `INSTALLDIR`). 19 | 20 | ```bash 21 | $ bash install.sh 22 | ``` 23 | 24 | This will take some time, but will create a folder called `goldfish_loss` and a folder called `tiny_plugins_rccl` in the installation directory (which is `$HOME` by default). The conda folder contains a fully functional conda environment containing all neccessary packages to run our training scripts. The RCCL folder contains code for the interconnect plugin that is crucial for multi-node jobs. You can enable this environment by using `source activate ${INSTALLDIR}/`. 25 | 26 | Please note, this installation is specific for AMD's compute nodes and is written as such. 27 | 28 | 29 | ## Usage 30 | 31 | Below command will invocate a python script that will inturn queue a SLURM job specific to frontier cluster used for this work. You can use `--dryrun` to retrieve the `sbatch` script that can be tuned for other clusters. 32 | 33 | ```bash 34 | $ launch_scripts/launch_jobs_1b_hashtable.sh 35 | ``` 36 | 37 | You can find the training config in `launch_scripts/config/` in yaml format. 38 | 39 | 40 | ## Contributing 41 | 42 | This is WIP repo and we are working on porting all experiments from the paper. 43 | 44 | We believe in open source community development. Feel free to add things, open issues, start pull requests, or reach out to us with any thoughts or questions. 45 | 46 | ## Cite our work 47 | 48 | If you find our work useful, please cite our paper: 49 | 50 | ```bibtex 51 | @inproceedings{ 52 | hans2024goldfishloss, 53 | title={Be like a Goldfish, Don't Memorize! Mitigating Memorization in Generative {LLM}s}, 54 | author={Abhimanyu Hans and John Kirchenbauer and Yuxin Wen and Neel Jain and Hamid Kazemi and Prajwal Singhania and Siddharth Singh and Gowthami Somepalli and Jonas Geiping and Abhinav Bhatele and Tom Goldstein}, 55 | booktitle={The Thirty-eighth Annual Conference on Neural Information Processing Systems}, 56 | year={2024}, 57 | url={https://openreview.net/forum?id=DylSyAfmWs} 58 | } 59 | ``` 60 | 61 | ## Acknowledgements 62 | 1. This codebase is developed as a collective effort from [tomg-group-umd](https://github.com/tomg-group-umd) members for LLM (pre)training and is fork of the [LitGPT codebase](https://github.com/Lightning-AI/litgpt). 63 | 1. This code is tested on *frontier* resources of the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725. This project is supported by the award for computer time was provided by the U.S. Department of Energy’s (DOE) Innovative and Novel Computational Impact on Theory and Experiment (INCITE) Program. Financial support was provided by the ONR MURI program and the AFOSR MURI program. Private support was provided by Capital One Bank, the Amazon Research Award program, and Open Philanthropy. Further support was provided by the National Science Foundation (IIS-2212182), and by the NSF TRAILS Institute (2229885). 64 | -------------------------------------------------------------------------------- /tutorials/pretrain_openwebtext.md: -------------------------------------------------------------------------------- 1 | # Pretrain Llama 2 on OpenWebText 2 | 3 | This tutorial will walk you through setting up the OpenWebText dataset and launching the pretraining script. 4 | 5 | ## What's OpenWebText 6 | 7 | [OpenWebText](https://github.com/jcpeterson/openwebtext) is an open-source reproduction of OpenAI's unreleased WebText training dataset, which was originally used to train GPT-2. The version that is used here consists of 8M documents and is loaded via the `load_dataset("openwebtext", ...)` function from the [datasets](https://github.com/huggingface/datasets) Python package. [Please refer to the website hosting the dataset](https://huggingface.co/datasets/Skylion007/openwebtext) for the licensing information. 8 | 9 | ## Prepare OpenWebText for training 10 | 11 | In order to start pretraining lit-gpt on it, you need to read, tokenize, and write the data in binary format. 12 | 13 | To prepare the dataset with the Llama 2 tokenizer, run 14 | 15 | ```bash 16 | pip install datasets 17 | 18 | python scripts/prepare_openwebtext.py \ 19 | --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-hf/ \ 20 | --destination_path data/openwebtext 21 | ``` 22 | 23 | The script will take about 15 min to run. 24 | 25 | ## Pretraining 26 | 27 | Running the pretraining script with its default settings requires at least 4 GPUs with 40GB+ each. (However, alternatively, you can train a smaller Pythia-70m on 1 GPU, more information about that further below). 28 | 29 | ```bash 30 | python pretrain/openwebtext.py --devices 4 31 | ``` 32 | 33 | The script will save checkpoints periodically to the folder `out/`. 34 | 35 | By default, the `pretrain/openwebtext.py` script will pretrain the Llama 2 7B model with FSDP in 36 | `bfloat16` precision and gradient accumulation. 37 | 38 | You can easily change the size of the model by passing a different string to the model name variable 39 | 40 | ```shell 41 | --model_name "Llama-2-7b-hf" 42 | ``` 43 | 44 | at the top of this script. 45 | 46 | The currently supported model names are contained in the [config.py](https://github.com/Lightning-AI/lit-gpt/lit_gpt/config.py) file. 47 | You can 48 | 49 | 1) either search this file for lines containing "name =", 50 | 2) or run `python scripts/download.py` without additional command line arguments, 51 | 52 | Keep in mind that the original LLaMA training for the 7B model required 83k A100 80GB 53 | hours (on a bigger dataset). However, for full pretraining on OpenWebText, you'll likely still need access to a cluster. 54 | 55 | Once you're in a cluster, you can follow [these instructions](https://lightning.ai/docs/fabric/stable/fundamentals/launch.html#launch-on-a-cluster) 56 | to launch the script across machines: 57 | 58 | - [SLURM cluster](https://lightning.ai/docs/fabric/stable/guide/multi_node/slurm.html) 59 | - [Barebones cluster](https://lightning.ai/docs/fabric/stable/guide/multi_node/barebones.html) 60 | - [MPI](https://lightning.ai/docs/fabric/stable/guide/multi_node/other.html) 61 | 62 | The exposes several hyperparameters you can tweak through the command line. 63 | 64 | For instance, `--train.micro_batch_size` should be adjusted so the process will use the available 65 | GPU memory. For more tips to avoid out-of-memory issues, please also see the more detailed 66 | [Dealing with out-of-memory (OOM) errors](oom.md) guide. 67 | 68 | Last, logging is kept minimal in the script. In order to use a particular logger 69 | please refer to or 70 | call a logging client library like `wandb` directly. 71 | 72 | ## Training a smaller model on a single GPU 73 | 74 | To train a smaller Pythia 70M model on a single GPU, you can pass the `--model_name "pythia-70m"` argument. 75 | 76 | (Please see the `download_*` scripts in the [tutorials](.) for more information on downloading model checkpoints for different models.) 77 | 78 | Also, before you start training, note that you will need to prepare the dataset specifically for this model since it may use a different tokenizer: 79 | 80 | ```bash 81 | python scripts/prepare_openwebtext.py \ 82 | --checkpoint_dir checkpoints/EleutherAI/pythia-70m/ \ 83 | --destination_path data/lit-openwebtext 84 | 85 | python pretrain/openwebtext.py --devices 4 86 | ``` 87 | -------------------------------------------------------------------------------- /lit_gpt/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | from pathlib import Path 5 | from typing import Optional, Union 6 | 7 | import torch 8 | 9 | 10 | class Tokenizer: 11 | def __init__(self, checkpoint_dir: Union[Path, str]) -> None: 12 | checkpoint_dir = Path(checkpoint_dir) 13 | if not checkpoint_dir.exists(): 14 | raise NotADirectoryError(f"The checkpoint directory does not exist: {str(checkpoint_dir)}") 15 | 16 | self.bos_id = None 17 | self.eos_id = None 18 | self.pad_id = None 19 | 20 | if (checkpoint_dir / "tokenizer.json").is_file(): 21 | from transformers import AutoTokenizer 22 | 23 | self.processor = AutoTokenizer.from_pretrained( 24 | str(checkpoint_dir), add_bos_token=False, add_eos_token=False 25 | ) 26 | 27 | self.backend = "huggingface" 28 | 29 | if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file(): 30 | with open(special_tokens_path) as fp: 31 | config = json.load(fp) 32 | self.bos_id = self.processor.bos_token_id 33 | self.eos_id = self.processor.eos_token_id 34 | self.pad_id = self.processor.pad_token_id 35 | if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file(): 36 | with open(special_tokens_path) as fp: 37 | config = json.load(fp) 38 | if self.bos_id is None: 39 | self.bos_id = config.get("bos_token_id") 40 | if self.eos_id is None: 41 | self.eos_id = config.get("eos_token_id") 42 | if self.pad_id is None: 43 | self.pad_id = config.get("pad_token_id") # idk if this will always work 44 | elif "open_llama" in str(checkpoint_dir): 45 | from transformers import LlamaTokenizer 46 | 47 | self.processor = LlamaTokenizer.from_pretrained( 48 | str(checkpoint_dir), add_bos_token=False, add_eos_token=False 49 | ) 50 | 51 | self.backend = "huggingface" 52 | 53 | if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file(): 54 | with open(special_tokens_path) as fp: 55 | config = json.load(fp) 56 | self.bos_id = self.processor.bos_token_id 57 | self.eos_id = self.processor.eos_token_id 58 | self.pad_id = self.processor.pad_token_id 59 | if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file(): 60 | with open(special_tokens_path) as fp: 61 | config = json.load(fp) 62 | if self.bos_id is None: 63 | self.bos_id = config.get("bos_token_id") 64 | if self.eos_id is None: 65 | self.eos_id = config.get("eos_token_id") 66 | if self.pad_id is None: 67 | self.pad_id = config.get("pad_token_id") # idk if this will always work 68 | else: 69 | raise NotImplementedError 70 | 71 | @property 72 | def vocab_size(self) -> int: 73 | return self.processor.vocab_size 74 | 75 | def encode( 76 | self, 77 | string: str, 78 | device: Optional[torch.device] = None, 79 | bos: Optional[bool] = None, 80 | eos: bool = False, 81 | max_length: int = -1, 82 | ) -> torch.Tensor: 83 | tokens = self.processor.encode(string) 84 | 85 | if bos: 86 | bos_id = self.bos_id 87 | if bos_id is None: 88 | raise NotImplementedError("This tokenizer does not have a defined a bos token") 89 | tokens = [bos_id] + tokens 90 | if eos: 91 | tokens = tokens + [self.eos_id] 92 | if max_length > 0: 93 | tokens = tokens[:max_length] 94 | return torch.tensor(tokens, dtype=torch.int, device=device) 95 | 96 | def decode(self, tensor: torch.Tensor) -> str: 97 | tokens = [tensor.item()] if tensor.ndim == 0 else tensor.tolist() 98 | return self.processor.decode(tokens) 99 | -------------------------------------------------------------------------------- /tutorials/finetune_full.md: -------------------------------------------------------------------------------- 1 | # Finetuning the whole model 2 | 3 | If you are interested in parameter-efficient finetuning, check out [finetune_adapter.md](finetune_adapter.md). In contrast to parameter-efficient finetuning, this "full" approach finetunes all model parameters, which is substantially more expensive. It may only be recommended as a baseline for comparison studies. 4 | 5 | ## Preparation 6 | 7 | The steps here only need to be done once: 8 | 9 | 1. Follow the instructions in the [README](../README.md) to install the dependencies. 10 | 2. Download and convert the weights following our [guide](download_stablelm.md). 11 | 3. Download the data and generate the Alpaca instruction tuning dataset: 12 | 13 | ```bash 14 | python scripts/prepare_alpaca.py --checkpoint_dir checkpoints/tiiuae/falcon-7b 15 | ``` 16 | 17 | or [prepare your own dataset](#tune-on-your-dataset). 18 | 19 | For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial. 20 | 21 | ## Running the finetuning 22 | 23 | ```bash 24 | python finetune/full.py --io.checkpoint_dir checkpoints/tiiuae/falcon-7b 25 | ``` 26 | 27 | Finetuning the falcon-7b model requires at least 8 GPUs with ~40 GB memory each. 28 | 29 | You can speed up training by setting the `devices` variable in the script to utilize more GPUs if available. 30 | Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently. 31 | 32 | This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name: 33 | 34 | ```bash 35 | python finetune/full.py --io.out_dir out/full/my-model-finetuned 36 | ``` 37 | 38 | If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument. 39 | For instance, to fine-tune on MPS (the GPU on modern Macs), you can run 40 | 41 | ```bash 42 | python finetune/full.py --io.out_dir out/full/my-model-finetuned --precision 32-true 43 | ``` 44 | 45 | Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac. 46 | 47 | ## Test the model 48 | 49 | You can test the finetuned model with your own instructions by running: 50 | 51 | ```bash 52 | python generate/full.py \ 53 | --prompt "Recommend a movie to watch on the weekend." \ 54 | --checkpoint_dir checkpoints/tiiuae/falcon-7b \ 55 | --finetuned_path out/full/my-model-finetuned/lit_model_finetuned.pth 56 | ``` 57 | 58 | Output: 59 | 60 | ```text 61 | A good movie to watch on the weekend would be The Lion King, since it's a classic family film that everyone can enjoy... 62 | ``` 63 | 64 | If your GPU supports `bfloat16`, the script will automatically use it. 65 | 66 | ## Tune on your dataset 67 | 68 | With only a few modifications, you can prepare and train on your own instruction dataset. 69 | 70 | 1. Create a json file in which each row holds one instruction-response pair. 71 | A row has an entry for 'instruction', 'input', and 'output', where 'input' is optional an can be 72 | the empty string if the instruction doesn't require a context. Below is an example json file: 73 | 74 | ```text 75 | [ 76 | { 77 | "instruction": "Arrange the given numbers in ascending order.", 78 | "input": "2, 4, 0, 8, 3", 79 | "output": "0, 2, 3, 4, 8" 80 | }, 81 | ... 82 | ] 83 | ``` 84 | 85 | 2. Make a copy of `scripts/prepare_alpaca.py` and name it what you want: 86 | 87 | ```bash 88 | cp scripts/prepare_alpaca.py scripts/prepare_mydata.py 89 | ``` 90 | 91 | 3. Modify `scripts/prepare_mydata.py` to read the json data file. 92 | 4. Run the script to generate the preprocessed, tokenized train-val split: 93 | 94 | ```bash 95 | python scripts/prepare_mydata.py --destination_path data/mydata/ 96 | ``` 97 | 98 | 5. Run `finetune/full.py` by passing in the location of your data (and optionally other parameters): 99 | 100 | ```bash 101 | python finetune/full.py \ 102 | --io.train_data_dir data/mydata --io.val_data_dir data/mydata/ \ 103 | --io.checkpoint_dir checkpoints/tiiuae/falcon-7b \ 104 | --io.out_dir data/mydata-finetuned 105 | ``` 106 | -------------------------------------------------------------------------------- /lit_gpt/data/tinyllama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from pathlib import Path 4 | from typing import Union, Optional 5 | 6 | from torch.utils.data import DataLoader 7 | 8 | from lit_gpt import Tokenizer 9 | from lit_gpt.data import LitDataModule 10 | 11 | 12 | class TinyLlama(LitDataModule): 13 | """The TinyLlama data module is composed of a mix of SlimPajama and Starcoder data. 14 | 15 | Provides training and validation streaming dataloaders that return batches of tokens. 16 | 17 | Args: 18 | data_path: The path to the data directory, containing two folders 'slimpajama' and 'starcoder' 19 | which are the output of the preprocessing step done in advance. See the `tutorial/pretrain_tinyllama.md` 20 | for instructions. The path can also be a remote path (e.g., s3://). 21 | seed: The seed to use for shuffling the training data. 22 | num_workers: The number of workers to use for the dataloaders. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | data_path: Union[str, Path] = Path("data/"), 28 | seed: int = 42, 29 | num_workers: int = 8, 30 | ) -> None: 31 | super().__init__() 32 | self.seed = seed 33 | self.num_workers = num_workers 34 | 35 | self.batch_size = 1 36 | self.seq_length = 2048 37 | 38 | # Could be a remote path (s3://) or a local path 39 | self.slimpajama_train = str(data_path).rstrip("/") + "/slimpajama/train" 40 | self.slimpajama_val = str(data_path).rstrip("/") + "/slimpajama/val" 41 | self.starcoder_train = str(data_path).rstrip("/") + "/starcoder" 42 | 43 | def connect( 44 | self, 45 | tokenizer: Optional[Tokenizer] = None, 46 | batch_size: int = 1, 47 | max_seq_length: Optional[int] = None 48 | ) -> None: 49 | self.batch_size = batch_size 50 | self.seq_length = max_seq_length + 1 # Increase by one because we need the next token as well 51 | 52 | def prepare_data(self) -> None: 53 | for path in (self.slimpajama_train, self.slimpajama_val, self.starcoder_train): 54 | if not path.startswith("s3://") and not Path(path).is_dir(): 55 | raise FileNotFoundError( 56 | "The data path for TinyLlama is expected to be the directory containing these subdirectories:" 57 | f" `slimpajama/train`, `slimpajama/val`, `starcoder`. The directory {path} does not exist." 58 | ) 59 | 60 | def train_dataloader(self) -> DataLoader: 61 | from lightning.data.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader 62 | 63 | train_datasets = [ 64 | StreamingDataset( 65 | input_dir=self.slimpajama_train, 66 | item_loader=TokensLoader(block_size=self.seq_length), 67 | shuffle=True, 68 | drop_last=True, 69 | ), 70 | StreamingDataset( 71 | input_dir=self.starcoder_train, 72 | item_loader=TokensLoader(block_size=self.seq_length), 73 | shuffle=True, 74 | drop_last=True, 75 | ), 76 | ] 77 | 78 | # Mix SlimPajama data and Starcoder data with these proportions: 79 | weights = (0.693584, 0.306416) 80 | combined_dataset = CombinedStreamingDataset(datasets=train_datasets, seed=self.seed, weights=weights) 81 | train_dataloader = StreamingDataLoader( 82 | combined_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 83 | ) 84 | return train_dataloader 85 | 86 | def val_dataloader(self) -> DataLoader: 87 | from lightning.data.streaming import StreamingDataset, TokensLoader 88 | 89 | val_dataset = StreamingDataset( 90 | input_dir=self.slimpajama_val, 91 | item_loader=TokensLoader(block_size=self.seq_length), 92 | shuffle=True, 93 | # Consider setting to False, but we would lose some samples due to truncation when world size > 1 94 | drop_last=True, 95 | ) 96 | val_dataloader = DataLoader( 97 | val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 98 | ) 99 | return val_dataloader 100 | -------------------------------------------------------------------------------- /lit_gpt/data/json.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | from pathlib import Path 5 | from typing import Optional 6 | 7 | import torch 8 | from torch.utils.data import random_split, DataLoader 9 | from lit_gpt.data import SFTDataset, get_sft_collate_fn, LitDataModule 10 | from lit_gpt.data.alpaca import prompt_template 11 | from lit_gpt.tokenizer import Tokenizer 12 | 13 | 14 | class JSON(LitDataModule): 15 | """Loads JSON data for supervised finetuning. 16 | 17 | Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels". 18 | 19 | Args: 20 | json_path: A path to a JSON file containing the data. The file should contain a list of samples (dicts). 21 | Each dict must have the keys 'instruction' and 'output', and can optionally have a key 'input' 22 | (see Alpaca). 23 | mask_prompt: Whether to mask the prompt section from the label (with ``ignore_index``). 24 | test_split_fraction: A number in the range [0, 1] that determines the fraction of the dataset 25 | to use for testing. 26 | ignore_index: The index to use for elements to be ignored in the label. 27 | seed: The random seed for creating the train/val splits and shuffling the dataset. 28 | num_workers: How many DataLoader processes to use for loading. 29 | """ 30 | 31 | def __init__( 32 | self, 33 | json_path: Path, 34 | mask_prompt: bool = False, 35 | test_split_fraction: float = 0.1, 36 | ignore_index: int = -1, 37 | seed: int = 42, 38 | num_workers: int = 4, 39 | ) -> None: 40 | super().__init__() 41 | self.json_path = json_path 42 | self.mask_prompt = mask_prompt 43 | self.test_split_fraction = test_split_fraction 44 | self.ignore_index = ignore_index 45 | self.seed = seed 46 | self.num_workers = num_workers 47 | 48 | self.tokenizer: Optional[Tokenizer] = None 49 | self.batch_size: int = 1 50 | self.max_seq_length: int = -1 51 | self.train_dataset: Optional[SFTDataset] = None 52 | self.test_dataset: Optional[SFTDataset] = None 53 | 54 | if not self.json_path.is_file(): 55 | raise FileNotFoundError(f"The file {self.json_path} does not exist.") 56 | 57 | def connect( 58 | self, 59 | tokenizer: Optional[Tokenizer] = None, 60 | batch_size: int = 1, 61 | max_seq_length: Optional[int] = None 62 | ) -> None: 63 | self.tokenizer = tokenizer 64 | self.batch_size = batch_size 65 | self.max_seq_length = -1 if max_seq_length is None else max_seq_length 66 | 67 | def setup(self, stage: str = "") -> None: 68 | with open(self.json_path, "r", encoding="utf-8") as file: 69 | data = json.load(file) 70 | 71 | # Partition the dataset into train and test 72 | train_data, test_data = random_split( 73 | data, 74 | [1.0 - self.test_split_fraction, self.test_split_fraction], 75 | generator=torch.Generator().manual_seed(self.seed) 76 | ) 77 | train_data, test_data = list(train_data), list(test_data) 78 | 79 | self.train_dataset = SFTDataset( 80 | data=train_data, 81 | tokenizer=self.tokenizer, 82 | prompt_template=prompt_template, 83 | max_seq_length=self.max_seq_length, 84 | mask_prompt=self.mask_prompt, 85 | ignore_index=self.ignore_index, 86 | ) 87 | self.test_dataset = SFTDataset( 88 | data=test_data, 89 | tokenizer=self.tokenizer, 90 | prompt_template=prompt_template, 91 | max_seq_length=self.max_seq_length, 92 | mask_prompt=self.mask_prompt, 93 | ignore_index=self.ignore_index, 94 | ) 95 | 96 | def train_dataloader(self) -> DataLoader: 97 | return DataLoader( 98 | self.train_dataset, 99 | batch_size=self.batch_size, 100 | shuffle=True, 101 | generator=torch.Generator().manual_seed(self.seed), 102 | num_workers=self.num_workers, 103 | collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index) 104 | ) 105 | 106 | def val_dataloader(self) -> DataLoader: 107 | return DataLoader( 108 | self.test_dataset, 109 | batch_size=self.batch_size, 110 | shuffle=False, 111 | num_workers=self.num_workers, 112 | collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index) 113 | ) 114 | -------------------------------------------------------------------------------- /lit_gpt/multiple_negative_ranking_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Iterable, Dict 4 | # from ..SentenceTransformer import SentenceTransformer 5 | from lit_gpt.model import GPT 6 | 7 | 8 | def cos_sim(a: Tensor, b: Tensor) -> Tensor: 9 | """ 10 | Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j. 11 | 12 | :return: Matrix with res[i][j] = cos_sim(a[i], b[j]) 13 | """ 14 | if not isinstance(a, torch.Tensor): 15 | a = torch.tensor(a) 16 | 17 | if not isinstance(b, torch.Tensor): 18 | b = torch.tensor(b) 19 | 20 | if len(a.shape) == 1: 21 | a = a.unsqueeze(0) 22 | 23 | if len(b.shape) == 1: 24 | b = b.unsqueeze(0) 25 | 26 | a_norm = torch.nn.functional.normalize(a, p=2, dim=1) 27 | b_norm = torch.nn.functional.normalize(b, p=2, dim=1) 28 | return torch.mm(a_norm, b_norm.transpose(0, 1)) 29 | 30 | 31 | class MultipleNegativesRankingLoss(nn.Module): 32 | """ 33 | This loss expects as input a batch consisting of sentence pairs (a_1, p_1), (a_2, p_2)..., (a_n, p_n) 34 | where we assume that (a_i, p_i) are a positive pair and (a_i, p_j) for i!=j a negative pair. 35 | 36 | For each a_i, it uses all other p_j as negative samples, i.e., for a_i, we have 1 positive example (p_i) and 37 | n-1 negative examples (p_j). It then minimizes the negative log-likehood for softmax normalized scores. 38 | 39 | This loss function works great to train embeddings for retrieval setups where you have positive pairs (e.g. (query, relevant_doc)) 40 | as it will sample in each batch n-1 negative docs randomly. 41 | 42 | The performance usually increases with increasing batch sizes. 43 | 44 | For more information, see: https://arxiv.org/pdf/1705.00652.pdf 45 | (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4) 46 | 47 | You can also provide one or multiple hard negatives per anchor-positive pair by structering the data like this: 48 | (a_1, p_1, n_1), (a_2, p_2, n_2) 49 | 50 | Here, n_1 is a hard negative for (a_1, p_1). The loss will use for the pair (a_i, p_i) all p_j (j!=i) and all n_j as negatives. 51 | 52 | Example:: 53 | 54 | from sentence_transformers import SentenceTransformer, losses, InputExample 55 | from torch.utils.data import DataLoader 56 | 57 | model = SentenceTransformer('distilbert-base-uncased') 58 | train_examples = [InputExample(texts=['Anchor 1', 'Positive 1']), 59 | InputExample(texts=['Anchor 2', 'Positive 2'])] 60 | train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32) 61 | train_loss = losses.MultipleNegativesRankingLoss(model=model) 62 | """ 63 | 64 | def __init__(self, model: GPT, scale: float = 20.0, similarity_fct=cos_sim): 65 | """ 66 | :param model: SentenceTransformer model 67 | :param scale: Output of similarity function is multiplied by scale value 68 | :param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1) 69 | """ 70 | super(MultipleNegativesRankingLoss, self).__init__() 71 | self.model = model 72 | self.scale = scale 73 | self.similarity_fct = similarity_fct 74 | self.cross_entropy_loss = nn.CrossEntropyLoss() 75 | 76 | def forward(self, sentence_features: list[Tensor, Tensor], loss_type: str): 77 | embeddings_a_bsz_T_d = sentence_features[0] 78 | embeddings_b_bsz_T_d = sentence_features[1] 79 | if loss_type == "batch_negative": 80 | embeddings_a_bsz_d = embeddings_a_bsz_T_d.reshape(-1, embeddings_a_bsz_T_d.size(-1)) 81 | embeddings_b_bsz_d = embeddings_b_bsz_T_d.reshape(-1, embeddings_b_bsz_T_d.size(-1)) 82 | scores = self.similarity_fct(embeddings_a_bsz_d, embeddings_b_bsz_d) * self.scale # [b, b] 83 | labels = torch.tensor( 84 | range(len(scores)), dtype=torch.long, device=scores.device 85 | ) # Example a[i] should match with b[i] [0, 1, 2, 3, ...] 86 | accuracy = (torch.argmax(scores, dim=1) == labels).float().mean() # we want to check the retrieval accuracy 87 | return self.cross_entropy_loss(scores, labels), accuracy.clone().detach() 88 | else: 89 | # loss_type == "single_negative" 90 | loss_and_accuracies = [self.forward([embeddings_a_bsz_T_d[i], embeddings_b_bsz_T_d[i]], "batch_negative") for i in range(embeddings_a_bsz_T_d.size(0))] 91 | losses = [x[0] for x in loss_and_accuracies] 92 | accuracies = [x[1] for x in loss_and_accuracies] 93 | return torch.stack(losses).mean(), torch.stack(accuracies).mean().clone().detach() 94 | 95 | def get_config_dict(self): 96 | return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__} -------------------------------------------------------------------------------- /tutorials/oom.md: -------------------------------------------------------------------------------- 1 | ## Dealing with out-of-memory (OOM) errors 2 | 3 | If you got this error while running a script 4 | 5 | ```bash 6 | OutOfMemoryError: CUDA out of memory. Tried to allocate 2.22 GiB. GPU 0 has a total capacity of 79.15 GiB of which 228.38 MiB is free. Including non-PyTorch memory, this process 7 | has 78.93 GiB memory in use. Of the allocated memory 76.28 GiB is allocated by PyTorch, and 2.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory 8 | is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF 9 | ``` 10 | 11 | it means that your GPU memory size wasn't big enough for the model and script configuration. 12 | 13 | Here's a few things you can try: 14 | 15 | ### Reduce the micro batch size 16 | 17 | Adjust the `--train.micro_batch_size` argument in the fine-tuning and pretraining scripts. This variable determines the number of samples loaded per iteration. 18 | 19 | A smaller value will simply load fewer samples simultaneously. The minimum value is 1. 20 | 21 | Experiment with different micro batch sizes to find a balance between memory consumption and computational efficiency. Smaller micro batch sizes consume less memory but may result in slower training convergence. Conversely, larger micro batch sizes require more memory but can accelerate training speed. 22 | 23 | ### Reduce the model's context length 24 | 25 | The context length (`block_size` in the code) plays a significant role in running models with attention. 26 | 27 | * The pretraining scripts are configured to use the full context length of the model to train. 28 | * The finetuning scripts are configured to use the longest sample length of the training data to avoid allocating unnecessary memory (`--train.max_seq_length` argument). 29 | If that's longer than the model's context length, an error is raised. If you try to run a batch that is longer than this, an error is raised. 30 | 31 | However, your hardware may not support such large context lengths. Here's what you can do: 32 | 33 | * For the pretraining scripts, you can simply reduce the `Config(block_size=...)` value. 34 | * For the finetuning scripts, you can trim the length of the samples in your dataset. 35 | Most of the `scripts/prepare_*.py` scripts expose a `--max_seq_length=...` argument. This might also be useful in cases where 36 | sample lengths are highly unbalanced, as the presence of a single very long sample would incur a larger memory usage for all other 37 | shorter samples. For example, the median length of the samples in Alpaca is 110 tokens. Truncating the Alpaca dataset to 256 max tokens reduces the memory requirements of a Falcon 7B model from 23.52 GB to 15.73 GB. For more information about the dataset truncation, please see the *Truncating datasets* section in the [prepare_datasets.md](prepare_datasets.md) tutorial. 38 | 39 | Keep in mind that reducing the context length will affect the modelling performance on text sequences longer than the limit. 40 | 41 | ### Use lower precision 42 | 43 | Our scripts expose the `--precision` argument, this directly impacts the memory usage. 44 | 45 | Using true lower precision (`16-true`, `bf16-true`) reduces the memory usage by half compared to `32-true`, however, 46 | the model might start producing NaNs due to the limited range of representable values. 47 | 48 | Mixed precision training (`16-mixed`, `bf16-mixed`) provides better stability but offers limited memory reduction. 49 | 50 | ### Do sharding across multiple GPUs 51 | 52 | For exceptionally large models, the aforementioned techniques might still not suffice. If you have multiple GPUs available, 53 | you can trade off memory for speed by changing the `--devices 1` argument in the scripts. Enabling this option enables a parallelism technique (FSDP), sharding the memory across different GPUs. 54 | 55 | The default configuration already uses activation checkpointing, but you can enable CPU offloading by changing the `cpu_offload=False` argument in the scripts. 56 | 57 | ### Try a different optimizer 58 | 59 | Our scripts use the [`AdamW` optimizer](https://pytorch.org/docs/main/generated/torch.optim.AdamW.html). 60 | It maintains 2 states for each trainable parameter of the model, meaning that the optimizer memory is double compared to 61 | an optimizer like [`SGD`](https://pytorch.org/docs/main/generated/torch.optim.SGD.html). 62 | 63 | You can try replacing it with your optimizer of choice that is lighter in memory requirements. Keep in mind that different optimizers have distinct optimization behaviors, so it's essential to assess their impact on the training process and model performance. 64 | An example would be the recently published [Sophia](https://arxiv.org/abs/2305.14342) or [Lion](https://arxiv.org/abs/2302.06675) optimizers. 65 | 66 | This suggestion is particularly relevant for pretraining, as the trainable parameters in the model represent a small 67 | subset of the total in the fine-tuning scripts. 68 | -------------------------------------------------------------------------------- /tutorials/convert_lit_models.md: -------------------------------------------------------------------------------- 1 | ## Converting Lit-GPT weights to Hugging Face Transformers 2 | 3 | Lit-GPT weights need to be converted to a format that Hugging Face understands with a [conversion script](../scripts/convert_lit_checkpoint.py) before our scripts can run. 4 | 5 | We provide a helpful script to convert models Lit-GPT models back to their equivalent Hugging Face Transformers format: 6 | 7 | ```sh 8 | python scripts/convert_lit_checkpoint.py \ 9 | --checkpoint_path checkpoints/repo_id/lit_model.pth \ 10 | --output_path output_path/converted.pth \ 11 | --config_path checkpoints/repo_id/config.json 12 | ``` 13 | 14 | These paths are just placeholders, you will need to customize them based on which finetuning or pretraining script you ran and it's configuration. 15 | 16 | ### Loading converted Lit-GPT checkpoints into transformers 17 | 18 | If you want to load the converted checkpoints into a `transformers` model, please make sure you copied the original `config.json` file into the folder that contains the `converted.pth` file saved via `--output_path` above. 19 | 20 | For example, 21 | 22 | ```bash 23 | cp checkpoints/repo_id/config.json output_path/config.json 24 | ``` 25 | 26 | Then, you can load the checkpoint file in a Python session as follows: 27 | 28 | ```python 29 | import torch 30 | from transformers import AutoModel 31 | 32 | 33 | state_dict = torch.load("output_path/converted.pth") 34 | model = AutoModel.from_pretrained( 35 | "output_path/", local_files_only=True, state_dict=state_dict 36 | ) 37 | ``` 38 | 39 | Alternatively, you can also load the model without copying the `config.json` file as follows: 40 | 41 | ```python 42 | model = AutoModel.from_pretrained("online_repo_id", state_dict=state_dict) 43 | ``` 44 | 45 | 46 | 47 | ### Merging LoRA weights 48 | 49 | Please note that if you want to convert a model that has been fine-tuned using an adapter like LoRA, these weights should be [merged](../scripts/merge_lora.py) to the checkpoint prior to converting. 50 | 51 | ```sh 52 | python scripts/merge_lora.py \ 53 | --checkpoint_dir checkpoints/repo_id \ 54 | --lora_path path/to/litgpt/lora_finetuned.pth \ 55 | --out_dir output_path/merged.ckpt 56 | ``` 57 | 58 |
59 |
60 | 61 | # A finetuning and conversion tutorial 62 | 63 | This section contains a reproducible example for finetuning a Lit-GPT model and converting it back into a HF `transformer` model. 64 | 65 | 1. Download a model of interest: 66 | 67 | For convenience, we first specify an environment variable (optional) to avoid copy and pasting the whole path: 68 | 69 | ```bash 70 | export repo_id=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T 71 | ``` 72 | 73 | Instead of using TinyLlama, you can replace the `repo_id` target with any other model repository 74 | specifier that is currently supported by Lit-GPT. You can get a list of supported repository specifier 75 | by running `scripts/download.py` without any additional arguments. 76 | 77 | Then, we download the model we specified via `$repo_id` above: 78 | 79 | ```bash 80 | python scripts/download.py --repo_id $repo_id 81 | ``` 82 | 83 | 2. Convert the model into the Lit-GPT format: 84 | 85 | ```bash 86 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id 87 | ``` 88 | 89 | 3. Prepare a dataset for finetuning: 90 | 91 | ```bash 92 | python scripts/prepare_alpaca.py \ 93 | --checkpoint_dir checkpoints/$repo_id \ 94 | --destination_path data/alpaca 95 | ``` 96 | 97 | 4. Finetune the model: 98 | 99 | 100 | ```bash 101 | export finetuned_dir=out/lit-finetuned-model 102 | 103 | python finetune/lora.py \ 104 | --io.checkpoint_dir checkpoints/$repo_id \ 105 | --io.train_data_dir data/alpaca \ 106 | --io.val_data_dir data/alpaca \ 107 | --train.epochs 1 \ 108 | --io.out_dir $finetuned_dir 109 | ``` 110 | 111 | 5. Merge LoRA weights: 112 | 113 | Note that this step only applies if the model was finetuned with `lora.py` above and not when `full.py` was used for finetuning. 114 | 115 | ```bash 116 | python scripts/merge_lora.py \ 117 | --checkpoint_dir checkpoints/$repo_id \ 118 | --lora_path $finetuned_dir/lit_model_lora_finetuned.pth \ 119 | --out_dir $finetuned_dir/merged/ 120 | ``` 121 | 122 | 123 | 5. Convert the finetuning model back into a HF format: 124 | 125 | ```bash 126 | python scripts/convert_lit_checkpoint.py \ 127 | --checkpoint_path $finetuned_dir/merged/lit_model.pth \ 128 | --output_path out/hf-tinyllama/converted_model.pth \ 129 | --config_path checkpoints/$repo_id/lit_config.json 130 | ``` 131 | 132 | (If you used `full.py` instead of `lora.py` to finetune your model, 133 | replace `$finetuned_dir/merged/lit_model.pth` with `$finetuned_dir/lit_model_finetuned.pth`.) 134 | 135 | 136 | 6. Load the model into a `transformers` model: 137 | 138 | ```python 139 | import torch 140 | from transformers import AutoModel 141 | 142 | state_dict = torch.load('out/hf-tinyllama/converted_model.pth') 143 | model = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", state_dict=state_dict) 144 | ``` 145 | -------------------------------------------------------------------------------- /lit_gpt/retrieval_attn_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | 5 | def create_dual_triangular_attention_mask(data, eos_id): 6 | bs, max_seq_length = data.size() 7 | attention_mask = torch.zeros(bs, max_seq_length, max_seq_length, dtype=torch.float32, device=data.device) 8 | 9 | for batch_idx in range(bs): 10 | sequence = data[batch_idx] 11 | # Find the indices of eos tokens 12 | eos_indices = (sequence == eos_id).nonzero(as_tuple=False).view(-1) 13 | # Handle cases where the eos_id appears less than 2 times or not at all 14 | if eos_indices.numel() < 2: 15 | # If eos_id does not appear or appears only once, fallback to standard lower triangular mask 16 | # attention_mask[batch_idx, :, :max_seq_length] = torch.tril(torch.ones(max_seq_length, max_seq_length, device=data.device)) 17 | raise ValueError(f"EOS token- {eos_id} does not appear twice in sequence") 18 | else: 19 | # Create mask for the first segment (Prefix) 20 | first_eos_idx = eos_indices[0].item() 21 | attention_mask[batch_idx, :first_eos_idx+1, :first_eos_idx+1] = torch.tril(torch.ones(first_eos_idx+1, first_eos_idx+1, device=data.device)) 22 | # Create mask for the second segment (Suffix) 23 | second_eos_idx = eos_indices[1].item() 24 | attention_mask[batch_idx, first_eos_idx+1:second_eos_idx+1, :second_eos_idx-first_eos_idx] = torch.tril(torch.ones(second_eos_idx-first_eos_idx, second_eos_idx-first_eos_idx, device=data.device)) 25 | 26 | # putting True in rest of the indices (Padding locations) 27 | attention_mask[batch_idx, second_eos_idx+1:, :] = True 28 | 29 | # Reshape the mask to include the additional dimension for heads if necessary 30 | attention_mask = attention_mask.view(bs, 1, max_seq_length, max_seq_length) 31 | return attention_mask 32 | 33 | 34 | def get_ltor_masks_and_position_ids(data, 35 | eod_token, 36 | reset_position_ids, 37 | reset_attention_mask, 38 | eod_mask_loss, 39 | attn_type="doc_block_attn"): 40 | """ 41 | Build masks and position id for left to right model. 42 | Modified from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/utils.py#L162. 43 | """ 44 | 45 | # Extract batch size and sequence length. 46 | micro_batch_size, seq_length = data.size() 47 | 48 | # Attention mask (lower triangular). 49 | if reset_attention_mask: 50 | att_mask_batch = micro_batch_size 51 | else: 52 | att_mask_batch = 1 53 | # attention_mask = torch.tril(torch.ones( 54 | # (att_mask_batch, seq_length, seq_length), device=data.device)).view( 55 | # att_mask_batch, 1, seq_length, seq_length) 56 | if attn_type == "doc_block_attn": 57 | attention_mask = create_dual_triangular_attention_mask(data, eod_token) 58 | if attn_type == "anti_causal_attn": 59 | attention_mask = torch.triu(torch.ones( 60 | (att_mask_batch, seq_length, seq_length), device=data.device, dtype=torch.int16)).view( 61 | att_mask_batch, 1, seq_length, seq_length) 62 | 63 | 64 | loss_mask = None 65 | position_ids = None 66 | # text = attention_mask[1, 0].tolist() 67 | # saving text as a txt file 68 | # with open("attention_mask.txt", "w") as f: 69 | # for item in text: 70 | # f.write(f"{item}\n") 71 | 72 | # # Loss mask. 73 | # loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) 74 | # if eod_mask_loss: 75 | # loss_mask[data == eod_token] = 0.0 76 | 77 | # # Position ids. 78 | # position_ids = torch.arange(seq_length, dtype=torch.long, 79 | # device=data.device) 80 | # position_ids = position_ids.unsqueeze(0).expand_as(data) 81 | # # We need to clone as the ids will be modifed based on batch index. 82 | # if reset_position_ids: 83 | # position_ids = position_ids.clone() 84 | 85 | # if reset_position_ids or reset_attention_mask: 86 | # # Loop through the batches: 87 | # for b in range(micro_batch_size): 88 | 89 | # # Find indecies where EOD token is. 90 | # eod_index = position_ids[b, data[b] == eod_token] 91 | # # Detach indecies from positions if going to modify positions. 92 | # if reset_position_ids: 93 | # eod_index = eod_index.clone() 94 | 95 | # # Loop through EOD indecies: 96 | # prev_index = 0 97 | # for j in range(eod_index.size()[0]): 98 | # i = eod_index[j] 99 | # # Mask attention loss. 100 | # if reset_attention_mask: 101 | # attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 102 | # # Reset positions. 103 | # if reset_position_ids: 104 | # position_ids[b, (i + 1):] -= (i + 1 - prev_index) 105 | # prev_index = i + 1 106 | 107 | # Convert attention mask to binary: 108 | attention_mask = (attention_mask > 0.5) 109 | 110 | return attention_mask, loss_mask, position_ids -------------------------------------------------------------------------------- /tutorials/inference.md: -------------------------------------------------------------------------------- 1 | # Inference 2 | 3 | We demonstrate how to run inference (next token prediction) with the GPT base model in the [`generate.py`](generate.py) script: 4 | 5 | ```bash 6 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b 7 | ``` 8 | 9 | Output: 10 | 11 | ```text 12 | Hello, my name is Levi Durrer, I'm an Austrian journalist - Chairman of the Press Blair Party, with 37 years in the Press Blair International, and two years in the Spectre of Austerity for the other. I'm crossing my fingers that you will feel 13 | ``` 14 | 15 | The script assumes you have downloaded and converted the weights as described [here](download_stablelm.md). 16 | 17 | This will run the 3B pre-trained model and require ~7 GB of GPU memory using the `bfloat16` datatype. 18 | 19 | ## Run interactively 20 | 21 | You can also chat with the model interactively: 22 | 23 | ```bash 24 | python chat/base.py --checkpoint_dir checkpoints/stabilityai/stablelm-tuned-alpha-3b 25 | ``` 26 | 27 | This script can work with any checkpoint. For the best chat-like experience, we recommend using it with a checkpoints 28 | fine-tuned for chatting such as `stabilityai/stablelm-tuned-alpha-3b` or `togethercomputer/RedPajama-INCITE-Chat-3B-v1`. 29 | 30 | ## Run a large model on one smaller device 31 | 32 | Check out our [quantization tutorial](quantize.md). 33 | 34 | ## Run a large model on multiple smaller devices 35 | 36 | We offer two scripts to leverage multiple devices for inference. 37 | 38 | ### [`generate/sequentially.py`](../generate/sequentially.py) 39 | 40 | Allows you to run models that wouldn't fit in a single card by partitioning the transformer blocks across all your devices and running them sequentially. 41 | 42 | For instance, `meta-llama/Llama-2-70b-chat-hf` would require ~140 GB of GPU memory to load on a single device, plus the memory for activations. 43 | With 80 transformer layers, we could partition them across 8, 5, 4, or 2 devices. 44 | 45 | ```shell 46 | python generate/sequentially.py \ 47 | --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \ 48 | --max_new_tokens 256 \ 49 | --num_samples 2 50 | ``` 51 | 52 | Using A100 40GB GPUs, we need to use at least 4. You can control the number of devices by setting the `CUDA_VISIBLE_DEVICES=` environment variable. 53 | 54 | | Devices | Max GPU RAM | Token/sec | 55 | |---------|-------------|-----------| 56 | | 2 | OOM | - | 57 | | 4 | 35.64 GB | 7.55 | 58 | | 5 | 28.72 GB | 7.49 | 59 | | 8 | 18.35 GB | 7.47 | 60 | 61 | Note that the memory usage will also depend on the `max_new_tokens` value used. 62 | 63 | The script also supports quantization, using 4-bit precision, we can now use 2 GPUs 64 | 65 | ```shell 66 | python generate/sequentially.py \ 67 | --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \ 68 | --max_new_tokens 256 \ 69 | --num_samples 2 \ 70 | --quantize bnb.nf4-dq 71 | ``` 72 | 73 | | Devices | Max GPU RAM | Token/sec | 74 | |---------|-------------|-----------| 75 | | 2 | 20.00 GB | 8.63 | 76 | | 4 | 10.80 GB | 8.23 | 77 | | 5 | 8.96 GB | 8.10 | 78 | | 8 | 6.23 GB | 8.18 | 79 | 80 | Smaller devices can also be used to run inference with this technique. 81 | 82 | ### [`generate/tp.py`](../generate/tp.py) 83 | 84 | Uses tensor parallelism (TP) to run models that wouldn't fit in a single card by sharding the MLP and Attention QKV linear layers across all your devices. 85 | 86 | For instance, `meta-llama/Llama-2-70b-chat-hf` would require ~140 GB of GPU memory to load on a single device, plus the memory for activations. 87 | The requirement is that the intermediate size (for the MLP) and the QKV size (for attention) is divisible by the number of devices. 88 | With an intermediate size of 28672, we can use 2, 4, 7, or 8 devices. With a QKV size of 10240 we can use 2, 4, 5, or 8 devices. 89 | Since the script is configured to shard both, the intersection is used: we can only use 2, 4, or 8 devices. 90 | 91 | ```shell 92 | python generate/tp.py \ 93 | --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \ 94 | --max_new_tokens 256 \ 95 | --num_samples 2 96 | ``` 97 | 98 | Using A100 40GB GPUs, we need to use at least 4. You can control the number of devices by setting the `CUDA_VISIBLE_DEVICES=` environment variable. 99 | 100 | | Devices | Max GPU RAM | Token/sec | 101 | |---------|-------------|-----------| 102 | | 2 | OOM | - | 103 | | 4 | 35.46 GB | 9.33 | 104 | | 8 | 18.19 GB | 8.61 | 105 | 106 | Note that the memory usage will also depend on the `max_new_tokens` value used. 107 | 108 | The script also supports quantization, using 4-bit precision, we can now use 2 GPUs 109 | 110 | ```shell 111 | python generate/tp.py \ 112 | --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \ 113 | --max_new_tokens 256 \ 114 | --num_samples 2 \ 115 | --quantize bnb.nf4-dq 116 | ``` 117 | 118 | | Devices | Max GPU RAM | Token/sec | 119 | |---------|-------------|-----------| 120 | | 2 | 19.79 GB | 6.72 | 121 | | 4 | 10.73 GB | 6.48 | 122 | | 8 | 6.15 GB | 6.20 | 123 | 124 | Smaller devices can also be used to run inference with this technique. 125 | -------------------------------------------------------------------------------- /lit_gpt/data/base.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from abc import abstractmethod 3 | from functools import partial 4 | from typing import List, Dict, Union, Optional 5 | 6 | import torch 7 | from torch import Tensor 8 | from torch.utils.data import Dataset 9 | 10 | from lightning import LightningDataModule 11 | from lit_gpt import Tokenizer 12 | 13 | 14 | class LitDataModule(LightningDataModule): 15 | """Base class for all data modules in Lit-GPT.""" 16 | 17 | @abstractmethod 18 | def connect( 19 | self, 20 | tokenizer: Optional[Tokenizer] = None, 21 | batch_size: int = 1, 22 | max_seq_length: Optional[int] = None 23 | ) -> None: 24 | """All settings that can't be determined at the time of instantiation need to be passed through here 25 | before any dataloaders can be accessed. 26 | """ 27 | 28 | def setup(self, stage: str = "") -> None: 29 | # Stub is to redefine the default signature, because the concept of 'stage' does not exist in Lit-GPT 30 | pass 31 | 32 | 33 | class SFTDataset(Dataset): 34 | """An in-memory dataset for supervised finetuning with `input_ids` and `labels`. 35 | 36 | Args: 37 | data: A list of samples (dicts). The target/label must be stored under the key 'output' and the instruction 38 | or other data can be stored under any key as long as it is compatible with the given prompt template. 39 | tokenizer: The tokenizer to use. Should match the one that was used to pretrain the model. 40 | prompt_template: A prompt template (format string or callable). 41 | max_seq_length: Truncate sequences that are longer than this value. By default, no truncation is applied. 42 | mask_prompt: Whether to mask the prompt section from the label (with ``ignore_index``). 43 | ignore_index: The index to use for elements to be ignored in the label. 44 | 45 | Returns a dict with two keys: 46 | input_ids: The encoded prompt + response 47 | labels: Same as input_ids, unless ``mask_prompt=True`` in which case the 'prompt' part is replaced with 48 | the ``ignore_index``. 49 | """ 50 | def __init__( 51 | self, 52 | data: List[Dict[str, str]], 53 | tokenizer: Tokenizer, 54 | prompt_template: Union[str, callable], 55 | max_seq_length: int = -1, 56 | mask_prompt: bool = True, 57 | ignore_index: int = -1, 58 | ) -> None: 59 | self.data = data 60 | self.tokenizer = tokenizer 61 | self.prompt_template = prompt_template 62 | self.max_seq_length = max_seq_length 63 | self.mask_prompt = mask_prompt 64 | self.ignore_index = ignore_index 65 | 66 | def __len__(self) -> int: 67 | return len(self.data) 68 | 69 | def __getitem__(self, idx: int) -> Dict[str, Tensor]: 70 | example = self.data[idx] 71 | prompt = apply_prompt_template(self.prompt_template, example) 72 | prompt_and_response = prompt + example["output"] 73 | encoded_prompt = self.tokenizer.encode(prompt, max_length=self.max_seq_length) 74 | encoded_prompt_and_response = self.tokenizer.encode( 75 | prompt_and_response, 76 | eos=True, 77 | max_length=self.max_seq_length, 78 | ) 79 | 80 | # The labels are the full prompt with response, but with the prompt masked out 81 | labels = encoded_prompt_and_response.clone() 82 | if self.mask_prompt: 83 | labels[: len(encoded_prompt)] = self.ignore_index 84 | 85 | return {"input_ids": encoded_prompt_and_response.type(torch.int64), "labels": labels.type(torch.int64)} 86 | 87 | 88 | def apply_prompt_template(template: Union[str, callable], example: Dict[str, str]) -> str: 89 | if isinstance(template, str): 90 | prompt = template.format(**example) 91 | else: 92 | prompt = template(example) 93 | return prompt 94 | 95 | 96 | def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1): 97 | """Returns the collate function for supervised finetuning (needed in the DataLoader). 98 | 99 | The collate function gets a list of dicts with keys `input_ids` and `labels`. 100 | It returns a dict with batched `input_ids` and `labels`. Also pads short sequences to the longest element in 101 | the batch. Optionally truncates all sequences to the specified maximum length. 102 | """ 103 | return partial(_sft_collate_fn, max_seq_length=max_seq_length, pad_id=pad_id, ignore_index=ignore_index) 104 | 105 | 106 | def _sft_collate_fn( 107 | samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1 108 | ) -> Dict[str, Tensor]: 109 | 110 | batched = {} 111 | for key in ("input_ids", "labels"): 112 | pad_value = pad_id if key == "input_ids" else ignore_index 113 | 114 | # Pad right based on the longest sequence 115 | batched[key] = torch.nn.utils.rnn.pad_sequence( 116 | [sample[key] for sample in samples], batch_first=True, padding_value=pad_value 117 | ) 118 | 119 | # Truncate if needed 120 | if max_seq_length > 0: 121 | batched[key] = batched[key][:, :max_seq_length] 122 | 123 | return batched 124 | -------------------------------------------------------------------------------- /lit_gpt/data/lima.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | """Implementation derived from https://github.com/tloen/alpaca-lora""" 3 | import os 4 | 5 | from typing import Optional, List 6 | 7 | import torch 8 | from torch.utils.data import random_split, DataLoader 9 | from lit_gpt.data import LitDataModule, SFTDataset, get_sft_collate_fn 10 | from lit_gpt.data.alpaca import prompt_template 11 | from lit_gpt.tokenizer import Tokenizer 12 | 13 | 14 | class LIMA(LitDataModule): 15 | """LIMA data module for supervised finetuning. 16 | 17 | Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels". 18 | """ 19 | 20 | def __init__( 21 | self, 22 | mask_prompt: bool = False, 23 | test_split_fraction: float = 0.1, 24 | ignore_index: int = -1, 25 | seed: int = 42, 26 | include_multiturn_conversations: bool = False, 27 | data_repo_id: str = "GAIR/lima", 28 | access_token: Optional[str] = os.getenv("HF_TOKEN"), 29 | num_workers: int = 4, 30 | ) -> None: 31 | super().__init__() 32 | if access_token is None: 33 | raise ValueError( 34 | "LIMA requires authentication, please set the `HF_TOKEN=your_token` environment" 35 | " variable or pass --access_token=your_token. You can find your token by visiting" 36 | " https://huggingface.co/settings/tokens" 37 | ) 38 | self.mask_prompt = mask_prompt 39 | self.test_split_fraction = test_split_fraction 40 | self.ignore_index = ignore_index 41 | self.seed = seed 42 | self.num_workers = num_workers 43 | 44 | self.access_token = access_token 45 | self.data_repo_id = data_repo_id 46 | self.include_multiturn_conversations = include_multiturn_conversations 47 | 48 | self.tokenizer: Optional[Tokenizer] = None 49 | self.batch_size = 1 50 | self.max_seq_length = -1 51 | self.train_dataset: Optional[SFTDataset] = None 52 | self.test_dataset: Optional[SFTDataset] = None 53 | 54 | def connect( 55 | self, 56 | tokenizer: Optional[Tokenizer] = None, 57 | batch_size: int = 1, 58 | max_seq_length: Optional[int] = None 59 | ) -> None: 60 | self.tokenizer = tokenizer 61 | self.batch_size = batch_size 62 | self.max_seq_length = -1 if max_seq_length is None else max_seq_length 63 | 64 | def prepare_data(self) -> None: 65 | from datasets import load_dataset 66 | 67 | load_dataset(self.data_repo_id, token=self.access_token) 68 | 69 | def setup(self, stage: str = "") -> None: 70 | from datasets import load_dataset 71 | 72 | dataset = load_dataset(self.data_repo_id, token=self.access_token) 73 | data = format_dataset(dataset["train"], self.include_multiturn_conversations) 74 | 75 | # Partition the dataset into train and test 76 | train_data, test_data = random_split( 77 | data, 78 | [1.0 - self.test_split_fraction, self.test_split_fraction], 79 | generator=torch.Generator().manual_seed(self.seed) 80 | ) 81 | train_data, test_data = list(train_data), list(test_data) 82 | 83 | self.train_dataset = SFTDataset( 84 | data=train_data, 85 | tokenizer=self.tokenizer, 86 | prompt_template=prompt_template, 87 | max_seq_length=self.max_seq_length, 88 | mask_prompt=self.mask_prompt, 89 | ignore_index=self.ignore_index, 90 | ) 91 | self.test_dataset = SFTDataset( 92 | data=test_data, 93 | tokenizer=self.tokenizer, 94 | prompt_template=prompt_template, 95 | max_seq_length=self.max_seq_length, 96 | mask_prompt=self.mask_prompt, 97 | ignore_index=self.ignore_index, 98 | ) 99 | 100 | def train_dataloader(self) -> DataLoader: 101 | return DataLoader( 102 | self.train_dataset, 103 | batch_size=self.batch_size, 104 | shuffle=True, 105 | generator=torch.Generator().manual_seed(self.seed), 106 | num_workers=self.num_workers, 107 | collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index), 108 | ) 109 | 110 | def val_dataloader(self) -> DataLoader: 111 | return DataLoader( 112 | self.test_dataset, 113 | batch_size=self.batch_size, 114 | shuffle=False, 115 | num_workers=self.num_workers, 116 | collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index) 117 | ) 118 | 119 | 120 | def format_dataset(dataset_partition: dict, include_multi_turn_conversations: bool) -> List[dict]: 121 | formatted_ds = [] 122 | 123 | for entry in dataset_partition: 124 | convo = entry["conversations"] 125 | if include_multi_turn_conversations: 126 | for i in range(0, len(convo) - 1, 2): 127 | formatted_ds.append({"instruction": convo[i], "input": "", "output": convo[i + 1]}) 128 | else: 129 | formatted_ds.append({"instruction": convo[0], "input": "", "output": convo[1]}) 130 | 131 | return formatted_ds 132 | -------------------------------------------------------------------------------- /tutorials/download_phi.md: -------------------------------------------------------------------------------- 1 | ## Download [phi](https://arxiv.org/abs/2309.05463) weights 2 | 3 | ### Phi 2 4 | 5 | Microsoft Research [released](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) Phi 2, which is a 2.7 billion parameter model trained on "textbook-quality" data with knowledge distillation from Phi 1.5. The model achieves sota results among base LLMs with less than 13B parameters and matches or outperforms models up to 25x larger on complex benchmarks, e.g. it achieves better performance compared to 25x larger Llama-2-70B model on multi-step reasoning tasks, i.e., coding and math. Phi 2 was trained on 1.4T tokens and has not undergone any RLHF alignment nor has it been instruct fine-tuned. Phi 2 shares the same architecture with Phi 1.5 and has context length of 2048 tokens. 6 | The model weights are released under [*Microsoft Research license*](https://huggingface.co/microsoft/phi-2#license). 7 | 8 | To download the model weights and convert them to the lit-gpt format, run 9 | 10 | ```bash 11 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 12 | 13 | python scripts/download.py --repo_id microsoft/phi-2 --from_safetensors True 14 | 15 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/microsoft/phi-2 16 | ``` 17 | 18 | > [!WARNING] 19 | > Phi-2 used [dropout](https://huggingface.co/microsoft/phi-2/blob/cb2f453/config.json#L26) during training which we don't model, so training will not be equal. 20 | 21 | Inference the model in instruct mode: 22 | 23 | ```bash 24 | python chat/base.py --checkpoint_dir checkpoints/microsoft/phi-2 25 | ``` 26 | ```text 27 | >> Prompt: Write a detailed analogy between mathematics and a lighthouse. 28 | >> Reply: Mathematics is like a lighthouse. Mathematics provides a method to guide us through the sometimes chaotic and confusing waters of life. It provides a structured approach to problems which can help us find our way and provide direction. Just as a lighthouse keeps watch over the sea, mathematics can provide us with the tools to try and make sense of the world. Furthermore, just as a lighthouse keeps a watchful eye on the horizon, mathematics can help us reach our goals by showing us the way. 29 | ``` 30 | 31 | > [!NOTE] 32 | > In order to obtain appropriate answers, you may need to tweak the [input prompt](https://github.com/Lightning-AI/lit-gpt/blob/74b8df0c3f07fc31d9d1a49e870a1f7955329ad8/chat/base.py#L359). E.g. we found out that if using `"Instruct:{prompt}\nOutput:\n"` instead of `"Instruct:{prompt}\nOutput:"` the model generates longer answers in some cases. 33 | 34 | Free generation mode: 35 | ```bash 36 | python generate/base.py --prompt "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\nBob:" --checkpoint_dir checkpoints/microsoft/phi-2 37 | ``` 38 | which yields 39 | ```text 40 | Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions? 41 | Bob: Well, one possible reason could be stress. Have you been feeling overwhelmed lately? 42 | Alice: Yes, I've been juggling multiple deadlines and it's been quite taxing. 43 | Carol: Stress can definitely impact your ability to concentrate. Maybe you need 44 | ``` 45 | 46 | ### Phi 1.5 47 | 48 | A team at Microsoft Research has made available Phi 1.5, which is a 1.3 billion parameter model optimized for common sense reasoning in natural language, showing performance on par with models 5x its size, especially in grade-school mathematics and basic coding. This model retains characteristics of larger LLMs, and significant improvement was noted in reducing toxic and biased generations by avoiding web data. It's also worth highlighting that while this model performs well on language understanding and common sense reasoning tasks, it is a base model that has not undergone any supervised instruction finetuning or finetuning with RLHF. 49 | 50 | The model was trained the same data sources (7B tokens) as its [phi-1](https://arxiv.org/abs/2306.11644) predecessor, which includes 51 | 52 | - a Python code subset from [The Stack](https://arxiv.org/abs/2211.15533) v1.2 53 | - Q&A texts from [StackOverflow](https://archive.org/download/stackexchange) 54 | - code from DeepMind [code_contests](https://github.com/deepmind/code_contests) 55 | - synthetic Python textbooks and exercises generated by [gpt-3.5-turbo-0301](https://platform.openai.com/docs/models/gpt-3-5) 56 | 57 | In addition, to create phi-1.5, the authors included additional textbook-quality synthetic text (roughly 20B tokens) in natural language, which was created using the [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) approach. 58 | 59 | The model weights are released under a [*Microsoft Research license*](https://huggingface.co/microsoft/phi-1_5/blob/main/README.md#license). 60 | 61 | In order to use the phi-1.5 model checkpoint, which requires about 3 Gb of disk space, download the weights and convert the checkpoint to the lit-gpt format: 62 | 63 | ```bash 64 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' 65 | 66 | python scripts/download.py --repo_id microsoft/phi-1_5 67 | 68 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/microsoft/phi-1_5 69 | ``` 70 | 71 | You're done! To execute the model just run: 72 | 73 | ```bash 74 | pip install tokenizers 75 | 76 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/microsoft/phi-1_5 77 | ``` 78 | -------------------------------------------------------------------------------- /lit_gpt/doc_block_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | 5 | def get_ltor_masks_and_position_ids(data, 6 | eod_token, 7 | reset_position_ids, 8 | reset_attention_mask, 9 | eod_mask_loss): 10 | """ 11 | Build masks and position id for left to right model. 12 | Modified from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/utils.py#L162. 13 | """ 14 | 15 | # Extract batch size and sequence length. 16 | micro_batch_size, seq_length = data.size() 17 | 18 | # Attention mask (lower triangular). 19 | if reset_attention_mask: 20 | att_mask_batch = micro_batch_size 21 | else: 22 | att_mask_batch = 1 23 | attention_mask = torch.tril(torch.ones( 24 | (att_mask_batch, seq_length, seq_length), device=data.device)).view( 25 | att_mask_batch, 1, seq_length, seq_length) 26 | 27 | # Loss mask. 28 | loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) 29 | if eod_mask_loss: 30 | loss_mask[data == eod_token] = 0.0 31 | 32 | # Position ids. 33 | position_ids = torch.arange(seq_length, dtype=torch.long, 34 | device=data.device) 35 | position_ids = position_ids.unsqueeze(0).expand_as(data) 36 | # We need to clone as the ids will be modifed based on batch index. 37 | if reset_position_ids: 38 | position_ids = position_ids.clone() 39 | 40 | if reset_position_ids or reset_attention_mask: 41 | # Loop through the batches: 42 | for b in range(micro_batch_size): 43 | 44 | # Find indecies where EOD token is. 45 | eod_index = position_ids[b, data[b] == eod_token] 46 | # Detach indecies from positions if going to modify positions. 47 | if reset_position_ids: 48 | eod_index = eod_index.clone() 49 | 50 | # Loop through EOD indecies: 51 | prev_index = 0 52 | for j in range(eod_index.size()[0]): 53 | i = eod_index[j] 54 | # Mask attention loss. 55 | if reset_attention_mask: 56 | attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 57 | # Reset positions. 58 | if reset_position_ids: 59 | position_ids[b, (i + 1):] -= (i + 1 - prev_index) 60 | prev_index = i + 1 61 | 62 | # Convert attention mask to binary: 63 | attention_mask = (attention_mask > 0.5) 64 | 65 | return attention_mask, loss_mask, position_ids 66 | 67 | 68 | def get_cache_attn_masks(data, 69 | cache_token, 70 | reset_position_ids, 71 | reset_attention_mask=True, 72 | cache_mask_loss=True): 73 | """ 74 | Build attention masks for cache tokens. 75 | """ 76 | 77 | # Extract batch size and sequence length. 78 | micro_batch_size, seq_length = data.size() 79 | 80 | # Attention mask (lower triangular). 81 | if reset_attention_mask: 82 | att_mask_batch = micro_batch_size 83 | else: 84 | att_mask_batch = 1 85 | attention_mask = torch.tril(torch.ones( 86 | (att_mask_batch, seq_length, seq_length), device=data.device)).view( 87 | att_mask_batch, 1, seq_length, seq_length) 88 | 89 | # Loss mask. 90 | loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) 91 | # Masking the loss on cache tokens. 92 | if cache_mask_loss: 93 | loss_mask[data == cache_token] = 0.0 94 | 95 | # Position ids. 96 | position_ids = torch.arange(seq_length, dtype=torch.long, 97 | device=data.device) 98 | position_ids = position_ids.unsqueeze(0).expand_as(data) 99 | # We need to clone as the ids will be modifed based on batch index. 100 | if reset_position_ids: 101 | position_ids = position_ids.clone() 102 | 103 | if reset_position_ids or reset_attention_mask: 104 | # Loop through the batches: 105 | for b in range(micro_batch_size): 106 | 107 | # Find indecies where EOD token is. 108 | eod_index = position_ids[b, data[b] == cache_token] 109 | # Detach indecies from positions if going to modify positions. 110 | if reset_position_ids: 111 | eod_index = eod_index.clone() 112 | 113 | # Loop through EOD indecies: 114 | prev_index = 0 115 | for j in range(eod_index.size()[0]): 116 | i = eod_index[j] 117 | # Mask attention loss. 118 | if reset_attention_mask: 119 | # TODO: Attend to all cache tokens when there's a block of consecutive cache tokens. 120 | attention_mask[b, 0, (i + 1):, :i] = 0 # Overlapped attn_mask at the *single* cache_token position. 121 | # Reset positions. 122 | if reset_position_ids: 123 | raise NotImplementedError 124 | # position_ids[b, (i + 1):] -= (i + 1 - prev_index) 125 | # prev_index = i + 1 126 | 127 | # Convert attention mask to binary: 128 | attention_mask = (attention_mask > 0.5) 129 | 130 | return attention_mask, loss_mask, position_ids -------------------------------------------------------------------------------- /scripts/prepare_retrieval_data.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | # saves the openwebtext dataset to a binary file for training. following was helpful: 4 | # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py 5 | import json 6 | import os 7 | import sys 8 | from pathlib import Path 9 | from typing import Union 10 | from functools import partial 11 | 12 | import numpy as np 13 | from tqdm import tqdm 14 | 15 | # support running without installing as a package 16 | wd = Path(__file__).parent.parent.resolve() 17 | sys.path.append(str(wd)) 18 | 19 | from lit_gpt import Tokenizer 20 | import torch 21 | 22 | 23 | def prepare( 24 | destination_path: Path = Path("/fs/cml-projects/llm-pretraining/llm-retrieval/data/orca_retrieval"), 25 | checkpoint_dir: Path = Path("/fs/cml-projects/llm-pretraining/llm-retrieval/checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"), 26 | seed: int = 42, 27 | cache_dir: Path = Path("/fs/cml-projects/llm-pretraining/llm-retrieval/data/cache"), 28 | test_size: Union[float, int, None] = 0.0005, 29 | max_seq_length: int = None, 30 | data_name: str = "openwebtext", 31 | data_type: str = "pretrain" 32 | ) -> None: 33 | np.random.seed(seed) 34 | from datasets import load_dataset # huggingface datasets 35 | 36 | destination_path.mkdir(parents=True, exist_ok=True) 37 | 38 | if max_seq_length is None: 39 | with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: 40 | config = json.load(file) 41 | max_seq_length = config["block_size"] 42 | 43 | tokenizer = Tokenizer(checkpoint_dir) 44 | 45 | # number of workers in .map() call 46 | # good number to use is ~order number of cpu cores // 2 47 | num_proc = os.cpu_count() // 2 48 | 49 | # number of workers in load_dataset() call 50 | # best number might be different from num_proc above as it also depends on HW speed. 51 | # it is better than 1 usually though 52 | num_proc_load_dataset = num_proc 53 | 54 | # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) 55 | dataset = load_dataset(data_name, num_proc=num_proc_load_dataset, cache_dir=cache_dir) 56 | test_size = 10000 / len(dataset['train']) # picking 10000 samples for test set 57 | # owt by default only contains the 'train' split, so create a test split 58 | split_dataset = dataset["train"].train_test_split(test_size=test_size, seed=seed, shuffle=True) 59 | val_dataset = split_dataset.pop("test") # rename the test split to val 60 | 61 | def process_instruction_data(examples, max_length=1024): 62 | # writing for batched examples 63 | query_ids = [] 64 | corpus_ids = [] 65 | lens = [] 66 | for question, response in zip(examples["question"], examples["response"]): 67 | query_id = tokenizer.encode(question, max_length=max_length, bos=False, eos=False).tolist() # not adding bos, eos for now 68 | corpus_id = tokenizer.encode(response, max_length=max_length).tolist() # not adding bos, eos for now 69 | if len(query_id) <= max_length and len(corpus_id) <= max_length: 70 | query_ids.append(query_id) 71 | corpus_ids.append(corpus_id) 72 | 73 | return {"query": query_ids, "corpus": corpus_ids, 'query_len': [len(q) for q in query_ids], 'corpus_len': [len(c) for c in corpus_ids]} 74 | 75 | def process_pretrain_data(examples, max_length=1024): 76 | # writing for batched examples 77 | query_ids = [] 78 | corpus_ids = [] 79 | lens = [] 80 | for text in examples["text"]: 81 | # splitting the text at random points and make query and corpus 82 | tokenized_text = tokenizer.encode(text, max_length=max_length, bos=False, eos=False).tolist() 83 | if len(tokenized_text) > 8: # making a random choice that the query and corpus are not too small 84 | pos = np.random.randint(5, len(tokenized_text)) 85 | query_id = tokenized_text[:pos] 86 | corpus_id = tokenized_text[pos:] 87 | query_ids.append(query_id) 88 | corpus_ids.append(corpus_id) 89 | 90 | return {"query": query_ids, "corpus": corpus_ids, 'query_len': [len(q) for q in query_ids], 'corpus_len': [len(c) for c in corpus_ids]} 91 | 92 | # tokenize the dataset 93 | if data_type == "pretrain": 94 | tokenize_func = partial(process_pretrain_data, max_length=max_seq_length) 95 | elif data_type == "instruction": 96 | tokenize_func = partial(process_instruction_data, max_length=max_seq_length) 97 | else: 98 | raise ValueError(f"Invalid data_type: {data_type}; Please choose from 'pretrain' or 'instruction'") 99 | tokenized = val_dataset.map(tokenize_func, desc="tokenizing the splits", batched=True, num_proc=num_proc) 100 | # removing all columns except query and corpus 101 | tokenized = tokenized.remove_columns([col for col in tokenized.column_names if col not in ["query", "corpus", "query_len", "corpus_len"]]) 102 | tokenized = tokenized.add_column("qrel", range(len(tokenized))) 103 | # saving as hf dataset 104 | tokenized.save_to_disk(destination_path) 105 | 106 | 107 | if __name__ == "__main__": 108 | from jsonargparse import CLI 109 | 110 | CLI(prepare) 111 | -------------------------------------------------------------------------------- /scripts/prepare_longform.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | """Implementation derived from https://github.com/tloen/alpaca-lora""" 4 | 5 | import json 6 | import sys 7 | from pathlib import Path 8 | from typing import Optional 9 | 10 | import torch 11 | from tqdm import tqdm 12 | 13 | # support running without installing as a package 14 | wd = Path(__file__).parent.parent.resolve() 15 | sys.path.append(str(wd)) 16 | 17 | from lit_gpt.tokenizer import Tokenizer 18 | from lit_gpt.utils import CLI 19 | from scripts.prepare_alpaca import download_if_missing 20 | 21 | 22 | def prepare( 23 | destination_path: Path = Path("data/longform"), 24 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), 25 | mask_inputs: bool = False, # as in alpaca-lora 26 | ignore_index: int = -1, 27 | max_seq_length: Optional[int] = None, 28 | ) -> None: 29 | """Prepare the Alpaca dataset for instruction tuning. 30 | 31 | The output is a training and test dataset saved as `train.pt` and `test.pt`, 32 | which stores the preprocessed and tokenized prompts and labels. 33 | """ 34 | if max_seq_length is None: 35 | with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: 36 | config = json.load(file) 37 | max_seq_length = config["block_size"] 38 | 39 | destination_path.mkdir(parents=True, exist_ok=True) 40 | 41 | train_file_name = "train.json" 42 | # val_file_name = "val.json" 43 | test_file_name = "test.json" 44 | 45 | train_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/train.json" 46 | # val_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/val.json" 47 | test_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/test.json" 48 | 49 | train_file_path = destination_path / train_file_name 50 | print("Loading train data file...") 51 | download_if_missing(train_file_path, train_file_url) 52 | with open(train_file_path, "r", encoding="utf-8") as file: 53 | train_data = json.load(file) 54 | 55 | test_file_path = destination_path / test_file_name 56 | print("Loading test data file...") 57 | download_if_missing(test_file_path, test_file_url) 58 | with open(test_file_path, "r", encoding="utf-8") as file: 59 | test_data = json.load(file) 60 | 61 | print("Loading tokenizer...") 62 | tokenizer = Tokenizer(checkpoint_dir) 63 | 64 | print(f"train has {len(train_data):,} samples") 65 | print(f"test has {len(test_data):,} samples") 66 | 67 | print("Processing train set ...") 68 | train_data = [ 69 | prepare_sample( 70 | example=sample, 71 | tokenizer=tokenizer, 72 | max_length=max_seq_length, 73 | mask_inputs=mask_inputs, 74 | ignore_index=ignore_index, 75 | ) 76 | for sample in tqdm(train_data) 77 | ] 78 | torch.save(train_data, destination_path / "train.pt") 79 | 80 | print("Processing test set ...") 81 | test_data = [ 82 | prepare_sample( 83 | example=sample, 84 | tokenizer=tokenizer, 85 | max_length=max_seq_length, 86 | mask_inputs=mask_inputs, 87 | ignore_index=ignore_index, 88 | ) 89 | for sample in tqdm(test_data) 90 | ] 91 | torch.save(test_data, destination_path / "test.pt") 92 | 93 | 94 | def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: 95 | """Processes a single sample. 96 | 97 | Each sample in the dataset consists of: 98 | - instruction: A string describing the task 99 | - input: A string holding a special input value for the instruction. 100 | This only applies to some samples, and in others this is empty. 101 | - output: The response string 102 | 103 | This function processes this data to produce a prompt text and a label for 104 | supervised training. The prompt text is formed as a single message including both 105 | the instruction and the input. The label/target is the same message but with the 106 | response attached. 107 | 108 | Finally, both the prompt and the label get tokenized. If desired, all tokens 109 | in the label that correspond to the original input prompt get masked out (default). 110 | """ 111 | full_prompt = generate_prompt(example) 112 | full_prompt_and_response = full_prompt + example["output"] 113 | encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) 114 | encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) 115 | 116 | # The labels are the full prompt with response, but with the prompt masked out 117 | labels = encoded_full_prompt_and_response.clone() 118 | if mask_inputs: 119 | labels[: len(encoded_full_prompt)] = ignore_index 120 | 121 | return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} 122 | 123 | 124 | def generate_prompt(example: dict) -> str: 125 | """Generates a standardized message to prompt the model with an instruction and a 126 | 'response' field.""" 127 | 128 | return ( 129 | "Below is an instruction that describes a task, paired with an input that provides further context. " 130 | "Write a response that appropriately completes the request.\n\n" 131 | f"### Instruction:\n{example['input']}\n\n### Response:" 132 | ) 133 | 134 | 135 | if __name__ == "__main__": 136 | CLI(prepare) 137 | -------------------------------------------------------------------------------- /eval/factmem_rephrase.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Sample script 3 | 4 | export OPENAI_API_KEY= 5 | 6 | python lit-gpt-dev/eval/factmem_rephrase.py --model tomg-group-umd/tinyllama_1b_redpajama_wiki2k_200B_tld3-step-00009536 --dataset "tomg-group-umd/RedPajama-Data-V2" --subset sample-100B --split train --num_samples 1000 7 | 8 | 9 | ''' 10 | 11 | import time 12 | import argparse 13 | import os 14 | import jsonlines 15 | import json 16 | from tqdm import tqdm 17 | import torch 18 | 19 | import datasets 20 | from transformers import AutoTokenizer, AutoModelForCausalLM 21 | from transformers import set_seed 22 | 23 | def str2bool(v): 24 | """Human friendly boolean cmdline flag parser.""" 25 | if isinstance(v, bool): 26 | return v 27 | if v.lower() in ("yes", "true", "t", "y", "1"): 28 | return True 29 | elif v.lower() in ("no", "false", "f", "n", "0"): 30 | return False 31 | else: 32 | raise argparse.ArgumentTypeError( 33 | f"Boolean value expected. Got: {str(v)}, " f"which cannot be converted to a boolean." 34 | ) 35 | 36 | 37 | def process_raw_data(raw_data, dataset): 38 | if dataset == "HuggingFaceTB/cosmopedia": 39 | return raw_data["prompt"] + raw_data["text"] 40 | elif dataset == "stingning/ultrachat": 41 | return "\n\n".join(raw_data["data"]) 42 | elif dataset == "tomg-group-umd/RedPajama-Data-V2": 43 | return raw_data["raw_content"] 44 | else: 45 | try: 46 | return raw_data["text"] 47 | except: 48 | raise NotImplementedError(f"{dataset}") 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--model", default=None, required=True) 54 | 55 | parser.add_argument("--dataset", default=None,required=True) 56 | parser.add_argument("--num_samples", default=500, type=int) 57 | parser.add_argument("--max_length", default=256, type=int) 58 | parser.add_argument("--min_length", default=64, type=int) 59 | parser.add_argument("--save_file_name", default=None, type=str) 60 | parser.add_argument("--seed", default=5, type=int) 61 | 62 | 63 | 64 | 65 | parser.add_argument("--dataset_type", default="huggingface") # huggingface, huggingface_disk 66 | parser.add_argument("--subset", default=None) 67 | parser.add_argument("--split", default=None) 68 | 69 | parser.add_argument("--run_prelim_eval", type=str2bool, default=True) 70 | args = parser.parse_args() 71 | 72 | if args.save_file_name is None: 73 | args.save_file_name = f"rephrase_ppl_expts/rephrased/{args.num_samples}_{args.min_length}_{args.max_length}/rephrased.jsonl" 74 | 75 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 76 | if torch.cuda.is_available(): 77 | print(f"Available GPUs: {torch.cuda.device_count()}") 78 | 79 | 80 | tokenizer = AutoTokenizer.from_pretrained(args.model) 81 | tokenizer.pad_token = tokenizer.eos_token 82 | 83 | 84 | 85 | if args.dataset_type == "huggingface": 86 | raw_dataset = datasets.load_dataset(args.dataset, args.subset, split=args.split, streaming=True) 87 | elif args.dataset_type == "huggingface_disk": 88 | raw_dataset = datasets.load_from_disk(args.dataset) 89 | else: 90 | raise NotImplementedError(f"{args.dataset_type}") 91 | # TODO Add support for our hfds and pkds 92 | 93 | print(raw_dataset) 94 | raw_dataset_iterator = iter(raw_dataset) 95 | 96 | 97 | final_strings = [] 98 | count = 0 99 | with torch.no_grad(): 100 | with tqdm(total=args.num_samples) as pbar: 101 | while count < args.num_samples: 102 | torch.cuda.empty_cache() 103 | raw_data = next(raw_dataset_iterator) 104 | 105 | full_sequence = process_raw_data(raw_data, args.dataset) 106 | inputs = tokenizer(full_sequence, truncation=True, max_length=args.max_length, return_tensors="pt") 107 | 108 | if inputs.input_ids.shape[1] <= args.min_length or inputs.input_ids.shape[1] >= args.max_length : 109 | continue 110 | else: 111 | 112 | final_strings.append(full_sequence) 113 | 114 | pbar.update(1) 115 | count += 1 116 | 117 | 118 | 119 | import openai 120 | 121 | # Load your OpenAI API key from the environment variable 122 | openai.api_key = os.getenv('OPENAI_API_KEY') 123 | 124 | # List of strings you want to rephrase 125 | 126 | def rephrase_strings(strings): 127 | rephrased = [] 128 | for string in strings: 129 | completion = openai.chat.completions.create( 130 | model="gpt-3.5-turbo-1106", 131 | max_tokens=args.max_length, 132 | messages=[ 133 | { 134 | "role": "user", 135 | "content": f"Rephrase this sentence: {string}", 136 | }, 137 | ], 138 | ) 139 | rephrased_text = completion.choices[0].message.content 140 | 141 | rephrased.append({ 142 | "original_text": string, 143 | "rephrased_text": rephrased_text 144 | }) 145 | return rephrased 146 | 147 | # Rephrase the strings 148 | rephrased_strings = rephrase_strings(final_strings) 149 | 150 | # Save the rephrased strings to a JSON file 151 | os.makedirs(os.path.dirname(args.save_file_name), exist_ok=True) 152 | with jsonlines.open(args.save_file_name, "w") as linewriter: 153 | for row in tqdm(rephrased_strings): 154 | linewriter.write(row) 155 | 156 | -------------------------------------------------------------------------------- /scripts/prepare_csv.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | import logging 5 | import sys 6 | from pathlib import Path 7 | from typing import Optional, Tuple 8 | 9 | import torch 10 | from torch.utils.data import random_split 11 | from tqdm import tqdm 12 | 13 | # support running without installing as a package 14 | wd = Path(__file__).parent.parent.resolve() 15 | logger = logging.getLogger(__name__) 16 | sys.path.append(str(wd)) 17 | 18 | from lit_gpt.tokenizer import Tokenizer 19 | from lit_gpt.utils import CLI 20 | 21 | 22 | def prepare( 23 | csv_path: Path, 24 | destination_path: Path = Path("data/csv"), 25 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), 26 | test_split_fraction: float = 0.1, 27 | seed: int = 42, 28 | mask_inputs: bool = False, 29 | ignore_index: int = -1, 30 | max_seq_length: Optional[int] = None, 31 | columns: Tuple[str, ...] = ("instruction", "input", "output"), 32 | ) -> None: 33 | """Prepare a CSV dataset for instruction tuning. 34 | 35 | The output is a training and test dataset saved as `train.pt` and `test.pt`, 36 | which stores the preprocessed and tokenized prompts and labels. 37 | """ 38 | if max_seq_length is None: 39 | with open(checkpoint_dir / "lit_config.json", "r") as file: 40 | config = json.load(file) 41 | max_seq_length = config["block_size"] 42 | 43 | destination_path.mkdir(parents=True, exist_ok=True) 44 | logger.info("Loading data file ...") 45 | import pandas as pd 46 | 47 | df = pd.read_csv(csv_path, dtype=str).fillna("") 48 | if not (df.columns.values == columns).all(): 49 | raise ValueError(f"CSV columns must be {columns}, found {df.columns.values}") 50 | data = json.loads(df.to_json(orient="records", indent=4)) 51 | 52 | print("Loading tokenizer...") 53 | tokenizer = Tokenizer(checkpoint_dir) 54 | 55 | # Partition the dataset into train and test 56 | train_set, test_set = random_split( 57 | data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) 58 | ) 59 | train_set, test_set = list(train_set), list(test_set) 60 | 61 | print(f"train has {len(train_set):,} samples") 62 | print(f"test has {len(test_set):,} samples") 63 | 64 | print("Processing train split ...") 65 | train_set = [ 66 | prepare_sample( 67 | example=sample, 68 | tokenizer=tokenizer, 69 | max_length=max_seq_length, 70 | mask_inputs=mask_inputs, 71 | ignore_index=ignore_index, 72 | ) 73 | for sample in tqdm(train_set) 74 | ] 75 | torch.save(train_set, destination_path / "train.pt") 76 | 77 | print("Processing test split ...") 78 | test_set = [ 79 | prepare_sample( 80 | example=sample, 81 | tokenizer=tokenizer, 82 | max_length=max_seq_length, 83 | mask_inputs=mask_inputs, 84 | ignore_index=ignore_index, 85 | ) 86 | for sample in tqdm(test_set) 87 | ] 88 | torch.save(test_set, destination_path / "test.pt") 89 | 90 | 91 | def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: 92 | """Processes a single sample. 93 | 94 | Each sample in the dataset consists of: 95 | - instruction: A string describing the task 96 | - input: A string holding a special input value for the instruction. 97 | This only applies to some samples, and in others this is empty. 98 | - output: The response string 99 | 100 | This function processes this data to produce a prompt text and a label for 101 | supervised training. The prompt text is formed as a single message including both 102 | the instruction and the input. The label/target is the same message but with the 103 | response attached. 104 | 105 | Finally, both the prompt and the label get tokenized. If desired, all tokens 106 | in the label that correspond to the original input prompt get masked out (default). 107 | """ 108 | full_prompt = generate_prompt(example) 109 | full_prompt_and_response = full_prompt + example["output"] 110 | encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) 111 | encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) 112 | 113 | # The labels are the full prompt with response, but with the prompt masked out 114 | labels = encoded_full_prompt_and_response.clone() 115 | if mask_inputs: 116 | labels[: len(encoded_full_prompt)] = ignore_index 117 | 118 | return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} 119 | 120 | 121 | def generate_prompt(example: dict) -> str: 122 | """Generates a standardized message to prompt the model with an instruction, optional input and a 123 | 'response' field.""" 124 | 125 | if example["input"]: 126 | return ( 127 | "Below is an instruction that describes a task, paired with an input that provides further context. " 128 | "Write a response that appropriately completes the request.\n\n" 129 | f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" 130 | ) 131 | return ( 132 | "Below is an instruction that describes a task. " 133 | "Write a response that appropriately completes the request.\n\n" 134 | f"### Instruction:\n{example['instruction']}\n\n### Response:" 135 | ) 136 | 137 | 138 | if __name__ == "__main__": 139 | CLI(prepare) 140 | --------------------------------------------------------------------------------