├── assets
    └── goldfish-loss.jpg
├── requirements.txt
├── tutorials
    ├── images
    │   └── prepare_dataset
    │   │   ├── alpaca.jpg
    │   │   ├── dolly.jpg
    │   │   ├── lima.jpg
    │   │   ├── longform.jpg
    │   │   └── alpaca_libre.jpg
    ├── download_freewilly_2.md
    ├── download_longchat.md
    ├── download_vicuna.md
    ├── download_function_calling_llama_2.md
    ├── download_openllama.md
    ├── download_gemma.md
    ├── download_falcon.md
    ├── download_dolly.md
    ├── download_stablecode.md
    ├── download_redpajama_incite.md
    ├── evaluation.md
    ├── download_pythia.md
    ├── download_code_llama.md
    ├── download_llama_2.md
    ├── download_tinyllama.md
    ├── download_mistral.md
    ├── download_stablelm.md
    ├── pretrain_openwebtext.md
    ├── finetune_full.md
    ├── oom.md
    ├── convert_lit_models.md
    ├── inference.md
    └── download_phi.md
├── axonn_fabric
    ├── __init__.py
    └── megatron_logging.py
├── lit_gpt
    ├── data
    │   ├── __init__.py
    │   ├── dolly.py
    │   ├── longform.py
    │   ├── tinyllama.py
    │   ├── json.py
    │   ├── base.py
    │   └── lima.py
    ├── __init__.py
    ├── rmsnorm.py
    ├── data_loading_utils.py
    ├── args.py
    ├── tokenizer.py
    ├── multiple_negative_ranking_loss.py
    ├── retrieval_attn_utils.py
    └── doc_block_utils.py
├── .gitignore
├── scripts
    ├── check_model_exists.py
    ├── convert_lit_ckpt.sh
    ├── push_to_hub.py
    ├── prepare_slimpajama.py
    ├── prepare_starcoder.py
    ├── merge_lora.py
    ├── simulate_lr.py
    ├── prepare_openwebtext.py
    ├── convert_checkpoint_to_hf.py
    ├── convert_pretrained_checkpoint.py
    ├── download.py
    ├── prepare_retrieval_data.py
    ├── prepare_longform.py
    └── prepare_csv.py
├── requirements-all.txt
├── launch_scripts
    ├── config
    │   ├── config_quick_run.yaml
    │   ├── tinyllama-1b-control.yaml
    │   ├── tinyllama-1b.yaml
    │   ├── tinyllama-1b-equal-supervised-tokens.yaml
    │   ├── tinyllama-1b-equal-supervised-tokens_mbs11.yaml
    │   ├── tinyllama-1b-equal-supervised-tokens_mbs8.yaml
    │   └── tinyllama-1b-equal-supervised-tokens_.yaml
    └── launch_jobs_1b_hashtable.sh.sh
├── .flake8
├── pyproject.toml
├── data_checks
    └── count_token_lengths.py
├── install.sh
├── eval
    ├── alpaca_eval_generate.py
    └── factmem_rephrase.py
└── README.md


/assets/goldfish-loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/assets/goldfish-loss.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=2.1.2
2 | lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af
3 | 


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpaca.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/alpaca.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/dolly.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/dolly.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/lima.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/lima.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/longform.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/longform.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpaca_libre.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahans30/goldfish-loss/HEAD/tutorials/images/prepare_dataset/alpaca_libre.jpg


--------------------------------------------------------------------------------
/axonn_fabric/__init__.py:
--------------------------------------------------------------------------------
1 | from .fabric import AxoNNFabric
2 | from .megatron_logging import pretty_log
3 | from .hf_llama_tensor_parallel import monkey_patch_llama_with_axonn
4 | 


--------------------------------------------------------------------------------
/lit_gpt/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from lit_gpt.data.base import LitDataModule, SFTDataset, apply_prompt_template, get_sft_collate_fn
 4 | from lit_gpt.data.alpaca import Alpaca
 5 | from lit_gpt.data.json import JSON
 6 | from lit_gpt.data.dolly import Dolly
 7 | from lit_gpt.data.flan import FLAN
 8 | from lit_gpt.data.lima import LIMA
 9 | from lit_gpt.data.longform import LongForm
10 | from lit_gpt.data.tinyllama import TinyLlama
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | .idea
 3 | .DS_Store
 4 | *.egg-info
 5 | build
 6 | .venv
 7 | .vscode
 8 | 
 9 | # data
10 | data
11 | datasets
12 | !lit_gpt/data
13 | !tests/data
14 | checkpoints
15 | out
16 | output
17 | outputs
18 | log
19 | wandb
20 | events.out.tfevents*
21 | results
22 | 
23 | tests/reference_models
24 | 
25 | # umd artifact paths
26 | slurm_logs/
27 | logs/
28 | checkpoints/
29 | 
30 | jwk_scratch/ahans/*
31 | mia_outputs/
32 | data_extraction_outputs/
33 | mem_output/
34 | launch_scripts/ahans/archive
35 | 
36 | .github*


--------------------------------------------------------------------------------
/scripts/check_model_exists.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | import sys
 4 | from huggingface_hub import delete_repo
 5 | import os
 6 | 
 7 | if __name__ == "__main__":
 8 |     from transformers import AutoTokenizer, AutoModel
 9 |     import argparse
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--model_name", type=str)
13 | 
14 |     args = parser.parse_args()
15 | 
16 |     try:
17 |         model = AutoModel.from_pretrained(f"tomg-group-umd/{args.model_name}")
18 |         print(f"Repo {args.model_name} exists")
19 |         sys.exit(0)
20 |     except Exception as e:
21 |         try:
22 |             delete_repo(repo_id = args.model_name, token = os.environ["HF_TOKEN_WRITE"])
23 |         except Exception as e:
24 |             pass
25 |         print(f"Repo {args.model_name} does NOT exist")
26 |         sys.exit(1)


--------------------------------------------------------------------------------
/requirements-all.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | jsonargparse[signatures]  # CLI
 3 | bitsandbytes==0.41.0      # quantization
 4 | scipy                     # required by bitsandbytes
 5 | sentencepiece             # llama-based models
 6 | tokenizers                # pythia, falcon, redpajama
 7 | datasets                  # eval
 8 | requests                  # scripts/prepare_*
 9 | zstandard                 # scripts/prepare_redpajama.py, scripts/prepare_starcoder.py
10 | pandas                    # scripts/prepare_csv.py, scripts/prepare_starcoder.py
11 | pyarrow                   # scripts/prepare_starcoder.py
12 | tensorboard               # pretrain/tinyllama.py
13 | torchmetrics              # pretrain/tinyllama.py
14 | # eval
15 | git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529
16 | # scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py, pretrain/tinyllama.py
17 | lightning[data] @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af
18 | wandb
19 | docstring_parser


--------------------------------------------------------------------------------
/scripts/convert_lit_ckpt.sh:
--------------------------------------------------------------------------------
 1 | user=$(whoami)
 2 | echo "User: $user"
 3 | python scripts/convert_lit_checkpoint.py --checkpoint_path /lustre/orion/csc569/scratch/$user/lit-gpt-dev/out/lit-tiny-llama-1.1b/step-00120000.pth --output_path /lustre/orion/csc569/scratch/$user/lit-gpt-dev/transformer_ckpts/lit-tiny-llama-1.1b-120k-steps-500B-tokens --model_name tiny-llama-1.1b 
 4 | cd /lustre/orion/csc569/scratch/$user/lit-gpt-dev/transformer_ckpts/lit-tiny-llama-1.1b-120k-steps-500B-tokens
 5 | wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-step-50K-105b/resolve/main/special_tokens_map.json
 6 | wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-step-50K-105b/resolve/main/tokenizer_config.json
 7 | wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-step-50K-105b/resolve/main/tokenizer.json
 8 | wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-step-50K-105b/resolve/main/tokenizer.model
 9 | cd /lustre/orion/csc569/scratch/$user/lit-gpt-dev
10 | python scripts/push_to_hub.py --model_name tiny-llama-1.1b-120k-steps-500B-tokens  --model_path /lustre/orion/csc569/scratch/$user/lit-gpt-dev/transformer_ckpts/lit-tiny-llama-1.1b-120k-steps-500B-tokens --token_id $HF_TOKEN


--------------------------------------------------------------------------------
/tutorials/download_freewilly_2.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Download [FreeWilly 2](https://stability.ai/blog/freewilly-large-instruction-fine-tuned-models) weights
 3 | 
 4 | Stability AI announced FreeWilly inspired by the methodology pioneered by Microsoft in its paper: "Orca: Progressive Learning from Complex Explanation Traces of GPT-4”.
 5 | FreeWilly2 leverages the Llama 2 70B foundation model to reach a performance that compares favorably with GPT-3.5 for some tasks.
 6 | 
 7 | ```bash
 8 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 9 | 
10 | python scripts/download.py --repo_id stabilityai/FreeWilly2
11 | 
12 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/stabilityai/FreeWilly2
13 | ```
14 | 
15 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
16 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
17 | 
18 | You're done! To execute the model just run:
19 | 
20 | ```bash
21 | pip install sentencepiece
22 | 
23 | python chat/base.py --checkpoint_dir checkpoints/stabilityai/FreeWilly2
24 | ```
25 | 


--------------------------------------------------------------------------------
/lit_gpt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import re
 4 | import logging
 5 | 
 6 | from lit_gpt.model import GPT
 7 | from lit_gpt.config import Config
 8 | from lit_gpt.tokenizer import Tokenizer
 9 | 
10 | from lightning_utilities.core.imports import RequirementCache
11 | 
12 | _LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.2.0.dev0")
13 | if not bool(_LIGHTNING_AVAILABLE):
14 |     raise ImportError(
15 |         "Lit-GPT requires lightning nightly. Please run:\n"
16 |         f" pip uninstall -y lightning; pip install -r requirements.txt\n{str(_LIGHTNING_AVAILABLE)}"
17 |     )
18 | 
19 | # Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632
20 | pattern = re.compile(".*Profiler function .* will be ignored")
21 | logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage()))
22 | 
23 | # Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint
24 | logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True
25 | logging.getLogger("torch.distributed.fsdp._debug_utils").disabled = True
26 | 
27 | __all__ = ["GPT", "Config", "Tokenizer"]
28 | 


--------------------------------------------------------------------------------
/lit_gpt/rmsnorm.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class RMSNorm(torch.nn.Module):
 7 |     """Root Mean Square Layer Normalization.
 8 | 
 9 |     Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
10 |     https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
11 |     """
12 | 
13 |     def __init__(self, size: int, dim: int = -1, eps: float = 1e-6, add_unit_offset: bool = False) -> None:
14 |         super().__init__()
15 |         self.weight = torch.nn.Parameter(torch.ones(size))
16 |         self.eps = eps
17 |         self.dim = dim
18 |         self.add_unit_offset = add_unit_offset
19 | 
20 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
21 |         dtype = x.dtype
22 |         x = x.float()
23 |         # NOTE: the original RMSNorm paper implementation is not equivalent
24 |         norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
25 |         x_normed = x * torch.rsqrt(norm_x + self.eps)
26 |         x_normed = x_normed.to(dtype=dtype)
27 |         if self.add_unit_offset:
28 |             # Gemma model requires a unit offset
29 |             # https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L176
30 |             return x_normed * (1 + self.weight)
31 |         return x_normed * self.weight
32 | 
33 |     def reset_parameters(self) -> None:
34 |         torch.nn.init.ones_(self.weight)
35 | 


--------------------------------------------------------------------------------
/scripts/push_to_hub.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | from huggingface_hub import create_repo
 4 | 
 5 | transformers.logging.set_verbosity_info()
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     from transformers import AutoTokenizer, AutoModelForCausalLM
10 |     import argparse
11 | 
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("--model_path", type=str, required=True)
14 |     parser.add_argument("--tokenizer_path", type=str, default=None)
15 |     # parser.add_argument("--repo_name", type=str, required=True)
16 |     parser.add_argument("--model_name", type=str, required=True)
17 |     parser.add_argument("--token_id", type=str, required=True)
18 | 
19 |     args = parser.parse_args()
20 |     args.repo_name = f"tomg-group-umd/{args.model_name}"
21 |     if args.tokenizer_path is None:
22 |         args.tokenizer_path = args.model_path
23 |     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
24 |     # model = AutoModelForCausalLM.from_pretrained(args.model_path)
25 |     state_dict = torch.load(f"{args.model_path}/pytorch_model.bin")
26 |     model = AutoModelForCausalLM.from_pretrained(args.model_path, state_dict=state_dict)
27 |     print(model)
28 | 
29 |     create_repo(args.repo_name, private=True, token=args.token_id, exist_ok=True)
30 |     model.push_to_hub(args.repo_name, use_temp_dir=True, token=args.token_id, overwrite=True)
31 |     tokenizer.push_to_hub(args.repo_name, use_temp_dir=True, token=args.token_id)
32 | 
33 |     print(f"Model pushed to {model}")
34 | 


--------------------------------------------------------------------------------
/tutorials/download_longchat.md:
--------------------------------------------------------------------------------
 1 | ## Download [LongChat](https://lmsys.org/blog/2023-06-29-longchat) weights
 2 | 
 3 | LongChat is an open-source family of chatbots based on LLaMA featuring an extended context length up to 16K tokens.
 4 | The technique used to extend the context length is described in [this blogpost](https://kaiokendev.github.io/context).
 5 | 
 6 | To see all the available checkpoints, run:
 7 | 
 8 | ```bash
 9 | python scripts/download.py | grep longchat
10 | ```
11 | 
12 | which will print
13 | 
14 | ```text
15 | lmsys/longchat-7b-16k
16 | lmsys/longchat-13b-16k
17 | ```
18 | 
19 | In order to use a specific checkpoint, for instance [longchat-7b-16k](https://huggingface.co/lmsys/longchat-7b-16k), download the weights and convert the checkpoint to the lit-gpt format:
20 | 
21 | ```bash
22 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
23 | 
24 | python scripts/download.py --repo_id lmsys/longchat-7b-16k
25 | 
26 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/lmsys/longchat-7b-16k
27 | ```
28 | 
29 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
30 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
31 | 
32 | You're done! To execute the model just run:
33 | 
34 | ```bash
35 | pip install sentencepiece
36 | 
37 | python chat/base.py --checkpoint_dir checkpoints/lmsys/longchat-7b-16k
38 | ```
39 | 


--------------------------------------------------------------------------------
/tutorials/download_vicuna.md:
--------------------------------------------------------------------------------
 1 | ## Download [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) weights
 2 | 
 3 | Vicuna is an open-source family of chatbots trained by fine-tuning LLaMA on user-shared conversations collected from [ShareGPT](https://sharegpt.com).
 4 | 
 5 | To see all the available checkpoints for Vicuna, run:
 6 | 
 7 | ```bash
 8 | python scripts/download.py | grep vicuna
 9 | ```
10 | 
11 | which will print
12 | 
13 | ```text
14 | lmsys/vicuna-7b-v1.3
15 | lmsys/vicuna-13b-v1.3
16 | lmsys/vicuna-33b-v1.3
17 | lmsys/vicuna-7b-v1.5
18 | lmsys/vicuna-7b-v1.5-16k
19 | lmsys/vicuna-13b-v1.5
20 | lmsys/vicuna-13b-v1.5-16k
21 | ```
22 | 
23 | In order to use a specific Vicuna checkpoint, for instance [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5), download the weights and convert the checkpoint to the lit-gpt format:
24 | 
25 | ```bash
26 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
27 | 
28 | python scripts/download.py --repo_id lmsys/vicuna-7b-v1.5
29 | 
30 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/lmsys/vicuna-7b-v1.5
31 | ```
32 | 
33 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
34 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
35 | 
36 | You're done! To execute the model just run:
37 | 
38 | ```bash
39 | pip install sentencepiece
40 | 
41 | python chat/base.py --checkpoint_dir checkpoints/lmsys/vicuna-7b-v1.5
42 | ```
43 | 


--------------------------------------------------------------------------------
/launch_scripts/config/config_quick_run.yaml:
--------------------------------------------------------------------------------
 1 | run_name: default-run
 2 | out_dir: null
 3 | resume: true
 4 | max_tokens: 1000000000000
 5 | max_iters: null
 6 | seed: 1337
 7 | model_name: tiny-llama-1.1b
 8 | block_size: 2048
 9 | world_batch_size: 32
10 | learning_rate: 0.0004
11 | warmup_steps: 2000
12 | weight_decay: 0.1
13 | beta1: 0.9
14 | beta2: 0.95
15 | grad_clip: 1.0
16 | lr_schedule: cosine
17 | decay_lr: true
18 | min_lr: 4.0e-05
19 | neptune_from_tokens: null
20 | neptune_till_tokens: null
21 | neptune_noise_alpha: null
22 | label_smoothing: 0.0
23 | k_token_loss_dropout: null
24 | fabric_strategy: ddp
25 | fabric_precision: bf16-true
26 | micro_batch_size: 4
27 | compile_model: true
28 | matmul_precision: high
29 | dataloader_num_workers: 0
30 | n_chunks: 4
31 | logger_name: wandb
32 | logger_project: tinyllama
33 | data_telemetry: false
34 | log_step_interval: 1
35 | eval_iters: 100
36 | save_and_eval_interval: 2000
37 | save_last_step: false
38 | sanity_validate: true
39 | measure_flops: false
40 | text_key: text
41 | pad_to_block_size: false
42 | add_bos: true
43 | add_eos: true
44 | shuffle_filenames: true
45 | collate_checks_enabled: true
46 | all_block_size_tensors: false
47 | data_config:
48 |   train_data:
49 |   - type: pkds
50 |     prefix: ''
51 |     weight: 1
52 |   val_data:
53 |   - type: pkds
54 |     prefix: ''
55 |     weight: 1
56 | train_data_dir: /lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd
57 | val_data_dir: /lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd
58 | tokenizer_path: /lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T
59 | 


--------------------------------------------------------------------------------
/tutorials/download_function_calling_llama_2.md:
--------------------------------------------------------------------------------
 1 | ## Download [Function Calling Llama 2](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2) weights
 2 | 
 3 | Llama-7B with function calling is licensed according to the Meta Community license.
 4 | 
 5 | Function calling Llama extends the hugging face Llama 2 models with function calling capabilities.
 6 | The model responds with a structured json argument with the function name and arguments.
 7 | 
 8 | In order to use the checkpoint, download the weights and convert the checkpoint to the lit-gpt format.
 9 | 
10 | ```bash
11 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
12 | 
13 | python scripts/download.py --repo_id Trelis/Llama-2-7b-chat-hf-function-calling-v2 --from_safetensors true
14 | 
15 | python scripts/convert_hf_checkpoint.py --checkpoint_dir Trelis/Llama-2-7b-chat-hf-function-calling-v2
16 | ```
17 | 
18 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
19 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
20 | 
21 | You're done! To execute the model just run:
22 | 
23 | ```bash
24 | pip install sentencepiece
25 | 
26 | python chat/base.py --checkpoint_dir Trelis/Llama-2-7b-chat-hf-function-calling-v2
27 | ```
28 | Is strongly recommended to visit the model [repository](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2) to know how to format the prompt.
29 | 
30 | The chat script has a generic use case with a single function defined, feel free to play with it to fit your needs, for instance to make HTTP requests with the model outputs.
31 | 
32 | Have fun!
33 | 


--------------------------------------------------------------------------------
/tutorials/download_openllama.md:
--------------------------------------------------------------------------------
 1 | ## Download [OpenLLaMA](https://github.com/openlm-research/open_llama) weights
 2 | 
 3 | OpenLLaMA is a permissively licensed open source reproduction of [Meta AI’s LLaMA](https://github.com/facebookresearch/llama)
 4 | 7B and 13B checkpoints trained on the [RedPajama dataset](https://github.com/togethercomputer/RedPajama-Data).
 5 | The weights can serve as the drop in replacement of LLaMA in existing implementations. We also provide a smaller 3B variant.
 6 | 
 7 | To see all the available checkpoints for Open LLaMA, run:
 8 | 
 9 | ```bash
10 | python scripts/download.py | grep open_llama
11 | ```
12 | 
13 | which will print
14 | 
15 | ```text
16 | openlm-research/open_llama_3b
17 | openlm-research/open_llama_7b
18 | openlm-research/open_llama_13b
19 | ```
20 | 
21 | In order to use a specific OpenLLaMA checkpoint, for instance [open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b), download the weights and convert the checkpoint to the lit-gpt format:
22 | 
23 | ```bash
24 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
25 | 
26 | python scripts/download.py --repo_id openlm-research/open_llama_3b
27 | 
28 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/openlm-research/open_llama_3b
29 | ```
30 | 
31 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
32 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
33 | 
34 | You're done! To execute the model just run:
35 | 
36 | ```bash
37 | pip install sentencepiece
38 | 
39 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/openlm-research/open_llama_3b
40 | ```
41 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | enable-extensions = G
 3 | select = B,C,E,F,G,P,SIM1,T4,W,B9
 4 | max-line-length = 120 
 5 | # track with black in pyproject.toml
 6 | #
 7 | #
 8 | # anytime you really hate a rule and don't want to follow it, add it here:
 9 | # C408 ignored because we like the dict keyword argument syntax
10 | # E501 is not flexible enough, we're using B950 instead
11 | ignore =
12 |     E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
13 |     # fix these lints in the future
14 |     E275,
15 |     # shebang has extra meaning in fbcode lints, so I think it's not worth trying
16 |     # to line this up with executable bit
17 |     EXE001,
18 |     # these ignores are from flake8-bugbear; please fix!
19 |     B007,B008,B017,B019,B020,B023,B024,B026,B028,B903,B904,B905,B906,B907
20 |     # these ignores are from flake8-comprehensions; please fix!
21 |     C407,
22 |     # these ignores are from flake8-logging-format; please fix!
23 |     G100,G101,G200,G201,G202
24 |     # these ignores are from flake8-simplify. please fix or ignore with commented reason
25 |     SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
26 |     # flake8-simplify code styles
27 |     SIM102,SIM103,SIM106,SIM112,
28 |     # I claim to know what I'm doing when doing this:
29 |     B006,
30 |     # We like commented out code sometimes :<
31 |     E800,
32 |     # and inefficient logging:
33 |     G004,
34 |     # overkill for ML code
35 |     ECE001,
36 |     # will get people to do this anyway:
37 |     E731,
38 |     # litgpt:
39 |     B011,
40 |     PT015
41 | per-file-ignores =
42 |     __init__.py: F401
43 | optional-ascii-coding = True
44 |   exclude =
45 |       .git
46 |       ./.git
47 |       /build
48 |       notebooks
49 |       scripts
50 |       __pycache__
51 |       dl/*
52 |       log/*
53 |       *.pyi
54 | 


--------------------------------------------------------------------------------
/tutorials/download_gemma.md:
--------------------------------------------------------------------------------
 1 | ## Download [Gemma](https://blog.google/technology/developers/gemma-open-models/) weights
 2 | 
 3 | Google developed and publicly released the Gemma large language models (LLMs), a collection of pretrained models in 2B and 7B parameter size that are based on the Gemini architecture.
 4 | 
 5 | For more information, please see the [technical report](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf).
 6 | 
 7 | 
 8 | To see all the available checkpoints, run:
 9 | 
10 | ```bash
11 | python scripts/download.py | grep gemma
12 | ```
13 | 
14 | which will print
15 | 
16 | ```text
17 | google/gemma-7b
18 | google/gemma-2b
19 | google/gemma-7b-it
20 | google/gemma-2b-it
21 | ```
22 | 
23 | In the list above, `gemma-2b` and `gemma-7b` are the pretrained models, and `gemma-2b-it` and `gemma-7b-it` are the instruction-finetuned models.
24 | 
25 | In order to use a specific checkpoint, for instance [gemma-2b](https://huggingface.co/google/gemma-2b), download the weights and convert the checkpoint to the lit-gpt format.
26 | 
27 | This requires that you've been granted access to the weights on the HuggingFace hub. You can do so by following the steps at <https://huggingface.co/google/gemma-2b>.
28 | After access is granted, you can find your HF hub token in <https://huggingface.co/settings/tokens>.
29 | 
30 | ```bash
31 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
32 | 
33 | python scripts/download.py --repo_id google/gemma-2b --access_token your_hf_token --from_safetensors true
34 | 
35 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/google/gemma-2b
36 | ```
37 | 
38 | By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM
39 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
40 | 
41 | You're done! To execute the model just run:
42 | 
43 | ```bash
44 | python chat/base.py --checkpoint_dir checkpoints/google/gemma-2b
45 | ```
46 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools >= 61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | requires-python = ">= 3.11"
 6 | 
 7 | [project]
 8 | name = "lit-gpt-umd"
 9 | version = "0.1"
10 | dependencies = [
11 |   "torch==2.1.2",
12 |   "lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af",
13 |   "pytorch-lightning==2.2.1",
14 |   "jsonargparse",
15 |   "requests",
16 |   "tensorboard",
17 |   "torchmetrics",
18 |   "submitit @ git+https://github.com/jwkirchenbauer/submitit.git",
19 |   "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529",
20 |   "wandb",
21 |   "sentencepiece",
22 |   "tokenizers",
23 |   "datasets",
24 | ]
25 | # Note: The order really matters here!
26 | # We really should migrate to lm-eval 0.4.* eventually (or wait for lit-gpt to migrate)
27 | # Not really a best practice to inscribe exact packages here :)
28 | 
29 | [project.optional-dependencies]
30 | # only for testing
31 | dev = [
32 |   "pytest",
33 |   "pytest-rerunfailures",
34 |   "pytest-timeout",
35 |   "transformers>=4.38.0",
36 |   "einops",
37 |   "protobuf",
38 |   "docstring_parser",
39 |   "lightning-cloud",
40 | ]
41 | 
42 | # only for data preproc
43 | data = [
44 |   "lightning[data] @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af",
45 |   "requests",
46 |   "zstandard",
47 |   "pandas",
48 |   "pyarrow",
49 | ]
50 | 
51 | quant = [
52 |   "bitsandbytes>=0.41.0",
53 |   "scipy",
54 | ]
55 | 
56 | # only on the cluster:
57 | hpc = [
58 |   "packaging",
59 |   "ninja",
60 |   "flash_attn @ git+https://github.com/ROCmSoftwarePlatform/flash-attention",
61 |   "axonn", # requires <mpi.h> headers
62 | ]
63 | 
64 | [tool.black]
65 | line-length = 120
66 | 
67 | [tool.setuptools.packages.find]
68 | include = ["lit-gpt", "axonn_fabric", "generate", "eval", "scripts", "finetune", "analysis", "chat"]
69 | 
70 | [project.entry-points.console_scripts]
71 | train = "pretrain_umd.module:train"
72 | push_to_hub = "scripts.module:push_to_hub"
73 | launch = "launch_scripts.module:launch_submitit"
74 | 
75 | 


--------------------------------------------------------------------------------
/tutorials/download_falcon.md:
--------------------------------------------------------------------------------
 1 | ## Download [Falcon](https://falconllm.tii.ae) weights
 2 | 
 3 | UAE's Technology Innovation Institute has open-sourced Falcon LLM.
 4 | It is trained on [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora
 5 |  Weights are released under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0).
 6 | 
 7 | The first Falcon release includes a base model and an instruction tuned model of sizes 7B and 40B called `falcon-7b-instruct` and `falcon-40b-instruct`. Recently, checkpoints for 180B parameter models were added as well; the 180B instruction tuned model is called `falcon-180B-chat` and similar to the `falcon-40b-instruct` architecture except for its larger size.
 8 | 
 9 | To see all the available checkpoints for Falcon, run:
10 | 
11 | ```bash
12 | python scripts/download.py | grep falcon
13 | ```
14 | 
15 | which will print
16 | 
17 | ```text
18 | tiiuae/falcon-7b
19 | tiiuae/falcon-7b-instruct
20 | tiiuae/falcon-40b
21 | tiiuae/falcon-40b-instruct
22 | tiiuae/falcon-180B
23 | tiiuae/falcon-180B-chat
24 | ```
25 | 
26 | In order to use a specific Falcon checkpoint, for instance [falcon-7b](https://huggingface.co/tiiuae/falcon-7b), download the weights and convert the checkpoint to the lit-gpt format:
27 | 
28 | ```bash
29 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
30 | 
31 | python scripts/download.py --repo_id tiiuae/falcon-7b
32 | 
33 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/tiiuae/falcon-7b
34 | ```
35 | 
36 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
37 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
38 | 
39 | You're done! To execute the model just run:
40 | 
41 | ```bash
42 | pip install tokenizers
43 | 
44 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/tiiuae/falcon-7b
45 | ```
46 | 
47 | or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Lightning-AI/lit-gpt/blob/main/notebooks/falcon-inference.ipynb)
48 | 


--------------------------------------------------------------------------------
/tutorials/download_dolly.md:
--------------------------------------------------------------------------------
 1 | ## Download [Dolly](https://github.com/databrickslabs/dolly) weights
 2 | 
 3 | Databricks’ [Dolly](https://huggingface.co/databricks/dolly-v2-12b) is an instruction-following large language model trained on the Databricks machine learning platform
 4 | that is licensed for commercial use. Based on `pythia-12b`, Dolly is trained on ~15k instruction/response fine tuning records
 5 | [`databricks-dolly-15k`](https://huggingface.co/datasets/databricks/databricks-dolly-15k) generated
 6 | by Databricks employees in capability domains from the InstructGPT paper, including brainstorming, classification, closed QA, generation,
 7 | information extraction, open QA and summarization. `dolly-v2-12b` is not a state-of-the-art model, but does exhibit surprisingly
 8 | high quality instruction following behavior not characteristic of the foundation model on which it is based.
 9 | 
10 | For detailed info on the models, their training, and their behavior, please see the [Dolly repository](https://github.com/databrickslabs/dolly).
11 | 
12 | To see all the available checkpoints for Dolly, run:
13 | 
14 | ```bash
15 | python scripts/download.py | grep dolly
16 | ```
17 | 
18 | which will print
19 | 
20 | ```text
21 | databricks/dolly-v2-3b
22 | databricks/dolly-v2-7b
23 | databricks/dolly-v2-12b
24 | ```
25 | 
26 | In order to use a specific Dolly checkpoint, for instance [dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b), download the weights and convert the checkpoint to the lit-gpt format:
27 | 
28 | ```bash
29 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
30 | 
31 | python scripts/download.py --repo_id databricks/dolly-v2-3b
32 | 
33 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/databricks/dolly-v2-3b
34 | ```
35 | 
36 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
37 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
38 | 
39 | You're done! To execute the model just run:
40 | 
41 | ```bash
42 | pip install tokenizers
43 | 
44 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/databricks/dolly-v2-3b
45 | ```
46 | 


--------------------------------------------------------------------------------
/scripts/prepare_slimpajama.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | import os
 5 | import sys
 6 | import time
 7 | from pathlib import Path
 8 | 
 9 | import zstandard as zstd
10 | from lightning.data.streaming import DataChunkRecipe, DataProcessor
11 | 
12 | # support running without installing as a package
13 | wd = Path(__file__).parent.parent.resolve()
14 | sys.path.append(str(wd))
15 | 
16 | from lit_gpt import Tokenizer
17 | from lit_gpt.utils import CLI
18 | 
19 | 
20 | class SlimPajamaDataRecipe(DataChunkRecipe):
21 |     def __init__(self, tokenizer: Tokenizer, chunk_size: int):
22 |         super().__init__(chunk_size)
23 |         self.tokenizer = tokenizer
24 | 
25 |     def prepare_structure(self, input_dir):
26 |         files = Path(input_dir).rglob("*.zst")
27 |         return [str(file) for file in files]
28 | 
29 |     def prepare_item(self, filepath):
30 |         with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
31 |             for row in f:
32 |                 text = json.loads(row)["text"]
33 |                 if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub":
34 |                     continue  # exclude the GitHub data since it overlaps with starcoder
35 |                 text_ids = self.tokenizer.encode(text, bos=False, eos=True)
36 |                 yield text_ids
37 | 
38 | 
39 | def prepare(
40 |     input_dir: Path = Path("data/SlimPajama-627B/train"),
41 |     output_dir: Path = Path("data/slimpajama/train"),
42 |     tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"),
43 |     chunk_size: int = (2049 * 16384),
44 |     fast_dev_run: bool = False,
45 | ) -> None:
46 |     tokenizer = Tokenizer(tokenizer_path)
47 |     data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
48 |     data_processor = DataProcessor(
49 |         input_dir=str(input_dir),
50 |         output_dir=str(output_dir),
51 |         fast_dev_run=fast_dev_run,
52 |         num_workers=os.cpu_count(),
53 |         num_downloaders=1,
54 |     )
55 | 
56 |     start_time = time.time()
57 |     data_processor.run(data_recipe)
58 |     elapsed_time = time.time() - start_time
59 |     print(f"Time taken: {elapsed_time:.2f} seconds")
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     CLI(prepare)
64 | 


--------------------------------------------------------------------------------
/tutorials/download_stablecode.md:
--------------------------------------------------------------------------------
 1 | ## Download [StableCode](https://huggingface.co/collections/stabilityai/stable-code-64f9dfb4ebc8a1be0a3f7650) weights
 2 | 
 3 | StableCode is a suite of 4 developer assistant models.
 4 | 
 5 | Each one of them is a decoder-only code completion model with 3 billion parameters, pre-trained on a diverse collection of programming languages that ranked highest in the 2023 StackOverflow developer survey.
 6 | 
 7 | For more info on the models, please visit the [StableCode repository](https://huggingface.co/collections/stabilityai/stable-code-64f9dfb4ebc8a1be0a3f7650).
 8 | 
 9 | ------
10 | 
11 | To see all the available checkpoints for StableCode, run:
12 | 
13 | ```bash
14 | python scripts/download.py | grep -E "stable-?code"
15 | ```
16 | 
17 | which will print:
18 | 
19 | ```text
20 | stabilityai/stablecode-completion-alpha-3b
21 | stabilityai/stablecode-completion-alpha-3b-4k
22 | stabilityai/stablecode-instruct-alpha-3b
23 | stabilityai/stable-code-3b
24 | ```
25 | 
26 | In order to use a specific StableCode checkpoint, for instance [stable-code-3b](https://huggingface.co/stabilityai/stable-code-3b), download the weights and convert the checkpoint to the Lit-GPT format:
27 | 
28 | ```bash
29 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
30 | 
31 | export repo_id=stabilityai/stable-code-3b
32 | python scripts/download.py --repo_id $repo_id --from_safetensors=True
33 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id
34 | ```
35 | 
36 | > [!NOTE]
37 | > `stablecode-completion-alpha-3b` is shipped in PyTorch .bin format, thus set `--from_safetensors=False`.
38 | 
39 | By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM
40 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
41 | 
42 | You're done! To execute the model just run:
43 | 
44 | ```bash
45 | pip install tokenizers
46 | 
47 | python generate/base.py --prompt "Write in Python a softmax function. Be concise." --checkpoint_dir checkpoints/$repo_id
48 | ```
49 | 
50 | Or you can run the model in an interactive mode:
51 | 
52 | ```bash
53 | python chat/base.py --checkpoint_dir checkpoints/$repo_id
54 | ```
55 | 


--------------------------------------------------------------------------------
/tutorials/download_redpajama_incite.md:
--------------------------------------------------------------------------------
 1 | ## Download [RedPajama-INCITE](https://www.together.xyz/blog/redpajama-models-v1) weights
 2 | 
 3 | Togethercomputer's RedPajama-INCITE family of models were trained over the [RedPajama v1](https://www.together.xyz/blog/redpajama) dataset, with the same architecture as the popular [Pythia](download_pythia.md) model suite. Weights are released under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0).
 4 | 
 5 | The release includes a base model, a chat fine-tuned model, and an instruction tuned model of sizes 3B and 7B.
 6 | 
 7 | To see all the available checkpoints for RedPajama-INCITE, run:
 8 | 
 9 | ```bash
10 | python scripts/download.py | grep RedPajama
11 | ```
12 | 
13 | which will print
14 | 
15 | ```text
16 | togethercomputer/RedPajama-INCITE-Base-3B-v1
17 | togethercomputer/RedPajama-INCITE-Chat-3B-v1
18 | togethercomputer/RedPajama-INCITE-Instruct-3B-v1
19 | togethercomputer/RedPajama-INCITE-7B-Base
20 | togethercomputer/RedPajama-INCITE-7B-Chat
21 | togethercomputer/RedPajama-INCITE-7B-Instruct
22 | togethercomputer/RedPajama-INCITE-Base-7B-v0.1
23 | togethercomputer/RedPajama-INCITE-Chat-7B-v0.1
24 | togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1
25 | ```
26 | 
27 | In order to use a specific RedPajama-INCITE checkpoint, for instance [RedPajama-INCITE-Base-3B-v1](https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1), download the weights and convert the checkpoint to the lit-gpt format:
28 | 
29 | ```bash
30 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
31 | 
32 | python scripts/download.py --repo_id togethercomputer/RedPajama-INCITE-Base-3B-v1
33 | 
34 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/togethercomputer/RedPajama-INCITE-Base-3B-v1
35 | ```
36 | 
37 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
38 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
39 | 
40 | You're done! To execute the model just run:
41 | 
42 | ```bash
43 | pip install tokenizers
44 | 
45 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/togethercomputer/RedPajama-INCITE-Base-3B-v1
46 | ```
47 | 


--------------------------------------------------------------------------------
/lit_gpt/data_loading_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch.utils.data._utils.collate import collate_tensor_fn
 4 | 
 5 | 
 6 | def generic_collate_fn(
 7 |     batch,
 8 |     tokenizer=None,
 9 |     block_size=None,
10 |     pad_to_block_size=False,
11 |     add_bos=True,
12 |     add_eos=True,
13 |     collate_checks_enabled=True,
14 |     all_block_size_tensors=False,
15 | ):
16 |     if all_block_size_tensors:
17 |         # If we are only dealing with tensors that we _know_ are the same size,
18 |         # we can just use the default collate_tensor_fn
19 |         return collate_tensor_fn(batch)
20 | 
21 |     if collate_checks_enabled:
22 |         assert isinstance(batch, list), "Batch must be a list."
23 |         type_list = [type(x) for x in batch]
24 |         if str in type_list:
25 |             assert tokenizer is not None, "If batch contains strings, tokenizer must be provided."
26 |             assert tokenizer.pad_id is not None, "Tokenizer must have pad token id since we are dynamically padding."
27 | 
28 |     # if tokenizer is not None:
29 |     # for now, we assume that if we need it, the tokenizer is always present
30 |     batch = [tokenizer.encode(row, bos=add_bos, eos=add_eos) if type(row) == str else row for row in batch]
31 | 
32 |     # Now all rows are tokenized
33 |     # logic is a bit generic, could be tightened under encode -> tensor assumption
34 |     if pad_to_block_size:
35 |         batch = [torch.tensor(x[:block_size].tolist() + [tokenizer.pad_id] * (block_size - len(x))) for x in batch]
36 |     else:
37 |         # pad to longest in batch
38 |         max_len = max(len(x) for x in batch)
39 |         batch = [torch.tensor(x.tolist() + [tokenizer.pad_id] * (max_len - len(x))) for x in batch]
40 | 
41 |     # Now all rows are tensors of the same length.
42 |     # Always slice to block size since the max row length realized could be longer than block size.
43 |     collated_batch = collate_tensor_fn(batch)[:, :block_size]
44 | 
45 |     # We need to check whether the entire batch consists of padding tokens
46 |     # if so, we raise a StopIteration to signal the exhaustion of all data sources since
47 |     # no real tokens are present in the batch
48 |     if torch.all(collated_batch == tokenizer.pad_id):
49 |         raise StopIteration("All tokens in batch are padding tokens.")
50 | 
51 |     return collated_batch
52 | 


--------------------------------------------------------------------------------
/tutorials/evaluation.md:
--------------------------------------------------------------------------------
 1 | # LLM Evaluation
 2 | 
 3 | &nbsp;
 4 | 
 5 | ## Using lm-evaluation-harness
 6 | 
 7 | You can evaluate Lit-GPT using [EleutherAI's lm-eval](https://github.com/EleutherAI/lm-evaluation-harness/tree/master) framework with a large number of different evaluation tasks.
 8 | 
 9 | You need to install the `lm-eval` framework first:
10 | 
11 | ```bash
12 | pip install https://github.com/EleutherAI/lm-evaluation-harness/archive/refs/heads/master.zip -U
13 | ```
14 | 
15 | &nbsp;
16 | 
17 | ### Evaluating Lit-GPT base models
18 | 
19 | Use the following command to evaluate Lit-GPT models on all tasks in Eleuther AI's Evaluation Harness.
20 | 
21 | ```bash
22 | python eval/lm_eval_harness.py \
23 |     --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
24 |     --precision "bf16-true" \
25 |     --save_filepath "results.json"
26 | ```
27 | 
28 | To evaluate on LLMs on specific tasks, for example, TruthfulQA and HellaSwag, you can use the `--eval_task` flag as follows:
29 | 
30 | ```bash
31 | python eval/lm_eval_harness.py \
32 |     --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
33 |     --eval_tasks "[truthfulqa_mc,hellaswag]" \
34 |     --precision "bf16-true" \
35 |     --save_filepath "results.json"
36 | ```
37 | 
38 | A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).
39 | 
40 | &nbsp;
41 | 
42 | ### Evaluating LoRA-finetuned LLMs
43 | 
44 | The above command can be used to evaluate models that are saved via a single checkpoint file. This includes downloaded checkpoints and base models finetuned via the full and adapter finetuning scripts.
45 | 
46 | For LoRA-finetuned models, you need to first merge the LoRA weights with the original checkpoint file as described in the [Merging LoRA Weights](finetune_lora.md#merging-lora-weights) section of the LoRA finetuning documentation.
47 | 
48 | &nbsp;
49 | 
50 | ## FAQs
51 | 
52 | * **How do I evaluate on MMLU?**
53 | 
54 |   MMLU is available as with lm-eval harness but the task name is not MMLU. You can use `hendrycksTest*` as regex to evaluate on MMLU.
55 | 
56 |   ```shell
57 |   python eval/lm_eval_harness.py \
58 |       --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
59 |       --precision "bf16-true" \
60 |       --eval_tasks "[hendrycksTest*]" \
61 |       --num_fewshot 5 \
62 |       --save_filepath "results.json"
63 |   ```
64 | 
65 | * **Is Truthful MC is not available in lm-eval?**
66 | 
67 |   It is available as `truthfulqa_mc`.
68 | 


--------------------------------------------------------------------------------
/tutorials/download_pythia.md:
--------------------------------------------------------------------------------
 1 | ## Download [Pythia](https://github.com/EleutherAI/pythia) weights
 2 | 
 3 | EleutherAI's project Pythia combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers. Weights are released under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0).
 4 | 
 5 | For detailed info on the models, their training, and their behavior, please see the [Pythia repository](https://github.com/EleutherAI/pythia).
 6 | It includes a suite of 8 checkpoints (weights) on 2 different datasets: [The Pile](https://pile.eleuther.ai/), as well as The Pile with deduplication applied. In addition there are two small models that come only in non-deduplicated form: `Pythia-14m` and `Pythia-31m`.
 7 | 
 8 | To see all the available checkpoints for Pythia, run:
 9 | 
10 | ```bash
11 | python scripts/download.py | grep pythia
12 | ```
13 | 
14 | which will print
15 | 
16 | ```text
17 | EleutherAI/pythia-14m
18 | EleutherAI/pythia-31m
19 | EleutherAI/pythia-70m
20 | EleutherAI/pythia-160m
21 | EleutherAI/pythia-410m
22 | EleutherAI/pythia-1b
23 | EleutherAI/pythia-1.4b
24 | EleutherAI/pythia-2.8b
25 | EleutherAI/pythia-6.9b
26 | EleutherAI/pythia-12b
27 | EleutherAI/pythia-70m-deduped
28 | EleutherAI/pythia-160m-deduped
29 | EleutherAI/pythia-410m-deduped
30 | EleutherAI/pythia-1b-deduped
31 | EleutherAI/pythia-1.4b-deduped
32 | EleutherAI/pythia-2.8b-deduped
33 | EleutherAI/pythia-6.9b-deduped
34 | EleutherAI/pythia-12b-deduped
35 | ```
36 | 
37 | In order to use a specific Pythia checkpoint, for instance [pythia-1b](https://huggingface.co/EleutherAI/pythia-1b), download the weights and convert the checkpoint to the lit-gpt format:
38 | 
39 | ```bash
40 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
41 | 
42 | python scripts/download.py --repo_id EleutherAI/pythia-1b
43 | 
44 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/EleutherAI/pythia-1b
45 | ```
46 | 
47 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
48 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
49 | 
50 | You're done! To execute the model just run:
51 | 
52 | ```bash
53 | pip install tokenizers
54 | 
55 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/EleutherAI/pythia-1b
56 | ```
57 | 


--------------------------------------------------------------------------------
/tutorials/download_code_llama.md:
--------------------------------------------------------------------------------
 1 | ## Download [Code Llama](https://ai.meta.com/blog/code-llama-large-language-model-coding/) weights
 2 | 
 3 | Meta developed and publicly released the Code Llama family of large language models (LLMs) on top of Llama 2.
 4 | 
 5 | Code Llama models come in three sizes: 7B, 13B, and 34B parameter models. Furthermore, there are three model versions for each size:
 6 | 
 7 | - Code Llama: A base model trained on 500B tokens, then and finetuned on 20B tokens.
 8 | - Code Llama-Python: The Code Llama model pretrained on 500B tokens, further trained on 100B additional Python code tokens, and then finetuned on 20B tokens.
 9 | - Code Llama-Instruct: The Code Llama model trained on 500B tokens, finetuned on 20B tokens, and instruction-finetuned on additional 5B tokens.
10 | 
11 | All models were  trained on 16,000 token contexts and support generations with up to 100,000 tokens of context.
12 | 
13 | To see all the available checkpoints, run:
14 | 
15 | ```bash
16 | python scripts/download.py | grep CodeLlama
17 | ```
18 | 
19 | which will print
20 | 
21 | ```text
22 | codellama/CodeLlama-7b-hf
23 | codellama/CodeLlama-13b-hf
24 | codellama/CodeLlama-34b-hf
25 | codellama/CodeLlama-70b-hf
26 | codellama/CodeLlama-7b-Python-hf
27 | codellama/CodeLlama-13b-Python-hf
28 | codellama/CodeLlama-34b-Python-hf
29 | codellama/CodeLlama-70b-Python-hf
30 | codellama/CodeLlama-7b-Instruct-hf
31 | codellama/CodeLlama-13b-Instruct-hf
32 | codellama/CodeLlama-34b-Instruct-hf
33 | codellama/CodeLlama-70b-Instruct-hf
34 | ```
35 | 
36 | In order to use a specific checkpoint, for instance [CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf), download the weights and convert the checkpoint to the lit-gpt format.
37 | 
38 | ```bash
39 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
40 | 
41 | python scripts/download.py --repo_id codellama/CodeLlama-7b-Python-hf
42 | 
43 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/codellama/CodeLlama-7b-Python-hf
44 | ```
45 | 
46 | By default, the `convert_hf_checkpoint.py` step will use the data type of the HF checkpoint's parameters. In cases where RAM
47 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
48 | 
49 | You're done! To execute the model just run:
50 | 
51 | ```bash
52 | pip install sentencepiece
53 | 
54 | python chat/base.py --checkpoint_dir checkpoints/codellama/CodeLlama-7b-Python-hf/
55 | ```
56 | 


--------------------------------------------------------------------------------
/data_checks/count_token_lengths.py:
--------------------------------------------------------------------------------
 1 | #### This is a script to run the overlap test on the two datasets where one is way smaller than the other ####
 2 | import os
 3 | import datasets
 4 | import numpy as np
 5 | import torch
 6 | from tqdm import tqdm
 7 | from transformers import AutoTokenizer
 8 | 
 9 | def count_tokenizes_and_get_metrics(dataset, tokenizer):
10 |     token_counts = []
11 |     for data in tqdm(dataset, total=len(dataset)):
12 |         token_counts.append(len(tokenizer(data['text'])['input_ids']))
13 |     # we gonnna return the mean, median, max, min, and std as a string
14 |     return f"Mean: {np.mean(token_counts)}, Median: {np.median(token_counts)}, Max: {np.max(token_counts)}, Min: {np.min(token_counts)}, Std: {np.std(token_counts)}"
15 |     print("Mean: ", np.mean(token_counts))
16 |     print("Median: ", np.median(token_counts))
17 |     print("Max: ", np.max(token_counts))
18 |     print("Min: ", np.min(token_counts))
19 |     print("Std: ", np.std(token_counts))
20 | 
21 |     
22 | 
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     import argparse
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--base_dir", type=str, default="")
29 |     parser.add_argument("--num_proc", type=int, default=28)
30 |     args = parser.parse_args()
31 | 
32 |     for file in os.listdir(args.base_dir):
33 |         if file != 'non_targeted':
34 |             for split in ['wiki', 'random']:
35 |                 new_path = os.path.join(args.base_dir, file, split)
36 |                 if os.path.exists(new_path):
37 |                     print("Processing: ", new_path)
38 |                     dataset = datasets.load_from_disk(new_path)
39 |                     tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T")
40 |                     output=count_tokenizes_and_get_metrics(dataset, tokenizer)
41 |                     print(output)
42 |         elif file == 'non_targeted':
43 |             new_path = os.path.join(args.base_dir, file)
44 |             if os.path.exists(new_path):
45 |                 print("Processing: ", f"{new_path}/wiki")
46 |                 dataset = datasets.load_from_disk(f"{new_path}/wiki")
47 |                 tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T")
48 |                 output=count_tokenizes_and_get_metrics(dataset, tokenizer)
49 |                 print(output)
50 |         else:
51 |             raise ValueError("Invalid File: ", file)
52 |         # print("File: ", file)
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/tutorials/download_llama_2.md:
--------------------------------------------------------------------------------
 1 | ## Download [Llama 2](https://ai.meta.com/llama) weights
 2 | 
 3 | Meta developed and publicly released the Llama 2 family of large language models (LLMs), a collection of pretrained and
 4 | fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. Its fine-tuned LLMs,
 5 | called Llama-2-Chat, are optimized for dialogue use cases. Llama-2-Chat models outperform open-source chat models on
 6 | most benchmarks we tested, and in our human evaluations for helpfulness and safety, are on par with some popular
 7 | closed-source models like ChatGPT and PaLM.
 8 | 
 9 | Llama 2 models are trained on 2 trillion tokens (40% more data than LLaMA 1) and have double the context length of LLaMA 1 (4096 tokens).
10 | 
11 | Llama 2 comes in a range of parameter sizes — 7B, 13B, and 70B — as well as pretrained and fine-tuned variations.
12 | 
13 | To see all the available checkpoints, run:
14 | 
15 | ```bash
16 | python scripts/download.py | grep Llama-2
17 | ```
18 | 
19 | which will print
20 | 
21 | ```text
22 | meta-llama/Llama-2-7b-hf
23 | meta-llama/Llama-2-7b-chat-hf
24 | meta-llama/Llama-2-13b-hf
25 | meta-llama/Llama-2-13b-chat-hf
26 | meta-llama/Llama-2-70b-hf
27 | meta-llama/Llama-2-70b-chat-hf
28 | ```
29 | 
30 | In order to use a specific checkpoint, for instance [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), download the weights and convert the checkpoint to the lit-gpt format.
31 | 
32 | This requires that you've been granted access to the weights on the HuggingFace hub. You can do so by following the steps at <https://huggingface.co/meta-llama/Llama-2-7b>.
33 | After access is granted, you can find your HF hub token in <https://huggingface.co/settings/tokens>.
34 | 
35 | ```bash
36 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
37 | 
38 | python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf --access_token your_hf_token
39 | 
40 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
41 | ```
42 | 
43 | By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
44 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
45 | 
46 | You're done! To execute the model just run:
47 | 
48 | ```bash
49 | pip install sentencepiece
50 | 
51 | python chat/base.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
52 | ```
53 | 


--------------------------------------------------------------------------------
/scripts/prepare_starcoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | import sys
 5 | import time
 6 | import traceback
 7 | from pathlib import Path
 8 | 
 9 | import pyarrow.parquet as pq
10 | from lightning.data.streaming import DataChunkRecipe, DataProcessor
11 | 
12 | # support running without installing as a package
13 | wd = Path(__file__).parent.parent.resolve()
14 | sys.path.append(str(wd))
15 | 
16 | from lit_gpt import Tokenizer
17 | from lit_gpt.utils import CLI
18 | 
19 | 
20 | class StarcoderDataRecipe(DataChunkRecipe):
21 |     def __init__(self, tokenizer: Tokenizer, chunk_size: int):
22 |         super().__init__(chunk_size)
23 |         self.tokenizer = tokenizer
24 | 
25 |     def prepare_structure(self, input_dir):
26 |         files = Path(input_dir).rglob("*.parquet")
27 |         return [str(file) for file in files]
28 | 
29 |     def prepare_item(self, item_metadata):
30 |         filepath = item_metadata
31 |         start = time.time()
32 | 
33 |         try:
34 |             parquet_file = pq.ParquetFile(filepath)
35 |             # reduce RAM usage
36 |             for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]):
37 |                 for text in batch.to_pandas()["content"]:
38 |                     yield self.tokenizer.encode(text, bos=False, eos=True)
39 | 
40 |         except Exception:
41 |             print(traceback.format_exc())
42 |             print(f"Error reading {filepath}")
43 |             return
44 | 
45 |         parquet_file.close()
46 |         end = time.time()
47 |         print(f"Took {end - start:.2f} seconds total", filepath)
48 | 
49 | 
50 | def prepare(
51 |     input_dir: Path = Path("data/starcoderdata"),
52 |     output_dir: Path = Path("data/starcoder"),
53 |     tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"),
54 |     chunk_size: int = (2049 * 8192),
55 |     fast_dev_run: bool = False,
56 | ) -> None:
57 |     tokenizer = Tokenizer(tokenizer_path)
58 |     data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
59 |     data_processor = DataProcessor(
60 |         input_dir=str(input_dir),
61 |         output_dir=str(output_dir),
62 |         fast_dev_run=fast_dev_run,
63 |         num_workers=os.cpu_count(),
64 |         num_downloaders=1,
65 |     )
66 | 
67 |     start_time = time.time()
68 |     data_processor.run(data_recipe)
69 |     elapsed_time = time.time() - start_time
70 |     print(f"Time taken: {elapsed_time:.2f} seconds")
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     CLI(prepare)
75 | 


--------------------------------------------------------------------------------
/launch_scripts/config/tinyllama-1b-control.yaml:
--------------------------------------------------------------------------------
 1 | # Main settings
 2 | 
 3 | # run_name: tinyllama-1b
 4 | resume: true
 5 | out_dir: null # mention in --extra_args
 6 | max_tokens: 20000000000 # 20B
 7 | max_iters: null
 8 | seed: 1337
 9 | 
10 | # Model configuration
11 | model_name: tiny-llama-1.1b
12 | block_size: 2048
13 | 
14 | # Training hyperparameters
15 | world_batch_size: 1024
16 | learning_rate: 4.0e-04
17 | warmup_steps: 1000 # out of 9536.74 total steps
18 | weight_decay: 0.1
19 | beta1: 0.9
20 | beta2: 0.95
21 | grad_clip: 1.0
22 | lr_schedule: cosine
23 | decay_lr: true
24 | min_lr: 4.0e-05
25 | 
26 | # Regularization
27 | neptune_from_tokens: null
28 | neptune_till_tokens: null
29 | neptune_noise_alpha: null
30 | label_smoothing: 0.0
31 | # tld_strategy: static specify in --extra_args
32 | # k_goldfish: specify in --extra_args
33 | 
34 | 
35 | # Implementation and backend
36 | fabric_strategy: ddp
37 | fabric_precision: bf16-true
38 | micro_batch_size: 8
39 | compile_model: true
40 | matmul_precision: high
41 | dataloader_num_workers: 0
42 | n_chunks: 4
43 | 
44 | # Logging
45 | logger_name: wandb
46 | logger_project: TLD-TinyLLaMA-1B
47 | data_telemetry: false
48 | log_step_interval: 1
49 | eval_iters: 2000
50 | save_and_eval_interval: 2000
51 | sanity_validate: true
52 | measure_flops: false
53 | save_n_min_before_job_done: 5
54 | save_last_step: true
55 | 
56 | # Data Handling
57 | text_key: text
58 | pad_to_block_size: true
59 | add_bos: false
60 | add_eos: true
61 | shuffle_filenames: true
62 | collate_checks_enabled: true
63 | all_block_size_tensors: false
64 | 
65 | # use redpajama_v2_sample_100b_tinyllama_tokd and wikipedia-en-2k-samples
66 | # Data configuration/paths
67 | tokenizer_path: /lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T
68 | data_config:
69 |   train_data:
70 |   - type: pkds
71 |     prefix: ''
72 |     weight: 1 # 20B - 204.8M tokens
73 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" # check eos/bos token is used or not
74 |     name: redpajama_v2_sample_100b_tinyllama_tokd
75 |   val_data: # do verify in latest Jonas' code that TLD is not used
76 |   - type: pkds
77 |     prefix: ''
78 |     weight: 0.98986379474 # 20B - 204.8M tokens
79 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd" # check eos/bos token is used or not
80 |     name: redpajama_v2_sample_100b_tinyllama_tokd
81 |   - type: hfds
82 |     prefix: 'wikipedia-en-2k'
83 |     weight: 0.01013620526 # 204.8M tokens
84 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val"
85 |     name: wikipedia-en-2k-samples


--------------------------------------------------------------------------------
/launch_scripts/config/tinyllama-1b.yaml:
--------------------------------------------------------------------------------
 1 | # Main settings
 2 | 
 3 | resume: true
 4 | out_dir: null # mention in --extra_args
 5 | max_tokens: 20000000000 # 20B
 6 | max_iters: null
 7 | seed: 1337
 8 | 
 9 | # Model configuration
10 | model_name: tiny-llama-1.1b
11 | block_size: 2048
12 | 
13 | # Training hyperparameters
14 | world_batch_size: 1024
15 | learning_rate: 4.0e-04
16 | warmup_steps: 1000 # out of 9536.74 total steps
17 | weight_decay: 0.1
18 | beta1: 0.9
19 | beta2: 0.95
20 | grad_clip: 1.0
21 | lr_schedule: cosine
22 | decay_lr: true
23 | min_lr: 4.0e-05
24 | 
25 | # Regularization
26 | neptune_from_tokens: null
27 | neptune_till_tokens: null
28 | neptune_noise_alpha: null
29 | label_smoothing: 0.0
30 | # goldfish_strategy: static specify in --extra_args
31 | # k_goldfish: specify in --extra_args
32 | 
33 | # Implementation and backend
34 | fabric_strategy: ddp
35 | fabric_precision: bf16-true
36 | micro_batch_size: 8
37 | compile_model: true
38 | matmul_precision: high
39 | dataloader_num_workers: 0
40 | n_chunks: 4
41 | 
42 | # Logging
43 | logger_name: wandb
44 | logger_project: goldfish-TinyLLaMA-1B
45 | data_telemetry: false
46 | log_step_interval: 1
47 | eval_iters: 2000
48 | save_and_eval_interval: 2000
49 | sanity_validate: true
50 | measure_flops: false
51 | save_n_min_before_job_done: 5
52 | save_last_step: true
53 | 
54 | # Data Handling
55 | text_key: text
56 | pad_to_block_size: true
57 | add_bos: false
58 | add_eos: true
59 | shuffle_filenames: true
60 | collate_checks_enabled: true
61 | all_block_size_tensors: false
62 | 
63 | # use redpajama_v2_sample_100b_tinyllama_tokd and wikipedia-en-2k-samples
64 | # Data configuration/paths
65 | tokenizer_path: /lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T
66 | data_config:
67 |   train_data:
68 |   - type: pkds
69 |     prefix: ''
70 |     weight: 0.98986379474 # 20B - 204.8M tokens
71 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
72 |     name: redpajama_v2_sample_100b_tinyllama_tokd
73 |   - type: hfds
74 |     prefix: 'wikipedia-en-2k'
75 |     weight: 0.01013620526 # 204.8M tokens
76 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train"
77 |     name: wikipedia-en-2k-samples
78 |   val_data:
79 |   - type: pkds
80 |     prefix: ''
81 |     weight: 0.98986379474 # 20B - 204.8M tokens
82 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
83 |     name: redpajama_v2_sample_100b_tinyllama_tokd
84 |   - type: hfds
85 |     prefix: 'wikipedia-en-2k'
86 |     weight: 0.01013620526 # 204.8M tokens
87 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val"
88 |     name: wikipedia-en-2k-samples


--------------------------------------------------------------------------------
/lit_gpt/data/dolly.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | from pathlib import Path
 5 | 
 6 | import torch
 7 | from torch.utils.data import random_split
 8 | from lit_gpt.data import SFTDataset, Alpaca
 9 | from lit_gpt.data.alpaca import prompt_template
10 | 
11 | _URL: str = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl"
12 | 
13 | 
14 | class Dolly(Alpaca):
15 |     """Dolly data module for supervised finetuning.
16 | 
17 |     Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
18 |     """
19 | 
20 |     def __init__(
21 |         self,
22 |         mask_prompt: bool = False,
23 |         test_split_fraction: float = 0.1,
24 |         ignore_index: int = -1,
25 |         seed: int = 42,
26 |         num_workers: int = 4,
27 |         data_file_url: str = _URL,
28 |         data_file_name: str = "dolly_data_cleaned.json",
29 |         download_dir: Path = Path("./data/dolly"),
30 |     ) -> None:
31 |         super().__init__(
32 |             mask_prompt=mask_prompt,
33 |             test_split_fraction=test_split_fraction,
34 |             ignore_index=ignore_index,
35 |             seed=seed,
36 |             num_workers=num_workers,
37 |             data_file_url=data_file_url,
38 |             data_file_name=data_file_name,
39 |             download_dir=download_dir,
40 |         )
41 | 
42 |     def setup(self, stage: str = "") -> None:
43 |         with open(self.download_dir / self.data_file_name, "r", encoding="utf-8") as file:
44 |             data = file.readlines()
45 |             data = [json.loads(line) for line in data]
46 |         for item in data:
47 |             item["input"] = item.pop("context")
48 |             item["output"] = item.pop("response")
49 | 
50 |         # Partition the dataset into train and test
51 |         train_data, test_data = random_split(
52 |             data,
53 |             [1.0 - self.test_split_fraction, self.test_split_fraction],
54 |             generator=torch.Generator().manual_seed(self.seed)
55 |         )
56 |         train_data, test_data = list(train_data), list(test_data)
57 | 
58 |         self.train_dataset = SFTDataset(
59 |             data=train_data,
60 |             tokenizer=self.tokenizer,
61 |             prompt_template=prompt_template,
62 |             max_seq_length=self.max_seq_length,
63 |             mask_prompt=self.mask_prompt,
64 |             ignore_index=self.ignore_index,
65 |         )
66 |         self.test_dataset = SFTDataset(
67 |             data=test_data,
68 |             tokenizer=self.tokenizer,
69 |             prompt_template=prompt_template,
70 |             max_seq_length=self.max_seq_length,
71 |             mask_prompt=self.mask_prompt,
72 |             ignore_index=self.ignore_index,
73 |         )
74 | 


--------------------------------------------------------------------------------
/tutorials/download_tinyllama.md:
--------------------------------------------------------------------------------
 1 | ## Download TinyLlama weights
 2 | 
 3 | [TinyLlama 1.1B](https://github.com/jzhang38/TinyLlama/) is Apache 2.0 licensed and can be used without restrictions.
 4 | It is still in development and at the time of writing this, checkpoints for the model trained up to 2T tokens are available.
 5 | The target is to train it for ~3 epochs on 3T tokens total. For more details on the schedule and progress of the pretraining, see the official [README](https://github.com/jzhang38/TinyLlama/tree/main).
 6 | 
 7 | There are two version of TinyLlama available: a base one and a fine-tuned "Chat" version.
 8 | To see all available versions, run:
 9 | 
10 | ```bash
11 | python scripts/download.py | grep TinyLlama
12 | ```
13 | 
14 | which will print
15 | 
16 | ```text
17 | TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
18 | TinyLlama/TinyLlama-1.1B-Chat-v1.0
19 | ```
20 | 
21 | In order to use a specific checkpoint, for instance [TinyLlama 1.1B base model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T), which requires about 5 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format:
22 | 
23 | ```bash
24 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
25 | 
26 | python scripts/download.py --repo_id TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
27 | 
28 | python scripts/convert_hf_checkpoint.py \
29 |     --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
30 | ```
31 | 
32 | -----
33 | 
34 | With the `Chat` version of the model, the download and conversion procedures are slightly different.
35 | As this version of the model is stored in `safetensor` format, to download it an additional flag is required:
36 | 
37 | ```bash
38 | python scripts/download.py --repo_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --from_safetensors=True
39 | ```
40 | 
41 | The model is shipped in `bfloat16` format, so if your hardware doesn't support it, you can provide `dtype` argument during model conversion. For example we can convert the weights into `float32` format:
42 | 
43 | ```bash
44 | python scripts/convert_hf_checkpoint.py \
45 |     --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dtype=float32
46 | ```
47 | 
48 | -----
49 | 
50 | You're done! To execute the model just run:
51 | 
52 | ```bash
53 | pip install sentencepiece
54 | 
55 | # base version
56 | python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
57 | 
58 | # or
59 | 
60 | # chat version
61 | python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0
62 | ```
63 | 
64 | To improve the response from Chat version you can also provide these args (as in the [model card](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0)):
65 | 
66 | ```bash
67 | python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0 --top_k=50 --temperature=0.7
68 | ```
69 | 


--------------------------------------------------------------------------------
/axonn_fabric/megatron_logging.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def get_mem():
 4 |     curr =  torch.cuda.memory_allocated() / 1024 / 1024 / 1024
 5 |     peak =  torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024
 6 |     return curr, peak
 7 | 
 8 | def get_tflops(config, batch_size):
 9 |     N = config.n_layer
10 |     B = batch_size
11 |     S = config.block_size
12 |     V = config.padded_vocab_size
13 |     H = config.n_embd
14 |     IH = config.intermediate_size
15 | 
16 | 
17 |     linear_flops = N*(32*B*S*H*H + 24 * B * S * H * IH)
18 |     attention_flops = N*(16 * B * S * S * H)
19 |     head_flops = 6 * B * S * H * V
20 |     if config.gradient_checkpointing:
21 |         flops = linear_flops + attention_flops + head_flops
22 |     else:
23 |         flops = 3/4*(linear_flops + attention_flops) + head_flops
24 | 
25 |     return flops/1e12
26 | 
27 | def pretty_log(iteration, 
28 |                train_iters,
29 |                consumed_train_samples,
30 |                elapsed_time_per_iteration,
31 |                learning_rate,
32 |                batch_size,
33 |                train_loss,
34 |                grad_norm=None,
35 |                model_name=None,
36 |                config=None):
37 |     log_string = '> global batch {:8d}/{:8d} |'.format(
38 |         iteration, train_iters)
39 |     log_string += ' consumed samples: {:12d} |'.format(
40 |         consumed_train_samples)
41 |     log_string += ' elapsed time per global batch (ms): {:.1f} |'.format(
42 |         elapsed_time_per_iteration * 1000.0)
43 |     log_string += ' learning rate: {:.3E} |'.format(learning_rate)
44 |     log_string += ' global batch size: {:5d} |'.format(batch_size)
45 |     log_string += ' loss: {:.5f} |'.format(train_loss)
46 |     #log_string += ' loss scale: {:.1f} |'.format(loss_scale)
47 |     if grad_norm is not None:
48 |         log_string += ' grad norm: {:.3f} |'.format(grad_norm)
49 |     #if num_zeros_in_grad is not None:
50 |     #    log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad)
51 |     #if params_norm is not None:
52 |     #    log_string += ' params norm: {:.3f} |'.format(params_norm)
53 |     #log_string += ' number of skipped iterations: {:3d} |'.format(
54 |     #    total_loss_dict[skipped_iters_key])
55 |     #log_string += ' number of nan iterations: {:3d} |'.format(
56 |     #    total_loss_dict[nan_iters_key])
57 |     #log_string += ' theoretical FLOP/s: {:.3f} TFLOP/s | '.format(get_flops(elapsed_time_per_iteration))
58 |     #log_string += ' model size: {:.3f} B params | '.format(get_params())
59 |     curr, peak = get_mem()
60 |     log_string += ' memory used by tensors {:.3f} GB (peak {:.3f} GB) |'.format(curr, peak)
61 |     if model_name is not None:
62 |         log_string += f' model name {model_name} |'
63 |     if config is not None:
64 |         log_string += f' {get_tflops(config, batch_size)/elapsed_time_per_iteration/torch.distributed.get_world_size():.2f} TFLOP/s per GPU'
65 |     return log_string
66 | 


--------------------------------------------------------------------------------
/lit_gpt/args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from pathlib import Path
 3 | from typing import Optional
 4 | 
 5 | 
 6 | @dataclass
 7 | class TrainArgs:
 8 |     """Training related arguments"""
 9 | 
10 |     save_interval: int = 1000
11 |     """Number of optimizer steps between checkpoints"""
12 |     log_interval: int = 1
13 |     """Number of iterations between logging calls"""
14 |     global_batch_size: int = 64
15 |     """Number of samples between optimizer steps across data-parallel ranks"""
16 |     micro_batch_size: int = 4
17 |     """Number of samples per data-parallel rank"""
18 |     lr_warmup_steps: int = 100
19 |     """Number of iterations with learning rate warmup active"""
20 |     epochs: Optional[int] = None
21 |     """Number of epochs to run"""
22 |     epoch_size: Optional[int] = None
23 |     """Size of the epoch"""
24 |     # TODO: pretrain/tinyllama is the only script using `max_tokens` explicitly. replace it with epoch_size*epochs?
25 |     max_tokens: Optional[int] = None
26 |     """Total number of tokens to train on"""
27 |     max_seq_length: Optional[int] = None
28 |     """Limits the length of samples. Off by default"""
29 | 
30 |     # Optimization args
31 |     learning_rate: float = 1e-3
32 |     weight_decay: float = 0.02
33 |     beta1: float = 0.9
34 |     beta2: float = 0.95
35 |     max_norm: Optional[float] = None
36 |     min_lr: float = 6e-5
37 | 
38 |     def max_iters(self, devices: int) -> int:
39 |         """Number of iterations"""
40 |         max_iters = self.epochs * self.epoch_size // devices // self.micro_batch_size
41 |         assert max_iters > 0
42 |         return max_iters
43 | 
44 |     def gradient_accumulation_iters(self, devices: int) -> int:
45 |         """Number of iterations between gradient synchronizations"""
46 |         gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size
47 |         assert gradient_accumulation_iters > 0
48 |         return gradient_accumulation_iters
49 | 
50 |     def batch_size(self, devices: int) -> int:
51 |         """Number of samples between optimizer steps per data-parallel rank"""
52 |         batch_size = self.global_batch_size // devices
53 |         assert batch_size > 0
54 |         return batch_size
55 | 
56 | 
57 | @dataclass
58 | class EvalArgs:
59 |     """Evaluation related arguments"""
60 | 
61 |     interval: int = 600
62 |     """Number of optimizer steps between evaluation calls"""
63 |     max_new_tokens: Optional[int] = None
64 |     """Number of tokens to generate"""
65 |     max_iters: int = 100
66 |     """Number of iterations"""
67 | 
68 | 
69 | @dataclass
70 | class IOArgs:
71 |     """Inputs and outputs related arguments"""
72 | 
73 |     # Optional because pretrain/tinyllama hardcodes the path
74 |     train_data_dir: Optional[Path] = Path("data/alpaca")
75 |     """Where to read training data from"""
76 |     val_data_dir: Optional[Path] = None
77 |     """Where to read validation data from"""
78 |     checkpoint_dir: Optional[Path] = None
79 |     """Where to read weights and tokenizer data from"""
80 |     out_dir: Path = Path("out/adapter/alpaca")
81 |     """Where to save artifacts"""
82 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | # Complete, reproducible script to build and prepare environment
 2 | LITGPT_REPO=$(pwd)
 3 | 
 4 | # modify the installation path and env name if you want
 5 | INSTALLDIR=${HOME}
 6 | ENV_NAME="goldfish_loss"
 7 | 
 8 | cd ${INSTALLDIR}
 9 | 
10 | # Base the installation on conda from module load
11 | source deactivate > /dev/null 2>&1 # discard potentially preloaded conda environments
12 | module load miniforge3
13 | echo "Conda Version:" $(which conda) 
14 | 
15 | 
16 | # Create conda environment, and print whether it is loaded correctly
17 | conda create --prefix ${INSTALLDIR}/$ENV_NAME python=3.11 --yes -c defaults
18 | source activate ${INSTALLDIR}/$ENV_NAME
19 | echo "Pip Version:" $(which pip)  # should be from the new environment!
20 | 
21 | # Conda packages:
22 | conda install -c conda-forge conda-pack --yes # install here, for the unpack
23 | 
24 | 
25 | # Load module family
26 | module load PrgEnv-cray # also loads cray-mpich and related stuff, will be loaded by default
27 | module load amd-mixed/5.6.0 # will need to match if updating pytorch version
28 | module load craype-accel-amd-gfx90a
29 | module load libfabric
30 | module load libtool # careful with LD_Library paths with this loaded, see RCCL notes below
31 | # module load cce/16.0.1 # doesnt fix flash-attention with C++20 headers
32 | 
33 | ######### COMPILE PIP PACKAGES ########################
34 | 
35 | # MPI
36 | MPICC="cc -shared" pip install --no-cache-dir --no-binary=mpi4py mpi4py
37 | 
38 | # pytorch and core reqs
39 | cd "${LITGPT_REPO}"
40 | pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/rocm5.6
41 | pip install .
42 | cd ${INSTALLDIR}
43 | 
44 | # flash attention
45 | pip install packaging ninja numpy
46 | git clone https://github.com/ROCmSoftwarePlatform/flash-attention
47 | cd flash-attention
48 | sed -i 's/c++20/c++17/g' setup.py # Annoying patch for now, there used to be a particular module config that loads a more modern cc version
49 | PYTORCH_ROCM_ARCH='gfx90a' GPU_ARCHS='gfx90a' pip install .
50 | cd ${INSTALLDIR}
51 | 
52 | # interconnects
53 | mkdir -p ${INSTALLDIR}/tiny_plugins_rccl
54 | git clone https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl
55 | cd aws-ofi-rccl
56 | ./autogen.sh
57 | CC=cc CXX=CC ./configure --with-libfabric=/opt/cray/libfabric/1.15.0.0 --with-hip=/opt/rocm-5.6.0/ \
58 |                          --with-rccl=${CONDA_PREFIX}/lib/python3.11/site-packages/torch/lib/ \
59 |                          --prefix=${INSTALLDIR}/tiny_plugins_rccl
60 | CC=cc CXX=CC make -j install
61 | cd ${INSTALLDIR}
62 | 
63 | # Finally axonn
64 | # pip install axonn
65 | git clone https://github.com/axonn-ai/axonn
66 | cd axonn
67 | git checkout 3a3c5386c48a889e4ae1f81acfd51ea1bc7f6f98
68 | pip install .
69 | cd ${INSTALLDIR}
70 | 
71 | 
72 | # Clean-up
73 | cd ${INSTALLDIR}
74 | rm -rf axonn
75 | rm -rf flash-attention
76 | rm -rf aws-ofi-rccl
77 | 
78 | ######### PACK A STATIC COPY OF THE ENVIRONMENT ########################
79 | # This step needs to be repeated if the env is changed
80 | 
81 | # Pack up the entire thing
82 | cd ${INSTALLDIR}
83 | rm -f ${ENV_NAME}_env_packed.tar.gz
84 | conda pack -p ${INSTALLDIR}/$ENV_NAME -o ${ENV_NAME}_env_packed.tar.gz --compress-level=1


--------------------------------------------------------------------------------
/tutorials/download_mistral.md:
--------------------------------------------------------------------------------
 1 | ## Download [Mistral](https://mistral.ai) weights
 2 | 
 3 | ### Mistral
 4 | 
 5 | [Mistral 7B](https://mistral.ai/news/announcing-mistral-7b) is Apache 2.0 licensed and can be used without restrictions. It:
 6 | 
 7 | * Outperforms Llama 2 13B on all benchmarks
 8 | * Outperforms Llama 1 34B on many benchmarks
 9 | * Approaches CodeLlama 7B performance on code, while remaining good at English tasks
10 | * Uses Grouped-query attention (GQA) for faster inference
11 | * ~~Uses Sliding Window Attention (SWA) to handle longer sequences at smaller cost~~.
12 |   This project's implementation does not use Sliding Window Attention, so the context length is limited to 4096 tokens.
13 | 
14 | Details about the data used to train the model or training procedure have not been made public.
15 | 
16 | To see all the available checkpoints, run:
17 | 
18 | ```bash
19 | python scripts/download.py | grep -E 'Mistral|Mixtral'
20 | ```
21 | 
22 | which will print
23 | 
24 | ```text
25 | mistralai/Mistral-7B-v0.1
26 | mistralai/Mistral-7B-Instruct-v0.1
27 | mistralai/Mixtral-8x7B-v0.1
28 | mistralai/Mixtral-8x7B-Instruct-v0.1
29 | mistralai/Mistral-7B-Instruct-v0.2
30 | ```
31 | 
32 | In order to use the Mistral 7B model checkpoint, which requires about 14 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format:
33 | 
34 | ```bash
35 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
36 | 
37 | python scripts/download.py --repo_id mistralai/Mistral-7B-Instruct-v0.2
38 | 
39 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.2
40 | ```
41 | 
42 | You're done! To execute the model just run:
43 | 
44 | ```bash
45 | pip install sentencepiece
46 | 
47 | python chat/base.py --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.2
48 | ```
49 | 
50 | ### Mixtral
51 | 
52 | [Mixtral 8x7B](https://mistral.ai/news/mixtral-of-experts) is a pretrained generative Sparse Mixture of Experts model based on Mistral 7B.
53 | Mistral-8x7B outperforms Llama 2 70B on most benchmarks tested.
54 | 
55 | Details about the data used to train the model or training procedure have not been made public.
56 | 
57 | In order to use the Mixtral 7B model checkpoint, which requires about 94 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format:
58 | 
59 | ```bash
60 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
61 | 
62 | python scripts/download.py --repo_id mistralai/Mixtral-8x7B-Instruct-v0.1 --from_safetensors true
63 | 
64 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-Instruct-v0.1
65 | ```
66 | 
67 | Due to the size of the model, currently only the multi-device sequential generation script can handle it.
68 | 
69 | ```bash
70 | pip install sentencepiece
71 | 
72 | python generate/sequentially.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-Instruct-v0.1
73 | ```
74 | 
75 | You will need enough devices (2, 4, or 8) where their combined memory is higher than 94 GB to fit the model in memory.
76 | Please check out [this section](inference.md#run-a-large-model-on-multiple-smaller-devices) for more information about this script.
77 | 


--------------------------------------------------------------------------------
/scripts/merge_lora.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | """This script merges the LoRA weights with the base model"""
 4 | 
 5 | import sys
 6 | from pathlib import Path
 7 | from typing import Optional
 8 | 
 9 | import lightning as L
10 | import torch
11 | 
12 | # support running without installing as a package
13 | wd = Path(__file__).parent.parent.resolve()
14 | sys.path.append(str(wd))
15 | 
16 | from lit_gpt.lora import GPT, Config, lora_filter, merge_lora_weights
17 | from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
18 | 
19 | 
20 | def merge_lora(
21 |     lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"),
22 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
23 |     out_dir: Path = Path("out/lora/checkpoint"),
24 |     precision: Optional[str] = None,
25 |     lora_r: int = 8,
26 |     lora_alpha: int = 16,
27 |     lora_dropout: float = 0.05,
28 |     lora_query: bool = True,
29 |     lora_key: bool = False,
30 |     lora_value: bool = True,
31 |     lora_projection: bool = False,
32 |     lora_mlp: bool = False,
33 |     lora_head: bool = False,
34 | ) -> None:
35 |     """Generates a response based on a given instruction and an optional input.
36 |     This script will only work with checkpoints from the instruction-tuned GPT-LoRA model.
37 |     See `finetune/lora.py`.
38 | 
39 |     Args:
40 |         lora_path: Path to the checkpoint with trained adapter weights, which are the output of
41 |             `finetune/lora.py`.
42 |         checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights.
43 |         out_dir: The path to the merged model that is created by this script.
44 |         precision: Indicates the Fabric precision setting to use.
45 |     """
46 |     check_valid_checkpoint_dir(checkpoint_dir)
47 |     out_dir.mkdir(parents=True, exist_ok=True)
48 | 
49 |     precision = precision or get_default_supported_precision(training=False)
50 |     fabric = L.Fabric(devices=1, precision=precision)
51 | 
52 |     config = Config.from_json(
53 |         checkpoint_dir / "lit_config.json",
54 |         r=lora_r,
55 |         alpha=lora_alpha,
56 |         dropout=lora_dropout,
57 |         to_query=lora_query,
58 |         to_key=lora_key,
59 |         to_value=lora_value,
60 |         to_projection=lora_projection,
61 |         to_mlp=lora_mlp,
62 |         to_head=lora_head,
63 |     )
64 | 
65 |     with fabric.init_module(empty_init=True):
66 |         model = GPT(config)
67 |     checkpoint_path = checkpoint_dir / "lit_model.pth"
68 |     checkpoint = lazy_load(checkpoint_path)
69 |     lora_checkpoint = lazy_load(lora_path)
70 |     checkpoint.update(lora_checkpoint.get("model", lora_checkpoint))
71 |     model.load_state_dict(checkpoint)
72 | 
73 |     merge_lora_weights(model)
74 | 
75 |     save_path = out_dir / "lit_model.pth"
76 |     fabric.print(f"Saving weights to {str(save_path)!r}")
77 |     # remove lora parameters and the lora linear substring
78 |     state_dict = {k.replace("linear.", ""): v for k, v in model.state_dict().items() if not lora_filter(k, v)}
79 |     torch.save(state_dict, save_path)
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     CLI(merge_lora)
84 | 


--------------------------------------------------------------------------------
/launch_scripts/config/tinyllama-1b-equal-supervised-tokens.yaml:
--------------------------------------------------------------------------------
 1 | run_name: tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13
 2 | out_dir: "/lustre/orion/csc569/scratch/njain17/new_workspace/lit-gpt-dev/output/tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13_aFYDh8o"
 3 | resume: true
 4 | max_tokens: 26666666667
 5 | max_iters: 
 6 | seed: 1337
 7 | model_name: tiny-llama-1.1b
 8 | block_size: 2048
 9 | ignore_block_size_mismatch: false
10 | model_checkpoint: 
11 | doc_block_attn: false
12 | cache_attn: false
13 | eod_token: 
14 | world_batch_size: 1408
15 | learning_rate: 0.0004
16 | warmup_steps: 1000
17 | weight_decay: 0.1
18 | beta1: 0.9
19 | beta2: 0.95
20 | adamw_eps: 1.0e-08
21 | grad_clip: 1
22 | lr_schedule: cosine
23 | decay_lr: true
24 | min_lr: 4.0e-05
25 | no_weight_decay_for_bias_and_norm_params: false
26 | neptune_from_tokens: 
27 | neptune_till_tokens: 
28 | neptune_noise_alpha: 
29 | label_smoothing: 0
30 | goldfish_strategy: hash-table
31 | k_goldfish: 4
32 | goldfish_start_position: 0
33 | goldfish_context_width: 13
34 | fabric_strategy: ddp
35 | fabric_precision: bf16-true
36 | micro_batch_size: 11
37 | compile_model: true
38 | matmul_precision: high
39 | dataloader_num_workers: 0
40 | n_chunks: 4
41 | tensor_parallel_size: 1
42 | torch_dist_init_barrier: false
43 | gradient_checkpointing_axonn: false
44 | logger_name: wandb
45 | logger_project: goldfish-TinyLLaMA-1B
46 | data_telemetry: false
47 | shape_watching_iters: 3
48 | log_step_interval: 1
49 | eval_iters: 2000
50 | save_and_eval_interval: 2000
51 | save_step_interval: 2000
52 | eval_step_interval: 2000
53 | save_last_step: true
54 | save_n_min_before_job_done: 5
55 | sanity_validate: true
56 | measure_flops: false
57 | torch_cpp_log_level: 
58 | torch_distributed_debug: 
59 | text_key: text
60 | pad_to_block_size: true
61 | add_bos: false
62 | add_eos: true
63 | shuffle_filenames: true
64 | collate_checks_enabled: true
65 | all_block_size_tensors: false
66 | data_config:
67 |   train_data:
68 |   - type: pkds
69 |     prefix: ''
70 |     weight: 0.98986379474
71 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
72 |     name: redpajama_v2_sample_100b_tinyllama_tokd
73 |   - type: hfds
74 |     prefix: wikipedia-en-2k
75 |     weight: 0.01013620526
76 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train"
77 |     name: wikipedia-en-2k-samples
78 |   val_data:
79 |   - type: pkds
80 |     prefix: ''
81 |     weight: 0.98986379474
82 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
83 |     name: redpajama_v2_sample_100b_tinyllama_tokd
84 |   - type: hfds
85 |     prefix: wikipedia-en-2k
86 |     weight: 0.01013620526
87 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val"
88 |     name: wikipedia-en-2k-samples
89 | train_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd"
90 | val_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd"
91 | tokenizer_path: "/lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/launch_scripts/config/tinyllama-1b-equal-supervised-tokens_mbs11.yaml:
--------------------------------------------------------------------------------
 1 | run_name: tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13
 2 | out_dir: "/lustre/orion/csc569/scratch/njain17/new_workspace/lit-gpt-dev/output/tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13_aFYDh8o"
 3 | resume: true
 4 | max_tokens: 26666666667
 5 | max_iters: 
 6 | seed: 1337
 7 | model_name: tiny-llama-1.1b
 8 | block_size: 2048
 9 | ignore_block_size_mismatch: false
10 | model_checkpoint: 
11 | doc_block_attn: false
12 | cache_attn: false
13 | eod_token: 
14 | world_batch_size: 1408
15 | learning_rate: 0.0004
16 | warmup_steps: 1000
17 | weight_decay: 0.1
18 | beta1: 0.9
19 | beta2: 0.95
20 | adamw_eps: 1.0e-08
21 | grad_clip: 1
22 | lr_schedule: cosine
23 | decay_lr: true
24 | min_lr: 4.0e-05
25 | no_weight_decay_for_bias_and_norm_params: false
26 | neptune_from_tokens: 
27 | neptune_till_tokens: 
28 | neptune_noise_alpha: 
29 | label_smoothing: 0
30 | goldfish_strategy: hash-table
31 | k_goldfish: 4
32 | goldfish_start_position: 0
33 | goldfish_context_width: 13
34 | fabric_strategy: ddp
35 | fabric_precision: bf16-true
36 | micro_batch_size: 11
37 | compile_model: true
38 | matmul_precision: high
39 | dataloader_num_workers: 0
40 | n_chunks: 4
41 | tensor_parallel_size: 1
42 | torch_dist_init_barrier: false
43 | gradient_checkpointing_axonn: false
44 | logger_name: wandb
45 | logger_project: goldfish-TinyLLaMA-1B
46 | data_telemetry: false
47 | shape_watching_iters: 3
48 | log_step_interval: 1
49 | eval_iters: 2000
50 | save_and_eval_interval: 2000
51 | save_step_interval: 2000
52 | eval_step_interval: 2000
53 | save_last_step: true
54 | save_n_min_before_job_done: 5
55 | sanity_validate: true
56 | measure_flops: false
57 | torch_cpp_log_level: 
58 | torch_distributed_debug: 
59 | text_key: text
60 | pad_to_block_size: true
61 | add_bos: false
62 | add_eos: true
63 | shuffle_filenames: true
64 | collate_checks_enabled: true
65 | all_block_size_tensors: false
66 | data_config:
67 |   train_data:
68 |   - type: pkds
69 |     prefix: ''
70 |     weight: 0.98986379474
71 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
72 |     name: redpajama_v2_sample_100b_tinyllama_tokd
73 |   - type: hfds
74 |     prefix: wikipedia-en-2k
75 |     weight: 0.01013620526
76 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train"
77 |     name: wikipedia-en-2k-samples
78 |   val_data:
79 |   - type: pkds
80 |     prefix: ''
81 |     weight: 0.98986379474
82 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
83 |     name: redpajama_v2_sample_100b_tinyllama_tokd
84 |   - type: hfds
85 |     prefix: wikipedia-en-2k
86 |     weight: 0.01013620526
87 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val"
88 |     name: wikipedia-en-2k-samples
89 | train_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd"
90 | val_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd"
91 | tokenizer_path: "/lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/launch_scripts/config/tinyllama-1b-equal-supervised-tokens_mbs8.yaml:
--------------------------------------------------------------------------------
 1 | run_name: tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13
 2 | out_dir: "/lustre/orion/csc569/scratch/njain17/new_workspace/lit-gpt-dev/output/tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13_u2UQQOY"
 3 | resume: true
 4 | max_tokens: 26666666667
 5 | max_iters: 
 6 | seed: 1337
 7 | model_name: tiny-llama-1.1b
 8 | block_size: 2048
 9 | ignore_block_size_mismatch: false
10 | model_checkpoint: 
11 | doc_block_attn: false
12 | cache_attn: false
13 | eod_token: 
14 | world_batch_size: 1024
15 | learning_rate: 0.0004
16 | warmup_steps: 1000
17 | weight_decay: 0.1
18 | beta1: 0.9
19 | beta2: 0.95
20 | adamw_eps: 1.0e-08
21 | grad_clip: 1
22 | lr_schedule: cosine
23 | decay_lr: true
24 | min_lr: 4.0e-05
25 | no_weight_decay_for_bias_and_norm_params: false
26 | neptune_from_tokens: 
27 | neptune_till_tokens: 
28 | neptune_noise_alpha: 
29 | label_smoothing: 0
30 | goldfish_strategy: hash-table
31 | k_goldfish: 4
32 | goldfish_start_position: 0
33 | goldfish_context_width: 13
34 | fabric_strategy: ddp
35 | fabric_precision: bf16-true
36 | micro_batch_size: 8
37 | compile_model: true
38 | matmul_precision: high
39 | dataloader_num_workers: 0
40 | n_chunks: 4
41 | tensor_parallel_size: 1
42 | torch_dist_init_barrier: false
43 | gradient_checkpointing_axonn: false
44 | logger_name: wandb
45 | logger_project: goldfish-TinyLLaMA-1B
46 | data_telemetry: false
47 | shape_watching_iters: 3
48 | log_step_interval: 1
49 | eval_iters: 2000
50 | save_and_eval_interval: 2000
51 | save_step_interval: 2000
52 | eval_step_interval: 2000
53 | save_last_step: true
54 | save_n_min_before_job_done: 5
55 | sanity_validate: true
56 | measure_flops: false
57 | torch_cpp_log_level: 
58 | torch_distributed_debug: 
59 | text_key: text
60 | pad_to_block_size: true
61 | add_bos: false
62 | add_eos: true
63 | shuffle_filenames: true
64 | collate_checks_enabled: true
65 | all_block_size_tensors: false
66 | data_config:
67 |   train_data:
68 |   - type: pkds
69 |     prefix: ''
70 |     weight: 0.98986379474
71 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
72 |     name: redpajama_v2_sample_100b_tinyllama_tokd
73 |   - type: hfds
74 |     prefix: wikipedia-en-2k
75 |     weight: 0.01013620526
76 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train"
77 |     name: wikipedia-en-2k-samples
78 |   val_data:
79 |   - type: pkds
80 |     prefix: ''
81 |     weight: 0.98986379474
82 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
83 |     name: redpajama_v2_sample_100b_tinyllama_tokd
84 |   - type: hfds
85 |     prefix: wikipedia-en-2k
86 |     weight: 0.01013620526
87 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val"
88 |     name: wikipedia-en-2k-samples
89 | train_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd"
90 | val_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd"
91 | tokenizer_path: "/lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T"
92 | 


--------------------------------------------------------------------------------
/launch_scripts/config/tinyllama-1b-equal-supervised-tokens_.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | run_name: tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13
 3 | out_dir: "/lustre/orion/csc569/scratch/njain17/new_workspace/lit-gpt-dev/output/tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table_h13_aFYDh8o"
 4 | resume: true
 5 | max_tokens: 26666666667
 6 | max_iters: 
 7 | seed: 1337
 8 | model_name: tiny-llama-1.1b
 9 | block_size: 2048
10 | ignore_block_size_mismatch: false
11 | model_checkpoint: 
12 | doc_block_attn: false
13 | cache_attn: false
14 | eod_token: 
15 | world_batch_size: 1408
16 | learning_rate: 0.0004
17 | warmup_steps: 1000
18 | weight_decay: 0.1
19 | beta1: 0.9
20 | beta2: 0.95
21 | adamw_eps: 1.0e-08
22 | grad_clip: 1
23 | lr_schedule: cosine
24 | decay_lr: true
25 | min_lr: 4.0e-05
26 | no_weight_decay_for_bias_and_norm_params: false
27 | neptune_from_tokens: 
28 | neptune_till_tokens: 
29 | neptune_noise_alpha: 
30 | label_smoothing: 0
31 | goldfish_strategy: hash-table
32 | k_goldfish: 4
33 | goldfish_start_position: 0
34 | goldfish_context_width: 13
35 | fabric_strategy: ddp
36 | fabric_precision: bf16-true
37 | micro_batch_size: 11
38 | compile_model: true
39 | matmul_precision: high
40 | dataloader_num_workers: 0
41 | n_chunks: 4
42 | tensor_parallel_size: 1
43 | torch_dist_init_barrier: false
44 | gradient_checkpointing_axonn: false
45 | logger_name: wandb
46 | logger_project: goldfish-TinyLLaMA-1B
47 | data_telemetry: false
48 | shape_watching_iters: 3
49 | log_step_interval: 1
50 | eval_iters: 2000
51 | save_and_eval_interval: 2000
52 | save_step_interval: 2000
53 | eval_step_interval: 2000
54 | save_last_step: true
55 | save_n_min_before_job_done: 5
56 | sanity_validate: true
57 | measure_flops: false
58 | torch_cpp_log_level: 
59 | torch_distributed_debug: 
60 | text_key: text
61 | pad_to_block_size: true
62 | add_bos: false
63 | add_eos: true
64 | shuffle_filenames: true
65 | collate_checks_enabled: true
66 | all_block_size_tensors: false
67 | data_config:
68 |   train_data:
69 |   - type: pkds
70 |     prefix: ''
71 |     weight: 0.98986379474
72 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
73 |     name: redpajama_v2_sample_100b_tinyllama_tokd
74 |   - type: hfds
75 |     prefix: wikipedia-en-2k
76 |     weight: 0.01013620526
77 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/train"
78 |     name: wikipedia-en-2k-samples
79 |   val_data:
80 |   - type: pkds
81 |     prefix: ''
82 |     weight: 0.98986379474
83 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/redpajama_v2_sample_100b_tinyllama_tokd"
84 |     name: redpajama_v2_sample_100b_tinyllama_tokd
85 |   - type: hfds
86 |     prefix: wikipedia-en-2k
87 |     weight: 0.01013620526
88 |     data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/wikipedia-en-2k-samples/val"
89 |     name: wikipedia-en-2k-samples
90 | train_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd"
91 | val_data_dir: "/lustre/orion/csc569/proj-shared/language_datasets/processed/spj_star_combined_full_tinyllama_tokd"
92 | tokenizer_path: "/lustre/orion/csc569/proj-shared/language_models/external/TinyLlama-1.1B-intermediate-step-1431k-3T"
93 | 


--------------------------------------------------------------------------------
/launch_scripts/launch_jobs_1b_hashtable.sh.sh:
--------------------------------------------------------------------------------
 1 | # k = 3, 4, 8, 32, 128, inf
 2 | 
 3 | # 3-goldfish
 4 | python launch_scripts/launcher.py \
 5 |     --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \
 6 |     --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \
 7 |     --config="launch_scripts/config/tinyllama-1b.yaml" \
 8 |     --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \
 9 |     --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish3_hash-table \
10 |     --extra_args="--k_goldfish=3 --goldfish_strategy=hash-table"
11 | 
12 | # 4-goldfish
13 | python launch_scripts/launcher.py \
14 |     --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \
15 |     --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \
16 |     --config="launch_scripts/config/tinyllama-1b.yaml" \
17 |     --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \
18 |     --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish4_hash-table \
19 |     --extra_args="--k_goldfish=4 --goldfish_strategy=hash-table"
20 | 
21 | 
22 | # 8-goldfish
23 | python launch_scripts/launcher.py \
24 |     --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \
25 |     --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \
26 |     --config="launch_scripts/config/tinyllama-1b.yaml" \
27 |     --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \
28 |     --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish8_hash-table \
29 |     --extra_args="--k_goldfish=8 --goldfish_strategy=hash-table"
30 | 
31 | # 32-goldfish
32 | python launch_scripts/launcher.py \
33 |     --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \
34 |     --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \
35 |     --config="launch_scripts/config/tinyllama-1b.yaml" \
36 |     --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \
37 |     --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish32_hash-table \
38 |     --extra_args="--k_goldfish=32 --goldfish_strategy=hash-table"
39 | 
40 | 
41 | # 128-goldfish
42 | python launch_scripts/launcher.py \
43 |     --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \
44 |     --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \
45 |     --config="launch_scripts/config/tinyllama-1b.yaml" \
46 |     --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \
47 |     --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish128_hash-table \
48 |     --extra_args="--k_goldfish=128 --goldfish_strategy=hash-table"
49 | 
50 | # 128-goldfish
51 | python launch_scripts/launcher.py \
52 |     --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \
53 |     --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \
54 |     --config="launch_scripts/config/tinyllama-1b.yaml" \
55 |     --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \
56 |     --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish128_hash-table \
57 |     --extra_args="--k_goldfish=128 --goldfish_strategy=hash-table"
58 | 
59 | # inf-goldfish or standard loss
60 | python launch_scripts/launcher.py \
61 |     --env_packed ${INSTALLDIR}/goldfish_loss_env_packed.tar.gz \
62 |     --rccl_installdir="${HOME}/tiny_plugins_rccl/lib" \
63 |     --config="launch_scripts/config/tinyllama-1b.yaml" \
64 |     --budget_minutes=0 --budget_hours 10 --nodes=16 --email=ahans1@umd.edu \
65 |     --run_name=tinyllama_1b_redpajama_wiki2k_20B_goldfish128_hash-table


--------------------------------------------------------------------------------
/tutorials/download_stablelm.md:
--------------------------------------------------------------------------------
 1 | ## Download [StableLM](https://github.com/Stability-AI/StableLM) weights
 2 | 
 3 | StableLM is a family of generative language models trained by StabilityAI.
 4 | 
 5 | To see all the available checkpoints for StableLM, run:
 6 | 
 7 | ```bash
 8 | python scripts/download.py | grep stablelm
 9 | ```
10 | 
11 | which will print:
12 | 
13 | ```text
14 | stabilityai/stablelm-base-alpha-3b
15 | stabilityai/stablelm-base-alpha-7b
16 | stabilityai/stablelm-tuned-alpha-3b
17 | stabilityai/stablelm-tuned-alpha-7b
18 | stabilityai/stablelm-3b-4e1t
19 | stabilityai/stablelm-zephyr-3b
20 | ```
21 | 
22 | > [!Important]
23 | > `stablelm-base-alpha-(3,7)b` and `stablelm-tuned-alpha-(3,7)b` are deprecated and are no longer in the StableLM collection. Last time these models were updated in April 2023. Consider using `stablelm-3b-4e1t` (base model) or `stablelm-zephyr-3b` (instruct fine-tuned).
24 | 
25 | ### StableLM-3B-4E1T
26 | 
27 | StableLM-3B-4E1T is a 3 billion (3B) parameter language model pre-trained under the multi-epoch regime to study the impact of repeated tokens on downstream performance.
28 | 
29 | Building on past achievements, StabilityAI underwent training on 1 trillion tokens for 4 epochs, as recommended by Muennighoff et al. (2023) in their study "Scaling Data-Constrained Language Models." They noted that training with repeated data over 4 epochs has minimal impact on loss compared to using unique data. Additionally, insights from "Go smol or go home" (De Vries, 2023) guided the choice of token count. The research suggests that a 2.96B model trained on 2.85 trillion tokens can achieve a loss similar to a compute-optimized 9.87B language model.
30 | More info can be found on [GitHub](https://github.com/Stability-AI/StableLM?tab=readme-ov-file#stablelm-3b-4e1t).
31 | 
32 | ### StableLM Zephyr 3B
33 | 
34 | Lightweight LLM, preference tuned for instruction following and Q&A-type tasks. This model is an extension of the pre-existing StableLM 3B-4e1t model and is inspired by the Zephyr 7B model from HuggingFace. With StableLM Zephyr's 3 billion parameters, this model efficiently caters to a wide range of text generation needs, from simple queries to complex instructional contexts on edge devices.
35 | More details can be found in the [announcement](https://stability.ai/news/stablelm-zephyr-3b-stability-llm).
36 | 
37 | ### Usage
38 | 
39 | In order to use a specific StableLM checkpoint, for instance [StableLM Zephyr 3B](https://huggingface.co/stabilityai/stablelm-zephyr-3b), download the weights and convert the checkpoint to the Lit-GPT format:
40 | 
41 | ```bash
42 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
43 | 
44 | export repo_id=stabilityai/stablelm-zephyr-3b
45 | python scripts/download.py --repo_id $repo_id --from_safetensors=True
46 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id
47 | ```
48 | 
49 | By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM
50 | or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
51 | 
52 | You're done! To execute the model just run:
53 | 
54 | ```bash
55 | pip install tokenizers
56 | 
57 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/$repo_id
58 | ```
59 | 
60 | Or you can run the model in an interactive mode:
61 | 
62 | ```bash
63 | python chat/base.py --checkpoint_dir checkpoints/$repo_id
64 | ```
65 | 


--------------------------------------------------------------------------------
/scripts/simulate_lr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script is helpful to simulate a learning rate schedule if you want to train for an existing hot checkpoint.
 3 | 
 4 | By default it configures the hardcoded hyperparameters for the TinyLLaMA model.
 5 | https://github.com/jzhang38/TinyLlama/blob/bf122247c486b6b897050e98cbb7bedae8eeba73/pretrain/tinyllama.py#L30:40
 6 | You can change the hyperparameters to simulate the learning rate schedule for other models.
 7 | 
 8 | TODO: Parameterize the script to accept the hyperparameters as arguments.
 9 | """
10 | import sys
11 | import os
12 | from dataclasses import dataclass
13 | from functools import partial
14 | import torch
15 | 
16 | # Add the root directory of the project to the path
17 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
18 | from pretrain_umd.train import CLISettings, get_lr
19 | from lit_gpt.utils import *
20 | 
21 | class CfgWithoutValidation(CLISettings):
22 |     def __post_init__(self):
23 |         pass
24 | 
25 | def main():
26 |     cfg = CfgWithoutValidation(
27 |         max_iters=1_430_512,
28 |         min_lr=4e-5,
29 |         lr_schedule="cosine",
30 |         learning_rate=4e-4,
31 |         warmup_steps=2000
32 |     )
33 |     cfg.warmup_iters = cfg.warmup_steps # assumes steps == iters i.e. gradient accumulation steps = 1
34 | 
35 |     # Computing hot lr for https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-480k-1T
36 |     lr = get_lr(it=480_000 + 1, lr_decay_iters=cfg.max_iters, cfg=cfg) # Can use this as new max_lr and resume training
37 |     print(f"Hot LR: {lr}") # Hot LR: 0.00030937179340707335
38 |     return
39 | 
40 | def tld_loss_sanity():
41 |     cfg = CfgWithoutValidation(
42 |         label_smoothing = 0,
43 |         tld_strategy = 'hash-avalanche',
44 |         k_token_loss_dropout = 3
45 |     )
46 | 
47 |     vocab_size = 32_000
48 |     block_size = 20
49 |     mbs = 2
50 | 
51 |     targets_swapped = torch.randint(0, vocab_size, (mbs+2, block_size))
52 |     torch.manual_seed(1337)
53 |     logits = torch.randn(mbs, block_size, vocab_size)
54 |     targets = torch.randint(0, vocab_size, (mbs, block_size))
55 | 
56 |     swapped_targets = torch.cat((targets_swapped, targets[:1]), dim=0)
57 | 
58 |     ignore_index = -1
59 | 
60 |     loss_func = partial(
61 |             chunked_cross_entropy,
62 |             label_smoothing=cfg.label_smoothing,
63 |             tld_strategy=cfg.tld_strategy,
64 |             k_token_loss_dropout=cfg.k_token_loss_dropout,
65 |             tld_start_position=cfg.tld_start_position,
66 |             ignore_index=ignore_index,
67 |     )
68 | 
69 |     loss = loss_func(logits=logits, targets=targets)
70 |     all_token_loss = loss_func(logits=logits, targets=targets, tld_strategy=None)
71 |     post_tld_targets, _ = apply_tld(targets=targets, strategy=cfg.tld_strategy, k=cfg.k_token_loss_dropout, tld_start_position=cfg.tld_start_position, ignore_index=ignore_index)
72 |     swapped_tld_targets, _ = post_tld_targets, _ = apply_tld(targets=swapped_targets, strategy=cfg.tld_strategy, k=cfg.k_token_loss_dropout, tld_start_position=cfg.tld_start_position, ignore_index=ignore_index)
73 | 
74 |     assert torch.all(swapped_tld_targets[-1] == post_tld_targets[-1])
75 | 
76 |     # random TLD strategy
77 |     k = cfg.k_token_loss_dropout
78 | 
79 |     random_tensor = torch.randint(1, k + 1, size=targets.size())
80 |     mask = (random_tensor == k).int()
81 |     dropped_token_indices = mask.nonzero().reshape(mbs, -1)
82 | 
83 |     breakpoint()
84 | 
85 | if __name__ == '__main__':
86 |     # main()
87 |     tld_loss_sanity()
88 | 


--------------------------------------------------------------------------------
/scripts/prepare_openwebtext.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | # saves the openwebtext dataset to a binary file for training. following was helpful:
 4 | # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
 5 | import os
 6 | import sys
 7 | from pathlib import Path
 8 | from typing import Union
 9 | 
10 | import numpy as np
11 | from tqdm import tqdm
12 | 
13 | # support running without installing as a package
14 | wd = Path(__file__).parent.parent.resolve()
15 | sys.path.append(str(wd))
16 | 
17 | from lit_gpt import Tokenizer
18 | from lit_gpt.utils import CLI
19 | 
20 | 
21 | def prepare(
22 |     destination_path: Path = Path("data/openwebtext"),
23 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
24 |     seed: int = 42,
25 |     test_size: Union[float, int, None] = 0.0005,
26 | ) -> None:
27 |     from datasets import load_dataset  # huggingface datasets
28 | 
29 |     destination_path.mkdir(parents=True, exist_ok=True)
30 | 
31 |     tokenizer = Tokenizer(checkpoint_dir)
32 | 
33 |     # number of workers in .map() call
34 |     # good number to use is ~order number of cpu cores // 2
35 |     num_proc = os.cpu_count() // 2
36 | 
37 |     # number of workers in load_dataset() call
38 |     # best number might be different from num_proc above as it also depends on HW speed.
39 |     # it is better than 1 usually though
40 |     num_proc_load_dataset = num_proc
41 | 
42 |     # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
43 |     dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
44 | 
45 |     # owt by default only contains the 'train' split, so create a test split
46 |     split_dataset = dataset["train"].train_test_split(test_size=test_size, seed=seed, shuffle=True)
47 |     split_dataset["val"] = split_dataset.pop("test")  # rename the test split to val
48 | 
49 |     def process(example):
50 |         ids = tokenizer.encode(example["text"]).tolist()
51 |         ids.append(tokenizer.eos_id)
52 | 
53 |         # ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
54 |         # ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
55 |         # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
56 |         return {"ids": ids, "len": len(ids)}
57 | 
58 |     # tokenize the dataset
59 |     tokenized = split_dataset.map(process, remove_columns=["text"], desc="tokenizing the splits", num_proc=num_proc)
60 | 
61 |     # concatenate all the ids in each dataset into one large file we can use for training
62 |     for split, dset in tokenized.items():
63 |         arr_len = np.sum(dset["len"], dtype=np.uint64)
64 |         filename = destination_path / f"{split}.bin"
65 |         dtype = np.uint16  # (can do since enc.max_token_value == 50256 is < 2**16)
66 |         arr = np.memmap(str(filename), dtype=dtype, mode="w+", shape=(arr_len,))
67 |         total_batches = 1024
68 | 
69 |         idx = 0
70 |         for batch_idx in tqdm(range(total_batches), desc=f"writing {filename}"):
71 |             # Batch together samples for faster write
72 |             batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format("numpy")
73 |             arr_batch = np.concatenate(batch["ids"])
74 |             # Write into mmap
75 |             arr[idx : idx + len(arr_batch)] = arr_batch
76 |             idx += len(arr_batch)
77 |         arr.flush()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     CLI(prepare)
82 | 


--------------------------------------------------------------------------------
/lit_gpt/data/longform.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | from pathlib import Path
 5 | from typing import Optional
 6 | 
 7 | import torch
 8 | from torch.utils.data import DataLoader
 9 | from lit_gpt.data import SFTDataset, get_sft_collate_fn, LitDataModule
10 | from lit_gpt.data.alpaca import download_if_missing
11 | from lit_gpt.tokenizer import Tokenizer
12 | 
13 | 
14 | _URL = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset"
15 | 
16 | 
17 | class LongForm(LitDataModule):
18 |     """LongForm data module for supervised finetuning.
19 | 
20 |     Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         mask_prompt: bool = False,
26 |         ignore_index: int = -1,
27 |         seed: int = 42,
28 |         num_workers: int = 4,
29 |         download_dir: Path = Path("./data/longform"),
30 |     ) -> None:
31 |         super().__init__()
32 |         self.mask_prompt = mask_prompt
33 |         self.ignore_index = ignore_index
34 |         self.seed = seed
35 |         self.num_workers = num_workers
36 |         self.download_dir = download_dir
37 | 
38 |         self.tokenizer: Optional[Tokenizer] = None
39 |         self.batch_size: int = 1
40 |         self.max_seq_length: int = -1
41 |         self.train_dataset: Optional[SFTDataset] = None
42 |         self.test_dataset: Optional[SFTDataset] = None
43 | 
44 |     def connect(
45 |         self,
46 |         tokenizer: Optional[Tokenizer] = None,
47 |         batch_size: int = 1,
48 |         max_seq_length: Optional[int] = None
49 |     ) -> None:
50 |         self.tokenizer = tokenizer
51 |         self.batch_size = batch_size
52 |         self.max_seq_length = -1 if max_seq_length is None else max_seq_length
53 | 
54 |     def prepare_data(self) -> None:
55 |         self.download_dir.mkdir(parents=True, exist_ok=True)
56 |         download_if_missing(self.download_dir / "train.json", f"{_URL}/train.json")
57 |         download_if_missing(self.download_dir / "val.json", f"{_URL}/val.json")
58 | 
59 |     def train_dataloader(self):
60 |         return self._dataloader("train")
61 | 
62 |     def val_dataloader(self):
63 |         return self._dataloader("val")
64 | 
65 |     def _dataloader(self, split: str) -> DataLoader:
66 |         with open(self.download_dir / f"{split}.json", "r", encoding="utf-8") as file:
67 |             data = json.load(file)
68 | 
69 |         dataset = SFTDataset(
70 |             data=data,
71 |             tokenizer=self.tokenizer,
72 |             prompt_template=prompt_template,
73 |             max_seq_length=self.max_seq_length,
74 |             mask_prompt=self.mask_prompt,
75 |             ignore_index=self.ignore_index,
76 |         )
77 |         return DataLoader(
78 |             dataset=dataset,
79 |             batch_size=self.batch_size,
80 |             shuffle=(split == "train"),
81 |             generator=torch.Generator().manual_seed(self.seed),
82 |             num_workers=self.num_workers,
83 |             collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
84 |         )
85 | 
86 | 
87 | def prompt_template(example: dict) -> str:
88 |     """A modified Alpaca prompt template without the 'input'."""
89 |     return (
90 |         "Below is an instruction that describes a task, paired with an input that provides further context. "
91 |         "Write a response that appropriately completes the request.\n\n"
92 |         f"### Instruction:\n{example['input']}\n\n### Response:\n"
93 |     )
94 | 


--------------------------------------------------------------------------------
/scripts/convert_checkpoint_to_hf.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import shutil
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | import torch
 7 | 
 8 | from convert_pretrained_checkpoint import convert_checkpoint
 9 | from convert_lit_checkpoint import convert_lit_checkpoint
10 | 
11 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
12 | from huggingface_hub import create_repo
13 | 
14 | # support running without installing as a package
15 | wd = Path(__file__).parent.parent.resolve()
16 | sys.path.append(str(wd))
17 | 
18 | from lit_gpt.utils import CLI
19 | 
20 | 
21 | @torch.inference_mode()
22 | def convert_checkpoint_to_hf(
23 |     checkpoint_file: Path,
24 |     tokenizer_dir: Path,
25 |     model_name: str,
26 |     axonn_patch: bool = False,
27 |     push_to_hub: bool = True,
28 | ) -> None:
29 |     ### convert training checkpoint to lit checkpoint
30 |     parent_dir = checkpoint_file.parent.absolute()
31 |     with open(parent_dir / "model_config.json") as f:
32 |         model_config = json.load(f)
33 |     config_name = model_config["name"]
34 |     convert_checkpoint(checkpoint_file, tokenizer_dir, config_name, parent_dir / f"lit_checkpoint_{model_name}")
35 | 
36 |     ### convert training checkpoint to hf checkpoint
37 |     convert_lit_checkpoint(
38 |         parent_dir / f"lit_checkpoint_{model_name}/lit_model.pth",
39 |         parent_dir / f"hf_checkpoint_{model_name}/pytorch_model.bin",
40 |         parent_dir / f"lit_checkpoint_{model_name}/lit_config.json",
41 |         axonn_patch=axonn_patch,
42 |     )
43 | 
44 |     for tokenizer_file in tokenizer_dir.glob("tokenizer*"):
45 |         shutil.copyfile(tokenizer_file, parent_dir / f"hf_checkpoint_{model_name}" / tokenizer_file.name)
46 | 
47 |     if (tokenizer_dir / "generation_config.json").is_file():
48 |         shutil.copyfile(
49 |             tokenizer_dir / "generation_config.json",
50 |             parent_dir / f"hf_checkpoint_{model_name}" / "generation_config.json",
51 |         )
52 | 
53 |     if (tokenizer_dir / "special_tokens_map.json").is_file():
54 |         shutil.copyfile(
55 |             tokenizer_dir / "special_tokens_map.json",
56 |             parent_dir / f"hf_checkpoint_{model_name}" / "special_tokens_map.json",
57 |         )
58 | 
59 |     if (tokenizer_dir / "added_tokens.json").is_file():
60 |         shutil.copyfile(
61 |             tokenizer_dir / "added_tokens.json", parent_dir / f"hf_checkpoint_{model_name}" / "added_tokens.json"
62 |         )
63 | 
64 |     if (tokenizer_dir / "config.json").is_file():
65 |         shutil.copyfile(tokenizer_dir / "config.json", parent_dir / f"hf_checkpoint_{model_name}" / "config.json")
66 | 
67 |     # hf_org = model_config["hf_config"]["org"]
68 |     # hf_name = model_config["hf_config"]["name"]
69 |     # hf_config = AutoConfig.from_pretrained(f"{hf_org}/{hf_name}")
70 |     # hf_config = hf_config.to_dict()
71 |     # with open(parent_dir / f"hf_checkpoint_{model_name}" / "config.json", "w") as f:
72 |     #     json.dump(hf_config, f, indent=4)
73 | 
74 |     if not push_to_hub:
75 |         return
76 | 
77 |     ### push to hub
78 |     repo_name = f"tomg-group-umd/{model_name}"
79 |     tokenizer = AutoTokenizer.from_pretrained(parent_dir / f"hf_checkpoint_{model_name}")
80 |     state_dict = torch.load(parent_dir / f"hf_checkpoint_{model_name}/pytorch_model.bin")
81 |     model = AutoModelForCausalLM.from_pretrained(parent_dir / f"hf_checkpoint_{model_name}", state_dict=state_dict)
82 |     create_repo(repo_name, private=True)
83 |     model.push_to_hub(repo_name, use_temp_dir=True)
84 |     tokenizer.push_to_hub(repo_name, use_temp_dir=True)
85 | 
86 |     print(f"Model pushed to {repo_name}")
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     CLI(convert_checkpoint_to_hf)
91 | 


--------------------------------------------------------------------------------
/scripts/convert_pretrained_checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | import shutil
 5 | import sys
 6 | from dataclasses import asdict
 7 | from pathlib import Path
 8 | 
 9 | import torch
10 | 
11 | # support running without installing as a package
12 | wd = Path(__file__).parent.parent.resolve()
13 | sys.path.append(str(wd))
14 | 
15 | from lit_gpt import Config
16 | from lit_gpt.utils import CLI, incremental_save
17 | 
18 | 
19 | @torch.inference_mode()
20 | def convert_checkpoint(checkpoint_file: Path, tokenizer_dir: Path, config_name: str, output_dir: Path) -> None:
21 |     """Convert a checkpoint after pretraining.
22 | 
23 |     The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training
24 |     is finished. This script will export the state-dict of the model and place it in the chosen output folder together
25 |     with the tokenizer and model config, which then can be loaded by other scripts for inference, evaluation, etc.
26 | 
27 |     Args:
28 |         checkpoint_file: Path to a checkpoint file scripts produced by the scripts in ``lit_gpt/pretrain/``.
29 |         tokenizer_dir: A path to the folder that holds the tokenizer configuration files that were used to train
30 |             the model. All files with a name starting with 'tokenizer' will be copied to the output folder.
31 |         config_name: The name of the model loaded with the ``lit_gpt.Config``. The configuration will be saved as a
32 |             JSON file to the output folder.
33 |         output_dir: The output folder where model state-dict file, the tokenizer config file, and the model config
34 |             file will be saved.
35 |     """
36 | 
37 |     if output_dir.is_dir() and output_dir.glob("*"):
38 |         raise FileExistsError(
39 |             f"The output folder exists and is not empty: {str(output_dir)}."
40 |             " Please delete it first or choose a different name."
41 |         )
42 |     if not tokenizer_dir.is_dir():
43 |         raise FileNotFoundError(f"The tokenizer_dir must be a directory: {str(output_dir)}.")
44 | 
45 |     output_dir.mkdir(parents=True)
46 |     output_checkpoint_file = output_dir / "lit_model.pth"
47 |     output_config_file = output_dir / "lit_config.json"
48 | 
49 |     # Save the config to output folder
50 |     config = Config.from_name(config_name)
51 |     with open(output_config_file, "w") as json_config:
52 |         json.dump(asdict(config), json_config)
53 | 
54 |     # Export the tokenizer configuration to output folder
55 |     for tokenizer_file in tokenizer_dir.glob("tokenizer*"):
56 |         shutil.copyfile(tokenizer_file, output_dir / tokenizer_file.name)
57 | 
58 |     # Copy config for tokenization if found
59 |     if (tokenizer_dir / "generation_config.json").is_file():
60 |         shutil.copyfile(tokenizer_dir / "generation_config.json", output_dir / "generation_config.json")
61 | 
62 |     # Extract the model state dict and save to output folder
63 |     with incremental_save(output_checkpoint_file) as saver:
64 |         print("Processing", checkpoint_file)
65 |         full_checkpoint = torch.load(str(checkpoint_file), mmap=True)
66 |         loaded_state_dict = full_checkpoint["model"]
67 |         converted_state_dict = {}
68 |         for param_name, param in loaded_state_dict.items():
69 |             saver.store_early(param)
70 |             # remove prefix for compiled model (if any)
71 |             param_name = param_name.replace("_orig_mod.", "")
72 |             converted_state_dict[param_name] = param
73 |         print(f"Saving converted checkpoint to {str(output_checkpoint_file)}.")
74 |         saver.save(converted_state_dict)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     CLI(convert_checkpoint)
79 | 


--------------------------------------------------------------------------------
/eval/alpaca_eval_generate.py:
--------------------------------------------------------------------------------
 1 | """This script generates the evaluation responses that can e used by eval_scoring.py"""
 2 | 
 3 | import argparse
 4 | from functools import partial
 5 | import json
 6 | 
 7 | from datasets import Dataset
 8 | import datasets
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | import torch
11 | from conversation import get_conv_template
12 | 
13 | 
14 | def apply_conv_template(example, template_type):
15 |     # preprocess instructions into prompted inputs
16 |     conv = get_conv_template(template_type)
17 |     conv.append_message(conv.roles[0], example["instruction"])
18 |     conv.append_message(conv.roles[1], "")
19 |     prompt = conv.get_prompt()
20 | 
21 |     example.update({"prompt": prompt})
22 | 
23 |     return example
24 | 
25 | 
26 | def generate_responses_batched(example, model, tokenizer, kwargs):
27 |     prompt = example["prompt"]
28 | 
29 |     encoding = tokenizer(
30 |         prompt,
31 |         return_tensors="pt",
32 |         padding="longest",
33 |         max_length=tokenizer.model_max_length,
34 |         truncation=True,
35 |     )
36 |     encoding = encoding.to(model.device)
37 |     with torch.no_grad():
38 |         model_output = model.generate(**encoding, **kwargs)
39 |         input_len = encoding.input_ids.shape[-1]
40 |         model_output = model_output[:, input_len:].cpu()
41 |         decoded_output = tokenizer.batch_decode(
42 |             model_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
43 |         )
44 | 
45 |     example.update({"output": decoded_output})
46 |     example.update({"metadata": [kwargs] * len(decoded_output)})
47 | 
48 |     return example
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument("--model", default="llama/7B_sharded", type=str)
54 |     parser.add_argument("--model_name", default=None, type=str)
55 |     parser.add_argument("--template_type", default="alpaca", type=str)
56 |     parser.add_argument("--save_file_name", default="outputs/answers/self-instruct_llama7B.jsonl", type=str)
57 |     parser.add_argument("--batch_size", default=4, type=int)
58 |     parser.add_argument(
59 |         "--debug",
60 |         action="store_true",
61 |         help="This reduce the number of generation examples to 4, so that we can debug faster.",
62 |     )
63 |     args = parser.parse_args()
64 | 
65 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
66 |     model = AutoModelForCausalLM.from_pretrained(args.model).to(device)
67 |     tokenizer = AutoTokenizer.from_pretrained(args.model, model_max_length=2048, padding_side="left")
68 |     tokenizer.pad_token = tokenizer.eos_token
69 | 
70 |     ## set the models to eval mode
71 |     model = model.eval()
72 | 
73 |     raw_data = datasets.load_dataset("tatsu-lab/alpaca_eval", "alpaca_eval")["eval"]
74 |     raw_data = raw_data.map(lambda x: {"generator": args.model_name if args.model_name else args.model})
75 | 
76 |     ## preprocess
77 |     eval_preproc = partial(apply_conv_template, template_type=args.template_type)
78 |     raw_data = raw_data.map(eval_preproc)
79 | 
80 |     # reduce number of examples for debugging
81 |     if args.debug:
82 |         raw_data = raw_data.select(range(4))
83 | 
84 |     ## run generation
85 |     generate_kwargs = dict(
86 |         max_length=2048, do_sample=True, top_p=0.7, num_return_sequences=1, temperature=1, repetition_penalty=1.2
87 |     )
88 |     generate = partial(
89 |         generate_responses_batched,
90 |         model=model,
91 |         tokenizer=tokenizer,
92 |         kwargs=generate_kwargs,
93 |     )
94 | 
95 |     dataset_w_responses = raw_data.map(generate, batched=True, batch_size=args.batch_size)
96 |     dataset_w_responses = dataset_w_responses.map(lambda x: {"metadata": json.dumps(x["metadata"])})
97 |     dataset_w_responses.to_json(args.save_file_name, orient="records", lines=False, indent=True)
98 | 


--------------------------------------------------------------------------------
/scripts/download.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | import sys
 5 | from pathlib import Path
 6 | from typing import Optional
 7 | 
 8 | import torch
 9 | from lightning_utilities.core.imports import RequirementCache
10 | 
11 | # support running without installing as a package
12 | wd = Path(__file__).parent.parent.resolve()
13 | sys.path.append(str(wd))
14 | 
15 | from lit_gpt.utils import CLI
16 | 
17 | _SAFETENSORS_AVAILABLE = RequirementCache("safetensors")
18 | _HF_TRANSFER_AVAILABLE = RequirementCache("hf_transfer")
19 | 
20 | 
21 | def download_from_hub(
22 |     repo_id: Optional[str] = None,
23 |     access_token: Optional[str] = os.getenv("HF_TOKEN"),
24 |     from_safetensors: bool = False,
25 |     tokenizer_only: bool = False,
26 |     checkpoint_dir: Path = Path("checkpoints"),
27 | ) -> None:
28 |     if repo_id is None:
29 |         from lit_gpt.config import configs
30 | 
31 |         options = [f"{config['hf_config']['org']}/{config['hf_config']['name']}" for config in configs]
32 |         print("Please specify --repo_id <repo_id>. Available values:")
33 |         print("\n".join(options))
34 |         return
35 | 
36 |     from huggingface_hub import snapshot_download
37 | 
38 |     if ("meta-llama" in repo_id or "falcon-180" in repo_id) and not access_token:
39 |         raise ValueError(
40 |             f"{repo_id} requires authentication, please set the `HF_TOKEN=your_token` environment"
41 |             " variable or pass --access_token=your_token. You can find your token by visiting"
42 |             " https://huggingface.co/settings/tokens"
43 |         )
44 | 
45 |     download_files = ["tokenizer*", "generation_config.json", "config.json"]
46 |     if not tokenizer_only:
47 |         if from_safetensors:
48 |             if not _SAFETENSORS_AVAILABLE:
49 |                 raise ModuleNotFoundError(str(_SAFETENSORS_AVAILABLE))
50 |             download_files.append("*.safetensors")
51 |         else:
52 |             # covers `.bin` files and `.bin.index.json`
53 |             download_files.append("*.bin*")
54 |     elif from_safetensors:
55 |         raise ValueError("`--from_safetensors=True` won't have an effect with `--tokenizer_only=True`")
56 | 
57 |     import huggingface_hub._snapshot_download as download
58 |     import huggingface_hub.constants as constants
59 | 
60 |     previous = constants.HF_HUB_ENABLE_HF_TRANSFER
61 |     if _HF_TRANSFER_AVAILABLE and not previous:
62 |         print("Setting HF_HUB_ENABLE_HF_TRANSFER=1")
63 |         constants.HF_HUB_ENABLE_HF_TRANSFER = True
64 |         download.HF_HUB_ENABLE_HF_TRANSFER = True
65 | 
66 |     directory = checkpoint_dir / repo_id
67 |     snapshot_download(
68 |         repo_id,
69 |         local_dir=directory,
70 |         local_dir_use_symlinks=False,
71 |         resume_download=True,
72 |         allow_patterns=download_files,
73 |         token=access_token,
74 |     )
75 | 
76 |     constants.HF_HUB_ENABLE_HF_TRANSFER = previous
77 |     download.HF_HUB_ENABLE_HF_TRANSFER = previous
78 | 
79 |     # convert safetensors to PyTorch binaries
80 |     if from_safetensors:
81 |         from safetensors import SafetensorError
82 |         from safetensors.torch import load_file as safetensors_load
83 | 
84 |         print("Converting .safetensor files to PyTorch binaries (.bin)")
85 |         for safetensor_path in directory.glob("*.safetensors"):
86 |             bin_path = safetensor_path.with_suffix(".bin")
87 |             try:
88 |                 result = safetensors_load(safetensor_path)
89 |             except SafetensorError as e:
90 |                 raise RuntimeError(f"{safetensor_path} is likely corrupted. Please try to re-download it.") from e
91 |             print(f"{safetensor_path} --> {bin_path}")
92 |             torch.save(result, bin_path)
93 |             os.remove(safetensor_path)
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     CLI(download_from_hub)
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Goldfish Loss: Mitigating Memorization in Generative LLMs [[paper]](https://arxiv.org/abs/2406.10209)[[checkpoints]](https://huggingface.co/collections/tomg-group-umd/goldfish-loss-mitigating-memorization-in-llms-66c175becb6aab07744f7272)
 3 | 
 4 | <p align="center">
 5 |   <img src="assets/goldfish-loss.jpg" width="200" height="200" alt="A very smart goldfish">
 6 | </p>
 7 | 
 8 | 
 9 | #### We introduce goldfish loss — a strikingly simple technique to mitigating extractable memorization in large language models.
10 | 
11 | ## Getting Started
12 | 
13 | This codebase is setup to (pre)train large language models in a distributed training setup using SLURM. This repo is written and tested on AMD compute-nodes and is a fork of Lightning AI's [LitGPT repository](https://github.com/Lightning-AI/litgpt).
14 | 
15 | For implementation of goldfish loss, please checkout [`apply_goldfish`](https://github.com/ahans30/goldfish-loss/blob/70bfad87dcf69da2921bcad08e662925ab2ab60b/lit_gpt/utils.py#L241) method in `lit_gpt/utils.py`.
16 | 
17 | ### Installation
18 | Before running below command, checkout the script and setup path variables specific to your compute instance (e.g. `INSTALLDIR`).
19 | 
20 | ```bash
21 | $ bash install.sh
22 | ```
23 | 
24 | This will take some time, but will create a folder called `goldfish_loss` and a folder called `tiny_plugins_rccl` in the installation directory (which is `$HOME` by default). The conda folder contains a fully functional conda environment containing all neccessary packages to run our training scripts. The RCCL folder contains code for the interconnect plugin that is crucial for multi-node jobs. You can enable this environment by using `source activate ${INSTALLDIR}/<PATH-TO-ENV_NAME>`.
25 | 
26 | Please note, this installation is specific for AMD's compute nodes and is written as such.
27 | 
28 | 
29 | ## Usage
30 | 
31 | Below command will invocate a python script that will inturn queue a SLURM job specific to frontier cluster used for this work. You can use `--dryrun` to retrieve the `sbatch` script that can be tuned for other clusters.
32 | 
33 | ```bash
34 | $ launch_scripts/launch_jobs_1b_hashtable.sh
35 | ```
36 | 
37 | You can find the training config in `launch_scripts/config/` in yaml format.
38 | 
39 | 
40 | ## Contributing
41 | 
42 | This is WIP repo and we are working on porting all experiments from the paper.
43 | 
44 | We believe in open source community development. Feel free to add things, open issues, start pull requests, or reach out to us with any thoughts or questions.
45 | 
46 | ## Cite our work
47 | 
48 | If you find our work useful, please cite our paper:
49 | 
50 | ```bibtex
51 | @inproceedings{
52 |   hans2024goldfishloss,
53 |   title={Be like a Goldfish, Don't Memorize! Mitigating Memorization in Generative {LLM}s},
54 |   author={Abhimanyu Hans and John Kirchenbauer and Yuxin Wen and Neel Jain and Hamid Kazemi and Prajwal Singhania and Siddharth Singh and Gowthami Somepalli and Jonas Geiping and Abhinav Bhatele and Tom Goldstein},
55 |   booktitle={The Thirty-eighth Annual Conference on Neural Information Processing Systems},
56 |   year={2024},
57 |   url={https://openreview.net/forum?id=DylSyAfmWs}
58 | }
59 | ```
60 | 
61 | ## Acknowledgements
62 | 1. This codebase is developed as a collective effort from [tomg-group-umd](https://github.com/tomg-group-umd) members for LLM (pre)training and is fork of the [LitGPT codebase](https://github.com/Lightning-AI/litgpt).
63 | 1. This code is tested on *frontier* resources of the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725. This project is supported by the award for computer time was provided by the U.S. Department of Energy’s (DOE) Innovative and Novel Computational Impact on Theory and Experiment (INCITE) Program. Financial support was provided by the ONR MURI program and the AFOSR MURI program. Private support was provided by Capital One Bank, the Amazon Research Award program, and Open Philanthropy. Further support was provided by the National Science Foundation (IIS-2212182), and by the NSF TRAILS Institute (2229885).
64 | 


--------------------------------------------------------------------------------
/tutorials/pretrain_openwebtext.md:
--------------------------------------------------------------------------------
 1 | # Pretrain Llama 2 on OpenWebText
 2 | 
 3 | This tutorial will walk you through setting up the OpenWebText dataset and launching the pretraining script.
 4 | 
 5 | ## What's OpenWebText
 6 | 
 7 | [OpenWebText](https://github.com/jcpeterson/openwebtext) is an open-source reproduction of OpenAI's unreleased WebText training dataset, which was originally used to train GPT-2. The version that is used here consists of 8M documents and is loaded via the `load_dataset("openwebtext", ...)` function from the [datasets](https://github.com/huggingface/datasets) Python package. [Please refer to the website hosting the dataset](https://huggingface.co/datasets/Skylion007/openwebtext) for the licensing information.
 8 | 
 9 | ## Prepare OpenWebText for training
10 | 
11 | In order to start pretraining lit-gpt on it, you need to read, tokenize, and write the data in binary format.
12 | 
13 | To prepare the dataset with the Llama 2 tokenizer, run
14 | 
15 | ```bash
16 | pip install datasets
17 | 
18 | python scripts/prepare_openwebtext.py \
19 |   --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-hf/ \
20 |   --destination_path data/openwebtext
21 | ```
22 | 
23 | The script will take about 15 min to run.
24 | 
25 | ## Pretraining
26 | 
27 | Running the pretraining script with its default settings requires at least 4 GPUs with 40GB+ each. (However, alternatively, you can train a smaller Pythia-70m on 1 GPU, more information about that further below).
28 | 
29 | ```bash
30 | python pretrain/openwebtext.py --devices 4
31 | ```
32 | 
33 | The script will save checkpoints periodically to the folder `out/`.
34 | 
35 | By default, the `pretrain/openwebtext.py` script will pretrain the Llama 2 7B model with FSDP in
36 | `bfloat16` precision and gradient accumulation.
37 | 
38 | You can easily change the size of the model by passing a different string to the model name variable
39 | 
40 | ```shell
41 | --model_name "Llama-2-7b-hf"
42 | ```
43 | 
44 | at the top of this script.
45 | 
46 | The currently supported model names are contained in the [config.py](https://github.com/Lightning-AI/lit-gpt/lit_gpt/config.py) file.
47 | You can
48 | 
49 | 1) either search this file for lines containing "name =",
50 | 2) or run `python scripts/download.py` without additional command line arguments,
51 | 
52 | Keep in mind that the original LLaMA training for the 7B model required 83k A100 80GB
53 | hours (on a bigger dataset). However, for full pretraining on OpenWebText, you'll likely still need access to a cluster.
54 | 
55 | Once you're in a cluster, you can follow [these instructions](https://lightning.ai/docs/fabric/stable/fundamentals/launch.html#launch-on-a-cluster)
56 | to launch the script across machines:
57 | 
58 | - [SLURM cluster](https://lightning.ai/docs/fabric/stable/guide/multi_node/slurm.html)
59 | - [Barebones cluster](https://lightning.ai/docs/fabric/stable/guide/multi_node/barebones.html)
60 | - [MPI](https://lightning.ai/docs/fabric/stable/guide/multi_node/other.html)
61 | 
62 | The exposes several hyperparameters you can tweak through the command line.
63 | 
64 | For instance, `--train.micro_batch_size` should be adjusted so the process will use the available
65 | GPU memory. For more tips to avoid out-of-memory issues, please also see the more detailed
66 | [Dealing with out-of-memory (OOM) errors](oom.md) guide.
67 | 
68 | Last, logging is kept minimal in the script. In order to use a particular logger
69 | please refer to <https://lightning.ai/docs/fabric/stable/api/loggers.html> or
70 | call a logging client library like `wandb` directly.
71 | 
72 | ## Training a smaller model on a single GPU
73 | 
74 | To train a smaller Pythia 70M model on a single GPU, you can pass the `--model_name "pythia-70m"` argument.
75 | 
76 | (Please see the `download_*` scripts in the [tutorials](.) for more information on downloading model checkpoints for different models.)
77 | 
78 | Also, before you start training, note that you will need to prepare the dataset specifically for this model since it may use a different tokenizer:
79 | 
80 | ```bash
81 | python scripts/prepare_openwebtext.py \
82 |   --checkpoint_dir checkpoints/EleutherAI/pythia-70m/ \
83 |   --destination_path data/lit-openwebtext
84 | 
85 | python pretrain/openwebtext.py --devices 4
86 | ```
87 | 


--------------------------------------------------------------------------------
/lit_gpt/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | from pathlib import Path
 5 | from typing import Optional, Union
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | class Tokenizer:
11 |     def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
12 |         checkpoint_dir = Path(checkpoint_dir)
13 |         if not checkpoint_dir.exists():
14 |             raise NotADirectoryError(f"The checkpoint directory does not exist: {str(checkpoint_dir)}")
15 | 
16 |         self.bos_id = None
17 |         self.eos_id = None
18 |         self.pad_id = None
19 | 
20 |         if (checkpoint_dir / "tokenizer.json").is_file():
21 |             from transformers import AutoTokenizer
22 | 
23 |             self.processor = AutoTokenizer.from_pretrained(
24 |                 str(checkpoint_dir), add_bos_token=False, add_eos_token=False
25 |             )
26 | 
27 |             self.backend = "huggingface"
28 | 
29 |             if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file():
30 |                 with open(special_tokens_path) as fp:
31 |                     config = json.load(fp)
32 |                 self.bos_id = self.processor.bos_token_id
33 |                 self.eos_id = self.processor.eos_token_id
34 |                 self.pad_id = self.processor.pad_token_id
35 |             if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file():
36 |                 with open(special_tokens_path) as fp:
37 |                     config = json.load(fp)
38 |                 if self.bos_id is None:
39 |                     self.bos_id = config.get("bos_token_id")
40 |                 if self.eos_id is None:
41 |                     self.eos_id = config.get("eos_token_id")
42 |                 if self.pad_id is None:
43 |                     self.pad_id = config.get("pad_token_id")  # idk if this will always work
44 |         elif "open_llama" in str(checkpoint_dir):
45 |             from transformers import LlamaTokenizer
46 | 
47 |             self.processor = LlamaTokenizer.from_pretrained(
48 |                 str(checkpoint_dir), add_bos_token=False, add_eos_token=False
49 |             )
50 | 
51 |             self.backend = "huggingface"
52 | 
53 |             if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file():
54 |                 with open(special_tokens_path) as fp:
55 |                     config = json.load(fp)
56 |                 self.bos_id = self.processor.bos_token_id
57 |                 self.eos_id = self.processor.eos_token_id
58 |                 self.pad_id = self.processor.pad_token_id
59 |             if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file():
60 |                 with open(special_tokens_path) as fp:
61 |                     config = json.load(fp)
62 |                 if self.bos_id is None:
63 |                     self.bos_id = config.get("bos_token_id")
64 |                 if self.eos_id is None:
65 |                     self.eos_id = config.get("eos_token_id")
66 |                 if self.pad_id is None:
67 |                     self.pad_id = config.get("pad_token_id")  # idk if this will always work
68 |         else:
69 |             raise NotImplementedError
70 | 
71 |     @property
72 |     def vocab_size(self) -> int:
73 |         return self.processor.vocab_size
74 | 
75 |     def encode(
76 |         self,
77 |         string: str,
78 |         device: Optional[torch.device] = None,
79 |         bos: Optional[bool] = None,
80 |         eos: bool = False,
81 |         max_length: int = -1,
82 |     ) -> torch.Tensor:
83 |         tokens = self.processor.encode(string)
84 | 
85 |         if bos:
86 |             bos_id = self.bos_id
87 |             if bos_id is None:
88 |                 raise NotImplementedError("This tokenizer does not have a defined a bos token")
89 |             tokens = [bos_id] + tokens
90 |         if eos:
91 |             tokens = tokens + [self.eos_id]
92 |         if max_length > 0:
93 |             tokens = tokens[:max_length]
94 |         return torch.tensor(tokens, dtype=torch.int, device=device)
95 | 
96 |     def decode(self, tensor: torch.Tensor) -> str:
97 |         tokens = [tensor.item()] if tensor.ndim == 0 else tensor.tolist()
98 |         return self.processor.decode(tokens)
99 | 


--------------------------------------------------------------------------------
/tutorials/finetune_full.md:
--------------------------------------------------------------------------------
  1 | # Finetuning the whole model
  2 | 
  3 | If you are interested in parameter-efficient finetuning, check out [finetune_adapter.md](finetune_adapter.md). In contrast to parameter-efficient finetuning, this "full" approach finetunes all model parameters, which is substantially more expensive. It may only be recommended as a baseline for comparison studies.
  4 | 
  5 | ## Preparation
  6 | 
  7 | The steps here only need to be done once:
  8 | 
  9 | 1. Follow the instructions in the [README](../README.md) to install the dependencies.
 10 | 2. Download and convert the weights following our [guide](download_stablelm.md).
 11 | 3. Download the data and generate the Alpaca instruction tuning dataset:
 12 | 
 13 | ```bash
 14 | python scripts/prepare_alpaca.py --checkpoint_dir checkpoints/tiiuae/falcon-7b
 15 | ```
 16 | 
 17 | or [prepare your own dataset](#tune-on-your-dataset).
 18 | 
 19 | For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial.
 20 | 
 21 | ## Running the finetuning
 22 | 
 23 | ```bash
 24 | python finetune/full.py --io.checkpoint_dir checkpoints/tiiuae/falcon-7b
 25 | ```
 26 | 
 27 | Finetuning the falcon-7b model requires at least 8 GPUs with ~40 GB memory each.
 28 | 
 29 | You can speed up training by setting the `devices` variable in the script to utilize more GPUs if available.
 30 | Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently.
 31 | 
 32 | This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name:
 33 | 
 34 | ```bash
 35 | python finetune/full.py --io.out_dir out/full/my-model-finetuned
 36 | ```
 37 | 
 38 | If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument.
 39 | For instance, to fine-tune on MPS (the GPU on modern Macs), you can run
 40 | 
 41 | ```bash
 42 | python finetune/full.py --io.out_dir out/full/my-model-finetuned --precision 32-true
 43 | ```
 44 | 
 45 | Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac.
 46 | 
 47 | ## Test the model
 48 | 
 49 | You can test the finetuned model with your own instructions by running:
 50 | 
 51 | ```bash
 52 | python generate/full.py \
 53 |     --prompt "Recommend a movie to watch on the weekend." \
 54 |     --checkpoint_dir checkpoints/tiiuae/falcon-7b \
 55 |     --finetuned_path out/full/my-model-finetuned/lit_model_finetuned.pth
 56 | ```
 57 | 
 58 | Output:
 59 | 
 60 | ```text
 61 | A good movie to watch on the weekend would be The Lion King, since it's a classic family film that everyone can enjoy...
 62 | ```
 63 | 
 64 | If your GPU supports `bfloat16`, the script will automatically use it.
 65 | 
 66 | ## Tune on your dataset
 67 | 
 68 | With only a few modifications, you can prepare and train on your own instruction dataset.
 69 | 
 70 | 1. Create a json file in which each row holds one instruction-response pair.
 71 |    A row has an entry for 'instruction', 'input', and 'output', where 'input' is optional an can be
 72 |    the empty string if the instruction doesn't require a context. Below is an example json file:
 73 | 
 74 |     ```text
 75 |     [
 76 |         {
 77 |             "instruction": "Arrange the given numbers in ascending order.",
 78 |             "input": "2, 4, 0, 8, 3",
 79 |             "output": "0, 2, 3, 4, 8"
 80 |         },
 81 |         ...
 82 |     ]
 83 |     ```
 84 | 
 85 | 2. Make a copy of `scripts/prepare_alpaca.py` and name it what you want:
 86 | 
 87 |     ```bash
 88 |     cp scripts/prepare_alpaca.py scripts/prepare_mydata.py
 89 |     ```
 90 | 
 91 | 3. Modify `scripts/prepare_mydata.py` to read the json data file.
 92 | 4. Run the script to generate the preprocessed, tokenized train-val split:
 93 | 
 94 |     ```bash
 95 |     python scripts/prepare_mydata.py --destination_path data/mydata/
 96 |     ```
 97 | 
 98 | 5. Run `finetune/full.py` by passing in the location of your data (and optionally other parameters):
 99 | 
100 |     ```bash
101 |     python finetune/full.py \
102 |         --io.train_data_dir data/mydata --io.val_data_dir data/mydata/ \
103 |         --io.checkpoint_dir checkpoints/tiiuae/falcon-7b \
104 |         --io.out_dir data/mydata-finetuned
105 |     ```
106 | 


--------------------------------------------------------------------------------
/lit_gpt/data/tinyllama.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | 
  3 | from pathlib import Path
  4 | from typing import Union, Optional
  5 | 
  6 | from torch.utils.data import DataLoader
  7 | 
  8 | from lit_gpt import Tokenizer
  9 | from lit_gpt.data import LitDataModule
 10 | 
 11 | 
 12 | class TinyLlama(LitDataModule):
 13 |     """The TinyLlama data module is composed of a mix of SlimPajama and Starcoder data.
 14 | 
 15 |     Provides training and validation streaming dataloaders that return batches of tokens.
 16 | 
 17 |     Args:
 18 |         data_path: The path to the data directory, containing two folders 'slimpajama' and 'starcoder'
 19 |             which are the output of the preprocessing step done in advance. See the `tutorial/pretrain_tinyllama.md`
 20 |             for instructions. The path can also be a remote path (e.g., s3://).
 21 |         seed: The seed to use for shuffling the training data.
 22 |         num_workers: The number of workers to use for the dataloaders.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         data_path: Union[str, Path] = Path("data/"),
 28 |         seed: int = 42,
 29 |         num_workers: int = 8,
 30 |     ) -> None:
 31 |         super().__init__()
 32 |         self.seed = seed
 33 |         self.num_workers = num_workers
 34 | 
 35 |         self.batch_size = 1
 36 |         self.seq_length = 2048
 37 | 
 38 |         # Could be a remote path (s3://) or a local path
 39 |         self.slimpajama_train = str(data_path).rstrip("/") + "/slimpajama/train"
 40 |         self.slimpajama_val = str(data_path).rstrip("/") + "/slimpajama/val"
 41 |         self.starcoder_train = str(data_path).rstrip("/") + "/starcoder"
 42 | 
 43 |     def connect(
 44 |         self,
 45 |         tokenizer: Optional[Tokenizer] = None,
 46 |         batch_size: int = 1,
 47 |         max_seq_length: Optional[int] = None
 48 |     ) -> None:
 49 |         self.batch_size = batch_size
 50 |         self.seq_length = max_seq_length + 1  # Increase by one because we need the next token as well
 51 | 
 52 |     def prepare_data(self) -> None:
 53 |         for path in (self.slimpajama_train, self.slimpajama_val, self.starcoder_train):
 54 |             if not path.startswith("s3://") and not Path(path).is_dir():
 55 |                 raise FileNotFoundError(
 56 |                     "The data path for TinyLlama is expected to be the directory containing these subdirectories:"
 57 |                     f" `slimpajama/train`, `slimpajama/val`, `starcoder`. The directory {path} does not exist."
 58 |                 )
 59 | 
 60 |     def train_dataloader(self) -> DataLoader:
 61 |         from lightning.data.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader
 62 | 
 63 |         train_datasets = [
 64 |             StreamingDataset(
 65 |                 input_dir=self.slimpajama_train,
 66 |                 item_loader=TokensLoader(block_size=self.seq_length),
 67 |                 shuffle=True,
 68 |                 drop_last=True,
 69 |             ),
 70 |             StreamingDataset(
 71 |                 input_dir=self.starcoder_train,
 72 |                 item_loader=TokensLoader(block_size=self.seq_length),
 73 |                 shuffle=True,
 74 |                 drop_last=True,
 75 |             ),
 76 |         ]
 77 | 
 78 |         # Mix SlimPajama data and Starcoder data with these proportions:
 79 |         weights = (0.693584, 0.306416)
 80 |         combined_dataset = CombinedStreamingDataset(datasets=train_datasets, seed=self.seed, weights=weights)
 81 |         train_dataloader = StreamingDataLoader(
 82 |             combined_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
 83 |         )
 84 |         return train_dataloader
 85 | 
 86 |     def val_dataloader(self) -> DataLoader:
 87 |         from lightning.data.streaming import StreamingDataset, TokensLoader
 88 | 
 89 |         val_dataset = StreamingDataset(
 90 |             input_dir=self.slimpajama_val,
 91 |             item_loader=TokensLoader(block_size=self.seq_length),
 92 |             shuffle=True,
 93 |             # Consider setting to False, but we would lose some samples due to truncation when world size > 1
 94 |             drop_last=True,
 95 |         )
 96 |         val_dataloader = DataLoader(
 97 |             val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
 98 |         )
 99 |         return val_dataloader
100 | 


--------------------------------------------------------------------------------
/lit_gpt/data/json.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | 
  3 | import json
  4 | from pathlib import Path
  5 | from typing import Optional
  6 | 
  7 | import torch
  8 | from torch.utils.data import random_split, DataLoader
  9 | from lit_gpt.data import SFTDataset, get_sft_collate_fn, LitDataModule
 10 | from lit_gpt.data.alpaca import prompt_template
 11 | from lit_gpt.tokenizer import Tokenizer
 12 | 
 13 | 
 14 | class JSON(LitDataModule):
 15 |     """Loads JSON data for supervised finetuning.
 16 | 
 17 |     Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
 18 | 
 19 |     Args:
 20 |         json_path: A path to a JSON file containing the data. The file should contain a list of samples (dicts).
 21 |             Each dict must have the keys 'instruction' and 'output', and can optionally have a key 'input'
 22 |             (see Alpaca).
 23 |         mask_prompt: Whether to mask the prompt section from the label (with ``ignore_index``).
 24 |         test_split_fraction: A number in the range [0, 1] that determines the fraction of the dataset
 25 |             to use for testing.
 26 |         ignore_index: The index to use for elements to be ignored in the label.
 27 |         seed: The random seed for creating the train/val splits and shuffling the dataset.
 28 |         num_workers: How many DataLoader processes to use for loading.
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         json_path: Path,
 34 |         mask_prompt: bool = False,
 35 |         test_split_fraction: float = 0.1,
 36 |         ignore_index: int = -1,
 37 |         seed: int = 42,
 38 |         num_workers: int = 4,
 39 |     ) -> None:
 40 |         super().__init__()
 41 |         self.json_path = json_path
 42 |         self.mask_prompt = mask_prompt
 43 |         self.test_split_fraction = test_split_fraction
 44 |         self.ignore_index = ignore_index
 45 |         self.seed = seed
 46 |         self.num_workers = num_workers
 47 | 
 48 |         self.tokenizer: Optional[Tokenizer] = None
 49 |         self.batch_size: int = 1
 50 |         self.max_seq_length: int = -1
 51 |         self.train_dataset: Optional[SFTDataset] = None
 52 |         self.test_dataset: Optional[SFTDataset] = None
 53 | 
 54 |         if not self.json_path.is_file():
 55 |             raise FileNotFoundError(f"The file {self.json_path} does not exist.")
 56 | 
 57 |     def connect(
 58 |         self,
 59 |         tokenizer: Optional[Tokenizer] = None,
 60 |         batch_size: int = 1,
 61 |         max_seq_length: Optional[int] = None
 62 |     ) -> None:
 63 |         self.tokenizer = tokenizer
 64 |         self.batch_size = batch_size
 65 |         self.max_seq_length = -1 if max_seq_length is None else max_seq_length
 66 | 
 67 |     def setup(self, stage: str = "") -> None:
 68 |         with open(self.json_path, "r", encoding="utf-8") as file:
 69 |             data = json.load(file)
 70 | 
 71 |         # Partition the dataset into train and test
 72 |         train_data, test_data = random_split(
 73 |             data,
 74 |             [1.0 - self.test_split_fraction, self.test_split_fraction],
 75 |             generator=torch.Generator().manual_seed(self.seed)
 76 |         )
 77 |         train_data, test_data = list(train_data), list(test_data)
 78 | 
 79 |         self.train_dataset = SFTDataset(
 80 |             data=train_data,
 81 |             tokenizer=self.tokenizer,
 82 |             prompt_template=prompt_template,
 83 |             max_seq_length=self.max_seq_length,
 84 |             mask_prompt=self.mask_prompt,
 85 |             ignore_index=self.ignore_index,
 86 |         )
 87 |         self.test_dataset = SFTDataset(
 88 |             data=test_data,
 89 |             tokenizer=self.tokenizer,
 90 |             prompt_template=prompt_template,
 91 |             max_seq_length=self.max_seq_length,
 92 |             mask_prompt=self.mask_prompt,
 93 |             ignore_index=self.ignore_index,
 94 |         )
 95 | 
 96 |     def train_dataloader(self) -> DataLoader:
 97 |         return DataLoader(
 98 |             self.train_dataset,
 99 |             batch_size=self.batch_size,
100 |             shuffle=True,
101 |             generator=torch.Generator().manual_seed(self.seed),
102 |             num_workers=self.num_workers,
103 |             collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
104 |         )
105 | 
106 |     def val_dataloader(self) -> DataLoader:
107 |         return DataLoader(
108 |             self.test_dataset,
109 |             batch_size=self.batch_size,
110 |             shuffle=False,
111 |             num_workers=self.num_workers,
112 |             collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
113 |         )
114 | 


--------------------------------------------------------------------------------
/lit_gpt/multiple_negative_ranking_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Iterable, Dict
 4 | # from ..SentenceTransformer import SentenceTransformer
 5 | from lit_gpt.model import GPT
 6 | 
 7 | 
 8 | def cos_sim(a: Tensor, b: Tensor) -> Tensor:
 9 |     """
10 |     Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
11 | 
12 |     :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
13 |     """
14 |     if not isinstance(a, torch.Tensor):
15 |         a = torch.tensor(a)
16 | 
17 |     if not isinstance(b, torch.Tensor):
18 |         b = torch.tensor(b)
19 | 
20 |     if len(a.shape) == 1:
21 |         a = a.unsqueeze(0)
22 | 
23 |     if len(b.shape) == 1:
24 |         b = b.unsqueeze(0)
25 | 
26 |     a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
27 |     b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
28 |     return torch.mm(a_norm, b_norm.transpose(0, 1))
29 | 
30 | 
31 | class MultipleNegativesRankingLoss(nn.Module):
32 |     """
33 |     This loss expects as input a batch consisting of sentence pairs (a_1, p_1), (a_2, p_2)..., (a_n, p_n)
34 |     where we assume that (a_i, p_i) are a positive pair and (a_i, p_j) for i!=j a negative pair.
35 | 
36 |     For each a_i, it uses all other p_j as negative samples, i.e., for a_i, we have 1 positive example (p_i) and
37 |     n-1 negative examples (p_j). It then minimizes the negative log-likehood for softmax normalized scores.
38 | 
39 |     This loss function works great to train embeddings for retrieval setups where you have positive pairs (e.g. (query, relevant_doc))
40 |     as it will sample in each batch n-1 negative docs randomly.
41 | 
42 |     The performance usually increases with increasing batch sizes.
43 | 
44 |     For more information, see: https://arxiv.org/pdf/1705.00652.pdf
45 |     (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4)
46 | 
47 |     You can also provide one or multiple hard negatives per anchor-positive pair by structering the data like this:
48 |     (a_1, p_1, n_1), (a_2, p_2, n_2)
49 | 
50 |     Here, n_1 is a hard negative for (a_1, p_1). The loss will use for the pair (a_i, p_i) all p_j (j!=i) and all n_j as negatives.
51 | 
52 |     Example::
53 | 
54 |         from sentence_transformers import SentenceTransformer, losses, InputExample
55 |         from torch.utils.data import DataLoader
56 | 
57 |         model = SentenceTransformer('distilbert-base-uncased')
58 |         train_examples = [InputExample(texts=['Anchor 1', 'Positive 1']),
59 |             InputExample(texts=['Anchor 2', 'Positive 2'])]
60 |         train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
61 |         train_loss = losses.MultipleNegativesRankingLoss(model=model)
62 |     """
63 | 
64 |     def __init__(self, model: GPT, scale: float = 20.0, similarity_fct=cos_sim):
65 |         """
66 |         :param model: SentenceTransformer model
67 |         :param scale: Output of similarity function is multiplied by scale value
68 |         :param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1)
69 |         """
70 |         super(MultipleNegativesRankingLoss, self).__init__()
71 |         self.model = model
72 |         self.scale = scale
73 |         self.similarity_fct = similarity_fct
74 |         self.cross_entropy_loss = nn.CrossEntropyLoss()
75 | 
76 |     def forward(self, sentence_features: list[Tensor, Tensor], loss_type: str):
77 |         embeddings_a_bsz_T_d = sentence_features[0]
78 |         embeddings_b_bsz_T_d = sentence_features[1]
79 |         if loss_type == "batch_negative":
80 |             embeddings_a_bsz_d = embeddings_a_bsz_T_d.reshape(-1, embeddings_a_bsz_T_d.size(-1))
81 |             embeddings_b_bsz_d = embeddings_b_bsz_T_d.reshape(-1, embeddings_b_bsz_T_d.size(-1))
82 |             scores = self.similarity_fct(embeddings_a_bsz_d, embeddings_b_bsz_d) * self.scale # [b, b]
83 |             labels = torch.tensor(
84 |                     range(len(scores)), dtype=torch.long, device=scores.device
85 |                     )  # Example a[i] should match with b[i]    [0, 1, 2, 3, ...]
86 |             accuracy = (torch.argmax(scores, dim=1) == labels).float().mean()   # we want to check the retrieval accuracy
87 |             return self.cross_entropy_loss(scores, labels), accuracy.clone().detach()
88 |         else:
89 |             # loss_type == "single_negative"
90 |             loss_and_accuracies = [self.forward([embeddings_a_bsz_T_d[i], embeddings_b_bsz_T_d[i]], "batch_negative") for i in range(embeddings_a_bsz_T_d.size(0))]
91 |             losses = [x[0] for x in loss_and_accuracies]
92 |             accuracies = [x[1] for x in loss_and_accuracies]
93 |             return torch.stack(losses).mean(), torch.stack(accuracies).mean().clone().detach()
94 | 
95 |     def get_config_dict(self):
96 |         return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__}


--------------------------------------------------------------------------------
/tutorials/oom.md:
--------------------------------------------------------------------------------
 1 | ## Dealing with out-of-memory (OOM) errors
 2 | 
 3 | If you got this error while running a script
 4 | 
 5 | ```bash
 6 | OutOfMemoryError: CUDA out of memory. Tried to allocate 2.22 GiB. GPU 0 has a total capacity of 79.15 GiB of which 228.38 MiB is free. Including non-PyTorch memory, this process
 7 | has 78.93 GiB memory in use. Of the allocated memory 76.28 GiB is allocated by PyTorch, and 2.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory
 8 | is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
 9 | ```
10 | 
11 | it means that your GPU memory size wasn't big enough for the model and script configuration.
12 | 
13 | Here's a few things you can try:
14 | 
15 | ### Reduce the micro batch size
16 | 
17 | Adjust the `--train.micro_batch_size` argument in the fine-tuning and pretraining scripts. This variable determines the number of samples loaded per iteration.
18 | 
19 | A smaller value will simply load fewer samples simultaneously. The minimum value is 1.
20 | 
21 | Experiment with different micro batch sizes to find a balance between memory consumption and computational efficiency. Smaller micro batch sizes consume less memory but may result in slower training convergence. Conversely, larger micro batch sizes require more memory but can accelerate training speed.
22 | 
23 | ### Reduce the model's context length
24 | 
25 | The context length (`block_size` in the code) plays a significant role in running models with attention.
26 | 
27 | * The pretraining scripts are configured to use the full context length of the model to train.
28 | * The finetuning scripts are configured to use the longest sample length of the training data to avoid allocating unnecessary memory (`--train.max_seq_length` argument).
29 |   If that's longer than the model's context length, an error is raised. If you try to run a batch that is longer than this, an error is raised.
30 | 
31 | However, your hardware may not support such large context lengths. Here's what you can do:
32 | 
33 | * For the pretraining scripts, you can simply reduce the `Config(block_size=...)` value.
34 | * For the finetuning scripts, you can trim the length of the samples in your dataset.
35 |   Most of the `scripts/prepare_*.py` scripts expose a `--max_seq_length=...` argument. This might also be useful in cases where
36 |   sample lengths are highly unbalanced, as the presence of a single very long sample would incur a larger memory usage for all other
37 |   shorter samples. For example, the median length of the samples in Alpaca is 110 tokens. Truncating the Alpaca dataset to 256 max tokens reduces the memory requirements of a Falcon 7B model from 23.52 GB to 15.73 GB. For more information about the dataset truncation, please see the *Truncating datasets* section in the [prepare_datasets.md](prepare_datasets.md) tutorial.
38 | 
39 | Keep in mind that reducing the context length will affect the modelling performance on text sequences longer than the limit.
40 | 
41 | ### Use lower precision
42 | 
43 | Our scripts expose the `--precision` argument, this directly impacts the memory usage.
44 | 
45 | Using true lower precision (`16-true`, `bf16-true`) reduces the memory usage by half compared to `32-true`, however,
46 | the model might start producing NaNs due to the limited range of representable values.
47 | 
48 | Mixed precision training (`16-mixed`, `bf16-mixed`) provides better stability but offers limited memory reduction.
49 | 
50 | ### Do sharding across multiple GPUs
51 | 
52 | For exceptionally large models, the aforementioned techniques might still not suffice. If you have multiple GPUs available,
53 | you can trade off memory for speed by changing the `--devices 1` argument in the scripts. Enabling this option enables a parallelism technique (FSDP), sharding the memory across different GPUs.
54 | 
55 | The default configuration already uses activation checkpointing, but you can enable CPU offloading by changing the `cpu_offload=False` argument in the scripts.
56 | 
57 | ### Try a different optimizer
58 | 
59 | Our scripts use the [`AdamW` optimizer](https://pytorch.org/docs/main/generated/torch.optim.AdamW.html).
60 | It maintains 2 states for each trainable parameter of the model, meaning that the optimizer memory is double compared to
61 | an optimizer like [`SGD`](https://pytorch.org/docs/main/generated/torch.optim.SGD.html).
62 | 
63 | You can try replacing it with your optimizer of choice that is lighter in memory requirements. Keep in mind that different optimizers have distinct optimization behaviors, so it's essential to assess their impact on the training process and model performance.
64 | An example would be the recently published [Sophia](https://arxiv.org/abs/2305.14342) or [Lion](https://arxiv.org/abs/2302.06675) optimizers.
65 | 
66 | This suggestion is particularly relevant for pretraining, as the trainable parameters in the model represent a small
67 | subset of the total in the fine-tuning scripts.
68 | 


--------------------------------------------------------------------------------
/tutorials/convert_lit_models.md:
--------------------------------------------------------------------------------
  1 | ## Converting Lit-GPT weights to Hugging Face Transformers
  2 | 
  3 | Lit-GPT weights need to be converted to a format that Hugging Face understands with a [conversion script](../scripts/convert_lit_checkpoint.py) before our scripts can run.
  4 | 
  5 | We provide a helpful script to convert models Lit-GPT models back to their equivalent Hugging Face Transformers format:
  6 | 
  7 | ```sh
  8 | python scripts/convert_lit_checkpoint.py \
  9 |     --checkpoint_path checkpoints/repo_id/lit_model.pth \
 10 |     --output_path output_path/converted.pth \
 11 |     --config_path checkpoints/repo_id/config.json
 12 | ```
 13 | 
 14 | These paths are just placeholders, you will need to customize them based on which finetuning or pretraining script you ran and it's configuration.
 15 | 
 16 | ### Loading converted Lit-GPT checkpoints into transformers
 17 | 
 18 | If you want to load the converted checkpoints into a `transformers` model, please make sure you copied the original `config.json` file into the folder that contains the `converted.pth` file saved via `--output_path` above.
 19 | 
 20 | For example,
 21 | 
 22 | ```bash
 23 | cp checkpoints/repo_id/config.json output_path/config.json
 24 | ```
 25 | 
 26 | Then, you can load the checkpoint file in a Python session as follows:
 27 | 
 28 | ```python
 29 | import torch
 30 | from transformers import AutoModel
 31 | 
 32 | 
 33 | state_dict = torch.load("output_path/converted.pth")
 34 | model = AutoModel.from_pretrained(
 35 |     "output_path/", local_files_only=True, state_dict=state_dict
 36 | )
 37 | ```
 38 | 
 39 | Alternatively, you can also load the model without copying the `config.json` file as follows:
 40 | 
 41 | ```python
 42 | model = AutoModel.from_pretrained("online_repo_id", state_dict=state_dict)
 43 | ```
 44 | 
 45 | 
 46 | 
 47 | ### Merging LoRA weights
 48 | 
 49 | Please note that if you want to convert a model that has been fine-tuned using an adapter like LoRA, these weights should be [merged](../scripts/merge_lora.py) to the checkpoint prior to converting.
 50 | 
 51 | ```sh
 52 | python scripts/merge_lora.py \
 53 |     --checkpoint_dir checkpoints/repo_id \
 54 |     --lora_path path/to/litgpt/lora_finetuned.pth \
 55 |     --out_dir output_path/merged.ckpt
 56 | ```
 57 | 
 58 | <br>
 59 | <br>
 60 | 
 61 | # A finetuning and conversion tutorial
 62 | 
 63 | This section contains a reproducible example for finetuning a Lit-GPT model and converting it back into a HF `transformer` model.
 64 | 
 65 | 1. Download a model of interest:
 66 | 
 67 | For convenience, we first specify an environment variable (optional) to avoid copy and pasting the whole path:
 68 | 
 69 | ```bash
 70 | export repo_id=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 71 | ```
 72 | 
 73 | Instead of using TinyLlama, you can replace the `repo_id` target with any other model repository 
 74 | specifier that is currently supported by Lit-GPT. You can get a list of supported repository specifier
 75 | by running `scripts/download.py` without any additional arguments.
 76 | 
 77 | Then, we download the model we specified via `$repo_id` above:
 78 | 
 79 | ```bash
 80 | python scripts/download.py --repo_id $repo_id
 81 | ```
 82 | 
 83 | 2. Convert the model into the Lit-GPT format:
 84 | 
 85 | ```bash
 86 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id
 87 | ```
 88 | 
 89 | 3. Prepare a dataset for finetuning:
 90 | 
 91 | ```bash
 92 | python scripts/prepare_alpaca.py \
 93 |     --checkpoint_dir checkpoints/$repo_id \
 94 |     --destination_path data/alpaca
 95 | ```
 96 | 
 97 | 4. Finetune the model:
 98 | 
 99 | 
100 | ```bash
101 | export finetuned_dir=out/lit-finetuned-model
102 | 
103 | python finetune/lora.py \
104 |    --io.checkpoint_dir checkpoints/$repo_id \
105 |    --io.train_data_dir data/alpaca \
106 |    --io.val_data_dir data/alpaca \
107 |    --train.epochs 1 \
108 |    --io.out_dir $finetuned_dir
109 | ```
110 | 
111 | 5. Merge LoRA weights:
112 | 
113 | Note that this step only applies if the model was finetuned with `lora.py` above and not when `full.py` was used for finetuning.
114 | 
115 | ```bash
116 | python scripts/merge_lora.py \
117 |     --checkpoint_dir checkpoints/$repo_id \
118 |     --lora_path $finetuned_dir/lit_model_lora_finetuned.pth \
119 |     --out_dir $finetuned_dir/merged/
120 | ```
121 | 
122 | 
123 | 5. Convert the finetuning model back into a HF format:
124 | 
125 | ```bash
126 | python scripts/convert_lit_checkpoint.py \
127 |    --checkpoint_path $finetuned_dir/merged/lit_model.pth \
128 |    --output_path out/hf-tinyllama/converted_model.pth \
129 |    --config_path checkpoints/$repo_id/lit_config.json 
130 | ```
131 | 
132 | (If you used `full.py` instead of `lora.py` to finetune your model, 
133 | replace `$finetuned_dir/merged/lit_model.pth` with `$finetuned_dir/lit_model_finetuned.pth`.)
134 | 
135 | 
136 | 6. Load the model into a `transformers` model:
137 | 
138 | ```python
139 | import torch
140 | from transformers import AutoModel
141 | 
142 | state_dict = torch.load('out/hf-tinyllama/converted_model.pth')
143 | model = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", state_dict=state_dict)
144 | ```
145 | 


--------------------------------------------------------------------------------
/lit_gpt/retrieval_attn_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | 
  4 | 
  5 | def create_dual_triangular_attention_mask(data, eos_id):
  6 |     bs, max_seq_length = data.size()
  7 |     attention_mask = torch.zeros(bs, max_seq_length, max_seq_length, dtype=torch.float32, device=data.device)
  8 | 
  9 |     for batch_idx in range(bs):
 10 |         sequence = data[batch_idx]
 11 |         # Find the indices of eos tokens
 12 |         eos_indices = (sequence == eos_id).nonzero(as_tuple=False).view(-1)
 13 |         # Handle cases where the eos_id appears less than 2 times or not at all
 14 |         if eos_indices.numel() < 2:
 15 |             # If eos_id does not appear or appears only once, fallback to standard lower triangular mask
 16 |             # attention_mask[batch_idx, :, :max_seq_length] = torch.tril(torch.ones(max_seq_length, max_seq_length, device=data.device))
 17 |             raise ValueError(f"EOS token- {eos_id} does not appear twice in sequence")
 18 |         else:
 19 |             # Create mask for the first segment (Prefix)
 20 |             first_eos_idx = eos_indices[0].item()
 21 |             attention_mask[batch_idx, :first_eos_idx+1, :first_eos_idx+1] = torch.tril(torch.ones(first_eos_idx+1, first_eos_idx+1, device=data.device))
 22 |             # Create mask for the second segment (Suffix)
 23 |             second_eos_idx = eos_indices[1].item()
 24 |             attention_mask[batch_idx, first_eos_idx+1:second_eos_idx+1, :second_eos_idx-first_eos_idx] = torch.tril(torch.ones(second_eos_idx-first_eos_idx, second_eos_idx-first_eos_idx, device=data.device))
 25 | 
 26 |             # putting True in rest of the indices (Padding locations)
 27 |             attention_mask[batch_idx, second_eos_idx+1:, :] = True
 28 | 
 29 |     # Reshape the mask to include the additional dimension for heads if necessary
 30 |     attention_mask = attention_mask.view(bs, 1, max_seq_length, max_seq_length)
 31 |     return attention_mask
 32 | 
 33 | 
 34 | def get_ltor_masks_and_position_ids(data,
 35 |                                     eod_token,
 36 |                                     reset_position_ids,
 37 |                                     reset_attention_mask,
 38 |                                     eod_mask_loss,
 39 |                                     attn_type="doc_block_attn"):
 40 |     """
 41 |         Build masks and position id for left to right model.
 42 |         Modified from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/utils.py#L162.
 43 |     """
 44 | 
 45 |     # Extract batch size and sequence length.
 46 |     micro_batch_size, seq_length = data.size()
 47 | 
 48 |     # Attention mask (lower triangular).
 49 |     if reset_attention_mask:
 50 |         att_mask_batch = micro_batch_size
 51 |     else:
 52 |         att_mask_batch = 1
 53 |     # attention_mask = torch.tril(torch.ones(
 54 |     #     (att_mask_batch, seq_length, seq_length), device=data.device)).view(
 55 |     #         att_mask_batch, 1, seq_length, seq_length)
 56 |     if attn_type == "doc_block_attn":
 57 |         attention_mask = create_dual_triangular_attention_mask(data, eod_token)
 58 |     if attn_type == "anti_causal_attn":
 59 |         attention_mask = torch.triu(torch.ones(
 60 |             (att_mask_batch, seq_length, seq_length), device=data.device, dtype=torch.int16)).view(
 61 |                 att_mask_batch, 1, seq_length, seq_length)
 62 | 
 63 | 
 64 |     loss_mask = None
 65 |     position_ids = None
 66 |     # text = attention_mask[1, 0].tolist()
 67 |     # saving text as a txt file
 68 |     # with open("attention_mask.txt", "w") as f:
 69 |     #     for item in text:
 70 |     #         f.write(f"{item}\n")
 71 | 
 72 |     # # Loss mask.
 73 |     # loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
 74 |     # if eod_mask_loss:
 75 |     #     loss_mask[data == eod_token] = 0.0
 76 | 
 77 |     # # Position ids.
 78 |     # position_ids = torch.arange(seq_length, dtype=torch.long,
 79 |     #                             device=data.device)
 80 |     # position_ids = position_ids.unsqueeze(0).expand_as(data)
 81 |     # # We need to clone as the ids will be modifed based on batch index.
 82 |     # if reset_position_ids:
 83 |     #     position_ids = position_ids.clone()
 84 | 
 85 |     # if reset_position_ids or reset_attention_mask:
 86 |     #     # Loop through the batches:
 87 |     #     for b in range(micro_batch_size):
 88 | 
 89 |     #         # Find indecies where EOD token is.
 90 |     #         eod_index = position_ids[b, data[b] == eod_token]
 91 |     #         # Detach indecies from positions if going to modify positions.
 92 |     #         if reset_position_ids:
 93 |     #             eod_index = eod_index.clone()
 94 | 
 95 |     #         # Loop through EOD indecies:
 96 |     #         prev_index = 0
 97 |     #         for j in range(eod_index.size()[0]):
 98 |     #             i = eod_index[j]
 99 |     #             # Mask attention loss.
100 |     #             if reset_attention_mask:
101 |     #                 attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
102 |     #             # Reset positions.
103 |     #             if reset_position_ids:
104 |     #                 position_ids[b, (i + 1):] -= (i + 1 - prev_index)
105 |     #                 prev_index = i + 1
106 | 
107 |     # Convert attention mask to binary:
108 |     attention_mask = (attention_mask > 0.5)
109 | 
110 |     return attention_mask, loss_mask, position_ids


--------------------------------------------------------------------------------
/tutorials/inference.md:
--------------------------------------------------------------------------------
  1 | # Inference
  2 | 
  3 | We demonstrate how to run inference (next token prediction) with the GPT base model in the [`generate.py`](generate.py) script:
  4 | 
  5 | ```bash
  6 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
  7 | ```
  8 | 
  9 | Output:
 10 | 
 11 | ```text
 12 | Hello, my name is Levi Durrer, I'm an Austrian journalist - Chairman of the Press Blair Party, with 37 years in the Press Blair International, and two years in the Spectre of Austerity for the other. I'm crossing my fingers that you will feel
 13 | ```
 14 | 
 15 | The script assumes you have downloaded and converted the weights as described [here](download_stablelm.md).
 16 | 
 17 | This will run the 3B pre-trained model and require ~7 GB of GPU memory using the `bfloat16` datatype.
 18 | 
 19 | ## Run interactively
 20 | 
 21 | You can also chat with the model interactively:
 22 | 
 23 | ```bash
 24 | python chat/base.py --checkpoint_dir checkpoints/stabilityai/stablelm-tuned-alpha-3b
 25 | ```
 26 | 
 27 | This script can work with any checkpoint. For the best chat-like experience, we recommend using it with a checkpoints
 28 | fine-tuned for chatting such as `stabilityai/stablelm-tuned-alpha-3b` or `togethercomputer/RedPajama-INCITE-Chat-3B-v1`.
 29 | 
 30 | ## Run a large model on one smaller device
 31 | 
 32 | Check out our [quantization tutorial](quantize.md).
 33 | 
 34 | ## Run a large model on multiple smaller devices
 35 | 
 36 | We offer two scripts to leverage multiple devices for inference.
 37 | 
 38 | ### [`generate/sequentially.py`](../generate/sequentially.py)
 39 | 
 40 | Allows you to run models that wouldn't fit in a single card by partitioning the transformer blocks across all your devices and running them sequentially.
 41 | 
 42 | For instance, `meta-llama/Llama-2-70b-chat-hf` would require ~140 GB of GPU memory to load on a single device, plus the memory for activations.
 43 | With 80 transformer layers, we could partition them across 8, 5, 4, or 2 devices.
 44 | 
 45 | ```shell
 46 | python generate/sequentially.py \
 47 |   --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \
 48 |   --max_new_tokens 256 \
 49 |   --num_samples 2
 50 | ```
 51 | 
 52 | Using A100 40GB GPUs, we need to use at least 4. You can control the number of devices by setting the `CUDA_VISIBLE_DEVICES=` environment variable.
 53 | 
 54 | | Devices | Max GPU RAM | Token/sec |
 55 | |---------|-------------|-----------|
 56 | | 2       | OOM         | -         |
 57 | | 4       | 35.64 GB    | 7.55      |
 58 | | 5       | 28.72 GB    | 7.49      |
 59 | | 8       | 18.35 GB    | 7.47      |
 60 | 
 61 | Note that the memory usage will also depend on the `max_new_tokens` value used.
 62 | 
 63 | The script also supports quantization, using 4-bit precision, we can now use 2 GPUs
 64 | 
 65 | ```shell
 66 | python generate/sequentially.py \
 67 |   --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \
 68 |   --max_new_tokens 256 \
 69 |   --num_samples 2 \
 70 |   --quantize bnb.nf4-dq
 71 | ```
 72 | 
 73 | | Devices | Max GPU RAM | Token/sec |
 74 | |---------|-------------|-----------|
 75 | | 2       | 20.00 GB    | 8.63      |
 76 | | 4       | 10.80 GB    | 8.23      |
 77 | | 5       | 8.96 GB     | 8.10      |
 78 | | 8       | 6.23 GB     | 8.18      |
 79 | 
 80 | Smaller devices can also be used to run inference with this technique.
 81 | 
 82 | ### [`generate/tp.py`](../generate/tp.py)
 83 | 
 84 | Uses tensor parallelism (TP) to run models that wouldn't fit in a single card by sharding the MLP and Attention QKV linear layers across all your devices.
 85 | 
 86 | For instance, `meta-llama/Llama-2-70b-chat-hf` would require ~140 GB of GPU memory to load on a single device, plus the memory for activations.
 87 | The requirement is that the intermediate size (for the MLP) and the QKV size (for attention) is divisible by the number of devices.
 88 | With an intermediate size of 28672, we can use 2, 4, 7, or 8 devices. With a QKV size of 10240 we can use 2, 4, 5, or 8 devices.
 89 | Since the script is configured to shard both, the intersection is used: we can only use 2, 4, or 8 devices.
 90 | 
 91 | ```shell
 92 | python generate/tp.py \
 93 |   --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \
 94 |   --max_new_tokens 256 \
 95 |   --num_samples 2
 96 | ```
 97 | 
 98 | Using A100 40GB GPUs, we need to use at least 4. You can control the number of devices by setting the `CUDA_VISIBLE_DEVICES=` environment variable.
 99 | 
100 | | Devices | Max GPU RAM | Token/sec |
101 | |---------|-------------|-----------|
102 | | 2       | OOM         | -         |
103 | | 4       | 35.46 GB    | 9.33      |
104 | | 8       | 18.19 GB    | 8.61      |
105 | 
106 | Note that the memory usage will also depend on the `max_new_tokens` value used.
107 | 
108 | The script also supports quantization, using 4-bit precision, we can now use 2 GPUs
109 | 
110 | ```shell
111 | python generate/tp.py \
112 |   --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \
113 |   --max_new_tokens 256 \
114 |   --num_samples 2 \
115 |   --quantize bnb.nf4-dq
116 | ```
117 | 
118 | | Devices | Max GPU RAM | Token/sec |
119 | |---------|-------------|-----------|
120 | | 2       | 19.79 GB    | 6.72      |
121 | | 4       | 10.73 GB    | 6.48      |
122 | | 8       | 6.15 GB     | 6.20      |
123 | 
124 | Smaller devices can also be used to run inference with this technique.
125 | 


--------------------------------------------------------------------------------
/lit_gpt/data/base.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | from abc import abstractmethod
  3 | from functools import partial
  4 | from typing import List, Dict, Union, Optional
  5 | 
  6 | import torch
  7 | from torch import Tensor
  8 | from torch.utils.data import Dataset
  9 | 
 10 | from lightning import LightningDataModule
 11 | from lit_gpt import Tokenizer
 12 | 
 13 | 
 14 | class LitDataModule(LightningDataModule):
 15 |     """Base class for all data modules in Lit-GPT."""
 16 | 
 17 |     @abstractmethod
 18 |     def connect(
 19 |         self,
 20 |         tokenizer: Optional[Tokenizer] = None,
 21 |         batch_size: int = 1,
 22 |         max_seq_length: Optional[int] = None
 23 |     ) -> None:
 24 |         """All settings that can't be determined at the time of instantiation need to be passed through here
 25 |         before any dataloaders can be accessed.
 26 |         """
 27 | 
 28 |     def setup(self, stage: str = "") -> None:
 29 |         # Stub is to redefine the default signature, because the concept of 'stage' does not exist in Lit-GPT
 30 |         pass
 31 | 
 32 | 
 33 | class SFTDataset(Dataset):
 34 |     """An in-memory dataset for supervised finetuning with `input_ids` and `labels`.
 35 | 
 36 |     Args:
 37 |         data: A list of samples (dicts). The target/label must be stored under the key 'output' and the instruction
 38 |             or other data can be stored under any key as long as it is compatible with the given prompt template.
 39 |         tokenizer: The tokenizer to use. Should match the one that was used to pretrain the model.
 40 |         prompt_template: A prompt template (format string or callable).
 41 |         max_seq_length: Truncate sequences that are longer than this value. By default, no truncation is applied.
 42 |         mask_prompt: Whether to mask the prompt section from the label (with ``ignore_index``).
 43 |         ignore_index: The index to use for elements to be ignored in the label.
 44 | 
 45 |     Returns a dict with two keys:
 46 |         input_ids: The encoded prompt + response
 47 |         labels: Same as input_ids, unless ``mask_prompt=True`` in which case the 'prompt' part is replaced with
 48 |             the ``ignore_index``.
 49 |     """
 50 |     def __init__(
 51 |         self,
 52 |         data: List[Dict[str, str]],
 53 |         tokenizer: Tokenizer,
 54 |         prompt_template: Union[str, callable],
 55 |         max_seq_length: int = -1,
 56 |         mask_prompt: bool = True,
 57 |         ignore_index: int = -1,
 58 |     ) -> None:
 59 |         self.data = data
 60 |         self.tokenizer = tokenizer
 61 |         self.prompt_template = prompt_template
 62 |         self.max_seq_length = max_seq_length
 63 |         self.mask_prompt = mask_prompt
 64 |         self.ignore_index = ignore_index
 65 | 
 66 |     def __len__(self) -> int:
 67 |         return len(self.data)
 68 | 
 69 |     def __getitem__(self, idx: int) -> Dict[str, Tensor]:
 70 |         example = self.data[idx]
 71 |         prompt = apply_prompt_template(self.prompt_template, example)
 72 |         prompt_and_response = prompt + example["output"]
 73 |         encoded_prompt = self.tokenizer.encode(prompt, max_length=self.max_seq_length)
 74 |         encoded_prompt_and_response = self.tokenizer.encode(
 75 |             prompt_and_response,
 76 |             eos=True,
 77 |             max_length=self.max_seq_length,
 78 |         )
 79 | 
 80 |         # The labels are the full prompt with response, but with the prompt masked out
 81 |         labels = encoded_prompt_and_response.clone()
 82 |         if self.mask_prompt:
 83 |             labels[: len(encoded_prompt)] = self.ignore_index
 84 | 
 85 |         return {"input_ids": encoded_prompt_and_response.type(torch.int64), "labels": labels.type(torch.int64)}
 86 | 
 87 | 
 88 | def apply_prompt_template(template: Union[str, callable], example: Dict[str, str]) -> str:
 89 |     if isinstance(template, str):
 90 |         prompt = template.format(**example)
 91 |     else:
 92 |         prompt = template(example)
 93 |     return prompt
 94 | 
 95 | 
 96 | def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1):
 97 |     """Returns the collate function for supervised finetuning (needed in the DataLoader).
 98 | 
 99 |     The collate function gets a list of dicts with keys `input_ids` and `labels`.
100 |     It returns a dict with batched `input_ids` and `labels`. Also pads short sequences to the longest element in
101 |     the batch. Optionally truncates all sequences to the specified maximum length.
102 |     """
103 |     return partial(_sft_collate_fn, max_seq_length=max_seq_length, pad_id=pad_id, ignore_index=ignore_index)
104 | 
105 | 
106 | def _sft_collate_fn(
107 |     samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1
108 | ) -> Dict[str, Tensor]:
109 | 
110 |     batched = {}
111 |     for key in ("input_ids", "labels"):
112 |         pad_value = pad_id if key == "input_ids" else ignore_index
113 | 
114 |         # Pad right based on the longest sequence
115 |         batched[key] = torch.nn.utils.rnn.pad_sequence(
116 |             [sample[key] for sample in samples], batch_first=True, padding_value=pad_value
117 |         )
118 | 
119 |         # Truncate if needed
120 |         if max_seq_length > 0:
121 |             batched[key] = batched[key][:, :max_seq_length]
122 | 
123 |     return batched
124 | 


--------------------------------------------------------------------------------
/lit_gpt/data/lima.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | """Implementation derived from https://github.com/tloen/alpaca-lora"""
  3 | import os
  4 | 
  5 | from typing import Optional, List
  6 | 
  7 | import torch
  8 | from torch.utils.data import random_split, DataLoader
  9 | from lit_gpt.data import LitDataModule, SFTDataset, get_sft_collate_fn
 10 | from lit_gpt.data.alpaca import prompt_template
 11 | from lit_gpt.tokenizer import Tokenizer
 12 | 
 13 | 
 14 | class LIMA(LitDataModule):
 15 |     """LIMA data module for supervised finetuning.
 16 | 
 17 |     Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         mask_prompt: bool = False,
 23 |         test_split_fraction: float = 0.1,
 24 |         ignore_index: int = -1,
 25 |         seed: int = 42,
 26 |         include_multiturn_conversations: bool = False,
 27 |         data_repo_id: str = "GAIR/lima",
 28 |         access_token: Optional[str] = os.getenv("HF_TOKEN"),
 29 |         num_workers: int = 4,
 30 |     ) -> None:
 31 |         super().__init__()
 32 |         if access_token is None:
 33 |             raise ValueError(
 34 |                 "LIMA requires authentication, please set the `HF_TOKEN=your_token` environment"
 35 |                 " variable or pass --access_token=your_token. You can find your token by visiting"
 36 |                 " https://huggingface.co/settings/tokens"
 37 |             )
 38 |         self.mask_prompt = mask_prompt
 39 |         self.test_split_fraction = test_split_fraction
 40 |         self.ignore_index = ignore_index
 41 |         self.seed = seed
 42 |         self.num_workers = num_workers
 43 | 
 44 |         self.access_token = access_token
 45 |         self.data_repo_id = data_repo_id
 46 |         self.include_multiturn_conversations = include_multiturn_conversations
 47 | 
 48 |         self.tokenizer: Optional[Tokenizer] = None
 49 |         self.batch_size = 1
 50 |         self.max_seq_length = -1
 51 |         self.train_dataset: Optional[SFTDataset] = None
 52 |         self.test_dataset: Optional[SFTDataset] = None
 53 | 
 54 |     def connect(
 55 |         self,
 56 |         tokenizer: Optional[Tokenizer] = None,
 57 |         batch_size: int = 1,
 58 |         max_seq_length: Optional[int] = None
 59 |     ) -> None:
 60 |         self.tokenizer = tokenizer
 61 |         self.batch_size = batch_size
 62 |         self.max_seq_length = -1 if max_seq_length is None else max_seq_length
 63 | 
 64 |     def prepare_data(self) -> None:
 65 |         from datasets import load_dataset
 66 | 
 67 |         load_dataset(self.data_repo_id, token=self.access_token)
 68 | 
 69 |     def setup(self, stage: str = "") -> None:
 70 |         from datasets import load_dataset
 71 | 
 72 |         dataset = load_dataset(self.data_repo_id, token=self.access_token)
 73 |         data = format_dataset(dataset["train"], self.include_multiturn_conversations)
 74 | 
 75 |         # Partition the dataset into train and test
 76 |         train_data, test_data = random_split(
 77 |             data,
 78 |             [1.0 - self.test_split_fraction, self.test_split_fraction],
 79 |             generator=torch.Generator().manual_seed(self.seed)
 80 |         )
 81 |         train_data, test_data = list(train_data), list(test_data)
 82 | 
 83 |         self.train_dataset = SFTDataset(
 84 |             data=train_data,
 85 |             tokenizer=self.tokenizer,
 86 |             prompt_template=prompt_template,
 87 |             max_seq_length=self.max_seq_length,
 88 |             mask_prompt=self.mask_prompt,
 89 |             ignore_index=self.ignore_index,
 90 |         )
 91 |         self.test_dataset = SFTDataset(
 92 |             data=test_data,
 93 |             tokenizer=self.tokenizer,
 94 |             prompt_template=prompt_template,
 95 |             max_seq_length=self.max_seq_length,
 96 |             mask_prompt=self.mask_prompt,
 97 |             ignore_index=self.ignore_index,
 98 |         )
 99 | 
100 |     def train_dataloader(self) -> DataLoader:
101 |         return DataLoader(
102 |             self.train_dataset,
103 |             batch_size=self.batch_size,
104 |             shuffle=True,
105 |             generator=torch.Generator().manual_seed(self.seed),
106 |             num_workers=self.num_workers,
107 |             collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
108 |         )
109 | 
110 |     def val_dataloader(self) -> DataLoader:
111 |         return DataLoader(
112 |             self.test_dataset,
113 |             batch_size=self.batch_size,
114 |             shuffle=False,
115 |             num_workers=self.num_workers,
116 |             collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
117 |         )
118 | 
119 | 
120 | def format_dataset(dataset_partition: dict, include_multi_turn_conversations: bool) -> List[dict]:
121 |     formatted_ds = []
122 | 
123 |     for entry in dataset_partition:
124 |         convo = entry["conversations"]
125 |         if include_multi_turn_conversations:
126 |             for i in range(0, len(convo) - 1, 2):
127 |                 formatted_ds.append({"instruction": convo[i], "input": "", "output": convo[i + 1]})
128 |         else:
129 |             formatted_ds.append({"instruction": convo[0], "input": "", "output": convo[1]})
130 | 
131 |     return formatted_ds
132 | 


--------------------------------------------------------------------------------
/tutorials/download_phi.md:
--------------------------------------------------------------------------------
 1 | ## Download [phi](https://arxiv.org/abs/2309.05463) weights
 2 | 
 3 | ### Phi 2
 4 | 
 5 | Microsoft Research [released](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) Phi 2, which is a 2.7 billion parameter model trained on "textbook-quality" data with knowledge distillation from Phi 1.5. The model achieves sota results among base LLMs with less than 13B parameters and matches or outperforms models up to 25x larger on complex benchmarks, e.g. it achieves better performance compared to 25x larger Llama-2-70B model on multi-step reasoning tasks, i.e., coding and math. Phi 2 was trained on 1.4T tokens and has not undergone any RLHF alignment nor has it been instruct fine-tuned. Phi 2 shares the same architecture with Phi 1.5 and has context length of 2048 tokens.
 6 | The model weights are released under [*Microsoft Research license*](https://huggingface.co/microsoft/phi-2#license).
 7 | 
 8 | To download the model weights and convert them to the lit-gpt format, run
 9 | 
10 | ```bash
11 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
12 | 
13 | python scripts/download.py --repo_id microsoft/phi-2 --from_safetensors True
14 | 
15 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/microsoft/phi-2
16 | ```
17 | 
18 | > [!WARNING]
19 | > Phi-2 used [dropout](https://huggingface.co/microsoft/phi-2/blob/cb2f453/config.json#L26) during training which we don't model, so training will not be equal.
20 | 
21 | Inference the model in instruct mode:
22 | 
23 | ```bash
24 | python chat/base.py --checkpoint_dir checkpoints/microsoft/phi-2
25 | ```
26 | ```text
27 | >> Prompt: Write a detailed analogy between mathematics and a lighthouse.
28 | >> Reply: Mathematics is like a lighthouse. Mathematics provides a method to guide us through the sometimes chaotic and confusing waters of life. It provides a structured approach to problems which can help us find our way and provide direction. Just as a lighthouse keeps watch over the sea, mathematics can provide us with the tools to try and make sense of the world. Furthermore, just as a lighthouse keeps a watchful eye on the horizon, mathematics can help us reach our goals by showing us the way.
29 | ```
30 | 
31 | > [!NOTE]
32 | > In order to obtain appropriate answers, you may need to tweak the [input prompt](https://github.com/Lightning-AI/lit-gpt/blob/74b8df0c3f07fc31d9d1a49e870a1f7955329ad8/chat/base.py#L359). E.g. we found out that if using `"Instruct:{prompt}\nOutput:\n"` instead of `"Instruct:{prompt}\nOutput:"` the model generates longer answers in some cases.
33 | 
34 | Free generation mode:
35 | ```bash
36 | python generate/base.py --prompt "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\nBob:" --checkpoint_dir checkpoints/microsoft/phi-2
37 | ```
38 | which yields
39 | ```text
40 | Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?
41 | Bob: Well, one possible reason could be stress. Have you been feeling overwhelmed lately?
42 | Alice: Yes, I've been juggling multiple deadlines and it's been quite taxing.
43 | Carol: Stress can definitely impact your ability to concentrate. Maybe you need
44 | ```
45 | 
46 | ### Phi 1.5
47 | 
48 | A team at Microsoft Research has made available Phi 1.5, which is a 1.3 billion parameter model optimized for common sense reasoning in natural language, showing performance on par with models 5x its size, especially in grade-school mathematics and basic coding. This model retains characteristics of larger LLMs, and significant improvement was noted in reducing toxic and biased generations by avoiding web data. It's also worth highlighting that while this model performs well on language understanding and common sense reasoning tasks, it is a base model that has not undergone any supervised instruction finetuning or finetuning with RLHF.
49 | 
50 | The model was trained the same data sources (7B tokens) as its [phi-1](https://arxiv.org/abs/2306.11644) predecessor, which includes
51 | 
52 | - a Python code subset from [The Stack](https://arxiv.org/abs/2211.15533) v1.2
53 | - Q&A texts from [StackOverflow](https://archive.org/download/stackexchange)
54 | - code from DeepMind [code_contests](https://github.com/deepmind/code_contests)
55 | - synthetic Python textbooks and exercises generated by [gpt-3.5-turbo-0301](https://platform.openai.com/docs/models/gpt-3-5)
56 | 
57 | In addition, to create phi-1.5, the authors included additional textbook-quality synthetic text (roughly 20B tokens) in natural language, which was created using the [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) approach.
58 | 
59 | The model weights are released under a [*Microsoft Research license*](https://huggingface.co/microsoft/phi-1_5/blob/main/README.md#license).
60 | 
61 | In order to use the phi-1.5 model checkpoint, which requires about 3 Gb of disk space, download the weights and convert the checkpoint to the lit-gpt format:
62 | 
63 | ```bash
64 | pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
65 | 
66 | python scripts/download.py --repo_id microsoft/phi-1_5
67 | 
68 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/microsoft/phi-1_5
69 | ```
70 | 
71 | You're done! To execute the model just run:
72 | 
73 | ```bash
74 | pip install tokenizers
75 | 
76 | python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/microsoft/phi-1_5
77 | ```
78 | 


--------------------------------------------------------------------------------
/lit_gpt/doc_block_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | 
  4 | 
  5 | def get_ltor_masks_and_position_ids(data,
  6 |                                     eod_token,
  7 |                                     reset_position_ids,
  8 |                                     reset_attention_mask,
  9 |                                     eod_mask_loss):
 10 |     """
 11 |         Build masks and position id for left to right model.
 12 |         Modified from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/utils.py#L162.
 13 |     """
 14 | 
 15 |     # Extract batch size and sequence length.
 16 |     micro_batch_size, seq_length = data.size()
 17 | 
 18 |     # Attention mask (lower triangular).
 19 |     if reset_attention_mask:
 20 |         att_mask_batch = micro_batch_size
 21 |     else:
 22 |         att_mask_batch = 1
 23 |     attention_mask = torch.tril(torch.ones(
 24 |         (att_mask_batch, seq_length, seq_length), device=data.device)).view(
 25 |             att_mask_batch, 1, seq_length, seq_length)
 26 | 
 27 |     # Loss mask.
 28 |     loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
 29 |     if eod_mask_loss:
 30 |         loss_mask[data == eod_token] = 0.0
 31 | 
 32 |     # Position ids.
 33 |     position_ids = torch.arange(seq_length, dtype=torch.long,
 34 |                                 device=data.device)
 35 |     position_ids = position_ids.unsqueeze(0).expand_as(data)
 36 |     # We need to clone as the ids will be modifed based on batch index.
 37 |     if reset_position_ids:
 38 |         position_ids = position_ids.clone()
 39 | 
 40 |     if reset_position_ids or reset_attention_mask:
 41 |         # Loop through the batches:
 42 |         for b in range(micro_batch_size):
 43 | 
 44 |             # Find indecies where EOD token is.
 45 |             eod_index = position_ids[b, data[b] == eod_token]
 46 |             # Detach indecies from positions if going to modify positions.
 47 |             if reset_position_ids:
 48 |                 eod_index = eod_index.clone()
 49 | 
 50 |             # Loop through EOD indecies:
 51 |             prev_index = 0
 52 |             for j in range(eod_index.size()[0]):
 53 |                 i = eod_index[j]
 54 |                 # Mask attention loss.
 55 |                 if reset_attention_mask:
 56 |                     attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
 57 |                 # Reset positions.
 58 |                 if reset_position_ids:
 59 |                     position_ids[b, (i + 1):] -= (i + 1 - prev_index)
 60 |                     prev_index = i + 1
 61 | 
 62 |     # Convert attention mask to binary:
 63 |     attention_mask = (attention_mask > 0.5)
 64 | 
 65 |     return attention_mask, loss_mask, position_ids
 66 | 
 67 | 
 68 | def get_cache_attn_masks(data,
 69 |                          cache_token,
 70 |                          reset_position_ids,
 71 |                          reset_attention_mask=True,
 72 |                          cache_mask_loss=True):
 73 |     """
 74 |         Build attention masks for cache tokens.
 75 |     """
 76 | 
 77 |     # Extract batch size and sequence length.
 78 |     micro_batch_size, seq_length = data.size()
 79 | 
 80 |     # Attention mask (lower triangular).
 81 |     if reset_attention_mask:
 82 |         att_mask_batch = micro_batch_size
 83 |     else:
 84 |         att_mask_batch = 1
 85 |     attention_mask = torch.tril(torch.ones(
 86 |         (att_mask_batch, seq_length, seq_length), device=data.device)).view(
 87 |             att_mask_batch, 1, seq_length, seq_length)
 88 | 
 89 |     # Loss mask.
 90 |     loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
 91 |     # Masking the loss on cache tokens.
 92 |     if cache_mask_loss:
 93 |         loss_mask[data == cache_token] = 0.0
 94 | 
 95 |     # Position ids.
 96 |     position_ids = torch.arange(seq_length, dtype=torch.long,
 97 |                                 device=data.device)
 98 |     position_ids = position_ids.unsqueeze(0).expand_as(data)
 99 |     # We need to clone as the ids will be modifed based on batch index.
100 |     if reset_position_ids:
101 |         position_ids = position_ids.clone()
102 | 
103 |     if reset_position_ids or reset_attention_mask:
104 |         # Loop through the batches:
105 |         for b in range(micro_batch_size):
106 | 
107 |             # Find indecies where EOD token is.
108 |             eod_index = position_ids[b, data[b] == cache_token]
109 |             # Detach indecies from positions if going to modify positions.
110 |             if reset_position_ids:
111 |                 eod_index = eod_index.clone()
112 | 
113 |             # Loop through EOD indecies:
114 |             prev_index = 0
115 |             for j in range(eod_index.size()[0]):
116 |                 i = eod_index[j]
117 |                 # Mask attention loss.
118 |                 if reset_attention_mask:
119 |                     # TODO: Attend to all cache tokens when there's a block of consecutive cache tokens.
120 |                     attention_mask[b, 0, (i + 1):, :i] = 0 # Overlapped attn_mask at the *single* cache_token position.
121 |                 # Reset positions.
122 |                 if reset_position_ids:
123 |                     raise NotImplementedError
124 |                     # position_ids[b, (i + 1):] -= (i + 1 - prev_index)
125 |                     # prev_index = i + 1
126 | 
127 |     # Convert attention mask to binary:
128 |     attention_mask = (attention_mask > 0.5)
129 | 
130 |     return attention_mask, loss_mask, position_ids


--------------------------------------------------------------------------------
/scripts/prepare_retrieval_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | 
  3 | # saves the openwebtext dataset to a binary file for training. following was helpful:
  4 | # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
  5 | import json
  6 | import os
  7 | import sys
  8 | from pathlib import Path
  9 | from typing import Union
 10 | from functools import partial
 11 | 
 12 | import numpy as np
 13 | from tqdm import tqdm
 14 | 
 15 | # support running without installing as a package
 16 | wd = Path(__file__).parent.parent.resolve()
 17 | sys.path.append(str(wd))
 18 | 
 19 | from lit_gpt import Tokenizer
 20 | import torch
 21 | 
 22 | 
 23 | def prepare(
 24 |     destination_path: Path = Path("/fs/cml-projects/llm-pretraining/llm-retrieval/data/orca_retrieval"),
 25 |     checkpoint_dir: Path = Path("/fs/cml-projects/llm-pretraining/llm-retrieval/checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"),
 26 |     seed: int = 42,
 27 |     cache_dir: Path = Path("/fs/cml-projects/llm-pretraining/llm-retrieval/data/cache"),
 28 |     test_size: Union[float, int, None] = 0.0005,
 29 |     max_seq_length: int = None,
 30 |     data_name: str = "openwebtext",
 31 |     data_type: str = "pretrain"
 32 | ) -> None:
 33 |     np.random.seed(seed)
 34 |     from datasets import load_dataset  # huggingface datasets
 35 | 
 36 |     destination_path.mkdir(parents=True, exist_ok=True)
 37 | 
 38 |     if max_seq_length is None:
 39 |         with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file:
 40 |             config = json.load(file)
 41 |             max_seq_length = config["block_size"]
 42 | 
 43 |     tokenizer = Tokenizer(checkpoint_dir)
 44 | 
 45 |     # number of workers in .map() call
 46 |     # good number to use is ~order number of cpu cores // 2
 47 |     num_proc = os.cpu_count() // 2
 48 | 
 49 |     # number of workers in load_dataset() call
 50 |     # best number might be different from num_proc above as it also depends on HW speed.
 51 |     # it is better than 1 usually though
 52 |     num_proc_load_dataset = num_proc
 53 | 
 54 |     # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
 55 |     dataset = load_dataset(data_name, num_proc=num_proc_load_dataset, cache_dir=cache_dir)
 56 |     test_size = 10000 / len(dataset['train']) # picking 10000 samples for test set
 57 |     # owt by default only contains the 'train' split, so create a test split
 58 |     split_dataset = dataset["train"].train_test_split(test_size=test_size, seed=seed, shuffle=True)
 59 |     val_dataset = split_dataset.pop("test")  # rename the test split to val
 60 | 
 61 |     def process_instruction_data(examples, max_length=1024):
 62 |         # writing for batched examples
 63 |         query_ids = []
 64 |         corpus_ids = []
 65 |         lens = []
 66 |         for question, response in zip(examples["question"], examples["response"]):
 67 |             query_id = tokenizer.encode(question, max_length=max_length, bos=False, eos=False).tolist()   # not adding bos, eos for now
 68 |             corpus_id = tokenizer.encode(response, max_length=max_length).tolist()   # not adding bos, eos for now
 69 |             if len(query_id) <= max_length and len(corpus_id) <= max_length:
 70 |                 query_ids.append(query_id)
 71 |                 corpus_ids.append(corpus_id)
 72 | 
 73 |         return {"query": query_ids, "corpus": corpus_ids, 'query_len': [len(q) for q in query_ids], 'corpus_len': [len(c) for c in corpus_ids]}
 74 | 
 75 |     def process_pretrain_data(examples, max_length=1024):
 76 |         # writing for batched examples
 77 |         query_ids = []
 78 |         corpus_ids = []
 79 |         lens = []
 80 |         for text in examples["text"]:
 81 |             # splitting the text at random points and make query and corpus
 82 |             tokenized_text = tokenizer.encode(text, max_length=max_length, bos=False, eos=False).tolist()
 83 |             if len(tokenized_text) > 8: # making a random choice that the query and corpus are not too small
 84 |                 pos = np.random.randint(5, len(tokenized_text))
 85 |                 query_id = tokenized_text[:pos]
 86 |                 corpus_id = tokenized_text[pos:]
 87 |                 query_ids.append(query_id)
 88 |                 corpus_ids.append(corpus_id)
 89 | 
 90 |         return {"query": query_ids, "corpus": corpus_ids, 'query_len': [len(q) for q in query_ids], 'corpus_len': [len(c) for c in corpus_ids]}
 91 | 
 92 |     # tokenize the dataset
 93 |     if data_type == "pretrain":
 94 |         tokenize_func = partial(process_pretrain_data, max_length=max_seq_length)
 95 |     elif data_type == "instruction":
 96 |         tokenize_func = partial(process_instruction_data, max_length=max_seq_length)
 97 |     else:
 98 |         raise ValueError(f"Invalid data_type: {data_type}; Please choose from 'pretrain' or 'instruction'")
 99 |     tokenized = val_dataset.map(tokenize_func, desc="tokenizing the splits", batched=True, num_proc=num_proc)
100 |     # removing all columns except query and corpus
101 |     tokenized = tokenized.remove_columns([col for col in tokenized.column_names if col not in ["query", "corpus", "query_len", "corpus_len"]])
102 |     tokenized = tokenized.add_column("qrel", range(len(tokenized)))
103 |     # saving as hf dataset
104 |     tokenized.save_to_disk(destination_path)
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     from jsonargparse import CLI
109 | 
110 |     CLI(prepare)
111 | 


--------------------------------------------------------------------------------
/scripts/prepare_longform.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | 
  3 | """Implementation derived from https://github.com/tloen/alpaca-lora"""
  4 | 
  5 | import json
  6 | import sys
  7 | from pathlib import Path
  8 | from typing import Optional
  9 | 
 10 | import torch
 11 | from tqdm import tqdm
 12 | 
 13 | # support running without installing as a package
 14 | wd = Path(__file__).parent.parent.resolve()
 15 | sys.path.append(str(wd))
 16 | 
 17 | from lit_gpt.tokenizer import Tokenizer
 18 | from lit_gpt.utils import CLI
 19 | from scripts.prepare_alpaca import download_if_missing
 20 | 
 21 | 
 22 | def prepare(
 23 |     destination_path: Path = Path("data/longform"),
 24 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
 25 |     mask_inputs: bool = False,  # as in alpaca-lora
 26 |     ignore_index: int = -1,
 27 |     max_seq_length: Optional[int] = None,
 28 | ) -> None:
 29 |     """Prepare the Alpaca dataset for instruction tuning.
 30 | 
 31 |     The output is a training and test dataset saved as `train.pt` and `test.pt`,
 32 |     which stores the preprocessed and tokenized prompts and labels.
 33 |     """
 34 |     if max_seq_length is None:
 35 |         with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file:
 36 |             config = json.load(file)
 37 |             max_seq_length = config["block_size"]
 38 | 
 39 |     destination_path.mkdir(parents=True, exist_ok=True)
 40 | 
 41 |     train_file_name = "train.json"
 42 |     # val_file_name = "val.json"
 43 |     test_file_name = "test.json"
 44 | 
 45 |     train_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/train.json"
 46 |     # val_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/val.json"
 47 |     test_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/test.json"
 48 | 
 49 |     train_file_path = destination_path / train_file_name
 50 |     print("Loading train data file...")
 51 |     download_if_missing(train_file_path, train_file_url)
 52 |     with open(train_file_path, "r", encoding="utf-8") as file:
 53 |         train_data = json.load(file)
 54 | 
 55 |     test_file_path = destination_path / test_file_name
 56 |     print("Loading test data file...")
 57 |     download_if_missing(test_file_path, test_file_url)
 58 |     with open(test_file_path, "r", encoding="utf-8") as file:
 59 |         test_data = json.load(file)
 60 | 
 61 |     print("Loading tokenizer...")
 62 |     tokenizer = Tokenizer(checkpoint_dir)
 63 | 
 64 |     print(f"train has {len(train_data):,} samples")
 65 |     print(f"test has {len(test_data):,} samples")
 66 | 
 67 |     print("Processing train set ...")
 68 |     train_data = [
 69 |         prepare_sample(
 70 |             example=sample,
 71 |             tokenizer=tokenizer,
 72 |             max_length=max_seq_length,
 73 |             mask_inputs=mask_inputs,
 74 |             ignore_index=ignore_index,
 75 |         )
 76 |         for sample in tqdm(train_data)
 77 |     ]
 78 |     torch.save(train_data, destination_path / "train.pt")
 79 | 
 80 |     print("Processing test set ...")
 81 |     test_data = [
 82 |         prepare_sample(
 83 |             example=sample,
 84 |             tokenizer=tokenizer,
 85 |             max_length=max_seq_length,
 86 |             mask_inputs=mask_inputs,
 87 |             ignore_index=ignore_index,
 88 |         )
 89 |         for sample in tqdm(test_data)
 90 |     ]
 91 |     torch.save(test_data, destination_path / "test.pt")
 92 | 
 93 | 
 94 | def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
 95 |     """Processes a single sample.
 96 | 
 97 |     Each sample in the dataset consists of:
 98 |     - instruction: A string describing the task
 99 |     - input: A string holding a special input value for the instruction.
100 |         This only applies to some samples, and in others this is empty.
101 |     - output: The response string
102 | 
103 |     This function processes this data to produce a prompt text and a label for
104 |     supervised training. The prompt text is formed as a single message including both
105 |     the instruction and the input. The label/target is the same message but with the
106 |     response attached.
107 | 
108 |     Finally, both the prompt and the label get tokenized. If desired, all tokens
109 |     in the label that correspond to the original input prompt get masked out (default).
110 |     """
111 |     full_prompt = generate_prompt(example)
112 |     full_prompt_and_response = full_prompt + example["output"]
113 |     encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length)
114 |     encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length)
115 | 
116 |     # The labels are the full prompt with response, but with the prompt masked out
117 |     labels = encoded_full_prompt_and_response.clone()
118 |     if mask_inputs:
119 |         labels[: len(encoded_full_prompt)] = ignore_index
120 | 
121 |     return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels}
122 | 
123 | 
124 | def generate_prompt(example: dict) -> str:
125 |     """Generates a standardized message to prompt the model with an instruction and a
126 |     'response' field."""
127 | 
128 |     return (
129 |         "Below is an instruction that describes a task, paired with an input that provides further context. "
130 |         "Write a response that appropriately completes the request.\n\n"
131 |         f"### Instruction:\n{example['input']}\n\n### Response:"
132 |     )
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     CLI(prepare)
137 | 


--------------------------------------------------------------------------------
/eval/factmem_rephrase.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Sample script
  3 | 
  4 | export OPENAI_API_KEY=<your_key>
  5 | 
  6 | python lit-gpt-dev/eval/factmem_rephrase.py --model tomg-group-umd/tinyllama_1b_redpajama_wiki2k_200B_tld3-step-00009536 --dataset "tomg-group-umd/RedPajama-Data-V2" --subset sample-100B --split train --num_samples 1000
  7 | 
  8 | 
  9 | '''
 10 | 
 11 | import time
 12 | import argparse
 13 | import os
 14 | import jsonlines
 15 | import json
 16 | from tqdm import tqdm
 17 | import torch
 18 | 
 19 | import datasets
 20 | from transformers import AutoTokenizer, AutoModelForCausalLM
 21 | from transformers import set_seed
 22 | 
 23 | def str2bool(v):
 24 |     """Human friendly boolean cmdline flag parser."""
 25 |     if isinstance(v, bool):
 26 |         return v
 27 |     if v.lower() in ("yes", "true", "t", "y", "1"):
 28 |         return True
 29 |     elif v.lower() in ("no", "false", "f", "n", "0"):
 30 |         return False
 31 |     else:
 32 |         raise argparse.ArgumentTypeError(
 33 |             f"Boolean value expected. Got: {str(v)}, " f"which cannot be converted to a boolean."
 34 |         )
 35 | 
 36 | 
 37 | def process_raw_data(raw_data, dataset):
 38 |     if dataset == "HuggingFaceTB/cosmopedia":
 39 |         return raw_data["prompt"] + raw_data["text"]
 40 |     elif dataset == "stingning/ultrachat":
 41 |         return "\n\n".join(raw_data["data"])
 42 |     elif dataset == "tomg-group-umd/RedPajama-Data-V2":
 43 |         return raw_data["raw_content"]
 44 |     else:
 45 |         try:
 46 |             return raw_data["text"]
 47 |         except:
 48 |             raise NotImplementedError(f"{dataset}")
 49 | 
 50 | 
 51 | if __name__ == "__main__":
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument("--model", default=None, required=True)
 54 | 
 55 |     parser.add_argument("--dataset", default=None,required=True)
 56 |     parser.add_argument("--num_samples", default=500, type=int)
 57 |     parser.add_argument("--max_length", default=256, type=int)
 58 |     parser.add_argument("--min_length", default=64, type=int)
 59 |     parser.add_argument("--save_file_name", default=None, type=str)
 60 |     parser.add_argument("--seed", default=5, type=int)
 61 | 
 62 | 
 63 | 
 64 | 
 65 |     parser.add_argument("--dataset_type", default="huggingface")  # huggingface, huggingface_disk
 66 |     parser.add_argument("--subset", default=None)
 67 |     parser.add_argument("--split", default=None)
 68 | 
 69 |     parser.add_argument("--run_prelim_eval", type=str2bool, default=True)
 70 |     args = parser.parse_args()
 71 | 
 72 |     if args.save_file_name is None:
 73 |         args.save_file_name = f"rephrase_ppl_expts/rephrased/{args.num_samples}_{args.min_length}_{args.max_length}/rephrased.jsonl"
 74 | 
 75 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 76 |     if torch.cuda.is_available():
 77 |         print(f"Available GPUs: {torch.cuda.device_count()}")
 78 | 
 79 | 
 80 |     tokenizer = AutoTokenizer.from_pretrained(args.model)
 81 |     tokenizer.pad_token = tokenizer.eos_token
 82 | 
 83 | 
 84 | 
 85 |     if args.dataset_type == "huggingface":
 86 |         raw_dataset = datasets.load_dataset(args.dataset, args.subset, split=args.split, streaming=True)
 87 |     elif args.dataset_type == "huggingface_disk":
 88 |         raw_dataset = datasets.load_from_disk(args.dataset)
 89 |     else:
 90 |         raise NotImplementedError(f"{args.dataset_type}")
 91 |     # TODO Add support for our hfds and pkds
 92 | 
 93 |     print(raw_dataset)
 94 |     raw_dataset_iterator = iter(raw_dataset)
 95 | 
 96 | 
 97 |     final_strings = []
 98 |     count = 0
 99 |     with torch.no_grad():
100 |         with tqdm(total=args.num_samples) as pbar:
101 |             while count < args.num_samples:
102 |                 torch.cuda.empty_cache()
103 |                 raw_data = next(raw_dataset_iterator)
104 |                 
105 |                 full_sequence = process_raw_data(raw_data, args.dataset)
106 |                 inputs = tokenizer(full_sequence, truncation=True, max_length=args.max_length, return_tensors="pt")
107 |                 
108 |                 if inputs.input_ids.shape[1] <= args.min_length or inputs.input_ids.shape[1] >= args.max_length :
109 |                     continue
110 |                 else:
111 |                    
112 |                    final_strings.append(full_sequence) 
113 | 
114 |                 pbar.update(1)
115 |                 count += 1
116 | 
117 | 
118 | 
119 | import openai
120 | 
121 | # Load your OpenAI API key from the environment variable
122 | openai.api_key = os.getenv('OPENAI_API_KEY')
123 | 
124 | # List of strings you want to rephrase
125 | 
126 | def rephrase_strings(strings):
127 |     rephrased = []
128 |     for string in strings:
129 |         completion = openai.chat.completions.create(
130 |             model="gpt-3.5-turbo-1106",
131 |             max_tokens=args.max_length,
132 |             messages=[
133 |                 {
134 |                     "role": "user",
135 |                     "content": f"Rephrase this sentence: {string}",
136 |                 },
137 |             ],
138 |         )
139 |         rephrased_text = completion.choices[0].message.content
140 | 
141 |         rephrased.append({
142 |             "original_text": string,
143 |             "rephrased_text": rephrased_text
144 |         })
145 |     return rephrased
146 | 
147 | # Rephrase the strings
148 | rephrased_strings = rephrase_strings(final_strings)
149 | 
150 | # Save the rephrased strings to a JSON file
151 | os.makedirs(os.path.dirname(args.save_file_name), exist_ok=True)
152 | with jsonlines.open(args.save_file_name, "w") as linewriter:
153 |     for row in tqdm(rephrased_strings):
154 |         linewriter.write(row)
155 | 
156 | 


--------------------------------------------------------------------------------
/scripts/prepare_csv.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | 
  3 | import json
  4 | import logging
  5 | import sys
  6 | from pathlib import Path
  7 | from typing import Optional, Tuple
  8 | 
  9 | import torch
 10 | from torch.utils.data import random_split
 11 | from tqdm import tqdm
 12 | 
 13 | # support running without installing as a package
 14 | wd = Path(__file__).parent.parent.resolve()
 15 | logger = logging.getLogger(__name__)
 16 | sys.path.append(str(wd))
 17 | 
 18 | from lit_gpt.tokenizer import Tokenizer
 19 | from lit_gpt.utils import CLI
 20 | 
 21 | 
 22 | def prepare(
 23 |     csv_path: Path,
 24 |     destination_path: Path = Path("data/csv"),
 25 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
 26 |     test_split_fraction: float = 0.1,
 27 |     seed: int = 42,
 28 |     mask_inputs: bool = False,
 29 |     ignore_index: int = -1,
 30 |     max_seq_length: Optional[int] = None,
 31 |     columns: Tuple[str, ...] = ("instruction", "input", "output"),
 32 | ) -> None:
 33 |     """Prepare a CSV dataset for instruction tuning.
 34 | 
 35 |     The output is a training and test dataset saved as `train.pt` and `test.pt`,
 36 |     which stores the preprocessed and tokenized prompts and labels.
 37 |     """
 38 |     if max_seq_length is None:
 39 |         with open(checkpoint_dir / "lit_config.json", "r") as file:
 40 |             config = json.load(file)
 41 |             max_seq_length = config["block_size"]
 42 | 
 43 |     destination_path.mkdir(parents=True, exist_ok=True)
 44 |     logger.info("Loading data file ...")
 45 |     import pandas as pd
 46 | 
 47 |     df = pd.read_csv(csv_path, dtype=str).fillna("")
 48 |     if not (df.columns.values == columns).all():
 49 |         raise ValueError(f"CSV columns must be {columns}, found {df.columns.values}")
 50 |     data = json.loads(df.to_json(orient="records", indent=4))
 51 | 
 52 |     print("Loading tokenizer...")
 53 |     tokenizer = Tokenizer(checkpoint_dir)
 54 | 
 55 |     # Partition the dataset into train and test
 56 |     train_set, test_set = random_split(
 57 |         data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed)
 58 |     )
 59 |     train_set, test_set = list(train_set), list(test_set)
 60 | 
 61 |     print(f"train has {len(train_set):,} samples")
 62 |     print(f"test has {len(test_set):,} samples")
 63 | 
 64 |     print("Processing train split ...")
 65 |     train_set = [
 66 |         prepare_sample(
 67 |             example=sample,
 68 |             tokenizer=tokenizer,
 69 |             max_length=max_seq_length,
 70 |             mask_inputs=mask_inputs,
 71 |             ignore_index=ignore_index,
 72 |         )
 73 |         for sample in tqdm(train_set)
 74 |     ]
 75 |     torch.save(train_set, destination_path / "train.pt")
 76 | 
 77 |     print("Processing test split ...")
 78 |     test_set = [
 79 |         prepare_sample(
 80 |             example=sample,
 81 |             tokenizer=tokenizer,
 82 |             max_length=max_seq_length,
 83 |             mask_inputs=mask_inputs,
 84 |             ignore_index=ignore_index,
 85 |         )
 86 |         for sample in tqdm(test_set)
 87 |     ]
 88 |     torch.save(test_set, destination_path / "test.pt")
 89 | 
 90 | 
 91 | def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
 92 |     """Processes a single sample.
 93 | 
 94 |     Each sample in the dataset consists of:
 95 |     - instruction: A string describing the task
 96 |     - input: A string holding a special input value for the instruction.
 97 |         This only applies to some samples, and in others this is empty.
 98 |     - output: The response string
 99 | 
100 |     This function processes this data to produce a prompt text and a label for
101 |     supervised training. The prompt text is formed as a single message including both
102 |     the instruction and the input. The label/target is the same message but with the
103 |     response attached.
104 | 
105 |     Finally, both the prompt and the label get tokenized. If desired, all tokens
106 |     in the label that correspond to the original input prompt get masked out (default).
107 |     """
108 |     full_prompt = generate_prompt(example)
109 |     full_prompt_and_response = full_prompt + example["output"]
110 |     encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length)
111 |     encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length)
112 | 
113 |     # The labels are the full prompt with response, but with the prompt masked out
114 |     labels = encoded_full_prompt_and_response.clone()
115 |     if mask_inputs:
116 |         labels[: len(encoded_full_prompt)] = ignore_index
117 | 
118 |     return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels}
119 | 
120 | 
121 | def generate_prompt(example: dict) -> str:
122 |     """Generates a standardized message to prompt the model with an instruction, optional input and a
123 |     'response' field."""
124 | 
125 |     if example["input"]:
126 |         return (
127 |             "Below is an instruction that describes a task, paired with an input that provides further context. "
128 |             "Write a response that appropriately completes the request.\n\n"
129 |             f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
130 |         )
131 |     return (
132 |         "Below is an instruction that describes a task. "
133 |         "Write a response that appropriately completes the request.\n\n"
134 |         f"### Instruction:\n{example['instruction']}\n\n### Response:"
135 |     )
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     CLI(prepare)
140 | 


--------------------------------------------------------------------------------