├── source
├── model
│ ├── llama2
│ │ ├── chatllms
│ │ │ ├── __init__.py
│ │ │ ├── server
│ │ │ │ └── __init__.py
│ │ │ ├── train
│ │ │ │ ├── __init__.py
│ │ │ │ └── training.py
│ │ │ ├── utils
│ │ │ │ ├── __init__.py
│ │ │ │ ├── stream_server.py
│ │ │ │ ├── apply_lora.py
│ │ │ │ └── logger_utils.py
│ │ │ ├── data
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── convert_alpaca.py
│ │ │ │ ├── __init__.py
│ │ │ │ └── data_loader.py
│ │ │ ├── evaluation
│ │ │ │ └── __init__.py
│ │ │ ├── __version__.py
│ │ │ ├── model
│ │ │ │ ├── __init__.py
│ │ │ │ ├── sample_generate_callback.py
│ │ │ │ ├── compute_metrics.py
│ │ │ │ └── save_peft_model_callback.py
│ │ │ └── configs
│ │ │ │ ├── __init__.py
│ │ │ │ ├── lora_args.py
│ │ │ │ ├── quant_args.py
│ │ │ │ ├── model_args.py
│ │ │ │ ├── infer_args.py
│ │ │ │ ├── train_args.py
│ │ │ │ ├── gen_args.py
│ │ │ │ └── data_args.py
│ │ ├── assets
│ │ │ └── wechat.jpg
│ │ ├── scripts
│ │ │ ├── server
│ │ │ │ ├── gradio_base_webserver.sh
│ │ │ │ ├── run_inference.sh
│ │ │ │ ├── gradio_webserver.sh
│ │ │ │ ├── gradio_qlora_webserver.sh
│ │ │ │ └── apply_lora_to_base_model.sh
│ │ │ ├── eval.sh
│ │ │ ├── run.sh
│ │ │ ├── full_finetune
│ │ │ │ ├── full-finetune_ds.sh
│ │ │ │ └── full-finetune.sh
│ │ │ ├── lora_finetune
│ │ │ │ ├── lora-finetune.sh
│ │ │ │ └── lora-finetune_ds.sh
│ │ │ ├── test_qlora_finetune.sh
│ │ │ ├── qlora_finetune
│ │ │ │ ├── finetune_baichuan_7b_vicuna_zh.sh
│ │ │ │ ├── finetune_llama2_7b_alpaca_zh.sh
│ │ │ │ ├── finetune_llama_7b_alpaca_zh.sh
│ │ │ │ └── finetune_baichuan_7b_alpaca_zh.sh
│ │ │ ├── ds_config
│ │ │ │ ├── default_offload_opt_param.json
│ │ │ │ └── ds_config_zero3_auto.json
│ │ │ └── clean_data.sh
│ │ ├── data
│ │ │ ├── run_test.yaml
│ │ │ ├── belle_group.yaml
│ │ │ ├── vicuna_zh.yaml
│ │ │ ├── alpaca_zh.yaml
│ │ │ ├── alpaca_zh_pcyn.yaml
│ │ │ ├── vicuna_zh_pcyn.yaml
│ │ │ ├── dataset_info.yaml
│ │ │ └── dataset_info.py
│ │ ├── examples
│ │ │ ├── clean_sharegpt
│ │ │ │ ├── clean_evol_instruct.py
│ │ │ │ └── merge.py
│ │ │ ├── vllm
│ │ │ │ ├── vllm_demo.py
│ │ │ │ └── apil_chient.py
│ │ │ ├── finetune_llm
│ │ │ │ ├── baichuan7b_demo.py
│ │ │ │ └── finetune_llama_with_qlora.py
│ │ │ ├── format_data
│ │ │ │ ├── merge.py
│ │ │ │ ├── convert_oasst1.py
│ │ │ │ ├── convert_vicuna.py
│ │ │ │ └── convert_alpaca.py
│ │ │ └── test_convdataset.py
│ │ ├── requirements.txt
│ │ ├── chatbot.py
│ │ ├── server
│ │ │ ├── multi_chat.py
│ │ │ ├── single_chat.py
│ │ │ └── gradio_base_webserver.py
│ │ ├── train_qlora.py
│ │ ├── train.py
│ │ └── cli_demo.py
│ ├── flan-t5
│ │ ├── sample_ablate.sh
│ │ ├── run_ft.sh
│ │ ├── run_lora.sh
│ │ ├── run_prefix.sh
│ │ ├── utils.py
│ │ └── flan_seq2seq.py
│ ├── deepspeed.json
│ ├── flan_t5_predict.py
│ ├── gpt_predict.py
│ └── llama2_predict.py
└── arch
│ ├── self_knowledge
│ └── sk.py
│ ├── passage_relevance
│ └── pr.py
│ └── task_decomposition
│ └── td.py
├── ra-isf.png
├── evaluation.png
├── retrieval_contriever
├── requirements.txt
├── README.md
├── example_scripts
│ ├── contriever.sh
│ └── mcontriever.sh
├── evaluate_retrieved_passages.py
├── preprocess.py
├── src
│ ├── index.py
│ ├── inbatch.py
│ ├── dist_utils.py
│ ├── slurm.py
│ ├── normalize_text.py
│ ├── moco.py
│ └── contriever.py
└── generate_passage_embeddings.py
├── requirement.txt
├── run.sh
├── test.py
├── config.py
└── contriever_config.py
/source/model/llama2/chatllms/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/server/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/train/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/data/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ra-isf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/ra-isf.png
--------------------------------------------------------------------------------
/evaluation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/evaluation.png
--------------------------------------------------------------------------------
/retrieval_contriever/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.11.0
2 | transformers==4.18.0
3 | beir==1.0.0
4 |
--------------------------------------------------------------------------------
/source/model/llama2/assets/wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OceannTwT/ra-isf/HEAD/source/model/llama2/assets/wechat.jpg
--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/gradio_base_webserver.sh:
--------------------------------------------------------------------------------
1 | python gradio_base_webserver.py \
2 | --model_name_or_path /home/robin/work_dir/llm/llm_pretrain_model/baichuan
3 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/__version__.py:
--------------------------------------------------------------------------------
1 | """Version information."""
2 |
3 | # The following line *must* be the last in the module, exactly as formatted:
4 | __version__ = '0.1.0'
5 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/run_inference.sh:
--------------------------------------------------------------------------------
1 | # generated_chat_vicuna
2 | CUDA_VISIBLE_DEVICES=0 python single_chat.py \
3 | --model_name_or_path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/merged_model
--------------------------------------------------------------------------------
/source/model/llama2/data/run_test.yaml:
--------------------------------------------------------------------------------
1 | 100PoisonMpts:
2 | hf_hub_url: 'damo/100PoisonMpts'
3 | local_path: /home/robin/prompt_data/100PoisonMpts/train_alpaca.json
4 | dataset_format: alpaca
5 | multi_turn: False
6 |
--------------------------------------------------------------------------------
/source/model/flan-t5/sample_ablate.sh:
--------------------------------------------------------------------------------
1 | sample_fraction=(0.025 0.05 0.1 0.25 0.5)
2 |
3 | for (( sf=0; sf<5; sf=sf+1 )) do
4 | python flan_classification.py --train_sample_fraction ${sample_fraction[$sf]} & wait
5 | done
6 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/gradio_webserver.sh:
--------------------------------------------------------------------------------
1 | python gradio_webserver.py \
2 | --model_name_or_path decapoda-research/llama-7b-hf \
3 | --lora_model_name_or_path work_dir/oasst1-llama-7b/checkpoint-414/adapter_model
4 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/eval.sh:
--------------------------------------------------------------------------------
1 | python chatllms/evaluation/evaluate_zh.py \
2 | --model_name_or_path ~/checkpoints/baichuan7b \
3 | --split test \
4 | --data_path ~/prompt_data/ceval-exam \
5 | --output_dir ./work_dir/ceval_output
6 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/gradio_qlora_webserver.sh:
--------------------------------------------------------------------------------
1 | python gradio_qlora_webserver.py \
2 | --model_name_or_path decapoda-research/llama-7b-hf \
3 | --lora_model_name_or_path ./work_dir/oasst1-llama-7b/checkpoint-831/adapter_model \
4 | --quant_type nf4 \
5 | --double_quant \
6 | --bits 4 \
7 | --fp16
8 |
--------------------------------------------------------------------------------
/source/model/flan-t5/run_ft.sh:
--------------------------------------------------------------------------------
1 | epochs=(2 5 10)
2 | lora_r=(2 4 8 16)
3 | dropout=(0.1 0.2)
4 |
5 | for (( epoch=0; epoch<3; epoch=epoch+1 )) do
6 | for ((r=0; r<4; r=r+1 )) do
7 | for (( d=0; d<2; d=d+1 )) do
8 | python flan_seq2seq.py --lora_r ${lora_r[$r]} --epochs ${epochs[$epoch]} --dropout ${dropout[$d]} & wait
9 | done
10 | done
11 | done
12 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/server/apply_lora_to_base_model.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python chatllms/utils/apply_lora.py \
2 | --base-model-path ~/checkpoints/baichuan7b/ \
3 | --lora-model-path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/checkpoint-15000 \
4 | --target-model-path ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu/merged_model
--------------------------------------------------------------------------------
/source/model/flan-t5/run_lora.sh:
--------------------------------------------------------------------------------
1 | epochs=(2 5 10)
2 | lora_r=(2 4 8 16)
3 | dropout=(0.1 0.2)
4 |
5 | for (( epoch=0; epoch<3; epoch=epoch+1 )) do
6 | for ((r=0; r<4; r=r+1 )) do
7 | for (( d=0; d<2; d=d+1 )) do
8 | python flan_seq2seq.py --peft_method "lora" --lora_r ${lora_r[$r]} --epochs ${epochs[$epoch]} --dropout ${dropout[$d]} & wait
9 | done
10 | done
11 | done
12 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .load_pretrain_model import load_model_tokenizer
2 | from .mmlueval_callback import MMLUEvalCallback
3 | from .sample_generate_callback import SampleGenerateCallback
4 | from .save_peft_model_callback import SavePeftModelCallback
5 |
6 | __all__ = [
7 | 'load_model_tokenizer', 'MMLUEvalCallback', 'SampleGenerateCallback',
8 | 'SavePeftModelCallback'
9 | ]
10 |
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.21.0
2 | deepspeed==0.10.1
3 | flash-attn==2.1.1
4 | jsonlines==3.1.0
5 | nltk==3.8.1
6 | numpy==1.24.4
7 | openai==0.27.8
8 | protobuf==4.24.0
9 | safetensors==0.3.2
10 | sentence-transformers==2.2.2
11 | sentencepiece==0.1.99
12 | spacy==2.2.4
13 | tiktoken==0.5.1
14 | tokenizers==0.15.0
15 | torch==2.0.1+cu118
16 | tqdm==4.66.1
17 | transformers==4.35.2
18 | uvicorn==0.23.2
19 | vllm==0.2.1.post1
20 |
--------------------------------------------------------------------------------
/source/model/flan-t5/run_prefix.sh:
--------------------------------------------------------------------------------
1 | epochs=(5 10 15 20)
2 | prefix_tokens=(10 25 50 100)
3 | prefix_projection=(0 1)
4 |
5 | for (( epoch=0; epoch<4; epoch=epoch+1 )) do
6 | for ((pt=0; pt<4; pt=pt+1 )) do
7 | for (( proj=0; proj<2; proj=proj+1 )) do
8 | python flan_seq2seq.py --prefix_tokens ${prefix_tokens[$pt]} --epochs ${epochs[$epoch]} --prefix_projection ${prefix_projection[$proj]} & wait
9 | done
10 | done
11 | done
12 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/clean_sharegpt/clean_evol_instruct.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from clean_sharegpt import get_clean_data, json_dump
4 |
5 | if __name__ == '__main__':
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('--in-file', type=str)
8 | parser.add_argument('--out-file', type=str)
9 | args = parser.parse_args()
10 |
11 | clean_data2 = get_clean_data(args)
12 | json_dump(clean_data2, args.out_file)
13 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # nohup sh scripts/finetune/finetune_baichuan_7b_olcc.sh > run2.log 2>&1 &
3 | # nohup sh scripts/multiturn/full-finetune_alpaca_ds.sh > run2.log 2>&1 &
4 | nohup sh scripts/qlora_finetune/multiturn_llama_finetune.sh > run_vicuna_llama_1gpu.log 2>&1 &
5 | nohup sh scripts/qlora_finetune/multiturn_baichuan_finetune.sh > run_vicuna_baichuan_1gpu.log 2>&1 &
6 | nohup sh scripts/qlora_finetune/finetune_baichuan_7b_olcc.sh > run_zh_baichuan_1gpu.log 2>&1 &
7 |
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | python main.py --engine "llama2-13b"
2 | --base_model_path {base_model_path} \
3 | --self_knowledge_model_path {self_knowledge_model_path} \
4 | --passage_relevance_model_path {passage_relevance_model_path} \
5 | --task_decomposition_model_path {task_decomposition_model_path} \
6 | --data_path {data_path} \
7 | --n_docs {Number of documents to retrieve per questions} \
8 | --model_name_or_path {contriever_model_path} \
9 | --passages_embedding "wikipedia_embeddings/*" \
--------------------------------------------------------------------------------
/source/model/llama2/requirements.txt:
--------------------------------------------------------------------------------
1 |
2 | accelerate
3 | accelerate @ git+https://github.com/huggingface/accelerate.git
4 | bitsandbytes==0.39.0
5 | datasets
6 | deepspeed
7 | einops==0.6.1
8 | evaluate>=0.4.0
9 | gradio
10 | jieba
11 | nltk>=3.8.1
12 | numpy
13 | peft
14 | peft @ git+https://github.com/huggingface/peft.git
15 | rouge-chinese
16 | rouge-score>=0.1.2
17 | sentencepiece
18 | tokenizers
19 | torch
20 | transformers>=4.28.0
21 | transformers @ git+https://github.com/huggingface/transformers.git
22 | wandb==0.15.3
23 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_args import DataArguments
2 | from .gen_args import GenerationArguments
3 | from .infer_args import ModelInferenceArguments
4 | from .lora_args import LoraArguments
5 | from .model_args import ModelArguments
6 | from .quant_args import QuantArguments
7 | from .train_args import TrainingArguments
8 |
9 | __all__ = [
10 | 'DataArguments', 'GenerationArguments', 'ModelArguments',
11 | 'TrainingArguments', 'ModelInferenceArguments', 'LoraArguments',
12 | 'QuantArguments'
13 | ]
14 |
--------------------------------------------------------------------------------
/source/model/deepspeed.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_micro_batch_size_per_gpu": "auto",
3 | "zero_allow_untested_optimizer": true,
4 | "fp16": {
5 | "enabled": "auto",
6 | "loss_scale": 0,
7 | "initial_scale_power": 16,
8 | "loss_scale_window": 1000,
9 | "hysteresis": 2,
10 | "min_loss_scale": 1
11 | },
12 | "zero_optimization": {
13 | "stage": 2,
14 | "allgather_partitions": true,
15 | "allgather_bucket_size": 5e8,
16 | "overlap_comm": false,
17 | "reduce_scatter": true,
18 | "reduce_bucket_size": 5e8,
19 | "contiguous_gradients" : true
20 | }
21 | }
--------------------------------------------------------------------------------
/retrieval_contriever/README.md:
--------------------------------------------------------------------------------
1 | ## Retrieval using Contriever
2 |
3 | We utilize the Retriever: [Contriever](https://github.com/facebookresearch/contriever).
4 |
5 | ## References
6 |
7 | ```bibtex
8 | @misc{izacard2021contriever,
9 | title={Unsupervised Dense Information Retrieval with Contrastive Learning},
10 | author={Gautier Izacard and Mathilde Caron and Lucas Hosseini and Sebastian Riedel and Piotr Bojanowski and Armand Joulin and Edouard Grave},
11 | year={2021},
12 | url = {https://arxiv.org/abs/2112.09118},
13 | doi = {10.48550/ARXIV.2112.09118},
14 | }
15 | ```
16 |
17 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .conv_dataset import make_conversation_data_module
2 | from .data_loader import make_supervised_data_module
3 | from .data_utils import (extract_alpaca_prompt_dataset,
4 | extract_default_prompt_dataset,
5 | extract_random_prompt_dataset)
6 | from .sft_dataset import make_instruction_data_module
7 |
8 | __all__ = [
9 | 'make_conversation_data_module', 'make_supervised_data_module',
10 | 'make_instruction_data_module', 'extract_random_prompt_dataset',
11 | 'extract_alpaca_prompt_dataset', 'extract_default_prompt_dataset'
12 | ]
13 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/vllm/vllm_demo.py:
--------------------------------------------------------------------------------
1 | from vllm import LLM, SamplingParams
2 |
3 | prompts = [
4 | 'Hello, my name is',
5 | 'The president of the United States is',
6 | 'The capital of France is',
7 | 'The future of AI is',
8 | ]
9 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
10 |
11 | llm = LLM(model='decapoda-research/llama-7b-hf', gpu_memory_utilization=0.9)
12 |
13 | # Print the outputs.
14 | for i in range(10):
15 | outputs = llm.generate(prompts, sampling_params)
16 | for output in outputs:
17 | prompt = output.prompt
18 | generated_text = output.outputs[0].text
19 | print(f'Prompt: {prompt!r}, Generated text: {generated_text!r}')
20 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/lora_args.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 |
3 |
4 | @dataclass
5 | class LoraArguments:
6 | # lora中A矩阵的列数量和B矩阵的行数量
7 | lora_r: int = field(default=64, metadata={'help': 'Lora R dimension.'})
8 | # 缩放因子
9 | lora_alpha: float = field(default=16, metadata={'help': ' Lora alpha.'})
10 | # dropout,一种正则化方法,可以模仿集成学习
11 | lora_dropout: float = field(default=0.0,
12 | metadata={'help': 'Lora dropout.'})
13 | # 每个GPU上可使用的显存大小,以MB为单位。默认是A100高端版本的80GB
14 | max_memory_MB: int = field(default=80000,
15 | metadata={'help': 'Free memory per gpu.'})
16 | lora_weight_path: str = ''
17 | bias: str = 'none'
18 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/full_finetune/full-finetune_ds.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 train.py \
2 | --model_name_or_path facebook/opt-125m \
3 | --data_path ~/prompt_data/InstructionWild/instinwild_en.json \
4 | --output_dir work_dir/alpaca_full-finetune \
5 | --num_train_epochs 3 \
6 | --per_device_train_batch_size 4 \
7 | --per_device_eval_batch_size 4 \
8 | --gradient_accumulation_steps 8 \
9 | --evaluation_strategy "no" \
10 | --save_strategy "steps" \
11 | --save_steps 500 \
12 | --save_total_limit 5 \
13 | --learning_rate 2e-5 \
14 | --weight_decay 0. \
15 | --warmup_ratio 0.03 \
16 | --lr_scheduler_type "cosine" \
17 | --logging_steps 1 \
18 | --deepspeed "scripts/ds_config/ds_config_zero3_auto.json"
19 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/lora_finetune/lora-finetune.sh:
--------------------------------------------------------------------------------
1 | python train_lora.py \
2 | --model_name_or_path facebook/opt-125m \
3 | --dataset_name 100PoisonMpts \
4 | --output_dir work_dir/lora-finetune \
5 | --num_train_epochs 3 \
6 | --per_device_train_batch_size 4 \
7 | --per_device_eval_batch_size 4 \
8 | --gradient_accumulation_steps 8 \
9 | --evaluation_strategy "no" \
10 | --save_strategy "steps" \
11 | --save_steps 500 \
12 | --save_total_limit 5 \
13 | --learning_rate 1e-4 \
14 | --weight_decay 0. \
15 | --warmup_ratio 0.03 \
16 | --optim "adamw_torch" \
17 | --lr_scheduler_type "cosine" \
18 | --model_max_length 1024 \
19 | --logging_steps 1 \
20 | --do_train \
21 | --do_eval \
22 | --gradient_checkpointing True
23 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/full_finetune/full-finetune.sh:
--------------------------------------------------------------------------------
1 | python train.py \
2 | --model_name_or_path facebook/opt-125m \
3 | --dataset_name share_gpt \
4 | --output_dir work_dir/full-finetune \
5 | --num_train_epochs 3 \
6 | --per_device_train_batch_size 4 \
7 | --per_device_eval_batch_size 4 \
8 | --gradient_accumulation_steps 8 \
9 | --evaluation_strategy "steps" \
10 | --save_strategy "steps" \
11 | --eval_steps 1000 \
12 | --save_steps 1000 \
13 | --save_total_limit 5 \
14 | --logging_steps 1 \
15 | --learning_rate 2e-5 \
16 | --weight_decay 0. \
17 | --warmup_ratio 0.03 \
18 | --optim "adamw_torch" \
19 | --lr_scheduler_type "cosine" \
20 | --gradient_checkpointing True \
21 | --model_max_length 128 \
22 | --trust_remote_code \
23 | --do_train \
24 | --do_eval
25 |
--------------------------------------------------------------------------------
/source/model/flan_t5_predict.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import time
3 | import os
4 | import json
5 |
6 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
7 |
8 |
9 | def model_init(args):
10 | model_path = args.model_path
11 | device = torch.device("cuda:0")
12 | model = AutoModelForSeq2SeqLM.from_pretrained(
13 | model_path,
14 | torch_dtype=torch.float16,
15 | ).to(device)
16 | tokenizer = AutoTokenizer.from_pretrained(model_path)
17 | return model, tokenizer, device
18 |
19 |
20 | def predict(args, prompt, model, tokenizer):
21 | inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
22 | generate_ids = model.generate(**inputs, temperature=args.temperature)
23 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
24 | infer_res = tokenizer.decode(generate_ids)
25 | return infer_res
26 |
--------------------------------------------------------------------------------
/source/model/gpt_predict.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from openai import OpenAI
4 |
5 | def predict(args, prompt):
6 | my_key = args.api_key
7 | max_length = 256
8 | temperature = 0.0
9 | top_p = 1
10 | frequency_penalty = 0
11 | presence_penalty = 0
12 | client = OpenAI(api_key = my_key)
13 | prompt = "
14 | response = client.completions.create(
15 | model="gpt-3.5-turbo-instruct", # text-davinci-003 is deprecated
16 | prompt=prompt,
17 | max_tokens=max_length,
18 | temperature=temperature,
19 | top_p=top_p,
20 | frequency_penalty=frequency_penalty,
21 | presence_penalty=presence_penalty,
22 | # api_key=my_key,
23 | )
24 | if args.engine == 'llama2-13b':
25 | raise NotImplementedError('Engine false when running gpt3.5: {}'.format(args.engine))
26 | return response.choices[0].text
--------------------------------------------------------------------------------
/source/model/llama2_predict.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import time
3 | import os
4 | import json
5 | from transformers import LlamaTokenizer, LlamaForCausalLM, AutoConfig
6 |
7 | def model_init(model_path):
8 | # model_path = args.model_path
9 | device = torch.device("cuda:0")
10 | model = LlamaForCausalLM.from_pretrained(
11 | model_path,
12 | torch_dtype=torch.float16,
13 | ).to(device)
14 | tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=False)
15 | return model, tokenizer
16 |
17 | def predict(args, prompt, model, tokenizer):
18 | inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
19 | generate_ids = model.generate(**inputs, max_length=args.max_length, temperature=args.temperature)
20 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
21 | infer_res = tokenizer.decode(generate_ids)
22 | return infer_res
23 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/clean_sharegpt/merge.py:
--------------------------------------------------------------------------------
1 | """
2 | Merge two conversation files into one
3 |
4 | Usage: python3 -m fastchat.data.merge --in file1.json file2.json --out merged.json
5 | """
6 |
7 | import argparse
8 |
9 | from clean_sharegpt import json_dump, json_load
10 |
11 | if __name__ == '__main__':
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--in-file', type=str, required=True, nargs='+')
14 | parser.add_argument('--out-file', type=str, default='merged.json')
15 | args = parser.parse_args()
16 |
17 | new_content = []
18 | for in_file in args.in_file:
19 | content = json_load(in_file)
20 | print(f'in-file: {in_file}, len: {len(content)}')
21 | new_content.extend(content)
22 |
23 | print(f'#out: {len(new_content)}')
24 | print(f'Save new_content to {args.out_file}')
25 | json_dump(new_content, args.out_file)
26 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/lora_finetune/lora-finetune_ds.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 train_lora.py \
2 | --model_name_or_path facebook/opt-125m \
3 | --data_path ~/prompt_data/InstructionWild/instinwild_en.json \
4 | --output_dir work_dir/alpaca_full-finetune \
5 | --num_train_epochs 3 \
6 | --per_device_train_batch_size 4 \
7 | --per_device_eval_batch_size 4 \
8 | --gradient_accumulation_steps 8 \
9 | --evaluation_strategy "no" \
10 | --save_strategy "steps" \
11 | --save_steps 500 \
12 | --save_total_limit 5 \
13 | --learning_rate 2e-5 \
14 | --weight_decay 0. \
15 | --warmup_ratio 0.03 \
16 | --optim "adamw_torch" \
17 | --lr_scheduler_type "cosine" \
18 | --model_max_length 2048 \
19 | --logging_steps 1 \
20 | --do_train \
21 | --do_eval \
22 | --gradient_checkpointing True \
23 | --deepspeed "scripts/ds_config/ds_config_zero3_auto.json"
24 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/finetune_llm/baichuan7b_demo.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 |
4 |
5 | def main(load_in_8bit=True, model_path=''):
6 | tokenizer = AutoTokenizer.from_pretrained(
7 | pretrained_model_name_or_path=model_path, trust_remote_code=True)
8 | model = AutoModelForCausalLM.from_pretrained(
9 | pretrained_model_name_or_path=model_path,
10 | load_in_8bit=load_in_8bit,
11 | torch_dtype=torch.float16,
12 | device_map='auto',
13 | trust_remote_code=True)
14 | inputs = tokenizer('登鹳雀楼->王之涣\n夜雨寄北->', return_tensors='pt')
15 | inputs = inputs.to('cuda:0')
16 | pred = model.generate(**inputs, max_new_tokens=64)
17 | print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
18 |
19 |
20 | if __name__ == '__main__':
21 | load_in_8bit = True
22 | model_path = '/home/robin/work_dir/llm/llm_pretrain_model/baichuan'
23 | main(load_in_8bit, model_path)
24 |
--------------------------------------------------------------------------------
/source/model/llama2/chatbot.py:
--------------------------------------------------------------------------------
1 | import openai
2 | import gradio as gr
3 |
4 |
5 | if __name__ == "__main__":
6 | openai.api_key = "Your API key"
7 |
8 | messages = [
9 | {"role": "system", "content": "You are a helpful and kind AI Assistant."},
10 | ]
11 |
12 | def chatbot(input):
13 | if input:
14 | messages.append({"role": "user", "content": input})
15 | chat = openai.ChatCompletion.create(
16 | model="gpt-3.5-turbo", messages=messages
17 | )
18 | reply = chat.choices[0].message.content
19 | messages.append({"role": "assistant", "content": reply})
20 | return reply
21 |
22 | inputs = gr.inputs.Textbox(lines=7, label="Chat with AI")
23 | outputs = gr.outputs.Textbox(label="Reply")
24 |
25 | gr.Interface(fn=chatbot, inputs=inputs, outputs=outputs, title="AI Chatbot",
26 | description="Ask anything you want",
27 | theme="compact").launch(share=True)
--------------------------------------------------------------------------------
/source/model/llama2/data/belle_group.yaml:
--------------------------------------------------------------------------------
1 | belle_0.5m:
2 | hf_hub_url: BelleGroup/train_0.5M_CN
3 | local_path: ''
4 | dataset_format: alpaca
5 | multi_turn: False
6 |
7 | belle_1m:
8 | hf_hub_url: BelleGroup/train_1M_CN
9 | local_path: ''
10 | dataset_format: alpaca
11 | multi_turn: False
12 |
13 | belle_2m:
14 | hf_hub_url: BelleGroup/train_2M_CN
15 | local_path: ''
16 | dataset_format: alpaca
17 | multi_turn: False
18 |
19 | belle_dialog:
20 | hf_hub_url: BelleGroup/generated_chat_0.4M
21 | local_path: ''
22 | dataset_format: belle_dialog
23 | multi_turn: False
24 |
25 | belle_math:
26 | hf_hub_url: BelleGroup/school_math_0.25M
27 | local_path: ''
28 | dataset_format: alpaca
29 | multi_turn: False
30 |
31 | belle_multiturn:
32 | hf_hub_url: BelleGroup/multi_turn_0.5M
33 | local_path: ''
34 | dataset_format: belle_multiturn
35 | multi_turn: True
36 | columns:
37 | prompt: instruction
38 | query: ''
39 | response: output
40 | history: history
41 |
--------------------------------------------------------------------------------
/source/arch/self_knowledge/sk.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import re
4 | import string
5 |
6 | import os
7 | import argparse
8 | import csv
9 | import json
10 | import logging
11 | import pickle
12 | import time
13 | import glob
14 |
15 | import numpy as np
16 | import torch
17 | import transformers
18 |
19 | class Self_Knowledge_Model():
20 | def __init__(self, model, tokenizer):
21 | self.model = model
22 | self.tokenizer = tokenizer
23 |
24 | def find_known(self, context, query):
25 | inputs = tokenizer(context + query, return_tensors="pt").to('cuda')
26 | generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature)
27 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
28 | result = tokenizer.decode(generate_ids)
29 | if result == "know":
30 | return True
31 | elif result == "unknow":
32 | return False
33 | else:
34 | print(f"Invalid output on SKM query: {context + query}")
35 | return False
36 |
--------------------------------------------------------------------------------
/source/arch/passage_relevance/pr.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import re
4 | import string
5 |
6 | import os
7 | import argparse
8 | import csv
9 | import json
10 | import logging
11 | import pickle
12 | import time
13 | import glob
14 |
15 | import numpy as np
16 | import torch
17 | import transformers
18 |
19 | class Passage_Relevance_Model():
20 | def __init__(self, model, tokenizer):
21 | self.model = model
22 | self.tokenizer = tokenizer
23 |
24 | def find_relevance(self, context, query, passage):
25 | inputs = tokenizer(context + query + "\nPassage: " + passage, return_tensors="pt").to('cuda')
26 | generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature)
27 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
28 | result = tokenizer.decode(generate_ids)
29 | if result == "relevance":
30 | return True
31 | elif result == "irrelevance":
32 | return False
33 | else:
34 | print(f"Invalid output on PRM query: {context + query}")
35 | return False
36 |
--------------------------------------------------------------------------------
/source/arch/task_decomposition/td.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import re
4 | import string
5 |
6 | import os
7 | import argparse
8 | import csv
9 | import json
10 | import logging
11 | import pickle
12 | import time
13 | import glob
14 |
15 | import numpy as np
16 | import torch
17 | import transformers
18 |
19 | class Task_Decomposition_Model():
20 | def __init__(self, model, tokenizer):
21 | self.model = model
22 | self.tokenizer = tokenizer
23 | self.query_list = list()
24 |
25 | def decompose(self, context, query):
26 | inputs = tokenizer(context + query, return_tensors="pt").to('cuda')
27 | generate_ids = model.generate(**inputs, max_length=512, temperature=args.temperature)
28 | generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
29 | result = tokenizer.decode(generate_ids)
30 | try:
31 | data = json.loads(result)
32 | for idx, q in data['query']:
33 | self.query_list.append(q)
34 | except json.JSONDecodeError:
35 | print(f"Invalid format on TDM query: {context + query}, json_string: {result}")
36 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/test_qlora_finetune.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python train_qlora.py \
2 | --model_name_or_path facebook/opt-125m \
3 | --dataset_name olcc \
4 | --output_dir ./work_dir/run_test \
5 | --num_train_epochs 3 \
6 | --max_train_samples 100 \
7 | --max_eval_samples 100 \
8 | --per_device_train_batch_size 4 \
9 | --per_device_eval_batch_size 4 \
10 | --gradient_accumulation_steps 8 \
11 | --evaluation_strategy steps \
12 | --eval_steps 50 \
13 | --save_strategy steps \
14 | --save_total_limit 5 \
15 | --save_steps 100 \
16 | --logging_strategy steps \
17 | --logging_steps 1 \
18 | --learning_rate 0.0002 \
19 | --warmup_ratio 0.03 \
20 | --weight_decay 0.0 \
21 | --lr_scheduler_type constant \
22 | --adam_beta2 0.999 \
23 | --max_grad_norm 0.3 \
24 | --max_new_tokens 32 \
25 | --lora_r 64 \
26 | --lora_alpha 16 \
27 | --lora_dropout 0.1 \
28 | --double_quant \
29 | --quant_type nf4 \
30 | --fp16 \
31 | --bits 4 \
32 | --gradient_checkpointing \
33 | --trust_remote_code \
34 | --do_train \
35 | --do_eval \
36 | --sample_generate \
37 | --data_seed 42 \
38 | --seed 0
39 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/qlora_finetune/finetune_baichuan_7b_vicuna_zh.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=1 python train_qlora.py \
2 | --model_name_or_path ~/checkpoints/baichuan7b \
3 | --dataset_cfg ./data/vicuna_zh_pcyn.yaml \
4 | --output_dir ./work_dir/vicuna_zh-baichuan-7b \
5 | --num_train_epochs 3 \
6 | --per_device_train_batch_size 2 \
7 | --per_device_eval_batch_size 2 \
8 | --gradient_accumulation_steps 16 \
9 | --evaluation_strategy steps \
10 | --eval_steps 1000 \
11 | --save_strategy steps \
12 | --save_total_limit 10 \
13 | --save_steps 1000 \
14 | --logging_strategy steps \
15 | --logging_steps 5 \
16 | --learning_rate 0.0002 \
17 | --warmup_ratio 0.03 \
18 | --weight_decay 0.0 \
19 | --lr_scheduler_type constant \
20 | --adam_beta2 0.999 \
21 | --max_grad_norm 0.3 \
22 | --lora_r 64 \
23 | --lora_alpha 16 \
24 | --lora_dropout 0.1 \
25 | --double_quant \
26 | --quant_type nf4 \
27 | --fp16 \
28 | --bits 4 \
29 | --model_max_length 1024 \
30 | --gradient_checkpointing \
31 | --trust_remote_code True \
32 | --use_auth_token True \
33 | --do_train \
34 | --do_eval \
35 | --data_seed 42 \
36 | --seed 0
37 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/qlora_finetune/finetune_llama2_7b_alpaca_zh.sh:
--------------------------------------------------------------------------------
1 | python train_qlora.py \
2 | --model_name_or_path meta-llama/Llama-2-7b-hf \
3 | --dataset_cfg ./data/alpaca_zh_pcyn.yaml \
4 | --output_dir ./work_dir/alpaca_zh_llama2-7b \
5 | --num_train_epochs 3 \
6 | --per_device_train_batch_size 4 \
7 | --per_device_eval_batch_size 4 \
8 | --gradient_accumulation_steps 8 \
9 | --evaluation_strategy steps \
10 | --eval_steps 1000 \
11 | --save_strategy steps \
12 | --save_total_limit 10 \
13 | --save_steps 1000 \
14 | --logging_strategy steps \
15 | --logging_steps 5 \
16 | --learning_rate 0.0002 \
17 | --warmup_ratio 0.03 \
18 | --weight_decay 0.0 \
19 | --lr_scheduler_type constant \
20 | --adam_beta2 0.999 \
21 | --max_grad_norm 0.3 \
22 | --lora_r 64 \
23 | --lora_alpha 16 \
24 | --lora_dropout 0.1 \
25 | --double_quant \
26 | --quant_type nf4 \
27 | --fp16 \
28 | --bits 4 \
29 | --model_max_length 1024 \
30 | --gradient_checkpointing \
31 | --trust_remote_code True \
32 | --use_auth_token True \
33 | --do_train \
34 | --do_eval \
35 | --sample_generate \
36 | --data_seed 42 \
37 | --seed 0
38 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/qlora_finetune/finetune_llama_7b_alpaca_zh.sh:
--------------------------------------------------------------------------------
1 | python train_qlora.py \
2 | --model_name_or_path decapoda-research/llama-7b-hf \
3 | --dataset_cfg ./data/alpaca_zh_pcyn.yaml \
4 | --output_dir ./work_dir/alpaca_zh-baichuan-7b \
5 | --num_train_epochs 3 \
6 | --per_device_train_batch_size 4 \
7 | --per_device_eval_batch_size 4 \
8 | --gradient_accumulation_steps 8 \
9 | --evaluation_strategy steps \
10 | --eval_steps 1000 \
11 | --save_strategy steps \
12 | --save_total_limit 10 \
13 | --save_steps 1000 \
14 | --logging_strategy steps \
15 | --logging_steps 5 \
16 | --learning_rate 0.0002 \
17 | --warmup_ratio 0.03 \
18 | --weight_decay 0.0 \
19 | --lr_scheduler_type constant \
20 | --adam_beta2 0.999 \
21 | --max_grad_norm 0.3 \
22 | --lora_r 64 \
23 | --lora_alpha 16 \
24 | --lora_dropout 0.1 \
25 | --double_quant \
26 | --quant_type nf4 \
27 | --fp16 \
28 | --bits 4 \
29 | --model_max_length 1024 \
30 | --gradient_checkpointing \
31 | --trust_remote_code True \
32 | --use_auth_token True \
33 | --do_train \
34 | --do_eval \
35 | --sample_generate \
36 | --data_seed 42 \
37 | --seed 0
38 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/quant_args.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 |
3 |
4 | @dataclass
5 | class QuantArguments:
6 | # 使用8-bit的adam,是否可以调整为LION或Sophia,甚至deepspeed还提供了多个1-bit优化器选择
7 | adam8bit: bool = field(default=False, metadata={'help': 'Use 8-bit adam.'})
8 | # 是否使用二次量化
9 | double_quant: bool = field(
10 | default=True,
11 | metadata={
12 | 'help':
13 | 'Compress the quantization statistics through double quantization.'
14 | })
15 | # 量化类型,可以选择`fp4`或`nf4`
16 | quant_type: str = field(
17 | default='nf4',
18 | metadata={
19 | 'help':
20 | 'Quantization data type to use. Should be one of `fp4` or `nf4`.'
21 | })
22 | # 使用的位宽,默认为4。
23 | bits: int = field(default=4, metadata={'help': 'How many bits to use.'})
24 |
25 | def __post_init__(self):
26 | if self.bits is not None:
27 | assert self.bits in [
28 | 4, 8
29 | ], 'We only accept 4-bit or 8-bit quantization.'
30 |
31 | if self.quant_type is not None:
32 | assert self.quant_type in [
33 | 'nf4', 'fp4'
34 | ], 'We only accept `nf4` or `fp4` quantization type.'
35 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/qlora_finetune/finetune_baichuan_7b_alpaca_zh.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python train_qlora.py \
2 | --model_name_or_path ~/checkpoints/baichuan7b \
3 | --dataset_cfg ./data/alpaca_zh_pcyn.yaml \
4 | --output_dir ./work_dir/alpaca_zh-baichuan-7b \
5 | --num_train_epochs 3 \
6 | --per_device_train_batch_size 4 \
7 | --per_device_eval_batch_size 4 \
8 | --gradient_accumulation_steps 8 \
9 | --evaluation_strategy steps \
10 | --eval_steps 1000 \
11 | --save_strategy steps \
12 | --save_total_limit 10 \
13 | --save_steps 1000 \
14 | --logging_strategy steps \
15 | --logging_steps 5 \
16 | --learning_rate 0.0002 \
17 | --warmup_ratio 0.03 \
18 | --weight_decay 0.0 \
19 | --lr_scheduler_type constant \
20 | --adam_beta2 0.999 \
21 | --max_grad_norm 0.3 \
22 | --lora_r 64 \
23 | --lora_alpha 16 \
24 | --lora_dropout 0.1 \
25 | --double_quant \
26 | --quant_type nf4 \
27 | --fp16 \
28 | --bits 4 \
29 | --model_max_length 1024 \
30 | --gradient_checkpointing \
31 | --trust_remote_code True \
32 | --use_auth_token True \
33 | --do_train \
34 | --do_eval \
35 | --sample_generate \
36 | --data_seed 42 \
37 | --seed 0
38 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoTokenizer
3 | from retrieval_contriever.src.contriever import Contriever
4 |
5 | tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/contriever-msmarco')
6 | model = Contriever.from_pretrained('/root/autodl-tmp/contriever-msmarco')
7 |
8 | sentences = [
9 | "Where was Marie Curie born?",
10 | "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
11 | "Born in Paris on 15 May 1859, 111111 Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
12 | ]
13 |
14 | # Apply tokenizer
15 | inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
16 |
17 | # Compute token embeddings
18 | outputs = model(**inputs)
19 |
20 | # Mean pooling
21 | def mean_pooling(token_embeddings, mask):
22 | token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
23 | sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
24 | return sentence_embeddings
25 | # embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
26 | embeddings = outputs
27 | # print(embeddings[0])
28 | # print(embeddings[1])
29 | score1 = embeddings[0] @ embeddings[1]
30 | score2 = embeddings[0] @ embeddings[2]
31 | print(score1)
32 | print(score2)
--------------------------------------------------------------------------------
/source/model/llama2/examples/format_data/merge.py:
--------------------------------------------------------------------------------
1 | """
2 | Merge two conversation files into one
3 |
4 | Usage: python3 -m fastchat.data.merge --in file1.json file2.json --out merged.json
5 | """
6 |
7 | import argparse
8 | import json
9 |
10 | from datasets import load_dataset
11 |
12 |
13 | def json_load(in_file):
14 | with open(in_file, 'r') as f:
15 | json_data = json.load(f)
16 | return json_data
17 |
18 |
19 | def json_dump(obj, path):
20 | with open(path, 'w', encoding='utf-8') as f:
21 | json.dump(obj, f, indent=2, ensure_ascii=False)
22 |
23 |
24 | def merge_datasets(in_file_list, out_file):
25 |
26 | new_content = []
27 | for in_file in in_file_list:
28 | content = load_dataset('json', data_files=in_file)['train']
29 |
30 | print(f'in-file: {in_file}, len: {len(content)}')
31 | new_content.extend(content)
32 |
33 | print(f'#out: {len(new_content)}')
34 | print(f'Save new_content to {out_file}')
35 | json_dump(new_content, out_file)
36 |
37 |
38 | if __name__ == '__main__':
39 | parser = argparse.ArgumentParser()
40 | parser.add_argument('--in-file', type=str, required=True, nargs='+')
41 | parser.add_argument('--out-file', type=str, default='merged.json')
42 | args = parser.parse_args()
43 |
44 | merge_datasets(args.in_file, args.out_file)
45 |
--------------------------------------------------------------------------------
/source/model/llama2/data/vicuna_zh.yaml:
--------------------------------------------------------------------------------
1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
2 | coig:
3 | hf_hub_url: BAAI/COIG
4 | local_path: /home/robin/prompt_data/COIG/train_vicuna.json
5 | dataset_format: sharegpt
6 | multi_turn: True
7 |
8 | cvalues_comparison_train:
9 | hf_hub_url: ''
10 | local_path: /home/robin/prompt_data/CValues-Comparison/train_vicuna.json
11 | dataset_format: sharegpt
12 | multi_turn: True
13 |
14 | cvalues_comparison_test:
15 | hf_hub_url: ''
16 | local_path: /home/robin/prompt_data/CValues-Comparison/test_vicuna.json
17 | dataset_format: sharegpt
18 | multi_turn: True
19 |
20 | olcc:
21 | hf_hub_url: ''
22 | local_path: /home/robin/prompt_data/olcc/olcc_vicuna.json
23 | dataset_format: sharegpt
24 | multi_turn: True
25 |
26 | 100PoisonMpts:
27 | hf_hub_url: ''
28 | local_path: /home/robin/prompt_data/100PoisonMpts/train_vicuna.json
29 | dataset_format: sharegpt
30 | multi_turn: True
31 |
32 | safety_prompt_part1:
33 | hf_hub_url: ''
34 | local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json
35 | dataset_format: sharegpt
36 | multi_turn: True
37 |
38 | safety_prompt_part2:
39 | hf_hub_url: ''
40 | local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json
41 | dataset_format: sharegpt
42 | multi_turn: True
43 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/model_args.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Optional
3 |
4 |
5 | @dataclass
6 | class ModelArguments:
7 | model_name_or_path: Optional[str] = field(
8 | default='facebook/opt-125m',
9 | metadata={
10 | 'help':
11 | ("The model checkpoint for weights initialization. Don't set if you want to\
12 | train a model from scratch.")
13 | },
14 | )
15 | tokenizer_name: Optional[str] = field(
16 | default=None,
17 | metadata={
18 | 'help':
19 | 'Pretrained tokenizer name or path if not the same as model_name'
20 | })
21 | model_revision: str = field(
22 | default='main',
23 | metadata={
24 | 'help':
25 | 'The specific model version to use (can be a branch name, tag name or commit id).'
26 | },
27 | )
28 | trust_remote_code: Optional[bool] = field(
29 | default=False,
30 | metadata={
31 | 'help':
32 | 'Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained.'
33 | })
34 | use_auth_token: Optional[bool] = field(
35 | default=False,
36 | metadata={
37 | 'help':
38 | 'Enables using Huggingface auth token from Git Credentials.'
39 | })
40 |
--------------------------------------------------------------------------------
/source/model/llama2/data/alpaca_zh.yaml:
--------------------------------------------------------------------------------
1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
2 | coig:
3 | hf_hub_url: BAAI/COIG
4 | local_path: /home/robin/prompt_data/COIG/train_alpaca.json
5 | dataset_format: alpaca
6 | multi_turn: False
7 |
8 | cvalues_comparison_train:
9 | hf_hub_url: ''
10 | local_path: /home/robin/prompt_data/CValues-Comparison/train_alpaca.json
11 | dataset_format: alpaca
12 | multi_turn: False
13 |
14 | cvalues_comparison_test:
15 | hf_hub_url: ''
16 | local_path: /home/robin/prompt_data/CValues-Comparison/test_alpaca.json
17 | dataset_format: alpaca
18 | multi_turn: False
19 |
20 | olcc:
21 | hf_hub_url: ''
22 | local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json
23 | dataset_format: alpaca
24 | multi_turn: False
25 |
26 | 100PoisonMpts:
27 | hf_hub_url: 'damo/100PoisonMpts'
28 | local_path: /home/robin/prompt_data/100PoisonMpts/train_alpaca.json
29 | dataset_format: alpaca
30 | multi_turn: False
31 |
32 | safety_prompt_part1:
33 | hf_hub_url: ''
34 | local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json
35 | dataset_format: alpaca
36 | multi_turn: False
37 |
38 | safety_prompt_part2:
39 | hf_hub_url: ''
40 | local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json
41 | dataset_format: alpaca
42 | multi_turn: False
43 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/ds_config/default_offload_opt_param.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupDecayLR",
16 | "params": {
17 | "total_num_steps": "auto",
18 | "warmup_min_lr": "auto",
19 | "warmup_max_lr": "auto",
20 | "warmup_num_steps": "auto"
21 | }
22 | },
23 | "zero_optimization": {
24 | "stage": 3,
25 | "offload_optimizer": {
26 | "device": "cpu",
27 | "pin_memory": true
28 | },
29 | "offload_param": {
30 | "device": "cpu",
31 | "pin_memory": true
32 | },
33 | "overlap_comm": true,
34 | "contiguous_gradients": true,
35 | "sub_group_size": 1e9,
36 | "reduce_bucket_size": "auto",
37 | "stage3_prefetch_bucket_size": "auto",
38 | "stage3_param_persistence_threshold": "auto",
39 | "stage3_max_live_parameters": 1e9,
40 | "stage3_max_reuse_distance": 1e9,
41 | "stage3_gather_16bit_weights_on_model_save": false
42 | },
43 | "gradient_accumulation_steps": "auto",
44 | "gradient_clipping": "auto",
45 | "steps_per_print": 5,
46 | "train_batch_size": "auto",
47 | "train_micro_batch_size_per_gpu": "auto",
48 | "wall_clock_breakdown": false
49 | }
50 |
--------------------------------------------------------------------------------
/source/model/llama2/data/alpaca_zh_pcyn.yaml:
--------------------------------------------------------------------------------
1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
2 | coig:
3 | hf_hub_url: BAAI/COIG
4 | local_path: /userhome/jianzhnie/prompt_data/COIG/train_alpaca.json
5 | dataset_format: alpaca
6 | multi_turn: False
7 |
8 | cvalues_comparison_train:
9 | hf_hub_url: ''
10 | local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/train_alpaca.json
11 | dataset_format: alpaca
12 | multi_turn: False
13 |
14 | cvalues_comparison_test:
15 | hf_hub_url: ''
16 | local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/test_alpaca.json
17 | dataset_format: alpaca
18 | multi_turn: False
19 |
20 | olcc:
21 | hf_hub_url: ''
22 | local_path: /userhome/jianzhnie/prompt_data/olcc/olcc_alpaca.json
23 | dataset_format: alpaca
24 | multi_turn: False
25 |
26 | 100PoisonMpts:
27 | hf_hub_url: ''
28 | local_path: /userhome/jianzhnie/prompt_data/100PoisonMpts/train_alpaca.json
29 | dataset_format: alpaca
30 | multi_turn: False
31 |
32 | safety_prompt_part1:
33 | hf_hub_url: ''
34 | local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json
35 | dataset_format: alpaca
36 | multi_turn: False
37 |
38 | safety_prompt_part2:
39 | hf_hub_url: ''
40 | local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json
41 | dataset_format: alpaca
42 | multi_turn: False
43 |
--------------------------------------------------------------------------------
/source/model/llama2/data/vicuna_zh_pcyn.yaml:
--------------------------------------------------------------------------------
1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
2 | coig:
3 | hf_hub_url: BAAI/COIG
4 | local_path: /userhome/jianzhnie/prompt_data/COIG/train_vicuna.json
5 | dataset_format: sharegpt
6 | multi_turn: True
7 |
8 | cvalues_comparison_train:
9 | hf_hub_url: ''
10 | local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/train_vicuna.json
11 | dataset_format: sharegpt
12 | multi_turn: True
13 |
14 | cvalues_comparison_test:
15 | hf_hub_url: ''
16 | local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/test_vicuna.json
17 | dataset_format: sharegpt
18 | multi_turn: True
19 |
20 | olcc:
21 | hf_hub_url: ''
22 | local_path: /userhome/jianzhnie/prompt_data/olcc/olcc_vicuna.json
23 | dataset_format: sharegpt
24 | multi_turn: True
25 |
26 | 100PoisonMpts:
27 | hf_hub_url: ''
28 | local_path: /userhome/jianzhnie/prompt_data/100PoisonMpts/train_vicuna.json
29 | dataset_format: sharegpt
30 | multi_turn: True
31 |
32 | safety_prompt_part1:
33 | hf_hub_url: ''
34 | local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json
35 | dataset_format: sharegpt
36 | multi_turn: True
37 |
38 | safety_prompt_part2:
39 | hf_hub_url: ''
40 | local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json
41 | dataset_format: sharegpt
42 | multi_turn: True
43 |
--------------------------------------------------------------------------------
/retrieval_contriever/example_scripts/contriever.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --cpus-per-task=5
3 | #SBATCH --nodes=4
4 | #SBATCH --ntasks-per-node=8
5 | #SBATCH --gres=gpu:8
6 | #SBATCH --time=72:00:00
7 | #SBATCH --job-name=contriever
8 | #SBATCH --output=/private/home/gizacard/contriever/logtrain/%A
9 | #SBATCH --partition=learnlab
10 | #SBATCH --mem=450GB
11 | #SBATCH --signal=USR1@140
12 | #SBATCH --open-mode=append
13 |
14 |
15 | port=$(shuf -i 15000-16000 -n 1)
16 | TDIR="/private/home/gizacard/contriever/encoded-data"
17 | TRAINDATASETS="${TDIR}/wikisub/ ${TDIR}/cc-netsub/"
18 |
19 | rmin=0.05
20 | rmax=0.5
21 | T=0.05
22 | QSIZE=131072
23 | MOM=0.9995
24 | POOL=average
25 | AUG=delete
26 | PAUG=0.1
27 | LC=0.
28 | mo=bert-base-uncased
29 | mp=none
30 |
31 | name=$SLURM_JOB_ID-$POOL-rmin$rmin-rmax$rmax-T$T-$QSIZE-$MOM-$mo-$AUG-$PAUG
32 |
33 | srun ~oceanntwt/anaconda3/envs/contriever/bin/python3 train.py \
34 | --model_path $mp \
35 | --sampling_coefficient $LC \
36 | --retriever_model_id $mo --pooling $POOL \
37 | --augmentation $AUG --prob_augmentation $PAUG \
38 | --train_data $TRAINDATASETS --loading_mode split \
39 | --ratio_min $rmin --ratio_max $rmax --chunk_length 256 \
40 | --momentum $MOM --queue_size $QSIZE --temperature $T \
41 | --warmup_steps 20000 --total_steps 500000 --lr 0.00005 \
42 | --name $name \
43 | --scheduler linear \
44 | --optim adamw \
45 | --per_gpu_batch_size 64 \
46 | --output_dir /checkpoint/oceanntwt/contriever/$name \
47 | --main_port $port \
48 |
49 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/ds_config/ds_config_zero3_auto.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 | "scheduler": {
23 | "type": "WarmupDecayLR",
24 | "params": {
25 | "total_num_steps": "auto",
26 | "warmup_min_lr": "auto",
27 | "warmup_max_lr": "auto",
28 | "warmup_num_steps": "auto"
29 | }
30 | },
31 | "zero_optimization": {
32 | "stage": 3,
33 | "offload_optimizer": {
34 | "device": "cpu",
35 | "pin_memory": true
36 | },
37 | "offload_param": {
38 | "device": "cpu",
39 | "pin_memory": true
40 | },
41 | "overlap_comm": true,
42 | "contiguous_gradients": true,
43 | "allgather_partitions": true,
44 | "allgather_bucket_size": 5e8,
45 | "sub_group_size": 1e9,
46 | "reduce_bucket_size": "auto",
47 | "stage3_prefetch_bucket_size": "auto",
48 | "stage3_param_persistence_threshold": "auto",
49 | "stage3_max_live_parameters": 1e9,
50 | "stage3_max_reuse_distance": 1e9,
51 | "stage3_gather_16bit_weights_on_model_save": true
52 | },
53 | "train_batch_size": "auto",
54 | "train_micro_batch_size_per_gpu": "auto",
55 | "gradient_accumulation_steps": "auto",
56 | "gradient_clipping": "auto",
57 | "steps_per_print": 5,
58 | "wall_clock_breakdown": false
59 | }
60 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/infer_args.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Optional
3 |
4 |
5 | @dataclass
6 | class ModelInferenceArguments:
7 | cache_dir: Optional[str] = field(default=None)
8 | model_name_or_path: Optional[str] = field(
9 | default='facebook/opt-125m',
10 | metadata={'help': 'Path to pre-trained model'})
11 | model_revision: str = field(
12 | default='main',
13 | metadata={
14 | 'help':
15 | 'The specific model version to use (can be a branch name, tag name or commit id).'
16 | },
17 | )
18 | trust_remote_code: Optional[bool] = field(
19 | default=False,
20 | metadata={
21 | 'help':
22 | 'Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained.'
23 | })
24 | use_auth_token: Optional[bool] = field(
25 | default=False,
26 | metadata={
27 | 'help':
28 | 'Enables using Huggingface auth token from Git Credentials.'
29 | })
30 | model_max_length: int = field(
31 | default=2048,
32 | metadata={
33 | 'help':
34 | 'Maximum sequence length. Sequences will be right padded (and possibly truncated).'
35 | },
36 | )
37 | low_cpu_mem_usage: bool = field(
38 | default=True,
39 | metadata={'help': 'Whether to use low cpu memory usage mode.'})
40 | fp16: bool = field(default=False,
41 | metadata={'help': 'Whether to use fp16.'})
42 | prompt_template: str = field(
43 | default='default',
44 | metadata={
45 | 'help':
46 | 'Prompt template name. Such as vanilla, alpaca, llama2, vicuna..., etc.'
47 | })
48 | source_prefix: Optional[str] = field(
49 | default=None,
50 | metadata={'help': 'Prefix to prepend to every source text.'})
51 |
--------------------------------------------------------------------------------
/retrieval_contriever/example_scripts/mcontriever.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --cpus-per-task=5
3 | #SBATCH --nodes=8
4 | #SBATCH --ntasks-per-node=8
5 | #SBATCH --gres=gpu:8
6 | #SBATCH --time=72:00:00
7 | #SBATCH --job-name=mcontriever
8 | #SBATCH --output=/private/home/oceanntwt/contriever/logtrain/%A
9 | #SBATCH --partition=learnlab
10 | #SBATCH --mem=450GB
11 | #SBATCH --signal=USR1@140
12 | #SBATCH --open-mode=append
13 |
14 |
15 | port=$(shuf -i 15000-16000 -n 1)
16 |
17 | TDIR=/private/home/oceanntwt/contriever/encoded-data/bert-base-multilingual-cased/
18 | TRAINDATASETS="${TDIR}fr_XX ${TDIR}en_XX ${TDIR}ar_AR ${TDIR}bn_IN ${TDIR}fi_FI ${TDIR}id_ID ${TDIR}ja_XX ${TDIR}ko_KR ${TDIR}ru_RU ${TDIR}sw_KE ${TDIR}hu_HU ${TDIR}he_IL ${TDIR}it_IT ${TDIR}km_KM ${TDIR}ms_MY ${TDIR}nl_XX ${TDIR}no_XX ${TDIR}pl_PL ${TDIR}pt_XX ${TDIR}sv_SE ${TDIR}te_IN ${TDIR}th_TH ${TDIR}tr_TR ${TDIR}vi_VN ${TDIR}zh_CN ${TDIR}zh_TW ${TDIR}es_XX ${TDIR}de_DE ${TDIR}da_DK"
19 |
20 | rmin=0.1
21 | rmax=0.5
22 | T=0.05
23 | QSIZE=32768
24 | MOM=0.999
25 | POOL=average
26 | AUG=none
27 | PAUG=0.
28 | LC=0.
29 | mo=bert-base-multilingual-cased
30 | mp=none
31 |
32 | name=$SLURM_JOB_ID-$POOL-rmin$rmin-rmax$rmax-T$T-$QSIZE-$MOM-$mo-$AUG-$PAUG
33 |
34 | srun ~oceanntwt/anaconda3/envs/pytorch10/bin/python3 ~oceanntwt/contriever/train.py \
35 | --model_path $mp \
36 | --sampling_coefficient $LC \
37 | --augmentation $AUG --prob_augmentation $PAUG \
38 | --retriever_model_id $mo --pooling $POOL \
39 | --train_data $TRAINDATASETS --loading_mode split \
40 | --ratio_min $rmin --ratio_max $rmax --chunk_length 256 \
41 | --momentum $MOM --queue_size $QSIZE --temperature $T \
42 | --warmup_steps 20000 --total_steps 500000 --lr 0.00005 \
43 | --name $name \
44 | --scheduler linear \
45 | --optim adamw \
46 | --per_gpu_batch_size 64 \
47 | --output_dir /checkpoint/oceanntwt/contriever/xling/$name \
48 | --main_port $port \
49 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/utils/stream_server.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers to support streaming generate output.
3 | Borrowed from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/callbacks.py
4 | """
5 | import traceback
6 | from queue import Queue
7 | from threading import Thread
8 |
9 | import transformers
10 |
11 |
12 | class Stream(transformers.StoppingCriteria):
13 | def __init__(self, callback_func=None):
14 | self.callback_func = callback_func
15 |
16 | def __call__(self, input_ids, scores) -> bool:
17 | if self.callback_func is not None:
18 | self.callback_func(input_ids[0])
19 | return False
20 |
21 |
22 | class Iteratorize:
23 | """
24 | Transforms a function that takes a callback
25 | into a lazy iterator (generator).
26 | """
27 | def __init__(self, func, kwargs={}, callback=None):
28 | self.mfunc = func
29 | self.c_callback = callback
30 | self.q = Queue()
31 | self.sentinel = object()
32 | self.kwargs = kwargs
33 | self.stop_now = False
34 |
35 | def _callback(val):
36 | if self.stop_now:
37 | raise ValueError
38 | self.q.put(val)
39 |
40 | def gentask():
41 | try:
42 | ret = self.mfunc(callback=_callback, **self.kwargs)
43 | except ValueError:
44 | pass
45 | except:
46 | traceback.print_exc()
47 | pass
48 |
49 | self.q.put(self.sentinel)
50 | if self.c_callback:
51 | self.c_callback(ret)
52 |
53 | self.thread = Thread(target=gentask)
54 | self.thread.start()
55 |
56 | def __iter__(self):
57 | return self
58 |
59 | def __next__(self):
60 | obj = self.q.get(True, None)
61 | if obj is self.sentinel:
62 | raise StopIteration
63 | else:
64 | return obj
65 |
66 | def __enter__(self):
67 | return self
68 |
69 | def __exit__(self, exc_type, exc_val, exc_tb):
70 | self.stop_now = True
71 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/data/data_loader.py:
--------------------------------------------------------------------------------
1 | from transformers.tokenization_utils import PreTrainedTokenizer
2 |
3 | from .conv_dataset import ConversationDataset, VicunaDataset
4 | from .data_utils import make_data_module
5 | from .sft_dataset import (DataCollatorForSupervisedDataset,
6 | SFTInstructionDataset)
7 |
8 |
9 | def make_supervised_data_module(tokenizer: PreTrainedTokenizer, args):
10 | train_dataset, eval_dataset, multi_turn = make_data_module(args)
11 | max_seq_length = tokenizer.model_max_length
12 | dataset_cls = (VicunaDataset if args.conversation_template == 'vicnua' else
13 | ConversationDataset)
14 |
15 | if not multi_turn:
16 | train_dataset = SFTInstructionDataset(
17 | train_dataset,
18 | tokenizer=tokenizer,
19 | max_seq_len=max_seq_length,
20 | ) if args.do_train else None
21 |
22 | eval_dataset = SFTInstructionDataset(
23 | eval_dataset,
24 | tokenizer=tokenizer,
25 | max_seq_len=max_seq_length,
26 | ) if args.do_eval else None
27 |
28 | else:
29 | train_dataset = dataset_cls(
30 | train_dataset,
31 | tokenizer=tokenizer,
32 | max_seq_length=max_seq_length,
33 | ) if args.do_train else None
34 | eval_dataset = dataset_cls(
35 | eval_dataset,
36 | tokenizer=tokenizer,
37 | max_seq_length=max_seq_length,
38 | ) if args.do_eval else None
39 |
40 | print(
41 | f'train_dataset: {type(train_dataset)}, mutlti-turn: {multi_turn}, #length: {len(train_dataset)}'
42 | ) if args.do_train else None
43 | print(
44 | f'eval_dataset: {type(eval_dataset)}, mutlti-turn: {multi_turn}, #length: {len(eval_dataset)}'
45 | ) if args.do_eval else None
46 |
47 | print('Adding data collator: ', DataCollatorForSupervisedDataset)
48 | data_collator = DataCollatorForSupervisedDataset(
49 | tokenizer=tokenizer, predict_with_generate=args.predict_with_generate)
50 |
51 | return {
52 | 'train_dataset': train_dataset,
53 | 'eval_dataset': eval_dataset,
54 | 'data_collator': data_collator
55 | }
56 |
--------------------------------------------------------------------------------
/source/model/flan-t5/utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import datasets
3 |
4 | from datasets import load_dataset, concatenate_datasets, DatasetDict
5 | from transformers import AutoTokenizer
6 | from sklearn.model_selection import train_test_split
7 | from typing import List, Union
8 |
9 |
10 | def clean_text(
11 | texts: List[Union[str, None]], labels: List[Union[str, None]]
12 | ) -> pd.DataFrame:
13 | """
14 | The News Group dataset needs to be preprocessed as it has a lot of
15 | entries with NULL text and/or NULL labels.
16 | In this function we simply filter out the NULL entries, and
17 | return a new dataframe with clean texts and labels.
18 | """
19 | new_texts, new_labels = [], []
20 | for text, label in zip(texts, labels):
21 | if isinstance(text, str) and isinstance(label, str):
22 | new_texts.append(text)
23 | new_labels.append(label)
24 | new_ids = [i for i in range(len(new_texts))]
25 | df = pd.DataFrame(data={"id": new_ids, "text": new_texts, "label": new_labels})
26 |
27 | return df
28 |
29 | def get_data(tokenizer: AutoTokenizer) -> List[Union[DatasetDict, int, int]]:
30 | dataset_id = "nq_open"
31 | # Load dataset from the hub
32 | dataset = load_dataset(dataset_id)
33 |
34 | print(f"Train dataset size: {len(dataset['train'])}")
35 | print(f"Test dataset size: {len(dataset['validation'])}") # if validate
36 |
37 | tokenized_inputs = concatenate_datasets([dataset["train"]]).map(
38 | lambda x: tokenizer(x["question"], truncation=True),
39 | batched=True,
40 | remove_columns=["question", "answer"],
41 | )
42 |
43 | max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
44 | print(f"Max source length: {max_source_length}")
45 |
46 | tokenized_targets = concatenate_datasets([dataset["train"], dataset["validation"]]).map(
47 | lambda x: tokenizer(x["answer"], truncation=True),
48 | batched=True,
49 | remove_columns=["question", "answer"],
50 | )
51 |
52 | max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
53 | print(f"Max target length: {max_target_length}")
54 |
55 | return dataset, max_source_length, max_target_length
56 |
--------------------------------------------------------------------------------
/retrieval_contriever/evaluate_retrieved_passages.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import argparse
8 | import json
9 | import logging
10 | import glob
11 |
12 | import numpy as np
13 | import torch
14 |
15 | import src.utils
16 |
17 | from src.evaluation import calculate_matches
18 |
19 | logger = logging.getLogger(__name__)
20 |
21 | def validate(data, workers_num):
22 | match_stats = calculate_matches(data, workers_num)
23 | top_k_hits = match_stats.top_k_hits
24 |
25 | #logger.info('Validation results: top k documents hits %s', top_k_hits)
26 | top_k_hits = [v / len(data) for v in top_k_hits]
27 | #logger.info('Validation results: top k documents hits accuracy %s', top_k_hits)
28 | return top_k_hits
29 |
30 |
31 | def main(opt):
32 | logger = src.utils.init_logger(opt, stdout_only=True)
33 | datapaths = glob.glob(args.data)
34 | r20, r100 = [], []
35 | for path in datapaths:
36 | data = []
37 | with open(path, 'r') as fin:
38 | for line in fin:
39 | data.append(json.loads(line))
40 | #data = json.load(fin)
41 | answers = [ex['answers'] for ex in data]
42 | top_k_hits = validate(data, args.validation_workers)
43 | message = f"Evaluate results from {path}:"
44 | for k in [5, 10, 20, 100]:
45 | if k <= len(top_k_hits):
46 | recall = 100 * top_k_hits[k-1]
47 | if k == 20:
48 | r20.append(f"{recall:.1f}")
49 | if k == 100:
50 | r100.append(f"{recall:.1f}")
51 | message += f' R@{k}: {recall:.1f}'
52 | logger.info(message)
53 | print(datapaths)
54 | print('\t'.join(r20))
55 | print('\t'.join(r100))
56 |
57 |
58 | if __name__ == '__main__':
59 | parser = argparse.ArgumentParser()
60 |
61 | parser.add_argument('--data', required=True, type=str, default=None)
62 | parser.add_argument('--validation_workers', type=int, default=16,
63 | help="Number of parallel processes to validate results")
64 |
65 | args = parser.parse_args()
66 | main(args)
67 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | def parse_arguments():
5 | parser = argparse.ArgumentParser()
6 |
7 | parser.add_argument(
8 | "--iteration_max_time", type=int, default=3, help="maxinum iteration in RA-iSF."
9 | )
10 | parser.add_argument(
11 | "--temperature", type=float, default=0, help=""
12 | )
13 | parser.add_argument(
14 | "--max_length", type=int, default=256, help="maxinum generation of base model"
15 | )
16 | parser.add_argument(
17 | "--type_list_file", default="./src/format/entity_type_list.txt", type=str, help='file path'
18 | )
19 | parser.add_argument(
20 | "--prompt_id", default='324', help='string'
21 | )
22 | parser.add_argument(
23 | "--infer_num", default='5', help='string'
24 | )
25 | parser.add_argument(
26 | "--engine", default='llama2-13b', help="llama2-7b, llama2-13b, gpt-3.5",
27 | choices=["llama2-7b", "llama2-13b", "gpt-3.5"]
28 | )
29 | parser.add_argument(
30 | "--api_key", default="", help="gpt3.5 api key"
31 | )
32 | parser.add_argument(
33 | "--base_model_path", default='/root/autodl-tmp/llama-7b-hf', help="your local model path"
34 | )
35 | parser.add_argument(
36 | "--self_knowledge_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel self-knowledge path"
37 | )
38 | parser.add_argument(
39 | "--passage_relevance_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel passage_relevance path"
40 | )
41 | parser.add_argument(
42 | "--task_decomposition_model_path", default='/root/autodl-tmp/llama-7b-hf', help="submodel task_decomposition path"
43 | )
44 | parser.add_argument(
45 | "--data_path", default='/root/workspace/ra-isf/dataset/natural_question/nq_open.json', help="your local data path"
46 | )
47 | parser.add_argument(
48 | "--output_path", default='/root/workspace/ra-isf/output/output.json', help="your local output file data path"
49 | )
50 | parser.add_argument(
51 | "--test_start", default='0', help='string, number'
52 | )
53 | parser.add_argument(
54 | "--test_end", default='full', help='string, number'
55 | )
56 | parsed_args = parser.parse_args()
57 | return parsed_args
58 |
59 |
60 | args = parse_arguments()
61 |
--------------------------------------------------------------------------------
/contriever_config.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | def parse_retriever_arguments():
4 | parser = argparse.ArgumentParser()
5 |
6 | parser.add_argument(
7 | "--data",
8 | # required=True,
9 | type=str,
10 | default=None,
11 | help=".json file containing question and answers, similar format to reader data",
12 | )
13 | parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)")
14 | parser.add_argument("--passages_embeddings", type=str, default=None, help="Glob path to encoded passages")
15 | parser.add_argument(
16 | "--output_dir", type=str, default=None, help="Results are written to outputdir with data suffix"
17 | )
18 | parser.add_argument("--n_docs", type=int, default=100, help="Number of documents to retrieve per questions")
19 | parser.add_argument(
20 | "--validation_workers", type=int, default=32, help="Number of parallel processes to validate results"
21 | )
22 | parser.add_argument("--per_gpu_batch_size", type=int, default=64, help="Batch size for question encoding")
23 | parser.add_argument(
24 | "--save_or_load_index", action="store_true", help="If enabled, save index and load index if it exists"
25 | )
26 | parser.add_argument(
27 | "--model_name_or_path", type=str, default="/root/autodl-tmp/contriever-msmarco", help="path to directory containing model weights and config file"
28 | )
29 | parser.add_argument("--no_fp16", action="store_true", help="inference in fp32")
30 | parser.add_argument("--question_maxlength", type=int, default=512, help="Maximum number of tokens in a question")
31 | parser.add_argument(
32 | "--indexing_batch_size", type=int, default=1000000, help="Batch size of the number of passages indexed"
33 | )
34 | parser.add_argument("--projection_size", type=int, default=768)
35 | parser.add_argument(
36 | "--n_subquantizers",
37 | type=int,
38 | default=0,
39 | help="Number of subquantizer used for vector quantization, if 0 flat index is used",
40 | )
41 | parser.add_argument("--n_bits", type=int, default=8, help="Number of bits per subquantizer")
42 | parser.add_argument("--lang", nargs="+")
43 | parser.add_argument("--dataset", type=str, default="none")
44 | parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding")
45 | parser.add_argument("--normalize_text", action="store_true", help="normalize text")
46 | parsed_args = parser.parse_args()
47 | return parsed_args
48 |
49 | c_args = parse_retriever_arguments()
--------------------------------------------------------------------------------
/retrieval_contriever/preprocess.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 |
3 | import os
4 | import argparse
5 | import torch
6 |
7 | import transformers
8 | from src.normalize_text import normalize
9 |
10 |
11 | def save(tensor, split_path):
12 | if not os.path.exists(os.path.dirname(split_path)):
13 | os.makedirs(os.path.dirname(split_path))
14 | with open(split_path, 'wb') as fout:
15 | torch.save(tensor, fout)
16 |
17 | def apply_tokenizer(path, tokenizer, normalize_text=False):
18 | alltokens = []
19 | lines = []
20 | with open(path, "r", encoding="utf-8") as fin:
21 | for k, line in enumerate(fin):
22 | if normalize_text:
23 | line = normalize(line)
24 |
25 | lines.append(line)
26 | if len(lines) > 1000000:
27 | tokens = tokenizer.batch_encode_plus(lines, add_special_tokens=False)['input_ids']
28 | tokens = [torch.tensor(x, dtype=torch.int) for x in tokens]
29 | alltokens.extend(tokens)
30 | lines = []
31 |
32 | tokens = tokenizer.batch_encode_plus(lines, add_special_tokens=False)['input_ids']
33 | tokens = [torch.tensor(x, dtype=torch.int) for x in tokens]
34 | alltokens.extend(tokens)
35 |
36 | alltokens = torch.cat(alltokens)
37 | return alltokens
38 |
39 | def tokenize_file(args):
40 | filename = os.path.basename(args.datapath)
41 | savepath = os.path.join(args.outdir, f"{filename}.pkl")
42 | if os.path.exists(savepath):
43 | if args.overwrite:
44 | print(f"File {savepath} already exists, overwriting")
45 | else:
46 | print(f"File {savepath} already exists, exiting")
47 | return
48 | try:
49 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer, local_files_only=True)
50 | except:
51 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer, local_files_only=False)
52 | print(f"Encoding {args.datapath}...")
53 | tokens = apply_tokenizer(args.datapath, tokenizer, normalize_text=args.normalize_text)
54 |
55 | print(f"Saving at {savepath}...")
56 | save(tokens, savepath)
57 |
58 |
59 | if __name__ == '__main__':
60 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
61 | parser.add_argument("--datapath", type=str)
62 | parser.add_argument("--outdir", type=str)
63 | parser.add_argument("--tokenizer", type=str)
64 | parser.add_argument("--overwrite", action="store_true")
65 | parser.add_argument("--normalize_text", action="store_true")
66 |
67 | args, _ = parser.parse_known_args()
68 | tokenize_file(args)
69 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/vllm/apil_chient.py:
--------------------------------------------------------------------------------
1 | """Example Python client for vllm.entrypoints.api_server"""
2 |
3 | import argparse
4 | import json
5 | from typing import Iterable, List
6 |
7 | import requests
8 |
9 |
10 | def clear_line(n: int = 1) -> None:
11 | LINE_UP = '\033[1A'
12 | LINE_CLEAR = '\x1b[2K'
13 | for _ in range(n):
14 | print(LINE_UP, end=LINE_CLEAR, flush=True)
15 |
16 |
17 | def post_http_request(prompt: str,
18 | api_url: str,
19 | n: int = 1,
20 | stream: bool = False) -> requests.Response:
21 | headers = {'User-Agent': 'Test Client'}
22 | pload = {
23 | 'prompt': prompt,
24 | 'n': n,
25 | 'use_beam_search': True,
26 | 'temperature': 0.0,
27 | 'max_tokens': 16,
28 | 'stream': stream,
29 | }
30 | response = requests.post(api_url, headers=headers, json=pload, stream=True)
31 | return response
32 |
33 |
34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
35 | for chunk in response.iter_lines(chunk_size=8192,
36 | decode_unicode=False,
37 | delimiter=b'\0'):
38 | if chunk:
39 | data = json.loads(chunk.decode('utf-8'))
40 | output = data['text']
41 | yield output
42 |
43 |
44 | def get_response(response: requests.Response) -> List[str]:
45 | data = json.loads(response.content)
46 | output = data['text']
47 | return output
48 |
49 |
50 | if __name__ == '__main__':
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument('--host', type=str, default='localhost')
53 | parser.add_argument('--port', type=int, default=8000)
54 | parser.add_argument('--n', type=int, default=4)
55 | parser.add_argument('--prompt', type=str, default='San Francisco is a')
56 | parser.add_argument('--stream', action='store_true')
57 | args = parser.parse_args()
58 | prompt = args.prompt
59 | api_url = f'http://{args.host}:{args.port}/generate'
60 | n = args.n
61 | stream = args.stream
62 |
63 | print(f'Prompt: {prompt!r}\n', flush=True)
64 | response = post_http_request(prompt, api_url, n, stream)
65 |
66 | if stream:
67 | num_printed_lines = 0
68 | for h in get_streaming_response(response):
69 | clear_line(num_printed_lines)
70 | num_printed_lines = 0
71 | for i, line in enumerate(h):
72 | num_printed_lines += 1
73 | print(f'Beam candidate {i}: {line!r}', flush=True)
74 | else:
75 | output = get_response(response)
76 | for i, line in enumerate(output):
77 | print(f'Beam candidate {i}: {line!r}', flush=True)
78 |
--------------------------------------------------------------------------------
/source/model/llama2/scripts/clean_data.sh:
--------------------------------------------------------------------------------
1 | # sharegpt
2 | python clean_sharegpt.py \
3 | --in-file /userhome/jianzhnie/prompt_data/anon8231489123/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json \
4 | --out-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_clean.json
5 |
6 | python split_long_conversation.py \
7 | --in-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_clean.json \
8 | --out-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_split.json \
9 | --model-name-or-path /userhome/jianzhnie/checkpoints/llama7b
10 |
11 | python clean_evol_instruct.py \
12 | --in-file /userhome/jianzhnie/prompt_data/WizardLM/WizardLM_evol_instruct_V2_196k/WizardLM_evol_instruct_V2_143k.json \
13 | --out-file /userhome/jianzhnie/prompt_data/sharegpt/evol_instruct_clean.json
14 |
15 | python merge.py \
16 | --in-file /userhome/jianzhnie/prompt_data/sharegpt/sharegpt_split.json /userhome/jianzhnie/prompt_data/sharegpt/evol_instruct_clean.json \
17 | --out-file /userhome/jianzhnie/prompt_data/sharegpt/evol_sharegpt_merge.json
18 |
19 | # chinese data
20 | python chatllms/data/utils/convert_alpaca.py \
21 | --in-file ./prompt_data/chinese_data/alpaca_data_zh_51k.json \
22 | --out-file ./prompt_data/chinese_data/alpaca_vicuna.json
23 |
24 | python chatllms/data/utils/convert_alpaca.py \
25 | --in-file ./prompt_data/InstructionWild/instinwild_ch.json \
26 | --out-file ./prompt_data/chinese_data/instinwild_ch_vicuna.json
27 |
28 | python chatllms/data/utils/convert_alpaca.py \
29 | --in-file ./prompt_data/InstructionWild/instinwild_en.json \
30 | --out-file ./prompt_data/chinese_data/instinwild_en_vicuna.json
31 |
32 | python chatllms/data/utils/convert_alpaca.py \
33 | --in-file ./prompt_data/databricks-dolly-15k/databricks-dolly-15k.jsonl \
34 | --out-file ./prompt_data/chinese_data/dolly-15k_vicuna.json
35 |
36 | python merge.py \
37 | --in-file /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/alpaca_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/dolly-15k_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/instinwild_ch_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/instinwild_en_vicuna.json /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/olcc.json\
38 | --out-file /userhome/jianzhnie/llm/Chinese-Guanaco/prompt_data/chinese_data/vicuna_merge.json
39 |
40 |
41 | # belle-group
42 | python chatllms/data/utils/convert_alpaca.py \
43 | --in-file ./prompt_data/belle_group/generated_chat_0.4M/generated_chat_0.4M.json \
44 | --out-file ./prompt_data/belle_group/generated_chat_vicuna.json
45 |
46 |
47 | python chatllms/data/utils/convert_alpaca.py \
48 | --in-file ./prompt_data/belle_group/school_math_0.25M/school_math_0.25M.json \
49 | --out-file ./prompt_data/belle_group/school_math_vicuna.json
50 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/test_convdataset.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 |
4 | import numpy as np
5 |
6 | sys.path.append('../')
7 | from typing import Any, Dict
8 |
9 | import transformers
10 |
11 | from chatllms.data.conv_dataset import ConversationDataset, UltraChatDataset
12 | from chatllms.data.data_utils import (DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN,
13 | DEFAULT_PAD_TOKEN, DEFAULT_UNK_TOKEN)
14 |
15 | if __name__ == '__main__':
16 | # Load the raw data from the specified data_path
17 | data_path = '/home/robin/work_dir/llm/FastChat/data/dummy_conversation.json'
18 | with open(data_path, 'r') as file:
19 | raw_data = json.load(file)
20 |
21 | model_name_or_path = '/home/robin/checkpoints/baichuan7b'
22 | model_name_or_path = 'facebook/opt-125m'
23 | sources = [example['conversations'] for example in raw_data]
24 | tokenizer = transformers.AutoTokenizer.from_pretrained(
25 | model_name_or_path,
26 | model_max_length=64,
27 | padding_side='right',
28 | use_fast=False,
29 | add_special_tokens=False,
30 | tokenizer_type='llama' if 'llama' in model_name_or_path else 'gpt2',
31 | )
32 | # Define a dictionary to store any missing special tokens along with their default values
33 | special_tokens_dict: Dict[str, Any] = {}
34 |
35 | # Check if each special token is present. If not, add it to the special_tokens_dict with its default value.
36 | if tokenizer.pad_token is None:
37 | special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
38 | if tokenizer.eos_token is None:
39 | special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
40 | if tokenizer.bos_token is None:
41 | special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN
42 | if tokenizer.unk_token is None:
43 | special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN
44 |
45 | num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
46 |
47 | print(tokenizer.bos_token)
48 | # # Apply the conversation function to the raw data
49 | dataset = ConversationDataset(sources, tokenizer, 64)
50 |
51 | for idx, data in enumerate(dataset):
52 | print('==' * 10)
53 | input_ids = data['input_ids']
54 | input_txt = tokenizer.decode(input_ids)
55 | print(input_txt)
56 | targets = data['labels']
57 | input_ids = np.array(input_ids)
58 | target_text = tokenizer.decode(targets)
59 | print(target_text)
60 | if idx > 10:
61 | break
62 |
63 | dataset = UltraChatDataset(sources, tokenizer, 128)
64 | for idx, data in enumerate(dataset):
65 | input_ids = data['input_ids']
66 | labels = data['labels']
67 | input_txt = tokenizer.decode(input_ids)
68 | target_text = tokenizer.decode(labels)
69 | print(input_txt)
70 | print(target_text)
71 | if idx > 10:
72 | break
73 |
--------------------------------------------------------------------------------
/source/model/llama2/server/multi_chat.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from threading import Thread
3 |
4 | import torch
5 | import transformers
6 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
7 | TextIteratorStreamer)
8 |
9 | sys.path.append('../')
10 | from chatllms.configs import GenerationArguments, ModelInferenceArguments
11 | from chatllms.utils.model_utils import get_logits_processor
12 |
13 |
14 | def main(model_server_args, generation_args):
15 | """
16 | 多轮对话,不具有对话历史的记忆功能
17 | """
18 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
19 | model = AutoModelForCausalLM.from_pretrained(
20 | model_server_args.model_name_or_path,
21 | cache_dir=model_server_args.cache_dir,
22 | trust_remote_code=True,
23 | low_cpu_mem_usage=True,
24 | torch_dtype=torch.float16,
25 | device_map='auto').to(device).eval()
26 | tokenizer = AutoTokenizer.from_pretrained(
27 | model_server_args.model_name_or_path,
28 | trust_remote_code=True,
29 | use_fast=False,
30 | )
31 | # 记录所有历史记录
32 | historys = tokenizer.bos_token
33 | print('User: ', end='', flush=True)
34 | user_input = input('')
35 | while True:
36 | user_input = '{}'.format(user_input).strip()
37 | historys = historys + user_input
38 | inputs = tokenizer(historys,
39 | return_tensors='pt',
40 | add_special_tokens=False)
41 | inputs = {k: v.to(model.device) for k, v in inputs.items()}
42 |
43 | # Create a TextIteratorStreamer object to stream the response from the model
44 | streamer = TextIteratorStreamer(tokenizer,
45 | timeout=60.0,
46 | skip_prompt=True,
47 | skip_special_tokens=True)
48 |
49 | # Set the arguments for the model's generate() method
50 | gen_kwargs = dict(
51 | inputs,
52 | streamer=streamer,
53 | logits_processor=get_logits_processor(),
54 | **generation_args.to_dict(),
55 | )
56 |
57 | # Start a separate thread to generate the response asynchronously
58 | thread = Thread(target=model.generate, kwargs=gen_kwargs)
59 | thread.start()
60 |
61 | # Print the model name and the response as it is generated
62 | print('Assistant: ', end='', flush=True)
63 | response = ''
64 | for new_text in streamer:
65 | print(new_text, end='', flush=True)
66 | response += new_text
67 |
68 | historys = historys + response
69 | print('\n')
70 | print('User: ', end='', flush=True)
71 | user_input = input('')
72 |
73 |
74 | if __name__ == '__main__':
75 | parser = transformers.HfArgumentParser(
76 | (ModelInferenceArguments, GenerationArguments))
77 | model_server_args, generation_args = parser.parse_args_into_dataclasses()
78 | main(model_server_args, generation_args)
79 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/format_data/convert_oasst1.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | import random
5 |
6 |
7 | def json_dump(obj, path):
8 | with open(path, 'w', encoding='utf-8') as f:
9 | json.dump(obj, f, indent=2, ensure_ascii=False)
10 |
11 |
12 | def json_load(in_file):
13 | with open(in_file, 'r') as f:
14 | json_data = json.load(f)
15 | return json_data
16 |
17 |
18 | def convert_oasst1_data(data_dir, output_dir):
19 | '''
20 | For OASST1, because it's in a tree structure, where every user input might get multiple replies,
21 | we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node).
22 | This results in some of the messages being duplicated among different paths (instances).
23 | Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path.
24 | '''
25 | conversations = []
26 | with open(os.path.join(data_dir, '2023-04-12_oasst_ready.trees.jsonl'),
27 | 'r') as fin:
28 | for line in fin:
29 | conversations.append(json.loads(line))
30 |
31 | output_path = os.path.join(output_dir, 'oasst1_data.jsonl')
32 |
33 | # tranvers the conversation tree, and collect all valid sequences
34 | def dfs(reply, messages, valid_sequences):
35 | if reply['role'] == 'assistant':
36 | messages.append({'role': 'assistant', 'content': reply['text']})
37 | valid_sequences.append(messages[:])
38 | for child in reply['replies']:
39 | dfs(child, messages, valid_sequences)
40 | messages.pop()
41 | elif reply['role'] == 'prompter':
42 | messages.append({'role': 'user', 'content': reply['text']})
43 | for child in reply['replies']:
44 | dfs(child, messages, valid_sequences)
45 | messages.pop()
46 | else:
47 | raise ValueError(f"Unknown role: {reply['role']}")
48 |
49 | with open(output_path, 'w') as fout:
50 | example_cnt = 0
51 | for _, conversation in enumerate(conversations):
52 | valid_sequences = []
53 | dfs(conversation['prompt'], [], valid_sequences)
54 | for sequence in valid_sequences:
55 | fout.write(
56 | json.dumps({
57 | 'dataset': 'oasst1',
58 | 'id': f'oasst1_{example_cnt}',
59 | 'messages': sequence
60 | }) + '\n')
61 | example_cnt += 1
62 |
63 |
64 | if __name__ == '__main__':
65 | arg_parser = argparse.ArgumentParser()
66 | arg_parser.add_argument('--raw_data_dir',
67 | type=str,
68 | default='data/downloads')
69 | arg_parser.add_argument('--output_dir', type=str, default='data/processed')
70 | arg_parser.add_argument('--seed', type=int, default=42)
71 | args = arg_parser.parse_args()
72 | random.seed(args.seed)
73 |
74 | convert_oasst1_data(data_dir=args.raw_data_dir, output_dir=args.output_dir)
75 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/train_args.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Optional
3 |
4 | from transformers import TrainingArguments
5 |
6 |
7 | @dataclass
8 | class TrainingArguments(TrainingArguments):
9 | # 缓存目录
10 | cache_dir: Optional[str] = field(default=None)
11 | # 不使用adapter进行全微调(不适用Lora或qlora?)
12 | full_finetune: bool = field(
13 | default=False,
14 | metadata={'help': 'Finetune the entire model without adapters.'})
15 | # 是否进行训练,那肯定是要的
16 | do_train: bool = field(
17 | default=True,
18 | metadata={'help': 'To train or not to train, that is the question?'})
19 | # 是否进行验证
20 | do_eval: bool = field(
21 | default=False,
22 | metadata={'help': 'To train or not to train, that is the question?'})
23 | # 是否使用MMLU评估
24 | do_mmlu_eval: Optional[bool] = field(
25 | default=False,
26 | metadata={'help': 'Whether to run the MMLU evaluation.'})
27 | # mmlu数据集的默认名称,`mmlu-zs` for zero-shot or `mmlu-fs` for few shot.
28 | mmlu_dataset: Optional[str] = field(
29 | default='mmlu-fs',
30 | metadata={
31 | 'help':
32 | 'MMLU dataset to use: options are `mmlu-zs` for zero-shot or `mmlu-fs` for few shot.'
33 | })
34 | # mmlu数据集的默认分割,`eval` for evaluation or `test` for testing.
35 | mmlu_split: Optional[str] = field(
36 | default='eval', metadata={'help': 'The MMLU split to run on'})
37 | # mmlu数据集的默认最大样本数量
38 | max_mmlu_samples: Optional[int] = field(
39 | default=None,
40 | metadata={
41 | 'help':
42 | 'If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset.'
43 | })
44 | # mmlu数据集source文本的最大长度(是字符长度还是token长度,这个去代码中找线索吧)
45 | mmlu_source_max_len: int = field(
46 | default=2048,
47 | metadata={'help': 'Maximum source sequence length for mmlu.'})
48 | # 是否进行sample generation
49 | sample_generate: bool = field(
50 | default=False,
51 | metadata={'help': 'If do sample generation on evaluation.'})
52 | # 使用nvidia的分页机制优化器,可以在偶尔OOM的情况,让模型继续训练下去。
53 | optim: str = field(default='paged_adamw_32bit',
54 | metadata={'help': 'The optimizer to be used'})
55 | # 梯度截断因子
56 | max_grad_norm: float = field(
57 | default=0.3,
58 | metadata={
59 | 'help':
60 | 'Gradient clipping max norm. This is tuned and works well for all models tested.'
61 | })
62 | # 梯度检查,设置为True,来减少显存占用。
63 | # 显存这么紧张,肯定是要设置为 True,但是运行时间就会提升
64 | gradient_checkpointing: bool = field(
65 | default=True,
66 | metadata={'help': 'Use gradient checkpointing. You want to use this.'})
67 | predict_with_generate: bool = field(
68 | default=False,
69 | metadata={
70 | 'help':
71 | 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'
72 | })
73 | model_max_length: int = field(
74 | default=1024,
75 | metadata={
76 | 'help':
77 | 'Maximum sequence length. Sequences will be right padded (and possibly truncated).'
78 | },
79 | )
80 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/data/utils/convert_alpaca.py:
--------------------------------------------------------------------------------
1 | """
2 | Convert alpaca dataset into sharegpt format.
3 |
4 | Usage: python3 -m chatllms.data.convert_alpaca --in alpaca_data.json
5 | """
6 |
7 | import argparse
8 | import json
9 | from typing import Any, Dict, List
10 |
11 | from datasets import load_dataset
12 |
13 |
14 | def json_dump(obj, path):
15 | with open(path, 'w', encoding='utf-8') as f:
16 | json.dump(obj, f, indent=2, ensure_ascii=False)
17 |
18 |
19 | def json_load(in_file):
20 | with open(in_file, 'r') as f:
21 | json_data = json.load(f)
22 | return json_data
23 |
24 |
25 | def valid_keys(keys):
26 | for k in ['instruction', 'input', 'output']:
27 | if k not in keys:
28 | return False
29 | return True
30 |
31 |
32 | def convert_alpaca_vicuna(raw_data: List[Dict[str, Any]]):
33 | collect_data = []
34 | for i, content in enumerate(raw_data):
35 | if not valid_keys(content.keys()):
36 | continue
37 |
38 | if len(content['input'].strip()) > 1:
39 | q, a = content['instruction'] + '\nInput:\n' + content[
40 | 'input'], content['output']
41 | else:
42 | q, a = content['instruction'], content['output']
43 |
44 | collect_data.append({
45 | 'id':
46 | f'alpaca_{i}',
47 | 'conversations': [
48 | {
49 | 'from': 'human',
50 | 'value': q
51 | },
52 | {
53 | 'from': 'gpt',
54 | 'value': a
55 | },
56 | ],
57 | })
58 | print(f'Original: {len(raw_data)}, Converted: {len(collect_data)}')
59 | return collect_data
60 |
61 |
62 | def convert_dolly_vicuna(raw_data: List[Dict[str, Any]]):
63 | collect_data = []
64 | for i, content in enumerate(raw_data):
65 | if len(content['context'].strip()) > 1:
66 | q, a = content['instruction'] + '\nInput:\n' + content[
67 | 'context'], content['response']
68 | else:
69 | q, a = content['instruction'], content['response']
70 |
71 | collect_data.append({
72 | 'id':
73 | f'alpaca_{i}',
74 | 'conversations': [
75 | {
76 | 'from': 'human',
77 | 'value': q
78 | },
79 | {
80 | 'from': 'gpt',
81 | 'value': a
82 | },
83 | ],
84 | })
85 | print(f'Original: {len(raw_data)}, Converted: {len(collect_data)}')
86 | return collect_data
87 |
88 |
89 | def main():
90 | parser = argparse.ArgumentParser()
91 | parser.add_argument('--in-file', type=str)
92 | parser.add_argument('--out-file', type=str)
93 | args = parser.parse_args()
94 |
95 | raw_data = load_dataset('json', data_files=args.in_file)['train']
96 | new_data = convert_alpaca_vicuna(raw_data)
97 |
98 | # new_data = convert_dolly_vicuna(raw_data)
99 | # new_data = convert_alpaca_vicuna(raw_data)
100 | json_dump(new_data, args.out_file)
101 |
102 |
103 | if __name__ == '__main__':
104 | main()
105 |
--------------------------------------------------------------------------------
/retrieval_contriever/src/index.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import os
8 | import pickle
9 | from typing import List, Tuple
10 |
11 | import faiss
12 | import numpy as np
13 | from tqdm import tqdm
14 |
15 | class Indexer(object):
16 |
17 | def __init__(self, vector_sz, n_subquantizers=0, n_bits=8):
18 | if n_subquantizers > 0:
19 | self.index = faiss.IndexPQ(vector_sz, n_subquantizers, n_bits, faiss.METRIC_INNER_PRODUCT)
20 | else:
21 | self.index = faiss.IndexFlatIP(vector_sz)
22 | #self.index_id_to_db_id = np.empty((0), dtype=np.int64)
23 | self.index_id_to_db_id = []
24 |
25 | def index_data(self, ids, embeddings):
26 | self._update_id_mapping(ids)
27 | embeddings = embeddings.astype('float32')
28 | if not self.index.is_trained:
29 | self.index.train(embeddings)
30 | self.index.add(embeddings)
31 |
32 | print(f'Total data indexed {len(self.index_id_to_db_id)}')
33 |
34 | def search_knn(self, query_vectors: np.array, top_docs: int, index_batch_size: int = 2048) -> List[Tuple[List[object], List[float]]]:
35 | query_vectors = query_vectors.astype('float32')
36 | result = []
37 | nbatch = (len(query_vectors)-1) // index_batch_size + 1
38 | for k in tqdm(range(nbatch)):
39 | start_idx = k*index_batch_size
40 | end_idx = min((k+1)*index_batch_size, len(query_vectors))
41 | q = query_vectors[start_idx: end_idx]
42 | scores, indexes = self.index.search(q, top_docs)
43 | # convert to external ids
44 | db_ids = [[str(self.index_id_to_db_id[i]) for i in query_top_idxs] for query_top_idxs in indexes]
45 | result.extend([(db_ids[i], scores[i]) for i in range(len(db_ids))])
46 | return result
47 |
48 | def serialize(self, dir_path):
49 | index_file = os.path.join(dir_path, 'index.faiss')
50 | meta_file = os.path.join(dir_path, 'index_meta.faiss')
51 | print(f'Serializing index to {index_file}, meta data to {meta_file}')
52 |
53 | faiss.write_index(self.index, index_file)
54 | with open(meta_file, mode='wb') as f:
55 | pickle.dump(self.index_id_to_db_id, f)
56 |
57 | def deserialize_from(self, dir_path):
58 | index_file = os.path.join(dir_path, 'index.faiss')
59 | meta_file = os.path.join(dir_path, 'index_meta.faiss')
60 | print(f'Loading index from {index_file}, meta data from {meta_file}')
61 |
62 | self.index = faiss.read_index(index_file)
63 | print('Loaded index of type %s and size %d', type(self.index), self.index.ntotal)
64 |
65 | with open(meta_file, "rb") as reader:
66 | self.index_id_to_db_id = pickle.load(reader)
67 | assert len(
68 | self.index_id_to_db_id) == self.index.ntotal, 'Deserialized index_id_to_db_id should match faiss index size'
69 |
70 | def _update_id_mapping(self, db_ids: List):
71 | #new_ids = np.array(db_ids, dtype=np.int64)
72 | #self.index_id_to_db_id = np.concatenate((self.index_id_to_db_id, new_ids), axis=0)
73 | self.index_id_to_db_id.extend(db_ids)
--------------------------------------------------------------------------------
/retrieval_contriever/src/inbatch.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 |
3 | import torch
4 | import torch.nn as nn
5 | import numpy as np
6 | import math
7 | import random
8 | import transformers
9 | import logging
10 | import torch.distributed as dist
11 |
12 | from retrieval_contriever.src import contriever, dist_utils, utils
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | class InBatch(nn.Module):
18 | def __init__(self, opt, retriever=None, tokenizer=None):
19 | super(InBatch, self).__init__()
20 |
21 | self.opt = opt
22 | self.norm_doc = opt.norm_doc
23 | self.norm_query = opt.norm_query
24 | self.label_smoothing = opt.label_smoothing
25 | if retriever is None or tokenizer is None:
26 | retriever, tokenizer = self._load_retriever(
27 | opt.retriever_model_id, pooling=opt.pooling, random_init=opt.random_init
28 | )
29 | self.tokenizer = tokenizer
30 | self.encoder = retriever
31 |
32 | def _load_retriever(self, model_id, pooling, random_init):
33 | cfg = utils.load_hf(transformers.AutoConfig, model_id)
34 | tokenizer = utils.load_hf(transformers.AutoTokenizer, model_id)
35 |
36 | if "xlm" in model_id:
37 | model_class = contriever.XLMRetriever
38 | else:
39 | model_class = contriever.Contriever
40 |
41 | if random_init:
42 | retriever = model_class(cfg)
43 | else:
44 | retriever = utils.load_hf(model_class, model_id)
45 |
46 | if "bert-" in model_id:
47 | if tokenizer.bos_token_id is None:
48 | tokenizer.bos_token = "[CLS]"
49 | if tokenizer.eos_token_id is None:
50 | tokenizer.eos_token = "[SEP]"
51 |
52 | retriever.config.pooling = pooling
53 |
54 | return retriever, tokenizer
55 |
56 | def get_encoder(self):
57 | return self.encoder
58 |
59 | def forward(self, q_tokens, q_mask, k_tokens, k_mask, stats_prefix="", iter_stats={}, **kwargs):
60 |
61 | bsz = len(q_tokens)
62 | labels = torch.arange(0, bsz, dtype=torch.long, device=q_tokens.device)
63 |
64 | qemb = self.encoder(input_ids=q_tokens, attention_mask=q_mask, normalize=self.norm_query)
65 | kemb = self.encoder(input_ids=k_tokens, attention_mask=k_mask, normalize=self.norm_doc)
66 |
67 | gather_fn = dist_utils.gather
68 |
69 | gather_kemb = gather_fn(kemb)
70 |
71 | labels = labels + dist_utils.get_rank() * len(kemb)
72 |
73 | scores = torch.einsum("id, jd->ij", qemb / self.opt.temperature, gather_kemb)
74 |
75 | loss = torch.nn.functional.cross_entropy(scores, labels, label_smoothing=self.label_smoothing)
76 |
77 | # log stats
78 | if len(stats_prefix) > 0:
79 | stats_prefix = stats_prefix + "/"
80 | iter_stats[f"{stats_prefix}loss"] = (loss.item(), bsz)
81 |
82 | predicted_idx = torch.argmax(scores, dim=-1)
83 | accuracy = 100 * (predicted_idx == labels).float().mean()
84 | stdq = torch.std(qemb, dim=0).mean().item()
85 | stdk = torch.std(kemb, dim=0).mean().item()
86 | iter_stats[f"{stats_prefix}accuracy"] = (accuracy, bsz)
87 | iter_stats[f"{stats_prefix}stdq"] = (stdq, bsz)
88 | iter_stats[f"{stats_prefix}stdk"] = (stdk, bsz)
89 |
90 | return loss, iter_stats
91 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/format_data/convert_vicuna.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 |
4 | from datasets import load_dataset
5 |
6 | sys.path.append('../../')
7 |
8 | from chatllms.data.data_utils import extract_default_prompt_dataset
9 |
10 |
11 | def json_dump(obj, path):
12 | with open(path, 'w', encoding='utf-8') as f:
13 | json.dump(obj, f, indent=2, ensure_ascii=False)
14 |
15 |
16 | def json_load(in_file):
17 | with open(in_file, 'r') as f:
18 | json_data = json.load(f)
19 | return json_data
20 |
21 |
22 | def valid_keys(keys):
23 | for k in ['input', 'output']:
24 | if k not in keys:
25 | return False
26 | return True
27 |
28 |
29 | def remove_unused_columns(dataset):
30 | """Remove columns not named 'input' or 'output'."""
31 | dataset = dataset.remove_columns([
32 | col for col in dataset.column_names if col not in ['input', 'output']
33 | ])
34 | return dataset
35 |
36 |
37 | def convert_alpaca_vicuna(in_file: str, out_file: str = None):
38 | raw_dataset = load_dataset('json', data_files=in_file)['train']
39 | raw_dataset = raw_dataset.map(extract_default_prompt_dataset)
40 |
41 | collect_data = []
42 | for i, content in enumerate(raw_dataset):
43 | prompt = content['input']
44 | response = content['output']
45 |
46 | collect_data.append({
47 | 'id':
48 | f'alpaca_{i}',
49 | 'conversations': [
50 | {
51 | 'from': 'human',
52 | 'value': prompt
53 | },
54 | {
55 | 'from': 'gpt',
56 | 'value': response
57 | },
58 | ],
59 | })
60 | print(f'Original: {len(raw_dataset)}, Converted: {len(collect_data)}')
61 | json_dump(collect_data, out_file)
62 | return collect_data
63 |
64 |
65 | if __name__ == '__main__':
66 | in_file = '/home/robin/prompt_data/100PoisonMpts/train_alpaca.json'
67 | out_file = '/home/robin/prompt_data/100PoisonMpts/train_vicuna.json'
68 | collect_data = convert_alpaca_vicuna(in_file, out_file)
69 |
70 | data_path = '/home/robin/prompt_data/CValues-Comparison/test_alpaca.json'
71 | out_path = '/home/robin/prompt_data/CValues-Comparison/test_vicuna.json'
72 | convert_alpaca_vicuna(data_path, out_file=out_path)
73 |
74 | data_path = '/home/robin/prompt_data/CValues-Comparison/train_alpaca.json'
75 | out_path = '/home/robin/prompt_data/CValues-Comparison/train_vicuna.json'
76 | convert_alpaca_vicuna(data_path, out_file=out_path)
77 |
78 | data_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.json'
79 | out_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_vicnua.json'
80 | convert_alpaca_vicuna(data_path, out_file=out_path)
81 |
82 | data_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json'
83 | out_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json'
84 | convert_alpaca_vicuna(data_path, out_file=out_path)
85 |
86 | data_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json'
87 | out_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json'
88 | convert_alpaca_vicuna(data_path, out_file=out_path)
89 |
90 | data_path = '/home/robin/prompt_data/COIG/train_alpaca.json'
91 | out_path = '/home/robin/prompt_data/COIG/train_vicuna.json'
92 | convert_alpaca_vicuna(data_path, out_file=out_path)
93 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/gen_args.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass, field
2 | from typing import Any, Dict, Optional
3 |
4 |
5 | @dataclass
6 | class GenerationArguments:
7 | """
8 | Arguments pertaining to specify the model generation parameters.
9 | """
10 | # generation parameters
11 | # 是否使用cache
12 | use_cache: Optional[bool] = field(default=True)
13 | # Length arguments
14 | # 最大的新生成的token数量
15 | max_new_tokens: Optional[int] = field(
16 | default=1024,
17 | metadata={
18 | 'help':
19 | 'Maximum number of new tokens to be generated in evaluation or prediction loops'
20 | 'if predict_with_generate is set.'
21 | })
22 | # 最少的新生成的token数量
23 | min_new_tokens: Optional[int] = field(
24 | default=0,
25 | metadata={'help': 'Minimum number of new tokens to generate.'})
26 | # 最大的token数量,会被 max_new_tokens 覆盖
27 | max_length: Optional[int] = field(
28 | default=None,
29 | metadata={
30 | 'help':
31 | 'The maximum length the generated tokens can have. It can be overridden by max_new_tokens.'
32 | })
33 | # Generation strategy
34 | # 是否采样
35 | do_sample: Optional[bool] = field(
36 | default=True,
37 | metadata={
38 | 'help':
39 | 'Whether or not to use sampling, use greedy decoding otherwise.'
40 | })
41 | # 集束搜索的数量
42 | num_beams: Optional[int] = field(
43 | default=1,
44 | metadata={
45 | 'help': 'Number of beams for beam search. 1 means no beam search.'
46 | })
47 | # 集束搜索的组数量
48 | num_beam_groups: Optional[int] = field(default=1)
49 | # 惩罚因子
50 | penalty_alpha: Optional[float] = field(default=None)
51 | # Hyperparameters for logit manipulation
52 | # softmax 函数的温度因子,来调节输出token的分布
53 | temperature: Optional[float] = field(
54 | default=1.0,
55 | metadata={
56 | 'help': 'The value used to modulate the next token probabilities.'
57 | })
58 | # top_k随机搜索中的k个最高概率选择
59 | top_k: Optional[int] = field(
60 | default=50,
61 | metadata={
62 | 'help':
63 | 'The number of highest probability vocabulary tokens to keep for top-k filtering.'
64 | })
65 | # 核采样参数,top_p最高的前n个(n是变化)概率和为p,从这些n个候选token中随机采样
66 | top_p: Optional[float] = field(
67 | default=1.0,
68 | metadata={
69 | 'help':
70 | 'The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept.'
71 | })
72 | # 典型p值
73 | typical_p: Optional[float] = field(default=1.0)
74 | # 丰富性惩罚因子
75 | diversity_penalty: Optional[float] = field(default=0.0)
76 | # 重复性惩罚因子
77 | repetition_penalty: Optional[float] = field(
78 | default=1.0,
79 | metadata={
80 | 'help':
81 | 'The parameter for repetition penalty. 1.0 means no penalty.'
82 | })
83 | # 长度惩罚因子
84 | length_penalty: Optional[float] = field(
85 | default=1.0,
86 | metadata={
87 | 'help':
88 | 'Exponential penalty to the length that is used with beam-based generation.'
89 | })
90 | # 没有ngram重复的尺度大小
91 | # 一般随机采样的丰富性够了,所以一般不会设置,如果重复很多则设置为2是比较好的选择
92 | no_repeat_ngram_size: Optional[int] = field(default=0)
93 |
94 | def to_dict(self) -> Dict[str, Any]:
95 | args = asdict(self)
96 | if args.get('max_new_tokens', None):
97 | args.pop('max_length', None)
98 | return args
99 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/utils/apply_lora.py:
--------------------------------------------------------------------------------
1 | """
2 | Apply the LoRA weights on top of a base model.
3 |
4 | Usage:
5 | python3 apply_lora.py --base_model_path ~/model_weights/llama-7b --target_model_path ~/model_weights/baize-7b \
6 | --lora_path project-baize/baize-lora-7B
7 |
8 | Dependency:
9 | pip3 install git+https://github.com/huggingface/peft.git@2822398fbe896f25d4dac5e468624dc5fd65a51b
10 | """
11 | import argparse
12 | from typing import Tuple
13 |
14 | import torch
15 | from peft import PeftModel
16 | from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
17 |
18 |
19 | def apply_lora(
20 | base_model_path: str,
21 | lora_model_path: str,
22 | target_model_path: str = None,
23 | cache_dir: str = None,
24 | use_auth_token: str = True,
25 | trust_remote_code: bool = True,
26 | ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
27 | """Applies the LoRA adapter to a base model and saves the resulting target model (optional).
28 |
29 | Args:
30 | base_model_path (str): The path to the base model to which the LoRA adapter will be applied.
31 | lora_model_path (str): The path to the LoRA adapter.
32 | target_model_path (str): The path where the target model will be saved (if `save_target_model=True`).
33 | cache_dir (str): The path to the cache directory.
34 | use_auth_token (bool): Whether to use an authentication token when downloading the model.
35 | trust_remote_code (bool): Whether to trust remote code when downloading the model.
36 |
37 | Returns:
38 | Tuple[AutoModelForCausalLM, AutoTokenizer]: A tuple containing the target model and its tokenizer.
39 |
40 | """
41 | # Load the base model and tokenizer
42 | print(f'Loading the base model from {base_model_path}')
43 | # Set configuration kwargs for tokenizer.
44 | config_kwargs = {
45 | 'cache_dir': cache_dir,
46 | 'use_auth_token': use_auth_token,
47 | 'trust_remote_code': trust_remote_code,
48 | }
49 |
50 | base_model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
51 | base_model_path,
52 | device_map='auto',
53 | torch_dtype=torch.float16,
54 | low_cpu_mem_usage=True,
55 | **config_kwargs,
56 | )
57 |
58 | # Load the tokenizer
59 | print(f'Loading the tokenizer from {base_model_path}')
60 | # Due to the name of Transformers' LlamaTokenizer, we have to do this
61 | tokenizer = AutoTokenizer.from_pretrained(
62 | base_model_path,
63 | use_fast=False,
64 | **config_kwargs,
65 | )
66 |
67 | # Load the LoRA adapter
68 | print(f'Loading the LoRA adapter from {lora_model_path}')
69 | model: PreTrainedModel = PeftModel.from_pretrained(base_model,
70 | lora_model_path)
71 | print('Applying the LoRA to base model')
72 | model = model.merge_and_unload()
73 |
74 | if target_model_path is not None:
75 | print(f'Saving the target model to {target_model_path}')
76 | model.save_pretrained(target_model_path)
77 | tokenizer.save_pretrained(target_model_path)
78 |
79 | return model, tokenizer
80 |
81 |
82 | if __name__ == '__main__':
83 | parser = argparse.ArgumentParser()
84 | parser.add_argument('--base-model-path', type=str, required=True)
85 | parser.add_argument('--target-model-path', type=str, default=None)
86 | parser.add_argument('--lora-model-path', type=str, required=True)
87 | args = parser.parse_args()
88 |
89 | apply_lora(base_model_path=args.base_model_path,
90 | lora_model_path=args.lora_model_path,
91 | target_model_path=args.target_model_path)
92 |
--------------------------------------------------------------------------------
/retrieval_contriever/src/dist_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 |
3 | import torch
4 | import torch.distributed as dist
5 |
6 |
7 | class Gather(torch.autograd.Function):
8 | @staticmethod
9 | def forward(ctx, x: torch.tensor):
10 | output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
11 | dist.all_gather(output, x)
12 | return tuple(output)
13 |
14 | @staticmethod
15 | def backward(ctx, *grads):
16 | all_gradients = torch.stack(grads)
17 | dist.all_reduce(all_gradients)
18 | return all_gradients[dist.get_rank()]
19 |
20 |
21 | def gather(x: torch.tensor):
22 | if not dist.is_initialized():
23 | return x
24 | x_gather = Gather.apply(x)
25 | x_gather = torch.cat(x_gather, dim=0)
26 | return x_gather
27 |
28 |
29 | @torch.no_grad()
30 | def gather_nograd(x: torch.tensor):
31 | if not dist.is_initialized():
32 | return x
33 | x_gather = [torch.ones_like(x) for _ in range(dist.get_world_size())]
34 | dist.all_gather(x_gather, x, async_op=False)
35 |
36 | x_gather = torch.cat(x_gather, dim=0)
37 | return x_gather
38 |
39 |
40 | @torch.no_grad()
41 | def varsize_gather_nograd(x: torch.Tensor):
42 | """gather tensors of different sizes along the first dimension"""
43 | if not dist.is_initialized():
44 | return x
45 |
46 | # determine max size
47 | size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int)
48 | allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())]
49 | dist.all_gather(allsizes, size)
50 | max_size = max([size.cpu().max() for size in allsizes])
51 |
52 | padded = torch.empty(max_size, *x.shape[1:], dtype=x.dtype, device=x.device)
53 | padded[: x.shape[0]] = x
54 | output = [torch.zeros_like(padded) for _ in range(dist.get_world_size())]
55 | dist.all_gather(output, padded)
56 |
57 | output = [tensor[: allsizes[k]] for k, tensor in enumerate(output)]
58 | output = torch.cat(output, dim=0)
59 |
60 | return output
61 |
62 |
63 | @torch.no_grad()
64 | def get_varsize(x: torch.Tensor):
65 | """gather tensors of different sizes along the first dimension"""
66 | if not dist.is_initialized():
67 | return [x.shape[0]]
68 |
69 | # determine max size
70 | size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int)
71 | allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())]
72 | dist.all_gather(allsizes, size)
73 | allsizes = torch.cat(allsizes)
74 | return allsizes
75 |
76 |
77 | def get_rank():
78 | if not dist.is_available():
79 | return 0
80 | if not dist.is_initialized():
81 | return 0
82 | return dist.get_rank()
83 |
84 |
85 | def is_main():
86 | return get_rank() == 0
87 |
88 |
89 | def get_world_size():
90 | if not dist.is_initialized():
91 | return 1
92 | else:
93 | return dist.get_world_size()
94 |
95 |
96 | def barrier():
97 | if dist.is_initialized():
98 | dist.barrier()
99 |
100 |
101 | def average_main(x):
102 | if not dist.is_initialized():
103 | return x
104 | if dist.is_initialized() and dist.get_world_size() > 1:
105 | dist.reduce(x, 0, op=dist.ReduceOp.SUM)
106 | if is_main():
107 | x = x / dist.get_world_size()
108 | return x
109 |
110 |
111 | def sum_main(x):
112 | if not dist.is_initialized():
113 | return x
114 | if dist.is_initialized() and dist.get_world_size() > 1:
115 | dist.reduce(x, 0, op=dist.ReduceOp.SUM)
116 | return x
117 |
118 |
119 | def weighted_average(x, count):
120 | if not dist.is_initialized():
121 | if isinstance(x, torch.Tensor):
122 | x = x.item()
123 | return x, count
124 | t_loss = torch.tensor([x * count]).cuda()
125 | t_total = torch.tensor([count]).cuda()
126 | t_loss = sum_main(t_loss)
127 | t_total = sum_main(t_total)
128 | return (t_loss / t_total).item(), t_total.item()
129 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/model/sample_generate_callback.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from dataclasses import dataclass
3 | from typing import Any, Dict
4 |
5 | from transformers import PreTrainedTokenizer, TrainerCallback
6 |
7 |
8 | @dataclass
9 | class SampleGenerateCallback(TrainerCallback):
10 | """
11 | A callback that generates text samples from a pre-trained language model during training.
12 |
13 | Args:
14 | tokenizer (PreTrainedTokenizer): The tokenizer used to preprocess inputs.
15 | max_new_tokens (int): The maximum number of tokens to generate in response to each input.
16 | """
17 | def __init__(self, tokenizer: PreTrainedTokenizer,
18 | generation_config: argparse.Namespace, logger: None):
19 | self.tokenizer = tokenizer
20 | self.generation_config = generation_config
21 | self.logger = logger
22 |
23 | # Define input prompts to generate text from
24 | self.sample_inputs = [
25 | '用一句话描述地球为什么是独一无二的。',
26 | '中国是否应该推出刺激政策救楼市?',
27 | '如何更好地融入新工作圈子',
28 | '帮我把这段文字转换成鲁迅作品里的语气:昨天上午,算几个数学问题时越算越难受,有想要撕掉草稿纸的冲动思维也变得缓慢,见字忘意,感觉大脑里是一团浆糊,阻力很大。'
29 | '我怀疑自己抑郁又犯了,站起身离开了书桌。走出大门,开始跑步,运动,希望能借此缓解。我不想再吃药,我担心不吃药是否能恢复。稍微运动后,大吃了一顿,路上不停的对自己说,我可以.',
30 | '回来后,感觉似乎确实好一些。',
31 | '给我写一篇大模型的新闻稿',
32 | '你觉得人类哪些工作岗位会被AI替代?',
33 | '请帮我写一封中式婚礼请帖,用于邀请亲朋好友参加我的婚礼!',
34 | '帮我写一篇八百字以上的作文,主题是:当代青年面对时代的挑战如何肩负起民族复兴的伟大任务',
35 | '请仿照李荣浩的风格写一首表现爱情的歌曲,以“辣椒酱”为题。',
36 | '秦王朝时期十大将军是?其主要功绩是什么?',
37 | '帮我写一段广告,关于房产销售的,我们的房子首付低,赠送面积大,还免两年物业费!',
38 | '请帮我设计一个时长为3天的北京旅游行程,行程的内容不要太紧凑,使用地铁作为交通工具,并前往前门、天安门、天坛公园、鸟巢游览,同时预留一天的时间游玩环球影城。',
39 | '一个笼子里面有若干只鸡和兔子,总共有50只脚和18个头,求鸡和兔子各有多少只?',
40 | '生成一篇短篇小说,故事情节为一个年轻人在旅途中遇到了一位神秘的老人,老人告诉他一个令人意想不到的秘密,最终年轻人的生活因此发生了翻天覆地的变化。',
41 | '导师想要我论文的一作,我应该怎么办?',
42 | '我现在很无聊,可以讲点有趣的事情吗?',
43 | '一项工程,甲、乙两队合作20天完成,乙丙两队合作60天完成,丙丁两队合作30完成,甲丁合作多少天完成?',
44 | '如果一位孕妇走上了公交车,但是车上没有空位了。请模拟一位热心乘客给孕妇让座的对话。',
45 | '桃花潭水深千尺,不及汪伦送我情。体现的是怎样的心情?',
46 | '编写一个简单的自动化脚本,用于批量操作文件或目录。脚本功能可以自由选择,如复制、压缩、重命名、删除等。脚本语言可使用Python、Shell、Perl等,代码长度不少于100行。',
47 | '音乐可以洗涤人的灵魂吗?',
48 | ]
49 |
50 | def on_evaluate(self, args: Any, state: Dict[str, Any], control: Any,
51 | **kwargs: Any) -> None:
52 | """
53 | Generates text samples from the language model during evaluation.
54 |
55 | Args:
56 | args (Any): Trainer arguments, not used in this method.
57 | state (Dict[str, Any]): Trainer state dictionary, not used in this method.
58 | control (Any): Trainer control object, not used in this method.
59 | kwargs (Dict[str, Any]): Keyword arguments passed to the method, including the pre-trained
60 | language model (under the key 'model') and any additional parameters needed for generation.
61 |
62 | Returns:
63 | None
64 | """
65 | logger = self.logger
66 | logger.info('Generating sample text during evaluation...')
67 |
68 | # Check if the pre-trained language model is available
69 | if 'model' in kwargs:
70 | model = kwargs['model']
71 |
72 | # Generate text for each input prompt
73 | for instruction in self.sample_inputs:
74 | # Preprocess input prompt and convert to tensor
75 | inputs = f'{instruction}\n\n### Response: '
76 | inputs = self.tokenizer(inputs, return_tensors='pt')
77 | inputs = inputs.to(model.device)
78 |
79 | # Generate text from input prompt
80 | generation_output = model.generate(
81 | **inputs,
82 | generation_config=self.generation_config,
83 | )
84 |
85 | # Decode generated text and log it
86 | generated_text = self.tokenizer.decode(generation_output[0])
87 | logger.info(f'Input prompt: {instruction}')
88 | logger.info(f'Generated text: {generated_text}')
89 |
90 | else:
91 | logger.info(
92 | 'Pre-trained language model not found in kwargs, skipping.')
93 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/model/compute_metrics.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Dict, List, Tuple, Union
3 |
4 | import jieba
5 | import numpy as np
6 | from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
7 | from rouge_chinese import Rouge
8 | from transformers import PreTrainedTokenizer
9 |
10 |
11 | @dataclass
12 | class ComputeMetrics:
13 | """
14 | Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer.
15 | Borrowed from: https://github.com/THUDM/ChatGLM-6B/blob/0c2806fea82683349194e21996dd6b3acc3c265b/ptuning/main.py#L307
16 |
17 | """
18 | def __init__(self, tokenizer: PreTrainedTokenizer) -> None:
19 | """
20 | Initialize the ComputeMetrics class with a pre-trained tokenizer object.
21 |
22 | Args:
23 | tokenizer (PreTrainedTokenizer): A pre-trained tokenizer object to be used for decoding tokenized sequences.
24 | """
25 | self.tokenizer = tokenizer
26 |
27 | def __call__(
28 | self, eval_preds: List[Union[np.ndarray, Tuple[np.ndarray]]]
29 | ) -> Dict[str, float]:
30 | """
31 | Computes evaluation metrics for model predictions.
32 |
33 | Args:
34 | eval_preds (List[Union[np.ndarray, Tuple[np.ndarray]]]): List of tuples containing prediction and label arrays.
35 |
36 | Returns:
37 | Dict[str, float]: A dictionary containing the average of each computed metric over all prediction-label pairs.
38 | """
39 |
40 | # Extract predictions and labels from input
41 | preds, labels = eval_preds
42 | if isinstance(preds, tuple):
43 | preds = preds[0]
44 |
45 | # Replace IGNORE_INDEX in the labels with pad_token_id as we cannot decode them if ignore_pad_token_for_loss=True.
46 | preds = np.where(preds != self.tokenizer.pad_token_id, preds,
47 | self.tokenizer.pad_token_id)
48 | labels = np.where(labels != self.tokenizer.pad_token_id, labels,
49 | self.tokenizer.pad_token_id)
50 |
51 | score_dict = {
52 | 'rouge-1': [], # numericl 1
53 | 'rouge-2': [],
54 | 'rouge-l': [], # string l
55 | 'bleu-4': []
56 | }
57 |
58 | # Calculate metrics for each prediction-label pair
59 | for pred, label in zip(preds, labels):
60 | pred = pred[(pred == self.tokenizer.bos_token_id
61 | ).nonzero()[0][0]:] # remove the query
62 | hypothesis = list(
63 | jieba.cut(self.tokenizer.decode(pred,
64 | skip_special_tokens=True)))
65 | reference = list(
66 | jieba.cut(
67 | self.tokenizer.decode(label, skip_special_tokens=True)))
68 |
69 | # If there are no words in the hypothesis, set all scores to 0
70 | if len(' '.join(hypothesis).split()) == 0:
71 | result = {
72 | 'rouge-1': {
73 | 'f': 0.0
74 | },
75 | 'rouge-2': {
76 | 'f': 0.0
77 | },
78 | 'rouge-l': {
79 | 'f': 0.0
80 | }
81 | }
82 | else:
83 | rouge = Rouge()
84 | scores = rouge.get_scores(' '.join(hypothesis),
85 | ' '.join(reference))
86 | result = scores[0]
87 |
88 | # Append scores to score_dict
89 | for k, v in result.items():
90 | score_dict[k].append(round(v['f'] * 100, 4))
91 |
92 | # Calculate BLEU-4 score and append it to score_dict
93 | bleu_score = sentence_bleu(
94 | [list(label)],
95 | list(pred),
96 | smoothing_function=SmoothingFunction().method3)
97 | score_dict['bleu-4'].append(round(bleu_score * 100, 4))
98 |
99 | # Calculate average of each metric over all prediction-label pairs and return as a dictionary
100 | return {k: float(np.mean(v)) for k, v in score_dict.items()}
101 |
--------------------------------------------------------------------------------
/retrieval_contriever/src/slurm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | from logging import getLogger
8 | import os
9 | import sys
10 | import torch
11 | import socket
12 | import signal
13 | import subprocess
14 |
15 |
16 | logger = getLogger()
17 |
18 | def sig_handler(signum, frame):
19 | logger.warning("Signal handler called with signal " + str(signum))
20 | prod_id = int(os.environ['SLURM_PROCID'])
21 | logger.warning("Host: %s - Global rank: %i" % (socket.gethostname(), prod_id))
22 | if prod_id == 0:
23 | logger.warning("Requeuing job " + os.environ['SLURM_JOB_ID'])
24 | os.system('scontrol requeue ' + os.environ['SLURM_JOB_ID'])
25 | else:
26 | logger.warning("Not the main process, no need to requeue.")
27 | sys.exit(-1)
28 |
29 |
30 | def term_handler(signum, frame):
31 | logger.warning("Signal handler called with signal " + str(signum))
32 | logger.warning("Bypassing SIGTERM.")
33 |
34 |
35 | def init_signal_handler():
36 | """
37 | Handle signals sent by SLURM for time limit / pre-emption.
38 | """
39 | signal.signal(signal.SIGUSR1, sig_handler)
40 | signal.signal(signal.SIGTERM, term_handler)
41 |
42 |
43 | def init_distributed_mode(params):
44 | """
45 | Handle single and multi-GPU / multi-node / SLURM jobs.
46 | Initialize the following variables:
47 | - local_rank
48 | - global_rank
49 | - world_size
50 | """
51 | is_slurm_job = 'SLURM_JOB_ID' in os.environ and not 'WORLD_SIZE' in os.environ
52 | has_local_rank = hasattr(params, 'local_rank')
53 |
54 | # SLURM job without torch.distributed.launch
55 | if is_slurm_job and has_local_rank:
56 |
57 | assert params.local_rank == -1 # on the cluster, this is handled by SLURM
58 |
59 | # local rank on the current node / global rank
60 | params.local_rank = int(os.environ['SLURM_LOCALID'])
61 | params.global_rank = int(os.environ['SLURM_PROCID'])
62 | params.world_size = int(os.environ['SLURM_NTASKS'])
63 |
64 | # define master address and master port
65 | hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', os.environ['SLURM_JOB_NODELIST']])
66 | params.main_addr = hostnames.split()[0].decode('utf-8')
67 | assert 10001 <= params.main_port <= 20000 or params.world_size == 1
68 |
69 | # set environment variables for 'env://'
70 | os.environ['MASTER_ADDR'] = params.main_addr
71 | os.environ['MASTER_PORT'] = str(params.main_port)
72 | os.environ['WORLD_SIZE'] = str(params.world_size)
73 | os.environ['RANK'] = str(params.global_rank)
74 | is_distributed = True
75 |
76 |
77 | # multi-GPU job (local or multi-node) - jobs started with torch.distributed.launch
78 | elif has_local_rank and params.local_rank != -1:
79 |
80 | assert params.main_port == -1
81 |
82 | # read environment variables
83 | params.global_rank = int(os.environ['RANK'])
84 | params.world_size = int(os.environ['WORLD_SIZE'])
85 |
86 | is_distributed = True
87 |
88 | # local job (single GPU)
89 | else:
90 | params.local_rank = 0
91 | params.global_rank = 0
92 | params.world_size = 1
93 | is_distributed = False
94 |
95 | # set GPU device
96 | torch.cuda.set_device(params.local_rank)
97 |
98 | # initialize multi-GPU
99 | if is_distributed:
100 |
101 | # http://pytorch.apachecn.org/en/0.3.0/distributed.html#environment-variable-initialization
102 | # 'env://' will read these environment variables:
103 | # MASTER_PORT - required; has to be a free port on machine with rank 0
104 | # MASTER_ADDR - required (except for rank 0); address of rank 0 node
105 | # WORLD_SIZE - required; can be set either here, or in a call to init function
106 | # RANK - required; can be set either here, or in a call to init function
107 |
108 | #print("Initializing PyTorch distributed ...")
109 | torch.distributed.init_process_group(
110 | init_method='env://',
111 | backend='nccl',
112 | #world_size=params.world_size,
113 | #rank=params.global_rank,
114 | )
--------------------------------------------------------------------------------
/source/model/llama2/train_qlora.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import time
4 |
5 | import torch
6 | import transformers
7 | from transformers import GenerationConfig, Trainer, set_seed
8 |
9 | from chatllms.configs import (DataArguments, GenerationArguments,
10 | LoraArguments, ModelArguments, QuantArguments,
11 | TrainingArguments)
12 | from chatllms.data import make_supervised_data_module
13 | from chatllms.model import (MMLUEvalCallback, SampleGenerateCallback,
14 | SavePeftModelCallback, load_model_tokenizer)
15 | from chatllms.train.training import train_and_evaluate
16 | from chatllms.utils.logger_utils import get_root_logger
17 | from chatllms.utils.model_utils import (check_training_finished,
18 | print_trainable_parameters,
19 | verify_dtypes)
20 |
21 | torch.backends.cuda.matmul.allow_tf32 = True
22 |
23 |
24 | def main():
25 | parser = transformers.HfArgumentParser(
26 | (ModelArguments, DataArguments, TrainingArguments, LoraArguments,
27 | QuantArguments, GenerationArguments))
28 | (model_args, data_args, training_args, lora_args, quant_args,
29 | generation_args) = parser.parse_args_into_dataclasses()
30 | # Check arguments (do not check finetuning_args since it may be loaded from checkpoints)
31 | data_args.init_for_training()
32 | training_args.generation_config = GenerationConfig(**vars(generation_args))
33 |
34 | args = argparse.Namespace(**vars(model_args), **vars(data_args),
35 | **vars(training_args), **vars(lora_args),
36 | **vars(quant_args))
37 | # init the logger before other steps
38 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
39 | if not os.path.exists(args.output_dir):
40 | os.makedirs(args.output_dir)
41 | log_file = os.path.join(args.output_dir, f'{timestamp}.log')
42 | logger = get_root_logger(log_file=log_file, log_level='INFO')
43 |
44 | # Log on each process the small summary:
45 | logger.info(
46 | f'Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}'
47 | +
48 | f'distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}'
49 | )
50 | logger.info('Training/evaluation parameters %s', args)
51 | # Check if training was already completed.
52 | checkpoint_dir, completed_training = check_training_finished(args, logger)
53 | args.resume_checkpoint = checkpoint_dir
54 |
55 | # load model and tokenizer
56 | model, tokenizer = load_model_tokenizer(
57 | args=args,
58 | checkpoint_dir=checkpoint_dir,
59 | is_trainable=args.do_train,
60 | logger=logger,
61 | )
62 | logger.info('Loaded model...')
63 |
64 | logger.info('Printing trainable parameters...')
65 | print_trainable_parameters(args, model)
66 |
67 | set_seed(args.seed)
68 |
69 | # Verify dtypes
70 | logger.info('Verifying dtypes...')
71 | verify_dtypes(model)
72 |
73 | data_module = make_supervised_data_module(tokenizer=tokenizer, args=args)
74 | trainer = Trainer(model=model,
75 | tokenizer=tokenizer,
76 | args=training_args,
77 | **data_module)
78 | # Add callback to save adapter model.
79 | if not args.full_finetune:
80 | trainer.add_callback(SavePeftModelCallback)
81 |
82 | # Add callback to generate samples.
83 | if args.sample_generate:
84 | trainer.add_callback(
85 | SampleGenerateCallback(
86 | tokenizer=tokenizer,
87 | generation_config=GenerationConfig(**vars(generation_args)),
88 | logger=logger,
89 | ))
90 |
91 | if args.do_mmlu_eval:
92 | eval_callback = MMLUEvalCallback(
93 | trainer=trainer,
94 | tokenizer=tokenizer,
95 | data_dir='./data',
96 | args=args,
97 | )
98 | trainer.add_callback(eval_callback)
99 |
100 | assert args.do_train or args.do_eval or args.do_predict
101 | if args.do_train or args.do_eval:
102 | train_and_evaluate(trainer, args, logger)
103 |
104 |
105 | if __name__ == '__main__':
106 | main()
107 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/model/save_peft_model_callback.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Any, Dict
3 |
4 | from transformers import (PreTrainedModel, TrainerCallback, TrainerControl,
5 | TrainingArguments)
6 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
7 |
8 |
9 | class SavePeftModelCallback(TrainerCallback):
10 | """
11 | Callback to save PEFT model checkpoints during training.
12 |
13 | Saves both the full model and the adapter model to separate directories
14 | within the checkpoint directory.
15 | """
16 | def save_model(self, args: Any, state: TrainingArguments,
17 | kwargs: Dict[str, Any]) -> None:
18 | """
19 | Saves the PEFT model checkpoint.
20 |
21 | Args:
22 | args (Any): The command line arguments passed to the script.
23 | state (TrainingArguments): The current state of training.
24 | kwargs (Dict[str, Any]): A dictionary of additional keyword arguments.
25 |
26 | Raises:
27 | TypeError: If `state` is not an instance of `TrainingArguments`.
28 | """
29 | print('+' * 20, 'Saving PEFT Model Checkpoint CallBack', '+' * 20)
30 |
31 | # Get the checkpoint directory for saving models.
32 | if state.best_model_checkpoint is not None:
33 | # If best model checkpoint exists, use its directory as the checkpoint folder
34 | checkpoint_dir = os.path.join(state.best_model_checkpoint,
35 | 'adapter_model')
36 | else:
37 | # Otherwise, create a new checkpoint folder using the output directory and current global step
38 | checkpoint_dir = os.path.join(
39 | args.output_dir,
40 | f'{PREFIX_CHECKPOINT_DIR}-{state.global_step}')
41 |
42 | # Create path for the PEFT model
43 | peft_model_path = os.path.join(checkpoint_dir, 'adapter_model')
44 | model: PreTrainedModel = kwargs['model']
45 | model.save_pretrained(peft_model_path)
46 |
47 | # Create path for the PyTorch model binary file and remove it if it already exists
48 | pytorch_model_path = os.path.join(checkpoint_dir, 'pytorch_model.bin')
49 | if os.path.exists(pytorch_model_path):
50 | os.remove(pytorch_model_path)
51 |
52 | def on_save(self, args: Any, state: TrainingArguments,
53 | control: TrainerControl,
54 | **kwargs: Dict[str, Any]) -> TrainerControl:
55 | """
56 | Callback method that calls save_model() and returns `control` argument.
57 |
58 | Args:
59 | args (Any): The command line arguments passed to the script.
60 | state (TrainingArguments): The current state of training.
61 | control (trainer_callback.TrainerControl): \
62 | The current state of the TrainerCallback's control flow.
63 | kwargs (Dict[str, Any]): A dictionary of additional keyword arguments.
64 |
65 | Returns:
66 | trainer_callback.TrainerControl: The current state of the TrainerCallback's control flow.
67 |
68 | Raises:
69 | TypeError: If `state` is not an instance of `TrainingArguments`.
70 | """
71 | self.save_model(args, state, kwargs)
72 | return control
73 |
74 | def on_train_end(self, args: Any, state: TrainingArguments,
75 | control: TrainerControl, **kwargs: Dict[str,
76 | Any]) -> None:
77 | """
78 | Callback method that saves the model checkpoint and creates a 'completed' file in the output directory.
79 |
80 | Args:
81 | args (Any): The command line arguments passed to the script.
82 | state (TrainingArguments): The current state of training.
83 | control (trainer_callback.TrainerControl): \
84 | The current state of the TrainerCallback's control flow.
85 | kwargs (Dict[str, Any]): A dictionary of additional keyword arguments.
86 |
87 | Raises:
88 | TypeError: If `state` is not an instance of `TrainingArguments`.
89 | """
90 |
91 | # Define a helper function to create a 'completed' file in the output directory
92 | def touch(fname, times=None):
93 | with open(fname, 'a'):
94 | os.utime(fname, times)
95 |
96 | # Create the 'completed' file in the output directory
97 | touch(os.path.join(args.output_dir, 'completed'))
98 |
--------------------------------------------------------------------------------
/source/model/llama2/server/single_chat.py:
--------------------------------------------------------------------------------
1 | import os
2 | import platform
3 | import sys
4 | from threading import Thread
5 | from typing import List
6 |
7 | import torch
8 | import transformers
9 | from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,
10 | PreTrainedTokenizer, TextIteratorStreamer)
11 |
12 | sys.path.append('../')
13 | from chatllms.configs import GenerationArguments, ModelInferenceArguments
14 | from chatllms.utils.model_utils import get_logits_processor
15 |
16 |
17 | def generate_response(query: str, tokenizer: PreTrainedTokenizer,
18 | model: PreTrainedModel,
19 | generation_args: dict) -> List[str]:
20 | """
21 | Generates a response to the given query using GPT-3.5 model and prints it to the console.
22 |
23 | Args:
24 | query (str): The input query for which a response is to be generated.
25 | tokenizer (PreTrainedTokenizer): The tokenizer used to convert the raw text into input tokens.
26 | model (PreTrainedModel): The GPT-3.5 model used to generate the response.
27 | generation_args (dict): A dictionary containing the arguments to be passed to the generate() method of the model.
28 |
29 | Returns:
30 | List[Tuple[str, str]]: A list of all the previous queries and their responses, including the current one.
31 | """
32 |
33 | # Convert the query and history into input IDs
34 | inputs = tokenizer(query, return_tensors='pt', add_special_tokens=False)
35 | inputs = {k: v.to(model.device) for k, v in inputs.items()}
36 |
37 | # Create a TextIteratorStreamer object to stream the response from the model
38 | streamer = TextIteratorStreamer(tokenizer,
39 | timeout=60.0,
40 | skip_prompt=True,
41 | skip_special_tokens=True)
42 |
43 | # Set the arguments for the model's generate() method
44 | gen_kwargs = dict(
45 | **inputs,
46 | streamer=streamer,
47 | logits_processor=get_logits_processor(),
48 | **generation_args.to_dict(),
49 | )
50 |
51 | # Start a separate thread to generate the response asynchronously
52 | thread = Thread(target=model.generate, kwargs=gen_kwargs)
53 | thread.start()
54 |
55 | # Print the model name and the response as it is generated
56 | print('Assistant: ', end='', flush=True)
57 | response = ''
58 | for new_text in streamer:
59 | print(new_text, end='', flush=True)
60 | response += new_text
61 | # Update the history with the current query and response and return it
62 | return response
63 |
64 |
65 | def main():
66 | """
67 | 单轮对话,不具有对话历史的记忆功能
68 | Run conversational agent loop with input/output.
69 |
70 | Args:
71 | model_args: Arguments for loading model
72 | gen_args: Arguments for model.generate()
73 |
74 | Returns:
75 | None
76 | """
77 |
78 | # Parse command-line arguments
79 | parser = transformers.HfArgumentParser(
80 | (ModelInferenceArguments, GenerationArguments))
81 | model_server_args, generation_args = parser.parse_args_into_dataclasses()
82 |
83 | # Load the pretrained language model.
84 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
85 |
86 | model = AutoModelForCausalLM.from_pretrained(
87 | model_server_args.model_name_or_path,
88 | trust_remote_code=True,
89 | low_cpu_mem_usage=True,
90 | torch_dtype=torch.float16,
91 | device_map='auto').to(device).eval()
92 |
93 | tokenizer = AutoTokenizer.from_pretrained(
94 | model_server_args.model_name_or_path,
95 | trust_remote_code=True,
96 | use_fast=False,
97 | )
98 |
99 | os_name = platform.system()
100 | clear_command = 'cls' if os_name == 'Windows' else 'clear'
101 | # Set the arguments for the model's generate() method
102 | print('欢迎使用 CLI 对话系统,输入内容即可对话,clear 清空对话历史,stop 终止程序')
103 | input_pattern = '{}'
104 | while True:
105 | query = input('\nUser: ')
106 | if query.strip() == 'stop':
107 | break
108 |
109 | if query.strip() == 'clear':
110 | os.system(clear_command)
111 | print('History has been removed.')
112 | print('欢迎使用CLI 对话系统,输入内容即可对话,clear 清空对话历史,stop 终止程序')
113 | continue
114 |
115 | query = input_pattern.format(query)
116 | # Perform prediction and printing
117 | generate_response(query, tokenizer, model, generation_args)
118 |
119 |
120 | if __name__ == '__main__':
121 | main()
122 |
--------------------------------------------------------------------------------
/source/model/llama2/server/gradio_base_webserver.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import gradio as gr
4 | import torch
5 | from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
6 |
7 | from chatllms.utils.apply_lora import apply_lora
8 |
9 |
10 | def args_parser():
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--model_name_or_path',
13 | default=None,
14 | type=str,
15 | required=True,
16 | help='Path to pre-trained model')
17 | parser.add_argument('--lora_model_name_or_path',
18 | default=None,
19 | type=str,
20 | help='Path to pre-trained model')
21 | parser.add_argument('--no_cuda',
22 | action='store_true',
23 | help='Avoid using CUDA when available')
24 | parser.add_argument('--load_8bit',
25 | action='store_true',
26 | help='Whether to use load_8bit instead of 32-bit')
27 | args = parser.parse_args()
28 |
29 | args.device = torch.device(
30 | 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
31 | return args
32 |
33 |
34 | def main(args):
35 | if args.lora_model_name_or_path is not None:
36 | model, tokenizer = apply_lora(args.model_name_or_path,
37 | args.lora_model_name_or_path,
38 | load_8bit=args.load_8bit)
39 | else:
40 | tokenizer = AutoTokenizer.from_pretrained(
41 | pretrained_model_name_or_path=args.model_name_or_path,
42 | trust_remote_code=True)
43 | model = AutoModelForCausalLM.from_pretrained(
44 | pretrained_model_name_or_path=args.model_name_or_path,
45 | load_in_8bit=args.load_8bit,
46 | torch_dtype=torch.float16,
47 | device_map='auto',
48 | trust_remote_code=True)
49 |
50 | def evaluate(
51 | input=None,
52 | temperature=0.8,
53 | top_p=0.75,
54 | top_k=40,
55 | max_new_tokens=128,
56 | **kwargs,
57 | ):
58 | inputs = tokenizer(input, return_tensors='pt')
59 | inputs = inputs.to(args.device)
60 | generation_config = GenerationConfig(
61 | temperature=temperature,
62 | top_p=top_p,
63 | top_k=top_k,
64 | do_sample=True,
65 | no_repeat_ngram_size=6,
66 | repetition_penalty=1.8,
67 | **kwargs,
68 | )
69 | # Without streaming
70 | with torch.no_grad():
71 | generation_output = model.generate(
72 | **inputs,
73 | generation_config=generation_config,
74 | return_dict_in_generate=True,
75 | output_scores=True,
76 | max_new_tokens=max_new_tokens,
77 | )
78 | s = generation_output.sequences[0]
79 | output = tokenizer.decode(s, skip_special_tokens=True)
80 | yield output
81 |
82 | description = 'Baichuan7B is a 7B-parameter LLaMA model finetuned to follow instructions.'
83 | server = gr.Interface(
84 | fn=evaluate,
85 | inputs=[
86 | gr.components.Textbox(lines=2, label='Input', placeholder='none'),
87 | gr.components.Slider(minimum=0,
88 | maximum=1,
89 | value=0.1,
90 | label='Temperature'),
91 | gr.components.Slider(minimum=0,
92 | maximum=1,
93 | value=0.75,
94 | label='Top p'),
95 | gr.components.Slider(minimum=0,
96 | maximum=100,
97 | step=1,
98 | value=40,
99 | label='Top k'),
100 | gr.components.Slider(minimum=1,
101 | maximum=2000,
102 | step=1,
103 | value=128,
104 | label='Max tokens'),
105 | ],
106 | outputs=[gr.inputs.Textbox(
107 | lines=5,
108 | label='Output',
109 | )],
110 | title='Baichuan7B',
111 | description=description,
112 | )
113 |
114 | server.queue().launch(server_name='0.0.0.0', share=False)
115 |
116 |
117 | if __name__ == '__main__':
118 | args = args_parser()
119 | main(args)
120 |
--------------------------------------------------------------------------------
/source/model/llama2/data/dataset_info.yaml:
--------------------------------------------------------------------------------
1 | # The dataset_info.yaml file contains the information of the datasets used in the experiments.
2 | alpaca:
3 | hf_hub_url: tatsu-lab/alpaca
4 | local_path: tatsu-lab/alpaca/alpaca.json
5 | dataset_format: alpaca
6 | multi_turn: False
7 |
8 | alpaca-clean:
9 | hf_hub_url: yahma/alpaca-cleaned
10 | local_path: ''
11 | dataset_format: alpaca
12 | multi_turn: False
13 |
14 | coig:
15 | hf_hub_url: BAAI/COIG
16 | local_path: /home/robin/prompt_data//COIG/train_alpaca.json
17 | dataset_format: alpaca
18 | multi_turn: False
19 |
20 | dolly-15k:
21 | hf_hub_url: databricks/databricks-dolly-15k
22 | local_path: databricks/databricks-dolly-15k
23 | dataset_format: dolly
24 | multi_turn: False
25 |
26 | cvalues_comparison_train:
27 | hf_hub_url: ''
28 | local_path: /home/robin/prompt_data/CValues-Comparison/train_alpaca.json
29 | dataset_format: alpaca
30 | multi_turn: False
31 |
32 | cvalues_comparison_test:
33 | hf_hub_url: ''
34 | local_path: /home/robin/prompt_data/CValues-Comparison/test_alpaca.json
35 | dataset_format: alpaca
36 | multi_turn: False
37 |
38 | guanaco:
39 | hf_hub_url: JosephusCheung/GuanacoDataset
40 | local_path: ''
41 | dataset_format: guanaco
42 | multi_turn: False
43 |
44 | hh-rlhf:
45 | hf_hub_url: Anthropic/hh-rlhf
46 | local_path: ''
47 | dataset_format: hh-rlhf
48 | multi_turn: False
49 |
50 | huatuogpt:
51 | hf_hub_url: FreedomIntelligence/HuatuoGPT-sft-data-v1
52 | local_path: /home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.jsonl
53 | dataset_format: alpaca
54 | multi_turn: False
55 |
56 | openassistant-guanaco:
57 | hf_hub_url: timdettmers/openassistant-guanaco
58 | local_path: /home/robin/prompt_data/timdettmers/openassistant-guanaco
59 | dataset_format: alpaca
60 | multi_turn: False
61 |
62 | olcc:
63 | hf_hub_url: ''
64 | local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json
65 | dataset_format: alpaca
66 | multi_turn: False
67 |
68 | 100PoisonMpts:
69 | hf_hub_url: 'damo/100PoisonMpts'
70 | local_path: /home/robin/prompt_data/100PoisonMpts/train.jsonl
71 | dataset_format: 100PoisonMpts
72 | multi_turn: False
73 |
74 | safety_prompt_part1:
75 | hf_hub_url: ''
76 | local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json
77 | dataset_format: alpaca
78 | multi_turn: False
79 |
80 | safety_prompt_part2:
81 | hf_hub_url: ''
82 | local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json
83 | dataset_format: alpaca
84 | multi_turn: False
85 |
86 | # Belle Group
87 | belle_0.5m:
88 | hf_hub_url: BelleGroup/train_0.5M_CN
89 | local_path: ''
90 | dataset_format: alpaca
91 | multi_turn: False
92 |
93 | belle_1m:
94 | hf_hub_url: BelleGroup/train_1M_CN
95 | local_path: ''
96 | dataset_format: alpaca
97 | multi_turn: False
98 |
99 | belle_2m:
100 | hf_hub_url: BelleGroup/train_2M_CN
101 | local_path: ''
102 | dataset_format: alpaca
103 | multi_turn: False
104 |
105 | belle_dialog:
106 | hf_hub_url: BelleGroup/generated_chat_0.4M
107 | local_path: ''
108 | dataset_format: belle_dialog
109 | multi_turn: False
110 |
111 | belle_math:
112 | hf_hub_url: BelleGroup/school_math_0.25M
113 | local_path: ''
114 | dataset_format: alpaca
115 | multi_turn: False
116 |
117 | belle_multiturn:
118 | hf_hub_url: BelleGroup/multi_turn_0.5M
119 | local_path: ''
120 | dataset_format: belle_multiturn
121 | multi_turn: True
122 | columns:
123 | prompt: instruction
124 | query: ''
125 | response: output
126 | history: history
127 |
128 | # firefly
129 | firefly:
130 | hf_hub_url: YeungNLP/firefly-train-1.1M
131 | local_path: ''
132 | dataset_format: alpaca
133 | multi_turn: False
134 | columns:
135 | prompt: input
136 | query: ''
137 | response: target
138 | history: ''
139 |
140 | # CodeAlpaca
141 | codealpaca:
142 | hf_hub_url: sahil2801/CodeAlpaca-20k
143 | local_path: ''
144 | dataset_format: codealpaca
145 | multi_turn: False
146 |
147 | # alpacacot
148 | alpaca_cot:
149 | hf_hub_url: QingyiSi/Alpaca-CoT
150 | local_path: ''
151 | multi_turn: False
152 |
153 | webqa:
154 | hf_hub_url: suolyer/webqa
155 | local_path: ''
156 | dataset_format: webqa
157 | multi_turn: False
158 | columns:
159 | prompt: input
160 | query: ''
161 | response: output
162 | history: ''
163 |
164 | # mutli-turn datasets
165 | evol_instruct:
166 | hf_hub_url: WizardLM/WizardLM_evol_instruct_V2_196k
167 | local_path: WizardLM/WizardLM_evol_instruct_V2_196k/WizardLM_evol_instruct_V2_143k.json
168 | dataset_format: sharegpt
169 | multi_turn: True
170 |
171 | share_gpt:
172 | hf_hub_url: ''
173 | local_path: /home/robin/prompt_data/sharegpt/sharegpt_split.json
174 | dataset_format: sharegpt
175 | multi_turn: True
176 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/train/training.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import math
4 | import os
5 | from typing import Any, Dict
6 |
7 | import numpy as np
8 | import transformers
9 | from torch.utils.data import Dataset
10 |
11 |
12 | def train_and_evaluate(trainer: transformers.Trainer, args: argparse.Namespace,
13 | logger: None) -> None:
14 | """
15 | Trains and evaluates a machine learning model.
16 |
17 | Args:
18 | trainer (Trainer): The training object to use for training and evaluation.
19 | args (argparse.Namespace): The command line arguments for the current run.
20 | Returns:
21 | None
22 | """
23 | # Create dictionary to store metrics
24 | all_metrics: Dict[str, Any] = {'run_name': args.run_name}
25 |
26 | # Training
27 | if args.do_train:
28 | logger.info('=' * 80)
29 | logger.info('*** Train ***')
30 | logger.info('=' * 80)
31 | train_result = trainer.train(
32 | resume_from_checkpoint=args.resume_checkpoint)
33 | metrics = train_result.metrics
34 |
35 | metrics['train_samples'] = len(trainer.train_dataset)
36 |
37 | # Log and save training metrics
38 | trainer.log_metrics('train', metrics)
39 | trainer.save_metrics('train', metrics)
40 | trainer.save_state()
41 |
42 | # Update metrics dictionary with training metrics
43 | all_metrics.update(metrics)
44 |
45 | # Evaluation
46 | if args.do_eval:
47 | logger.info('=' * 80)
48 | logger.info('*** Evaluate ***')
49 | logger.info('=' * 80)
50 |
51 | # Evaluate the trained model and obtain evaluation metrics
52 | metrics = trainer.evaluate(metric_key_prefix='eval')
53 |
54 | try:
55 | perplexity = math.exp(metrics['eval_loss'])
56 | except OverflowError:
57 | perplexity = float('inf')
58 |
59 | metrics['perplexity'] = perplexity
60 | metrics['eval_samples'] = len(trainer.eval_dataset)
61 | # Log and save evaluation metrics
62 | trainer.log_metrics('eval', metrics)
63 | trainer.save_metrics('eval', metrics)
64 |
65 | # Update metrics dictionary with evaluation metrics
66 | all_metrics.update(metrics)
67 |
68 | # Save all metrics to a json file
69 | if args.do_train or args.do_eval:
70 | with open(os.path.join(args.output_dir, 'metrics.json'), 'w') as fout:
71 | fout.write(json.dumps(all_metrics))
72 |
73 |
74 | def predict_and_save(trainer: transformers.Trainer,
75 | tokenizer: transformers.PreTrainedTokenizer,
76 | predict_dataset: Dataset, args: argparse.Namespace,
77 | logger: None) -> None:
78 | """
79 | Make predictions on new data, save them to a file along with input examples,
80 | and update the overall metrics.
81 | """
82 | logger.info('=' * 80)
83 | logger.info('*** Predict ***')
84 | logger.info('=' * 80)
85 | data_dict = predict_dataset.dataset
86 |
87 | # Make predictions on the test dataset
88 | prediction_output = trainer.predict(test_dataset=predict_dataset,
89 | metric_key_prefix='predict')
90 |
91 | # Get the predictions and metrics
92 | prediction_metrics = prediction_output.metrics
93 | predictions = prediction_output.predictions
94 |
95 | # Replace -100 values with pad token ID and decode predictions
96 | predictions = np.where(predictions != -100, predictions,
97 | tokenizer.pad_token_id)
98 | predictions = tokenizer.batch_decode(predictions,
99 | skip_special_tokens=True,
100 | clean_up_tokenization_spaces=True)
101 |
102 | data_dict = predict_dataset.dataset
103 | # Create dictionary to store metrics
104 | all_metrics: Dict[str, Any] = {'run_name': args.run_name}
105 | # Write predictions and input examples to file
106 | with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout:
107 | for i, example in enumerate(data_dict):
108 | example['prediction_with_input'] = predictions[i].strip()
109 | example['prediction'] = predictions[i].replace(
110 | example['input'], '').strip()
111 | fout.write(json.dumps(example) + '\n')
112 |
113 | # Print and log the prediction metrics
114 | print(prediction_metrics)
115 | trainer.log_metrics('predict', prediction_metrics)
116 | trainer.save_metrics('predict', prediction_metrics)
117 |
118 | # Update the overall metrics
119 | all_metrics.update(prediction_metrics)
120 |
121 | # Save the overall metrics to a file
122 | with open(os.path.join(args.output_dir, 'eval_metrics.json'), 'w') as fout:
123 | fout.write(json.dumps(all_metrics))
124 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/configs/data_args.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 | from typing import List, Optional
4 |
5 | import yaml
6 |
7 |
8 | @dataclass
9 | class DatasetAttr(object):
10 |
11 | dataset_name: Optional[str] = None
12 | hf_hub_url: Optional[str] = None
13 | local_path: Optional[str] = None
14 | dataset_format: Optional[str] = None
15 | load_from_local: bool = False
16 | multi_turn: Optional[bool] = False
17 |
18 | def __repr__(self) -> str:
19 | rep = (f'dataset_name: {self.dataset_name} || '
20 | f'hf_hub_url: {self.hf_hub_url} || '
21 | f'local_path: {self.local_path} \n'
22 | f'data_formate: {self.dataset_format} || '
23 | f'load_from_local: {self.load_from_local} || '
24 | f'multi_turn: {self.multi_turn}')
25 | return rep
26 |
27 | def __post_init__(self):
28 | self.prompt_column = 'instruction'
29 | self.query_column = 'input'
30 | self.response_column = 'output'
31 | self.history_column = None
32 |
33 |
34 | @dataclass
35 | class DataArguments:
36 | dataset_cfg: Optional[str] = field(
37 | default='./data/alpaca_zh.yaml',
38 | metadata={
39 | 'help':
40 | 'Path to dataset infos, please refer to `./data/README.md` to see how to prepare your datasets for training.'
41 | })
42 | instruction_template: str = field(
43 | default='default',
44 | metadata={
45 | 'help':
46 | 'Which template to use for constructing prompts in training and inference.'
47 | })
48 | conversation_template: str = field(
49 | default='default',
50 | metadata={
51 | 'help':
52 | 'Which template to use for constructing prompts in multi-turn dataset training and inference.'
53 | })
54 | # 验证数据集的尺寸,也就是数量
55 | eval_dataset_size: Optional[float] = field(
56 | default=0.1, metadata={'help': 'Size of validation dataset.'})
57 | # 最大训练数据样本的数量。主要是为了快速调试训练代码
58 | max_train_samples: Optional[int] = field(
59 | default=None,
60 | metadata={
61 | 'help':
62 | 'For debugging purposes or quicker training, truncate the number of training examples to this '
63 | 'value if set.'
64 | },
65 | )
66 | # 与max_train_samples类似,主要是为了快速调试训练代码
67 | max_eval_samples: Optional[int] = field(
68 | default=None,
69 | metadata={
70 | 'help':
71 | 'For debugging purposes or quicker training, truncate the number of evaluation examples to this '
72 | 'value if set.'
73 | },
74 | )
75 |
76 | def init_for_training(self): # support mixing multiple datasets
77 | assert self.dataset_cfg is not None and os.path.exists(
78 | self.dataset_cfg
79 | ), f'{self.dataset_cfg} does not exist!, please check the path.'
80 | datasets_info = yaml.safe_load(open(self.dataset_cfg, 'r'))
81 | self.dataset_names = list(datasets_info.keys())
82 | self.dataset_attr_list: List[DatasetAttr] = []
83 | for i, name in enumerate(self.dataset_names):
84 | dataset_attr = DatasetAttr()
85 | dataset_attr.dataset_name = name
86 | dataset_attr.dataset_format = datasets_info[name].get(
87 | 'dataset_format', None)
88 | dataset_attr.hf_hub_url = datasets_info[name].get(
89 | 'hf_hub_url', None)
90 | dataset_attr.local_path = datasets_info[name].get(
91 | 'local_path', None)
92 | dataset_attr.multi_turn = datasets_info[name].get(
93 | 'multi_turn', False)
94 |
95 | if datasets_info[name]['local_path'] and os.path.exists(
96 | datasets_info[name]['local_path']):
97 | dataset_attr.load_from_local = True
98 | else:
99 | dataset_attr.load_from_local = False
100 | raise Warning(
101 | 'You have set local_path: {} for {} but it does not exist! Will load the data from {}'
102 | .format(name, dataset_attr.local_path,
103 | dataset_attr.hf_hub_url))
104 |
105 | if 'columns' in datasets_info[name]:
106 | dataset_attr.prompt_column = datasets_info[name][
107 | 'columns'].get('prompt', None)
108 | dataset_attr.query_column = datasets_info[name]['columns'].get(
109 | 'query', None)
110 | dataset_attr.response_column = datasets_info[name][
111 | 'columns'].get('response', None)
112 | dataset_attr.history_column = datasets_info[name][
113 | 'columns'].get('history', None)
114 |
115 | self.dataset_attr_list.append(dataset_attr)
116 |
--------------------------------------------------------------------------------
/source/model/llama2/chatllms/utils/logger_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import torch.distributed as dist
4 |
5 | logger_initialized: dict = {}
6 |
7 |
8 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
9 | """Initialize and get a logger by name.
10 |
11 | If the logger has not been initialized, this method will initialize the
12 | logger by adding one or two handlers, otherwise the initialized logger will
13 | be directly returned. During initialization, a StreamHandler will always be
14 | added. If `log_file` is specified and the process rank is 0, a FileHandler
15 | will also be added.
16 |
17 | Args:
18 | name (str): Logger name.
19 | log_file (str | None): The log filename. If specified, a FileHandler
20 | will be added to the logger.
21 | log_level (int): The logger level. Note that only the process of
22 | rank 0 is affected, and other processes will set the level to
23 | "Error" thus be silent most of the time.
24 | file_mode (str): The file mode used in opening log file.
25 | Defaults to 'w'.
26 |
27 | Returns:
28 | logging.Logger: The expected logger.
29 | """
30 | logger = logging.getLogger(name)
31 | if name in logger_initialized:
32 | return logger
33 | # handle hierarchical names
34 | # e.g., logger "a" is initialized, then logger "a.b" will skip the
35 | # initialization since it is a child of "a".
36 | for logger_name in logger_initialized:
37 | if name.startswith(logger_name):
38 | return logger
39 |
40 | # handle duplicate logs to the console
41 | # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET)
42 | # to the root logger. As logger.propagate is True by default, this root
43 | # level handler causes logging messages from rank>0 processes to
44 | # unexpectedly show up on the console, creating much unwanted clutter.
45 | # To fix this issue, we set the root logger's StreamHandler, if any, to log
46 | # at the ERROR level.
47 | for handler in logger.root.handlers:
48 | if type(handler) is logging.StreamHandler:
49 | handler.setLevel(logging.ERROR)
50 |
51 | stream_handler = logging.StreamHandler()
52 | handlers = [stream_handler]
53 |
54 | if dist.is_available() and dist.is_initialized():
55 | rank = dist.get_rank()
56 | else:
57 | rank = 0
58 |
59 | # only rank 0 will add a FileHandler
60 | if rank == 0 and log_file is not None:
61 | # Here, the default behaviour of the official logger is 'a'. Thus, we
62 | # provide an interface to change the file mode to the default
63 | # behaviour.
64 | file_handler = logging.FileHandler(log_file, file_mode)
65 | handlers.append(file_handler)
66 |
67 | formatter = logging.Formatter(
68 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
69 | for handler in handlers:
70 | handler.setFormatter(formatter)
71 | handler.setLevel(log_level)
72 | logger.addHandler(handler)
73 |
74 | if rank == 0:
75 | logger.setLevel(log_level)
76 | else:
77 | logger.setLevel(logging.ERROR)
78 |
79 | logger_initialized[name] = True
80 |
81 | return logger
82 |
83 |
84 | def print_log(msg, logger=None, level=logging.INFO):
85 | """Print a log message.
86 |
87 | Args:
88 | msg (str): The message to be logged.
89 | logger (logging.Logger | str | None): The logger to be used.
90 | Some special loggers are:
91 |
92 | - "silent": no message will be printed.
93 | - other str: the logger obtained with `get_root_logger(logger)`.
94 | - None: The `print()` method will be used to print log messages.
95 | level (int): Logging level. Only available when `logger` is a Logger
96 | object or "root".
97 | """
98 | if logger is None:
99 | print(msg)
100 | elif isinstance(logger, logging.Logger):
101 | logger.log(level, msg)
102 | elif logger == 'silent':
103 | pass
104 | elif isinstance(logger, str):
105 | _logger = get_logger(logger)
106 | _logger.log(level, msg)
107 | else:
108 | raise TypeError(
109 | 'logger should be either a logging.Logger object, str, '
110 | f'"silent" or None, but got {type(logger)}')
111 |
112 |
113 | def get_root_logger(log_file=None, log_level=logging.INFO):
114 | """Get root logger.
115 |
116 | Args:
117 | log_file (str, optional): File path of log. Defaults to None.
118 | log_level (int, optional): The level of logger.
119 | Defaults to logging.INFO.
120 |
121 | Returns:
122 | :obj:`logging.Logger`: The obtained logger
123 | """
124 | logger = get_logger(name='chatllms',
125 | log_file=log_file,
126 | log_level=log_level)
127 |
128 | return logger
129 |
--------------------------------------------------------------------------------
/source/model/llama2/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import pathlib
4 | from typing import Tuple
5 |
6 | import torch
7 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
8 | HfArgumentParser, PreTrainedModel,
9 | PreTrainedTokenizer, Trainer)
10 |
11 | from chatllms.configs import DataArguments, ModelArguments, TrainingArguments
12 | from chatllms.data import make_supervised_data_module
13 | from chatllms.utils.model_utils import (add_special_tokens_if_missing,
14 | safe_save_model_for_hf_trainer)
15 |
16 |
17 | def load_model_tokenizer(args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
18 | """
19 | Load a pre-trained model and tokenizer for natural language processing tasks.
20 |
21 | Args:
22 | args: An object containing the input arguments.
23 |
24 | Returns:
25 | A tuple containing the loaded model and tokenizer.
26 | """
27 | # Determine the torch data type based on the input arguments
28 | torch_dtype = torch.float16 if args.fp16 else (
29 | torch.bfloat16 if args.bf16 else torch.float32)
30 |
31 | config_kwargs = {
32 | 'cache_dir': args.cache_dir,
33 | 'use_auth_token': args.use_auth_token,
34 | 'trust_remote_code': args.trust_remote_code,
35 | }
36 |
37 | # Load the pre-trained model
38 | print(f'Loading Model from {args.model_name_or_path}...')
39 | model = AutoModelForCausalLM.from_pretrained(
40 | args.model_name_or_path,
41 | torch_dtype=torch_dtype,
42 | **config_kwargs,
43 | )
44 |
45 | # Enable model parallelism
46 | setattr(model, 'model_parallel', True)
47 | setattr(model, 'is_parallelizable', True)
48 |
49 | if args.gradient_checkpointing:
50 | logging.warning('Using gradient checkpointing...')
51 | model.enable_input_require_grads()
52 | model.config.use_cache = False # Turn off when gradient checkpointing is enabled
53 |
54 | # Load the tokenizer
55 | print(f'Loading tokenizer from {args.model_name_or_path}...')
56 | tokenizer = AutoTokenizer.from_pretrained(
57 | args.model_name_or_path,
58 | padding_side='right',
59 | model_max_length=args.model_max_length,
60 | use_fast=False,
61 | tokenizer_type='llama' if 'llama' in args.model_name_or_path else None,
62 | **config_kwargs,
63 | )
64 |
65 | return model, tokenizer
66 |
67 |
68 | def train() -> None:
69 | """
70 | Trains a language model using Hugging Face's Transformers library.
71 |
72 | Args:
73 | model_args (ModelArguments): The arguments for the model configuration.
74 | data_args (DataArguments): The arguments for the data configuration.
75 | training_args (TrainingArguments): The arguments for the training configuration.
76 |
77 | Returns:
78 | None
79 |
80 | """
81 | parser = HfArgumentParser(
82 | (ModelArguments, DataArguments, TrainingArguments))
83 | (model_args, data_args,
84 | training_args) = parser.parse_args_into_dataclasses()
85 | data_args.init_for_training()
86 | args = argparse.Namespace(**vars(model_args), **vars(data_args),
87 | **vars(training_args))
88 | # load model and tokenizer
89 | logging.warning('Loading model and tokenizer...')
90 | model, tokenizer = load_model_tokenizer(args=args)
91 | logging.warning('Successfully loaded model and tokenizer.')
92 |
93 | if 'llama' in args.model_name_or_path or 'baichuan' in args.model_name_or_path:
94 | logging.warning(
95 | f'Adding special tokens for {args.model_name_or_path}.')
96 | add_special_tokens_if_missing(tokenizer, model)
97 |
98 | if 'baichuan' in args.model_name_or_path:
99 | # Tie the weights
100 | model.tie_weights()
101 |
102 | # Create a supervised dataset and Trainer, then train the model
103 | logging.warning('Creating a supervised dataset and DataCollator...')
104 | data_module = make_supervised_data_module(tokenizer=tokenizer, args=args)
105 |
106 | # Initialize the Trainer object and start training
107 | logging.warning('Initializing Trainer object.')
108 | trainer = Trainer(
109 | model=model,
110 | tokenizer=tokenizer,
111 | args=training_args,
112 | **data_module,
113 | )
114 |
115 | logging.warning('Start Training...')
116 | if list(pathlib.Path(training_args.output_dir).glob('checkpoint-*')):
117 | trainer.train(resume_from_checkpoint=True)
118 | else:
119 | trainer.train()
120 |
121 | logging.warning(f'Saving Model to {training_args.output_dir}')
122 | trainer.save_state()
123 | # Save the trained model
124 | safe_save_model_for_hf_trainer(trainer=trainer,
125 | output_dir=training_args.output_dir)
126 |
127 | logging.warning('Done.')
128 |
129 |
130 | if __name__ == '__main__':
131 | train()
132 |
--------------------------------------------------------------------------------
/retrieval_contriever/generate_passage_embeddings.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import os
8 |
9 | import argparse
10 | import csv
11 | import logging
12 | import pickle
13 |
14 | import numpy as np
15 | import torch
16 |
17 | import transformers
18 |
19 | import src.slurm
20 | import src.contriever
21 | import src.utils
22 | import src.data
23 | import src.normalize_text
24 |
25 |
26 | def embed_passages(args, passages, model, tokenizer):
27 | total = 0
28 | allids, allembeddings = [], []
29 | batch_ids, batch_text = [], []
30 | with torch.no_grad():
31 | for k, p in enumerate(passages):
32 | batch_ids.append(p["id"])
33 | if args.no_title or not "title" in p:
34 | text = p["text"]
35 | else:
36 | text = p["title"] + " " + p["text"]
37 | if args.lowercase:
38 | text = text.lower()
39 | if args.normalize_text:
40 | text = src.normalize_text.normalize(text)
41 | batch_text.append(text)
42 |
43 | if len(batch_text) == args.per_gpu_batch_size or k == len(passages) - 1:
44 |
45 | encoded_batch = tokenizer.batch_encode_plus(
46 | batch_text,
47 | return_tensors="pt",
48 | max_length=args.passage_maxlength,
49 | padding=True,
50 | truncation=True,
51 | )
52 |
53 | encoded_batch = {k: v.cuda() for k, v in encoded_batch.items()}
54 | embeddings = model(**encoded_batch)
55 |
56 | embeddings = embeddings.cpu()
57 | total += len(batch_ids)
58 | allids.extend(batch_ids)
59 | allembeddings.append(embeddings)
60 |
61 | batch_text = []
62 | batch_ids = []
63 | if k % 100000 == 0 and k > 0:
64 | print(f"Encoded passages {total}")
65 |
66 | allembeddings = torch.cat(allembeddings, dim=0).numpy()
67 | return allids, allembeddings
68 |
69 |
70 | def main(args):
71 | model, tokenizer, _ = src.contriever.load_retriever(args.model_name_or_path)
72 | print(f"Model loaded from {args.model_name_or_path}.", flush=True)
73 | model.eval()
74 | model = model.cuda()
75 | if not args.no_fp16:
76 | model = model.half()
77 |
78 | passages = src.data.load_passages(args.passages)
79 |
80 | shard_size = len(passages) // args.num_shards
81 | start_idx = args.shard_id * shard_size
82 | end_idx = start_idx + shard_size
83 | if args.shard_id == args.num_shards - 1:
84 | end_idx = len(passages)
85 |
86 | passages = passages[start_idx:end_idx]
87 | print(f"Embedding generation for {len(passages)} passages from idx {start_idx} to {end_idx}.")
88 |
89 | allids, allembeddings = embed_passages(args, passages, model, tokenizer)
90 |
91 | save_file = os.path.join(args.output_dir, args.prefix + f"_{args.shard_id:02d}")
92 | os.makedirs(args.output_dir, exist_ok=True)
93 | print(f"Saving {len(allids)} passage embeddings to {save_file}.")
94 | with open(save_file, mode="wb") as f:
95 | pickle.dump((allids, allembeddings), f)
96 |
97 | print(f"Total passages processed {len(allids)}. Written to {save_file}.")
98 |
99 |
100 | if __name__ == "__main__":
101 | parser = argparse.ArgumentParser()
102 |
103 | parser.add_argument("--passages", type=str, default=None, help="Path to passages (.tsv file)")
104 | parser.add_argument("--output_dir", type=str, default="wikipedia_embeddings", help="dir path to save embeddings")
105 | parser.add_argument("--prefix", type=str, default="passages", help="prefix path to save embeddings")
106 | parser.add_argument("--shard_id", type=int, default=0, help="Id of the current shard")
107 | parser.add_argument("--num_shards", type=int, default=1, help="Total number of shards")
108 | parser.add_argument(
109 | "--per_gpu_batch_size", type=int, default=512, help="Batch size for the passage encoder forward pass"
110 | )
111 | parser.add_argument("--passage_maxlength", type=int, default=512, help="Maximum number of tokens in a passage")
112 | parser.add_argument(
113 | "--model_name_or_path", type=str, help="path to directory containing model weights and config file"
114 | )
115 | parser.add_argument("--no_fp16", action="store_true", help="inference in fp32")
116 | parser.add_argument("--no_title", action="store_true", help="title not added to the passage body")
117 | parser.add_argument("--lowercase", action="store_true", help="lowercase text before encoding")
118 | parser.add_argument("--normalize_text", action="store_true", help="lowercase text before encoding")
119 |
120 | args = parser.parse_args()
121 |
122 | src.slurm.init_distributed_mode(args)
123 |
124 | main(args)
125 |
--------------------------------------------------------------------------------
/source/model/llama2/cli_demo.py:
--------------------------------------------------------------------------------
1 | import os
2 | import platform
3 | from threading import Thread
4 | from typing import List, Tuple
5 |
6 | import torch
7 | import transformers
8 | from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,
9 | PreTrainedTokenizer, TextIteratorStreamer)
10 |
11 | from chatllms.configs import GenerationArguments, ModelInferenceArguments
12 | from chatllms.utils.model_utils import get_logits_processor
13 | from chatllms.utils.template import PromptTemplate
14 |
15 |
16 | def generate_response(
17 | query: str,
18 | history: List[Tuple[str, str]],
19 | prefix: str,
20 | prompt_template: PromptTemplate,
21 | tokenizer: PreTrainedTokenizer,
22 | model: PreTrainedModel,
23 | generation_args: dict,
24 | ) -> List[str]:
25 | """
26 | Generates a response to the given query using GPT-3.5 model and prints it to the console.
27 |
28 | Args:
29 | query (str): The input query for which a response is to be generated.
30 | history (List[Tuple[str, str]]): A list of previous queries and their responses.
31 | prefix (str): The prefix string added to the beginning of each input sequence.
32 | prompt_template (PromptTemplate): The prompt template used to generate the input sequence to the model.
33 | tokenizer (PreTrainedTokenizer): The tokenizer used to convert the raw text into input tokens.
34 | model (PreTrainedModel): The GPT-3.5 model used to generate the response.
35 | generation_args (dict): A dictionary containing the arguments to be passed to the generate() method of the model.
36 |
37 | Returns:
38 | List[Tuple[str, str]]: A list of all the previous queries and their responses, including the current one.
39 | """
40 |
41 | # Convert the query and history into input IDs
42 | input_text = prompt_template.get_prompt(query, history, prefix)
43 | inputs = tokenizer(input_text, return_tensors='pt')
44 | inputs = {k: v.to(model.device) for k, v in inputs.items()}
45 |
46 | # Create a TextIteratorStreamer object to stream the response from the model
47 | streamer = TextIteratorStreamer(tokenizer,
48 | timeout=60.0,
49 | skip_prompt=True,
50 | skip_special_tokens=True)
51 |
52 | # Set the arguments for the model's generate() method
53 | gen_kwargs = dict(
54 | inputs,
55 | streamer=streamer,
56 | logits_processor=get_logits_processor(),
57 | **generation_args.to_dict(),
58 | )
59 |
60 | # Start a separate thread to generate the response asynchronously
61 | thread = Thread(target=model.generate, kwargs=gen_kwargs)
62 | thread.start()
63 |
64 | # Print the model name and the response as it is generated
65 | print('Assistant: ', end='', flush=True)
66 | response = ''
67 | for new_text in streamer:
68 | print(new_text, end='', flush=True)
69 | response += new_text
70 | print()
71 |
72 | # Update the history with the current query and response and return it
73 | history.append((query, response))
74 | return history
75 |
76 |
77 | def main():
78 | os_name = platform.system()
79 | clear_command = 'cls' if os_name == 'Windows' else 'clear'
80 |
81 | # Parse command-line arguments
82 | parser = transformers.HfArgumentParser(
83 | (ModelInferenceArguments, GenerationArguments))
84 | model_server_args, generation_args = parser.parse_args_into_dataclasses()
85 |
86 | # Load the model and tokenizer
87 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
88 |
89 | model = AutoModelForCausalLM.from_pretrained(
90 | model_server_args.model_name_or_path,
91 | trust_remote_code=True,
92 | low_cpu_mem_usage=True,
93 | torch_dtype=torch.float16,
94 | device_map='auto').to(device).eval()
95 |
96 | tokenizer = AutoTokenizer.from_pretrained(
97 | model_server_args.model_name_or_path,
98 | trust_remote_code=True,
99 | use_fast=False,
100 | )
101 |
102 | prompt_template = PromptTemplate(model_server_args.prompt_template)
103 | prefix = model_server_args.source_prefix if model_server_args.source_prefix else ''
104 | history: List[str] = []
105 | print('欢迎使用 CLI 对话系统,输入内容即可对话,clear 清空对话历史,stop 终止程序')
106 | while True:
107 | try:
108 | query = input('\nUser: ')
109 | except UnicodeDecodeError:
110 | print(
111 | 'Detected decoding error at the inputs, please set the terminal encoding to utf-8.'
112 | )
113 | continue
114 | if query.strip() == 'stop':
115 | break
116 |
117 | if query.strip() == 'clear':
118 | # Clear the conversation history
119 | history = []
120 | os.system(clear_command)
121 | print('欢迎使用 CLI 对话系统,输入内容即可对话,clear 清空对话历史,stop 终止程序')
122 | continue
123 |
124 | # Perform prediction and printing
125 | history = generate_response(query, history, prefix, prompt_template,
126 | tokenizer, model, generation_args)
127 |
128 |
129 | if __name__ == '__main__':
130 | main()
131 |
--------------------------------------------------------------------------------
/retrieval_contriever/src/normalize_text.py:
--------------------------------------------------------------------------------
1 | """
2 | adapted from chemdataextractor.text.normalize
3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 | Tools for normalizing text.
5 | https://github.com/mcs07/ChemDataExtractor
6 | :copyright: Copyright 2016 by Matt Swain.
7 | :license: MIT
8 |
9 | Permission is hereby granted, free of charge, to any person obtaining
10 | a copy of this software and associated documentation files (the
11 | 'Software'), to deal in the Software without restriction, including
12 | without limitation the rights to use, copy, modify, merge, publish,
13 | distribute, sublicense, and/or sell copies of the Software, and to
14 | permit persons to whom the Software is furnished to do so, subject to
15 | the following conditions:
16 |
17 | The above copyright notice and this permission notice shall be
18 | included in all copies or substantial portions of the Software.
19 |
20 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
21 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | """
28 |
29 | #: Control characters.
30 | CONTROLS = {
31 | '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
32 | '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
33 | }
34 | # There are further control characters, but they are instead replaced with a space by unicode normalization
35 | # '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f'
36 |
37 |
38 | #: Hyphen and dash characters.
39 | HYPHENS = {
40 | '-', # \u002d Hyphen-minus
41 | '‐', # \u2010 Hyphen
42 | '‑', # \u2011 Non-breaking hyphen
43 | '⁃', # \u2043 Hyphen bullet
44 | '‒', # \u2012 figure dash
45 | '–', # \u2013 en dash
46 | '—', # \u2014 em dash
47 | '―', # \u2015 horizontal bar
48 | }
49 |
50 | #: Minus characters.
51 | MINUSES = {
52 | '-', # \u002d Hyphen-minus
53 | '−', # \u2212 Minus
54 | '-', # \uff0d Full-width Hyphen-minus
55 | '⁻', # \u207b Superscript minus
56 | }
57 |
58 | #: Plus characters.
59 | PLUSES = {
60 | '+', # \u002b Plus
61 | '+', # \uff0b Full-width Plus
62 | '⁺', # \u207a Superscript plus
63 | }
64 |
65 | #: Slash characters.
66 | SLASHES = {
67 | '/', # \u002f Solidus
68 | '⁄', # \u2044 Fraction slash
69 | '∕', # \u2215 Division slash
70 | }
71 |
72 | #: Tilde characters.
73 | TILDES = {
74 | '~', # \u007e Tilde
75 | '˜', # \u02dc Small tilde
76 | '⁓', # \u2053 Swung dash
77 | '∼', # \u223c Tilde operator #in mbert vocab
78 | '∽', # \u223d Reversed tilde
79 | '∿', # \u223f Sine wave
80 | '〜', # \u301c Wave dash #in mbert vocab
81 | '~', # \uff5e Full-width tilde #in mbert vocab
82 | }
83 |
84 | #: Apostrophe characters.
85 | APOSTROPHES = {
86 | "'", # \u0027
87 | '’', # \u2019
88 | '՚', # \u055a
89 | 'Ꞌ', # \ua78b
90 | 'ꞌ', # \ua78c
91 | ''', # \uff07
92 | }
93 |
94 | #: Single quote characters.
95 | SINGLE_QUOTES = {
96 | "'", # \u0027
97 | '‘', # \u2018
98 | '’', # \u2019
99 | '‚', # \u201a
100 | '‛', # \u201b
101 |
102 | }
103 |
104 | #: Double quote characters.
105 | DOUBLE_QUOTES = {
106 | '"', # \u0022
107 | '“', # \u201c
108 | '”', # \u201d
109 | '„', # \u201e
110 | '‟', # \u201f
111 | }
112 |
113 | #: Accent characters.
114 | ACCENTS = {
115 | '`', # \u0060
116 | '´', # \u00b4
117 | }
118 |
119 | #: Prime characters.
120 | PRIMES = {
121 | '′', # \u2032
122 | '″', # \u2033
123 | '‴', # \u2034
124 | '‵', # \u2035
125 | '‶', # \u2036
126 | '‷', # \u2037
127 | '⁗', # \u2057
128 | }
129 |
130 | #: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
131 | QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
132 |
133 | def normalize(text):
134 | for control in CONTROLS:
135 | text = text.replace(control, '')
136 | text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
137 |
138 | for hyphen in HYPHENS | MINUSES:
139 | text = text.replace(hyphen, '-')
140 | text = text.replace('\u00ad', '')
141 |
142 | for double_quote in DOUBLE_QUOTES:
143 | text = text.replace(double_quote, '"') # \u0022
144 | for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
145 | text = text.replace(single_quote, "'") # \u0027
146 | text = text.replace('′', "'") # \u2032 prime
147 | text = text.replace('‵', "'") # \u2035 reversed prime
148 | text = text.replace('″', "''") # \u2033 double prime
149 | text = text.replace('‶', "''") # \u2036 reversed double prime
150 | text = text.replace('‴', "'''") # \u2034 triple prime
151 | text = text.replace('‷', "'''") # \u2037 reversed triple prime
152 | text = text.replace('⁗', "''''") # \u2057 quadruple prime
153 |
154 | text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026
155 |
156 | for slash in SLASHES:
157 | text = text.replace(slash, '/')
158 |
159 | #for tilde in TILDES:
160 | # text = text.replace(tilde, '~')
161 |
162 | return text
163 |
--------------------------------------------------------------------------------
/retrieval_contriever/src/moco.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 |
3 | import torch
4 | import torch.nn as nn
5 | import logging
6 | import copy
7 | import transformers
8 |
9 | from src import contriever, dist_utils, utils
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | class MoCo(nn.Module):
15 | def __init__(self, opt):
16 | super(MoCo, self).__init__()
17 |
18 | self.queue_size = opt.queue_size
19 | self.momentum = opt.momentum
20 | self.temperature = opt.temperature
21 | self.label_smoothing = opt.label_smoothing
22 | self.norm_doc = opt.norm_doc
23 | self.norm_query = opt.norm_query
24 | self.moco_train_mode_encoder_k = opt.moco_train_mode_encoder_k # apply the encoder on keys in train mode
25 |
26 | retriever, tokenizer = self._load_retriever(
27 | opt.retriever_model_id, pooling=opt.pooling, random_init=opt.random_init
28 | )
29 |
30 | self.tokenizer = tokenizer
31 | self.encoder_q = retriever
32 | self.encoder_k = copy.deepcopy(retriever)
33 |
34 | for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
35 | param_k.data.copy_(param_q.data)
36 | param_k.requires_grad = False
37 |
38 | # create the queue
39 | self.register_buffer("queue", torch.randn(opt.projection_size, self.queue_size))
40 | self.queue = nn.functional.normalize(self.queue, dim=0)
41 |
42 | self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))
43 |
44 | def _load_retriever(self, model_id, pooling, random_init):
45 | cfg = utils.load_hf(transformers.AutoConfig, model_id)
46 | tokenizer = utils.load_hf(transformers.AutoTokenizer, model_id)
47 |
48 | if "xlm" in model_id:
49 | model_class = contriever.XLMRetriever
50 | else:
51 | model_class = contriever.Contriever
52 |
53 | if random_init:
54 | retriever = model_class(cfg)
55 | else:
56 | retriever = utils.load_hf(model_class, model_id)
57 |
58 | if "bert-" in model_id:
59 | if tokenizer.bos_token_id is None:
60 | tokenizer.bos_token = "[CLS]"
61 | if tokenizer.eos_token_id is None:
62 | tokenizer.eos_token = "[SEP]"
63 |
64 | retriever.config.pooling = pooling
65 |
66 | return retriever, tokenizer
67 |
68 | def get_encoder(self, return_encoder_k=False):
69 | if return_encoder_k:
70 | return self.encoder_k
71 | else:
72 | return self.encoder_q
73 |
74 | def _momentum_update_key_encoder(self):
75 | """
76 | Update of the key encoder
77 | """
78 | for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
79 | param_k.data = param_k.data * self.momentum + param_q.data * (1.0 - self.momentum)
80 |
81 | @torch.no_grad()
82 | def _dequeue_and_enqueue(self, keys):
83 | # gather keys before updating queue
84 | keys = dist_utils.gather_nograd(keys.contiguous())
85 |
86 | batch_size = keys.shape[0]
87 |
88 | ptr = int(self.queue_ptr)
89 | assert self.queue_size % batch_size == 0, f"{batch_size}, {self.queue_size}" # for simplicity
90 |
91 | # replace the keys at ptr (dequeue and enqueue)
92 | self.queue[:, ptr : ptr + batch_size] = keys.T
93 | ptr = (ptr + batch_size) % self.queue_size # move pointer
94 |
95 | self.queue_ptr[0] = ptr
96 |
97 | def _compute_logits(self, q, k):
98 | l_pos = torch.einsum("nc,nc->n", [q, k]).unsqueeze(-1)
99 | l_neg = torch.einsum("nc,ck->nk", [q, self.queue.clone().detach()])
100 |
101 | logits = torch.cat([l_pos, l_neg], dim=1)
102 | return logits
103 |
104 | def forward(self, q_tokens, q_mask, k_tokens, k_mask, stats_prefix="", iter_stats={}, **kwargs):
105 | bsz = q_tokens.size(0)
106 |
107 | q = self.encoder_q(input_ids=q_tokens, attention_mask=q_mask, normalize=self.norm_query)
108 |
109 | # compute key features
110 | with torch.no_grad(): # no gradient to keys
111 | self._momentum_update_key_encoder() # update the key encoder
112 |
113 | if not self.encoder_k.training and not self.moco_train_mode_encoder_k:
114 | self.encoder_k.eval()
115 |
116 | k = self.encoder_k(input_ids=k_tokens, attention_mask=k_mask, normalize=self.norm_doc)
117 |
118 | logits = self._compute_logits(q, k) / self.temperature
119 |
120 | # labels: positive key indicators
121 | labels = torch.zeros(bsz, dtype=torch.long).cuda()
122 |
123 | loss = torch.nn.functional.cross_entropy(logits, labels, label_smoothing=self.label_smoothing)
124 |
125 | self._dequeue_and_enqueue(k)
126 |
127 | # log stats
128 | if len(stats_prefix) > 0:
129 | stats_prefix = stats_prefix + "/"
130 | iter_stats[f"{stats_prefix}loss"] = (loss.item(), bsz)
131 |
132 | predicted_idx = torch.argmax(logits, dim=-1)
133 | accuracy = 100 * (predicted_idx == labels).float().mean()
134 | stdq = torch.std(q, dim=0).mean().item()
135 | stdk = torch.std(k, dim=0).mean().item()
136 | iter_stats[f"{stats_prefix}accuracy"] = (accuracy, bsz)
137 | iter_stats[f"{stats_prefix}stdq"] = (stdq, bsz)
138 | iter_stats[f"{stats_prefix}stdk"] = (stdk, bsz)
139 |
140 | return loss, iter_stats
141 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/format_data/convert_alpaca.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from datasets import load_dataset
4 |
5 |
6 | def json_dump(obj, path):
7 | with open(path, 'w', encoding='utf-8') as f:
8 | json.dump(obj, f, indent=2, ensure_ascii=False)
9 |
10 |
11 | def json_load(in_file):
12 | with open(in_file, 'r') as f:
13 | json_data = json.load(f)
14 | return json_data
15 |
16 |
17 | def convert_100PoisonMpts(in_file, out_file):
18 | raw_data = load_dataset('json', data_files=in_file)['train']
19 | new_content = []
20 | for i, raw_text in enumerate(raw_data):
21 | prompt = raw_text['prompt']
22 | response = raw_text['answer']
23 | if len(prompt) <= 5 or len(response) <= 5:
24 | continue
25 | new_content.append({
26 | 'instruction': prompt,
27 | 'input': '',
28 | 'output': response,
29 | })
30 |
31 | print(f'#out: {len(new_content)}')
32 | json_dump(new_content, out_file)
33 |
34 |
35 | def convert_Cvalues(in_file, out_file):
36 | raw_data = load_dataset('json', data_files=in_file)['train']
37 | new_content = []
38 | for i, raw_text in enumerate(raw_data):
39 | prompt = raw_text['prompt']
40 | response = raw_text['pos_resp']
41 | if len(prompt) <= 5 or len(response) <= 5:
42 | continue
43 | new_content.append({
44 | 'instruction': prompt,
45 | 'input': '',
46 | 'output': response,
47 | })
48 |
49 | print(f'#out: {len(new_content)}')
50 | json_dump(new_content, out_file)
51 |
52 |
53 | def convert_huatuogpt(in_file, out_file):
54 | raw_data = load_dataset('json', data_files=in_file)['train']
55 | new_content = []
56 | for i, raw_text in enumerate(raw_data):
57 | data = raw_text['data']
58 | prompt = data[0].replace('问:', '')
59 | response = data[1].replace('答:', '')
60 | if len(prompt) <= 5 or len(response) <= 5:
61 | continue
62 | new_content.append({
63 | 'instruction': prompt,
64 | 'input': '',
65 | 'output': response,
66 | })
67 | print(f'#out: {len(new_content)}')
68 | json_dump(new_content, out_file)
69 |
70 |
71 | def convert_safety_attack(in_file, out_file):
72 | field_list = [
73 | 'Reverse_Exposure', 'Goal_Hijacking', 'Prompt_Leaking',
74 | 'Unsafe_Instruction_Topic', 'Role_Play_Instruction',
75 | 'Inquiry_With_Unsafe_Opinion'
76 | ]
77 | new_content = []
78 | for filed in field_list:
79 | raw_data = load_dataset('json', field=filed,
80 | data_files=in_file)['train']
81 | for i, raw_text in enumerate(raw_data):
82 | prompt = raw_text['prompt']
83 | response = raw_text['response']
84 | if len(prompt) <= 5 or len(response) <= 5:
85 | continue
86 | new_content.append({
87 | 'instruction': prompt,
88 | 'input': '',
89 | 'output': response,
90 | })
91 | print(f'#out: {len(new_content)}')
92 | json_dump(new_content, out_file)
93 |
94 |
95 | def convert_safety_scenarios(in_file, out_file):
96 |
97 | field_list = [
98 | 'Unfairness_And_Discrimination', 'Crimes_And_Illegal_Activities',
99 | 'Insult', 'Mental_Health', 'Physical_Harm', 'Privacy_And_Property',
100 | 'Ethics_And_Morality'
101 | ]
102 | new_content = []
103 | for filed in field_list:
104 | raw_data = load_dataset('json', data_files=in_file,
105 | field=filed)['train']
106 | for i, raw_text in enumerate(raw_data):
107 | prompt = raw_text['prompt']
108 | response = raw_text['response']
109 | if len(prompt) <= 5 or len(response) <= 5:
110 | continue
111 | new_content.append({
112 | 'instruction': prompt,
113 | 'input': '',
114 | 'output': response,
115 | })
116 | print(f'#out: {len(new_content)}')
117 | json_dump(new_content, out_file)
118 |
119 |
120 | if __name__ == '__main__':
121 |
122 | data_path = '/home/robin/prompt_data/100PoisonMpts/train.jsonl'
123 | out_path = '/home/robin/prompt_data/100PoisonMpts/train_alpaca.jsonl'
124 | convert_100PoisonMpts(data_path, out_file=out_path)
125 |
126 | data_path = '/home/robin/prompt_data/CValues-Comparison/test.jsonl'
127 | out_path = '/home/robin/prompt_data/CValues-Comparison/test_alpaca.json'
128 | convert_Cvalues(data_path, out_file=out_path)
129 |
130 | data_path = '/home/robin/prompt_data/CValues-Comparison/train.jsonl'
131 | out_path = '/home/robin/prompt_data/CValues-Comparison/train_alpaca.json'
132 | convert_Cvalues(data_path, out_file=out_path)
133 |
134 | data_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_sft_data_v1.jsonl'
135 | out_path = '/home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.json'
136 | convert_huatuogpt(data_path, out_file=out_path)
137 |
138 | data_path = '/home/robin/prompt_data/Safety-Prompts/instruction_attack_scenarios.json'
139 | out_path = '/home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json'
140 | convert_safety_attack(data_path, out_file=out_path)
141 |
142 | data_path = '/home/robin/prompt_data/Safety-Prompts/typical_safety_scenarios.json'
143 | out_path = '/home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json'
144 | convert_safety_scenarios(data_path, out_file=out_path)
145 |
--------------------------------------------------------------------------------
/source/model/llama2/data/dataset_info.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 |
3 |
4 | def get_dataset_info(dataset_dir):
5 | """
6 | Returns the datasets info to a dataset based on a pre-defined map of dataset names to their corresponding URLs on the internet
7 | or local file paths.
8 |
9 | Args:
10 | dataset_dir (str): The local directory where the dataset is stored; this is used for datasets that are stored locally.
11 |
12 | Returns:
13 | str: The dataset dict to the specified dataset.
14 | """
15 | dataset_info = {
16 | 'alpaca': {
17 | 'hf_hub_url': 'tatsu-lab/alpaca',
18 | 'local_path': 'tatsu-lab/alpaca/alpaca.json',
19 | 'multi_turn': False
20 | },
21 | 'alpaca-clean': {
22 | 'hf_hub_url': 'yahma/alpaca-cleaned',
23 | 'local_path': '',
24 | 'multi_turn': False
25 | },
26 | 'chip2': {
27 | 'hf_hub_url': 'laion/OIG',
28 | 'local_path': '',
29 | 'multi_turn': False
30 | },
31 | 'self-instruct': {
32 | 'hf_hub_url': 'yizhongw/self_instruct',
33 | 'local_path': '',
34 | 'multi_turn': False
35 | },
36 | 'guanaco': {
37 | 'hf_hub_url': 'JosephusCheung/GuanacoDataset',
38 | 'local_path': '',
39 | 'multi_turn': False
40 | },
41 | 'hh-rlhf': {
42 | 'hf_hub_url': 'Anthropic/hh-rlhf',
43 | 'local_path': '',
44 | 'multi_turn': False
45 | },
46 | 'longformer': {
47 | 'hf_hub_url': 'akoksal/LongForm',
48 | 'local_path': '',
49 | 'multi_turn': False
50 | },
51 | 'openassistant-guanaco': {
52 | 'hf_hub_url':
53 | 'timdettmers/openassistant-guanaco',
54 | 'local_path':
55 | join(dataset_dir,
56 | 'timdettmers/openassistant_best_replies_train.jsonl'),
57 | 'multi_turn':
58 | False
59 | },
60 | 'evol_instruct': {
61 | 'hf_hub_url':
62 | 'WizardLM/WizardLM_evol_instruct_V2_196k',
63 | 'local_path':
64 | join(dataset_dir, 'WizardLM/WizardLM_evol_instruct_V2_143k.json'),
65 | 'multi_turn':
66 | False
67 | },
68 | 'dolly-15k': {
69 | 'hf_hub_url': 'databricks/databricks-dolly-15k',
70 | 'local_path': join(dataset_dir, 'databricks/databricks-dolly-15k'),
71 | 'multi_turn': False
72 | },
73 | 'olcc': {
74 | 'hf_hub_url': 'yizhongw/olcc',
75 | 'local_path': join(dataset_dir, 'olcc/olcc_alpaca.json'),
76 | 'multi_turn': False
77 | },
78 | 'share_gpt': {
79 | 'hf_hub_url': '',
80 | 'local_path': join(dataset_dir, 'sharegpt/sharegpt_split.json'),
81 | 'multi_turn': True
82 | },
83 | '100PoisonMpts': {
84 | 'hf_hub_url': '',
85 | 'local_path': join(dataset_dir, '100PoisonMpts/train.jsonl'),
86 | 'multi_turn': False
87 | },
88 | 'belle_0.5m': {
89 | 'hf_hub_url': 'BelleGroup/train_0.5M_CN',
90 | 'local_path': '',
91 | 'multi_turn': False
92 | },
93 | 'belle_1m': {
94 | 'hf_hub_url': 'BelleGroup/train_1M_CN',
95 | 'local_path': '',
96 | 'multi_turn': False
97 | },
98 | 'belle_2m': {
99 | 'hf_hub_url': 'BelleGroup/train_2M_CN',
100 | 'local_path': '',
101 | 'multi_turn': False
102 | },
103 | 'belle_dialog': {
104 | 'hf_hub_url': 'BelleGroup/generated_chat_0.4M',
105 | 'local_path': '',
106 | 'multi_turn': False
107 | },
108 | 'belle_math': {
109 | 'hf_hub_url': 'BelleGroup/school_math_0.25M',
110 | 'local_path': '',
111 | 'multi_turn': False
112 | },
113 | 'belle_multiturn': {
114 | 'hf_hub_url': 'BelleGroup/multi_turn_0.5M',
115 | 'local_path': '',
116 | 'multi_turn': True,
117 | 'columns': {
118 | 'prompt': 'instruction',
119 | 'query': '',
120 | 'response': 'output',
121 | 'history': 'history'
122 | }
123 | },
124 | 'firefly': {
125 | 'hf_hub_url': 'YeungNLP/firefly-train-1.1M',
126 | 'local_path': '',
127 | 'multi_turn': False,
128 | 'columns': {
129 | 'prompt': 'input',
130 | 'query': '',
131 | 'response': 'target',
132 | 'history': ''
133 | }
134 | },
135 | 'codealpaca': {
136 | 'hf_hub_url': 'sahil2801/CodeAlpaca-20k',
137 | 'local_path': '',
138 | 'multi_turn': False
139 | },
140 | 'alpaca_cot': {
141 | 'hf_hub_url': 'QingyiSi/Alpaca-CoT',
142 | 'local_path': '',
143 | 'multi_turn': False
144 | },
145 | 'webqa': {
146 | 'hf_hub_url': 'suolyer/webqa',
147 | 'local_path': '',
148 | 'multi_turn': False,
149 | 'columns': {
150 | 'prompt': 'input',
151 | 'query': '',
152 | 'response': 'output',
153 | 'history': ''
154 | }
155 | },
156 | 'novel_tokens512_50k': {
157 | 'hf_hub_url': 'zxbsmk/webnovel_cn',
158 | 'local_path': '',
159 | 'multi_turn': False
160 | }
161 | }
162 |
163 | return dataset_info
164 |
--------------------------------------------------------------------------------
/retrieval_contriever/src/contriever.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 |
3 | import os
4 | import torch
5 | import transformers
6 | from transformers import BertModel, XLMRobertaModel
7 |
8 | from retrieval_contriever.src import utils
9 |
10 |
11 | class Contriever(BertModel):
12 | def __init__(self, config, pooling="average", **kwargs):
13 | super().__init__(config, add_pooling_layer=False)
14 | if not hasattr(config, "pooling"):
15 | self.config.pooling = pooling
16 |
17 | def forward(
18 | self,
19 | input_ids=None,
20 | attention_mask=None,
21 | token_type_ids=None,
22 | position_ids=None,
23 | head_mask=None,
24 | inputs_embeds=None,
25 | encoder_hidden_states=None,
26 | encoder_attention_mask=None,
27 | output_attentions=None,
28 | output_hidden_states=None,
29 | normalize=False,
30 | ):
31 |
32 | model_output = super().forward(
33 | input_ids=input_ids,
34 | attention_mask=attention_mask,
35 | token_type_ids=token_type_ids,
36 | position_ids=position_ids,
37 | head_mask=head_mask,
38 | inputs_embeds=inputs_embeds,
39 | encoder_hidden_states=encoder_hidden_states,
40 | encoder_attention_mask=encoder_attention_mask,
41 | output_attentions=output_attentions,
42 | output_hidden_states=output_hidden_states,
43 | )
44 |
45 | last_hidden = model_output["last_hidden_state"]
46 | last_hidden = last_hidden.masked_fill(~attention_mask[..., None].bool(), 0.0)
47 |
48 | if self.config.pooling == "average":
49 | emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
50 | elif self.config.pooling == "cls":
51 | emb = last_hidden[:, 0]
52 |
53 | if normalize:
54 | emb = torch.nn.functional.normalize(emb, dim=-1)
55 | return emb
56 |
57 |
58 | class XLMRetriever(XLMRobertaModel):
59 | def __init__(self, config, pooling="average", **kwargs):
60 | super().__init__(config, add_pooling_layer=True)
61 | if not hasattr(config, "pooling"):
62 | self.config.pooling = pooling
63 |
64 | def forward(
65 | self,
66 | input_ids=None,
67 | attention_mask=None,
68 | token_type_ids=None,
69 | position_ids=None,
70 | head_mask=None,
71 | inputs_embeds=None,
72 | encoder_hidden_states=None,
73 | encoder_attention_mask=None,
74 | output_attentions=None,
75 | output_hidden_states=None,
76 | normalize=False,
77 | ):
78 |
79 | model_output = super().forward(
80 | input_ids=input_ids,
81 | attention_mask=attention_mask,
82 | token_type_ids=token_type_ids,
83 | position_ids=position_ids,
84 | head_mask=head_mask,
85 | inputs_embeds=inputs_embeds,
86 | encoder_hidden_states=encoder_hidden_states,
87 | encoder_attention_mask=encoder_attention_mask,
88 | output_attentions=output_attentions,
89 | output_hidden_states=output_hidden_states,
90 | )
91 |
92 | last_hidden = model_output["last_hidden_state"]
93 | last_hidden = last_hidden.masked_fill(~attention_mask[..., None].bool(), 0.0)
94 | if self.config.pooling == "average":
95 | emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
96 | elif self.config.pooling == "cls":
97 | emb = last_hidden[:, 0]
98 | if normalize:
99 | emb = torch.nn.functional.normalize(emb, dim=-1)
100 | return emb
101 |
102 |
103 | def load_retriever(model_path, pooling="average", random_init=False):
104 | # try: check if model exists locally
105 | path = os.path.join(model_path, "checkpoint.pth")
106 | if os.path.exists(path):
107 | pretrained_dict = torch.load(path, map_location="gpu")
108 | opt = pretrained_dict["opt"]
109 | print(opt.retriever_model_id)
110 | if hasattr(opt, "retriever_model_id"):
111 | print("here1")
112 | retriever_model_id = opt.retriever_model_id
113 | else:
114 | # retriever_model_id = "bert-base-uncased"
115 | print("here")
116 | retriever_model_id = "bert-base-multilingual-cased"
117 | tokenizer = utils.load_hf(transformers.AutoTokenizer, retriever_model_id)
118 | cfg = utils.load_hf(transformers.AutoConfig, retriever_model_id)
119 | if "xlm" in retriever_model_id:
120 | model_class = XLMRetriever
121 | else:
122 | model_class = Contriever
123 | retriever = model_class(cfg)
124 | pretrained_dict = pretrained_dict["model"]
125 |
126 | if any("encoder_q." in key for key in pretrained_dict.keys()): # test if model is defined with moco class
127 | pretrained_dict = {k.replace("encoder_q.", ""): v for k, v in pretrained_dict.items() if "encoder_q." in k}
128 | elif any("encoder." in key for key in pretrained_dict.keys()): # test if model is defined with inbatch class
129 | pretrained_dict = {k.replace("encoder.", ""): v for k, v in pretrained_dict.items() if "encoder." in k}
130 | retriever.load_state_dict(pretrained_dict, strict=False)
131 | else:
132 | retriever_model_id = model_path
133 | if "xlm" in retriever_model_id:
134 | model_class = XLMRetriever
135 | else:
136 | model_class = Contriever
137 | cfg = utils.load_hf(transformers.AutoConfig, model_path)
138 | tokenizer = utils.load_hf(transformers.AutoTokenizer, model_path)
139 | retriever = utils.load_hf(model_class, model_path)
140 |
141 | return retriever, tokenizer, retriever_model_id
142 |
--------------------------------------------------------------------------------
/source/model/flan-t5/flan_seq2seq.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import numpy as np
4 | import pandas as pd
5 | import pickle
6 | import nltk
7 |
8 | nltk.download("punkt")
9 |
10 | from transformers import (
11 | AutoTokenizer,
12 | AutoModelForSeq2SeqLM,
13 | DataCollatorForSeq2Seq,
14 | Seq2SeqTrainingArguments,
15 | Seq2SeqTrainer,
16 | )
17 | from peft import (
18 | get_peft_model,
19 | TaskType,
20 | LoraConfig,
21 | PrefixTuningConfig,
22 | )
23 |
24 | from utils import get_data
25 |
26 |
27 | def main(args):
28 | model_name_or_path = args.pretrained_ckpt
29 |
30 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
31 |
32 | # loading dataset
33 | dataset, max_source_length, max_target_length = get_data(tokenizer)
34 |
35 | def preprocess_function(sample, padding="max_length"):
36 | # add prefix to the input for t5
37 | inputs = ["query: " + item for item in sample["question"]]
38 |
39 | # tokenize inputs
40 | model_inputs = tokenizer(
41 | inputs, max_length=max_source_length, padding=padding, truncation=True
42 | )
43 |
44 | labels = tokenizer(
45 | text_target=sample["answer"],
46 | max_length=max_target_length,
47 | padding=padding,
48 | truncation=True,
49 | )
50 |
51 | if padding == "max_length":
52 | labels["input_ids"] = [
53 | [(l if l != tokenizer.pad_token_id else -100) for l in label]
54 | for label in labels["input_ids"]
55 | ]
56 |
57 | model_inputs["labels"] = labels["input_ids"]
58 | return model_inputs
59 |
60 | tokenized_dataset = dataset.map(
61 | preprocess_function, batched=True, remove_columns=["question", "answer"]
62 | )
63 | print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
64 |
65 | print("Getting PEFT method")
66 |
67 | if args.peft_method == "lora":
68 | peft_config = LoraConfig(
69 | task_type=TaskType.SEQ_2_SEQ_LM,
70 | inference_mode=False,
71 | r=args.lora_r,
72 | lora_alpha=32,
73 | lora_dropout=args.dropout,
74 | target_modules=["q", "v"],
75 | )
76 | results_dir = f"experiments/summarization_{args.peft_method}_epochs-{args.epochs}_r-{args.lora_r}_dropout-{args.dropout}"
77 |
78 | elif args.peft_method == "prefix":
79 | peft_config = PrefixTuningConfig(
80 | task_type=TaskType.SEQ_2_SEQ_LM,
81 | inference_mode=False,
82 | num_virtual_tokens=args.prefix_tokens,
83 | prefix_projection=True if args.prefix_projection else False,
84 | )
85 | results_dir = f"experiments/summarization_{args.peft_method}_epochs-{args.epochs}_prefixTokens-{args.prefix_tokens}_useProjection-{args.prefix_projection}"
86 |
87 | model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
88 | if args.peft_method != "sft":
89 | model = get_peft_model(model, peft_config)
90 | print(model.print_trainable_parameters())
91 |
92 | # Define training args
93 | training_args = Seq2SeqTrainingArguments(
94 | do_train=True,
95 | do_eval=True,
96 | evaluation_strategy="epoch",
97 | logging_strategy="epoch",
98 | save_strategy="no",
99 | per_device_eval_batch_size=8,
100 | per_device_train_batch_size=8,
101 | gradient_accumulation_steps=1,
102 | output_dir=results_dir,
103 | auto_find_batch_size=True,
104 | learning_rate=1e-3,
105 | num_train_epochs=args.epochs,
106 | logging_dir=f"{results_dir}/logs",
107 | report_to="none",
108 | )
109 |
110 | # we want to ignore tokenizer pad token in the loss
111 | label_pad_token_id = -100
112 | # Data collator
113 | data_collator = DataCollatorForSeq2Seq(
114 | tokenizer,
115 | model=model,
116 | label_pad_token_id=label_pad_token_id,
117 | pad_to_multiple_of=8,
118 | )
119 |
120 | print(f"training_args = {training_args}")
121 | # Create Trainer instance
122 | trainer = Seq2SeqTrainer(
123 | model=model,
124 | args=training_args,
125 | train_dataset=tokenized_dataset["train"],
126 | eval_dataset=tokenized_dataset["validation"],
127 | data_collator=data_collator,
128 | )
129 | model.config.use_cache = False
130 |
131 | trainer_stats = trainer.train()
132 | train_loss = trainer_stats.training_loss
133 | eval_stats = trainer.evaluate()
134 | eval_loss = eval_stats["eval_loss"]
135 | print(f"Training loss:{train_loss}|Val loss:{eval_loss}")
136 |
137 | peft_model_id = f"{results_dir}/assets"
138 | trainer.model.save_pretrained(peft_model_id)
139 | tokenizer.save_pretrained(peft_model_id)
140 |
141 | with open(f"{results_dir}/results.pkl", "wb") as handle:
142 | run_result = [
143 | args.epochs,
144 | args.prefix_tokens,
145 | args.prefix_projection,
146 | train_loss,
147 | eval_loss,
148 | ]
149 | pickle.dump(run_result, handle)
150 | print("Experiment over")
151 |
152 |
153 | if __name__ == "__main__":
154 | parser = argparse.ArgumentParser()
155 | parser.add_argument("--pretrained_ckpt", default="google/flan-t5-large")
156 | parser.add_argument("--peft_method", default="sft")
157 | parser.add_argument("--lora_r", default=16, type=int)
158 | parser.add_argument("--epochs", default=1, type=int)
159 | parser.add_argument("--prefix_tokens", default=20, type=int)
160 | parser.add_argument("--prefix_projection", default=1, type=int)
161 | parser.add_argument("--dropout", default=0.1, type=float)
162 | parser.add_argument("--p_tokens", default=20, type=int)
163 | parser.add_argument("--p_hidden", default=100, type=int)
164 | parser.add_argument("--prompt_tokens", default=20, type=int)
165 |
166 | args = parser.parse_args()
167 | main(args)
168 |
--------------------------------------------------------------------------------
/source/model/llama2/examples/finetune_llm/finetune_llama_with_qlora.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import torch
4 | import transformers
5 | from datasets import load_dataset
6 | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
7 | from transformers import (AutoModelForCausalLM, AutoTokenizer,
8 | BitsAndBytesConfig, DataCollatorForLanguageModeling,
9 | LlamaTokenizer, Trainer, TrainingArguments)
10 |
11 | DEFAULT_PAD_TOKEN = '[PAD]'
12 | DEFAULT_EOS_TOKEN = ''
13 | DEFAULT_BOS_TOKEN = ''
14 | DEFAULT_UNK_TOKEN = ''
15 |
16 |
17 | def print_trainable_parameters(model: AutoModelForCausalLM) -> None:
18 | """
19 | Prints the number of trainable parameters in the model.
20 | """
21 | trainable_params, all_param = 0, 0
22 | for _, param in model.named_parameters():
23 | all_param += param.numel()
24 | if param.requires_grad:
25 | trainable_params += param.numel()
26 | print(
27 | f'trainable params: {trainable_params} || all params: {all_param} || trainable%: \
28 | {100 * trainable_params / all_param}')
29 |
30 |
31 | def smart_tokenizer_and_embedding_resize(
32 | special_tokens_dict: Dict,
33 | tokenizer: transformers.PreTrainedTokenizer,
34 | model: transformers.PreTrainedModel,
35 | ):
36 | """Resize tokenizer and embedding.
37 |
38 | Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
39 | """
40 | num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
41 | model.resize_token_embeddings(len(tokenizer))
42 |
43 | if num_new_tokens > 0:
44 | input_embeddings = model.get_input_embeddings().weight.data
45 | output_embeddings = model.get_output_embeddings().weight.data
46 |
47 | input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
48 | dim=0, keepdim=True)
49 | output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
50 | dim=0, keepdim=True)
51 |
52 | input_embeddings[-num_new_tokens:] = input_embeddings_avg
53 | output_embeddings[-num_new_tokens:] = output_embeddings_avg
54 |
55 |
56 | if __name__ == '__main__':
57 | model_id = 'decapoda-research/llama-7b-hf'
58 | bnb_config = BitsAndBytesConfig(
59 | load_in_4bit=True,
60 | bnb_4bit_use_double_quant=True,
61 | bnb_4bit_quant_type='nf4',
62 | bnb_4bit_compute_dtype=torch.bfloat16,
63 | )
64 | """
65 | - load_in_4bit: The model will be loaded in the memory with 4-bit precision.
66 | - bnb_4bit_use_double_quant: We will do the double quantization proposed by QLoRa.
67 | - bnb_4bit_quant_type: This is the type of quantization. “nf4” stands for 4-bit NormalFloat.
68 | - bnb_4bit_compute_dtype: While we load and store the model in 4-bit,
69 | we will partially dequantize it when needed and do all the computations with a 16-bit precision (bfloat16).
70 | """
71 | # So now we can load the model in 4-bit:
72 | model = AutoModelForCausalLM.from_pretrained(
73 | model_id, quantization_config=bnb_config, device_map={'': 0})
74 |
75 | # Then, we enable gradient checkpointing, to reduce the memory footprint of the model:
76 | model.gradient_checkpointing_enable()
77 | # Then, we load the tokenizer:
78 | if model.config.model_type == 'llama':
79 | # Due to the name of Transformers' LlamaTokenizer, we have to do this
80 | tokenizer = LlamaTokenizer.from_pretrained(
81 | model_id,
82 | padding_side='right',
83 | use_fast=True,
84 | )
85 | else:
86 | tokenizer = AutoTokenizer.from_pretrained(
87 | model_id,
88 | padding_side='right',
89 | use_fast=True,
90 | )
91 | # Preprocessing the GPT model for LoRa
92 | model = prepare_model_for_kbit_training(model)
93 | # This is where we use PEFT. We prepare the model for LoRa, adding trainable adapters for each layer.
94 | config = LoraConfig(
95 | r=8,
96 | lora_alpha=32,
97 | target_modules=['q_proj', 'v_proj'],
98 | lora_dropout=0.05,
99 | bias='none',
100 | task_type='CAUSAL_LM',
101 | )
102 | # We can now add the adapters to the model:
103 | model = get_peft_model(model, config)
104 | # We can now print the number of trainable parameters in the model:
105 | print_trainable_parameters(model)
106 |
107 | # Get your dataset ready
108 | # For this demo, I use the “english_quotes” dataset. This is a dataset made of famous quotes distributed under a CC BY 4.0 license.
109 | data = load_dataset('Abirate/english_quotes')
110 | data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
111 |
112 | # Add special tokens to tokenizer if they are not already present
113 | special_tokens_dict: Dict[str, str] = {}
114 | if tokenizer.pad_token is None:
115 | special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
116 | if tokenizer.eos_token is None:
117 | special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
118 | if tokenizer.bos_token is None:
119 | special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN
120 | if tokenizer.unk_token is None:
121 | special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN
122 |
123 | smart_tokenizer_and_embedding_resize(
124 | special_tokens_dict=special_tokens_dict,
125 | tokenizer=tokenizer,
126 | model=model,
127 | )
128 |
129 | trainer = Trainer(
130 | model=model,
131 | train_dataset=data['train'],
132 | args=TrainingArguments(
133 | per_device_train_batch_size=4,
134 | gradient_accumulation_steps=8,
135 | warmup_steps=2,
136 | max_steps=1000,
137 | learning_rate=2e-4,
138 | fp16=True,
139 | logging_steps=1,
140 | output_dir='outputs',
141 | optim='paged_adamw_8bit',
142 | ),
143 | data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
144 | )
145 | model.config.use_cache = False # silence the warnings. Please re-enable for inference!
146 | trainer.train()
147 |
--------------------------------------------------------------------------------