├── python └── llm │ ├── .gitignore │ ├── version.txt │ ├── portable-zip │ ├── .gitignore │ ├── chat.bat │ ├── setup.md │ └── README-ui.md │ ├── example │ ├── CPU │ │ ├── Speculative-Decoding │ │ │ ├── EAGLE │ │ │ │ └── requirements.txt │ │ │ ├── README.md │ │ │ └── Self-Speculation │ │ │ │ └── README.md │ │ ├── QLoRA-FineTuning │ │ │ └── alpaca-qlora │ │ │ │ ├── templates │ │ │ │ ├── alpaca_short.json │ │ │ │ ├── alpaca.json │ │ │ │ ├── alpaca_legacy.json │ │ │ │ └── vigogne.json │ │ │ │ └── finetune_one_node_two_sockets.sh │ │ ├── PyTorch-Models │ │ │ ├── README.md │ │ │ └── Model │ │ │ │ ├── README.md │ │ │ │ └── yuan2 │ │ │ │ └── yuan2-2B-instruct │ │ │ │ └── config.json │ │ ├── HF-Transformers-AutoModels │ │ │ ├── README.md │ │ │ └── Model │ │ │ │ ├── README.md │ │ │ │ └── yuan2 │ │ │ │ └── yuan2-2B-instruct │ │ │ │ └── config.json │ │ ├── Deepspeed-AutoTP │ │ │ ├── run.sh │ │ │ └── install.sh │ │ └── Applications │ │ │ └── streaming-llm │ │ │ └── streaming_llm │ │ │ └── __init__.py │ └── GPU │ │ ├── Speculative-Decoding │ │ ├── EAGLE │ │ │ └── requirements.txt │ │ └── README.md │ │ ├── vLLM-Serving │ │ ├── fp8_kv.png │ │ └── max_length.png │ │ ├── Deepspeed-AutoTP-FastAPI │ │ └── prompt │ │ │ └── 32.txt │ │ ├── Pipeline-Parallel-Serving │ │ └── prompt │ │ │ ├── 32.txt │ │ │ └── 128.txt │ │ ├── HuggingFace │ │ ├── Multimodal │ │ │ └── README.md │ │ ├── LLM │ │ │ ├── README.md │ │ │ └── yuan2 │ │ │ │ └── yuan2-2B-instruct │ │ │ │ └── config.json │ │ └── README.md │ │ ├── LLM-Finetuning │ │ ├── LoRA │ │ │ ├── chatglm_finetune │ │ │ │ ├── deepspeed_config.json │ │ │ │ ├── lora_finetuning_chatglm3_6b_on_alpaca_with_1_arc_card.sh │ │ │ │ ├── lora_finetuning_chatglm3_6b_on_advertise_gen_with_1_arc_card.sh │ │ │ │ ├── lora_finetuning_chatglm3_6b_on_alpaca_with_2_arc_cards.sh │ │ │ │ └── lora_finetuning_chatglm3_6b_on_advertise_gen_with_2_arc_cards.sh │ │ │ ├── deepspeed_zero3_config.json │ │ │ ├── lora_finetune_llama2_7b_arc_1_card.sh │ │ │ ├── lora_finetune_llama2_7b_pvc_1110_4_card.sh │ │ │ ├── lora_finetune_llama2_7b_pvc_1550_4_card.sh │ │ │ └── lora_finetune_llama2_7b_pvc_1550_1_tile.sh │ │ ├── QLoRA │ │ │ ├── alpaca-qlora │ │ │ │ ├── deepspeed_zero2.json │ │ │ │ ├── deepspeed_zero3.json │ │ │ │ ├── qlora_finetune_gemma_2b_arc_1_card.sh │ │ │ │ ├── qlora_finetune_llama2_7b_arc_1_card.sh │ │ │ │ ├── qlora_finetune_qwen15_7b_arc_1_card.sh │ │ │ │ ├── qlora_finetune_baichuan2_7b_arc_1_card.sh │ │ │ │ ├── qlora_finetune_llama3_8b_arc_1_card.sh │ │ │ │ ├── qlora_finetune_llama2_7b_arc_2_card.sh │ │ │ │ ├── qlora_finetune_llama2_7b_flex_170_1_card.sh │ │ │ │ ├── qlora_finetune_llama2_7b_pvc_1100_1_card.sh │ │ │ │ ├── qlora_finetune_llama2_13b_pvc_1550_1_tile.sh │ │ │ │ ├── qlora_finetune_chatglm3_6b_arc_1_card.sh │ │ │ │ ├── qlora_finetune_llama2_7b_pvc_1100_4_card.sh │ │ │ │ ├── qlora_finetune_llama2_7b_pvc_1550_1_card.sh │ │ │ │ ├── qlora_finetune_llama2_7b_pvc_1550_4_card.sh │ │ │ │ ├── qlora_finetune_llama2_13b_pvc_1550_1_card.sh │ │ │ │ ├── qlora_finetune_llama2_13b_pvc_1550_4_card.sh │ │ │ │ └── qlora_finetune_llama2_7b_flex_170_3_card.sh │ │ │ └── README.md │ │ ├── common │ │ │ ├── templates │ │ │ │ ├── alpaca_short.json │ │ │ │ ├── alpaca.json │ │ │ │ ├── alpaca_legacy.json │ │ │ │ └── vigogne.json │ │ │ └── utils │ │ │ │ └── __init__.py │ │ ├── axolotl │ │ │ ├── default_config.yaml │ │ │ └── requirements-xpu.txt │ │ ├── HF-PEFT │ │ │ └── alpaca-lora │ │ │ │ └── templates │ │ │ │ ├── alpaca_short.json │ │ │ │ ├── alpaca.json │ │ │ │ ├── alpaca_legacy.json │ │ │ │ └── vigogne.json │ │ ├── ReLora │ │ │ ├── relora_finetune_llama2_7b_arc_1_card.sh │ │ │ ├── relora_finetune_llama2_7b_arc_2_card.sh │ │ │ ├── relora_finetune_llama2_7b_pvc_1550_1_card.sh │ │ │ └── relora_finetune_llama2_7b_pvc_1550_4_card.sh │ │ └── QA-LoRA │ │ │ └── qalora_finetune_llama2_7b_arc_1_card.sh │ │ └── PyTorch-Models │ │ ├── Model │ │ ├── README.md │ │ └── yuan2 │ │ │ └── yuan2-2B-instruct │ │ │ └── config.json │ │ └── README.md │ ├── dev │ └── benchmark │ │ ├── all-in-one │ │ ├── prompt │ │ │ └── continuation │ │ │ │ ├── 32.txt │ │ │ │ └── 256.txt │ │ ├── run-igpu.sh │ │ ├── run-spr.sh │ │ ├── run-arc.sh │ │ ├── run-max-gpu.sh │ │ ├── run-pipeline-parallel-arc.sh │ │ ├── run-deepspeed-arc.sh │ │ ├── run-deepspeed-pvc.sh │ │ └── run-deepspeed-spr.sh │ │ ├── LongBench │ │ ├── test_and_eval.sh │ │ ├── config │ │ │ ├── ablation_c512_w32_k7_maxpool.json │ │ │ ├── ablation_c1024_w32_k7_maxpool.json │ │ │ ├── ablation_c2048_w32_k7_maxpool.json │ │ │ ├── ablation_c4096_w32_k7_maxpool.json │ │ │ ├── dataset2maxlen.json │ │ │ ├── model2maxlen.json │ │ │ └── model2path.json │ │ └── config.yaml │ │ └── ceval │ │ ├── run.sh │ │ └── evaluators │ │ └── evaluator.py │ ├── src │ └── ipex_llm │ │ ├── cli │ │ └── prompts │ │ │ └── chat-with-llm.txt │ │ ├── transformers │ │ ├── load_config.yaml │ │ ├── models │ │ │ └── __init__.py │ │ ├── gguf │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ └── model_implement │ │ │ │ │ ├── baichuan │ │ │ │ │ └── __init__.py │ │ │ │ │ └── yuan2 │ │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── npu_models │ │ │ └── __init__.py │ │ ├── npu_pipeline_model │ │ │ └── __init__.py │ │ ├── awq │ │ │ └── __init__.py │ │ └── patches.py │ │ ├── vllm │ │ ├── __init__.py │ │ ├── cpu │ │ │ ├── __init__.py │ │ │ ├── engine │ │ │ │ └── __init__.py │ │ │ ├── ipex_llm_v1_wrapper.py │ │ │ └── ipex_llm_wrapper.py │ │ └── xpu │ │ │ ├── __init__.py │ │ │ ├── ipex_llm_v1_wrapper.py │ │ │ ├── engine │ │ │ └── __init__.py │ │ │ └── ipex_llm_wrapper.py │ │ ├── serving │ │ ├── __init__.py │ │ ├── fastchat │ │ │ └── __init__.py │ │ └── fastapi │ │ │ └── __init__.py │ │ ├── utils │ │ ├── modules.py │ │ └── common │ │ │ └── __init__.py │ │ ├── gptq │ │ ├── __init__.py │ │ └── convert │ │ │ └── __init__.py │ │ ├── ggml │ │ └── model │ │ │ ├── __init__.py │ │ │ ├── bloom │ │ │ └── __init__.py │ │ │ ├── generation │ │ │ └── __init__.py │ │ │ ├── starcoder │ │ │ └── __init__.py │ │ │ ├── llama │ │ │ └── __init__.py │ │ │ └── gptneox │ │ │ └── __init__.py │ │ ├── langchain │ │ ├── __init__.py │ │ └── vllm │ │ │ └── __init__.py │ │ ├── llamaindex │ │ └── __init__.py │ │ └── models.py │ ├── test │ ├── benchmark │ │ ├── perplexity │ │ │ └── fp16.csv │ │ ├── harness │ │ │ └── fp16.csv │ │ ├── igpu-perf │ │ │ ├── 1024-128_436.yaml │ │ │ ├── 32-32_int4_fp16_436.yaml │ │ │ ├── 2048-256_int4_fp16_436.yaml │ │ │ ├── 1024-128_int4_fp16_436.yaml │ │ │ ├── 3072-384_int4_fp16_436.yaml │ │ │ ├── 1024-128_int4_fp16_loadlowbit_436.yaml │ │ │ ├── 1024-128_438.yaml │ │ │ ├── 32-32_int4_fp16_438.yaml │ │ │ ├── 1024-128_445.yaml │ │ │ ├── 2048-256_int4_fp16_438.yaml │ │ │ ├── 3072-384_int4_fp16_438.yaml │ │ │ ├── 4096-512_int4_fp16_438.yaml │ │ │ ├── 1024-128_int4_fp16_438.yaml │ │ │ ├── 32-32_int4_fp16_445.yaml │ │ │ ├── 2048-256_int4_fp16_445.yaml │ │ │ ├── 3072-384_int4_fp16_445.yaml │ │ │ ├── 4096-512_int4_fp16_445.yaml │ │ │ ├── 1024-128_443.yaml │ │ │ ├── 1024-128_int4_fp16_445.yaml │ │ │ ├── 1024-128_int4_fp16_loadlowbit_438.yaml │ │ │ ├── 32-32_int4_fp16_443.yaml │ │ │ ├── 1024-128_int4_fp16_loadlowbit_445.yaml │ │ │ ├── 2048-256_int4_fp16_443.yaml │ │ │ ├── 3072-384_int4_fp16_443.yaml │ │ │ ├── 4096-512_int4_fp16_443.yaml │ │ │ ├── 1024-128_int4_fp16_443.yaml │ │ │ ├── 1024-128_int4_fp16_loadlowbit_443.yaml │ │ │ ├── 3072-384_int4_fp16.yaml │ │ │ ├── 4096-512_int4_fp16.yaml │ │ │ ├── 1024-128_int4_fp16_loadlowbit.yaml │ │ │ └── 32-32_int4_fp16.yaml │ │ ├── arc-perf-transformers-436.yaml │ │ ├── arc-perf-transformers-443.yaml │ │ ├── arc-perf-transformers-436-batch2.yaml │ │ ├── arc-perf-transformers-443-batch2.yaml │ │ ├── arc-perf-transformers-443-batch4.yaml │ │ ├── arc-perf-transformers-440.yaml │ │ ├── arc-perf-transformers-445.yaml │ │ ├── arc-perf-transformers-445-batch2.yaml │ │ ├── arc-perf-transformers-445-batch4.yaml │ │ ├── arc-perf-transformers-436-batch4.yaml │ │ ├── stable-version-arc-stress-test-sym_int4.yaml │ │ ├── stable-version-arc-stress-test-fp8.yaml │ │ ├── stable-version-cpu-stress-test.yaml │ │ ├── stable-version-cpu-perf-test.yaml │ │ ├── cpu-perf-test.yaml │ │ └── core-perf-test.yaml │ ├── run-llm-install-tests.sh │ ├── run-llm-langchain-tests.sh │ ├── run-llm-llamaindex-tests.sh │ ├── __init__.py │ ├── run-llm-inference-tests.sh │ ├── run-llm-check-function.sh │ ├── run-llm-llamaindex-tests-gpu.sh │ ├── run-llm-langchain-tests-gpu.sh │ ├── install │ │ └── test_install.py │ └── run-llm-convert-tests.sh │ ├── scripts │ └── env-check.bat │ └── tpp │ ├── README.md │ └── licenses │ ├── LICENSE-go-isatty.txt │ ├── LICENSE-tablewriter.txt │ ├── LICENSE-hm.txt │ ├── LICENSE-gin-contrib-cors.txt │ ├── LICENSE-go-urn.txt │ ├── LICENSE-gorgonia.org-vecf32.txt │ ├── LICENSE-mimetype.txt │ ├── LICENSE-uniseg.txt │ ├── LICENSE-d4l3k-go-bfloat16.txt │ ├── LICENSE-gorgonia.org-vecf64.txt │ ├── LICENSE-validator.txt │ ├── LICENSE-gin.txt │ ├── LICENSE-go-runewidth.txt │ ├── LICENSE-sse.txt │ ├── LICENSE-go-playground-locales.txt │ ├── LICENSE-agnivade-levenshtein.txt │ ├── LICENSE-gin-contrib-sse.txt │ ├── LICENSE-float16.txt │ ├── LICENSE-go-playground-universal-translator.txt │ ├── LICENSE-go-toml.txt │ ├── LICENSE-go-codec.txt │ └── LICENSE-zlib.txt ├── docker └── llm │ ├── serving │ ├── cpu │ │ ├── kubernetes │ │ │ ├── clean.sh │ │ │ └── models-pv.yaml │ │ └── docker │ │ │ ├── start-vllm-service.sh │ │ │ ├── model_adapter.py.patch │ │ │ └── start-notebook.sh │ └── xpu │ │ └── docker │ │ ├── setvars.sh │ │ ├── README.md │ │ └── start-lightweight_serving-service.sh │ ├── inference-cpp │ ├── start-open-webui.sh │ ├── start-ollama.sh │ └── start-llama-cpp.sh │ ├── finetune │ ├── qlora │ │ └── cpu │ │ │ └── kubernetes │ │ │ ├── templates │ │ │ ├── ipex-llm-finetuning-namespace.yaml │ │ │ ├── nfs-pvc.yaml │ │ │ └── nfs-pv.yaml │ │ │ ├── Chart.yaml │ │ │ └── values.yaml │ ├── lora │ │ └── cpu │ │ │ ├── kubernetes │ │ │ ├── templates │ │ │ │ ├── ipex-llm-lora-finetuning-namespace.yaml │ │ │ │ ├── nfs-pvc.yaml │ │ │ │ └── nfs-pv.yaml │ │ │ ├── Chart.yaml │ │ │ └── values.yaml │ │ │ └── docker │ │ │ └── requirements.txt │ └── xpu │ │ └── start-qlora-finetuning-on-xpu.sh │ └── sources │ ├── README.md │ └── Dockerfile ├── .github ├── CODEOWNERS └── ISSUE_TEMPLATE │ └── 🐛bug-report.md ├── docs └── mddocs │ ├── Overview │ ├── install.md │ └── KeyFeatures │ │ ├── gpu_supports.md │ │ └── README.md │ ├── PythonAPI │ └── README.md │ └── DockerGuides │ └── README.md ├── pyproject.toml ├── SECURITY.md ├── .readthedocs.yml ├── apps └── ipynb2py.sh └── .gitignore /python/llm/.gitignore: -------------------------------------------------------------------------------- 1 | libs/ 2 | -------------------------------------------------------------------------------- /python/llm/version.txt: -------------------------------------------------------------------------------- 1 | 2.3.0.dev0 2 | -------------------------------------------------------------------------------- /docker/llm/serving/cpu/kubernetes/clean.sh: -------------------------------------------------------------------------------- 1 | kubectl delete -f deployment.yaml -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | /.github/workflows/* @glorysdj @liu-shaojun @intel-analytics/CICD 2 | -------------------------------------------------------------------------------- /python/llm/portable-zip/.gitignore: -------------------------------------------------------------------------------- 1 | python-embed 2 | bigdl-llm.zip 3 | *.log 4 | *.json -------------------------------------------------------------------------------- /docker/llm/inference-cpp/start-open-webui.sh: -------------------------------------------------------------------------------- 1 | cd /llm/open-webui/backend 2 | bash start.sh > open-webui.log 3 | -------------------------------------------------------------------------------- /python/llm/example/CPU/Speculative-Decoding/EAGLE/requirements.txt: -------------------------------------------------------------------------------- 1 | openai==0.28.0 2 | anthropic==0.5.0 3 | wandb 4 | -------------------------------------------------------------------------------- /python/llm/example/GPU/Speculative-Decoding/EAGLE/requirements.txt: -------------------------------------------------------------------------------- 1 | openai==0.28.0 2 | anthropic==0.5.0 3 | wandb 4 | -------------------------------------------------------------------------------- /python/llm/example/GPU/vLLM-Serving/fp8_kv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/ipex-llm/HEAD/python/llm/example/GPU/vLLM-Serving/fp8_kv.png -------------------------------------------------------------------------------- /python/llm/example/GPU/vLLM-Serving/max_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/ipex-llm/HEAD/python/llm/example/GPU/vLLM-Serving/max_length.png -------------------------------------------------------------------------------- /docker/llm/serving/xpu/docker/setvars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export CCL_DG2_ALLREDUCE=1 4 | export LD_LIBRARY_PATH=/opt/intel/1ccl-wks/lib:$LD_LIBRARY_PATH 5 | -------------------------------------------------------------------------------- /docker/llm/finetune/qlora/cpu/kubernetes/templates/ipex-llm-finetuning-namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: ipex-llm-qlora-finetuning 5 | -------------------------------------------------------------------------------- /docker/llm/finetune/lora/cpu/kubernetes/templates/ipex-llm-lora-finetuning-namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: ipex-llm-lora-finetuning 5 | -------------------------------------------------------------------------------- /python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/prompt/32.txt: -------------------------------------------------------------------------------- 1 | Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun 2 | -------------------------------------------------------------------------------- /python/llm/example/GPU/Pipeline-Parallel-Serving/prompt/32.txt: -------------------------------------------------------------------------------- 1 | Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun 2 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/prompt/continuation/32.txt: -------------------------------------------------------------------------------- 1 | Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. 2 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/run-igpu.sh: -------------------------------------------------------------------------------- 1 | source /opt/intel/oneapi/setvars.sh 2 | 3 | export SYCL_CACHE_PERSISTENT=1 4 | export BIGDL_LLM_XMX_DISABLED=1 5 | 6 | python run.py # make sure config YAML file 7 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/test_and_eval.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | export HF_ENDPOINT=https://hf-mirror.com 4 | 5 | SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) 6 | python ${SHELL_FOLDER}/pred.py 7 | python ${SHELL_FOLDER}/eval.py -------------------------------------------------------------------------------- /docker/llm/finetune/lora/cpu/kubernetes/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: trusted-fintune-service 3 | description: A Helm chart for IPEX-LLM Finetuning Service on Kubernetes 4 | type: application 5 | version: 1.1.27 6 | appVersion: "1.16.0" 7 | -------------------------------------------------------------------------------- /docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: ipex_llm-fintune-service 3 | description: A Helm chart for IPEX-LLM Finetune Service on Kubernetes 4 | type: application 5 | version: 1.1.27 6 | appVersion: "1.16.0" 7 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/ceval/run.sh: -------------------------------------------------------------------------------- 1 | export IPEX_LLM_LAST_LM_HEAD=0 2 | 3 | python eval.py \ 4 | --model_path "path to model" \ 5 | --eval_type validation \ 6 | --device xpu \ 7 | --eval_data_path data \ 8 | --qtype sym_int4 -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/config/ablation_c512_w32_k7_maxpool.json: -------------------------------------------------------------------------------- 1 | { 2 | "window_sizes": 32, 3 | "default_max_capacity_prompts": 512, 4 | "specific_max_capcity_prompts": {}, 5 | "kernel_sizes": 7, 6 | "pooling": "maxpool" 7 | } -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/run-spr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ipex-llm-init -t 3 | export OMP_NUM_THREADS=48 4 | 5 | # set following parameters according to the actual specs of the test machine 6 | numactl -C 0-47 -m 0 python $(dirname "$0")/run.py -------------------------------------------------------------------------------- /python/llm/portable-zip/chat.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | 4 | :: execute chat script 5 | set PYTHONUNBUFFERED=1 6 | 7 | set /p modelpath="Please enter the model path: " 8 | .\python-embed\python.exe .\chat.py --model-path="%modelpath%" 9 | 10 | pause -------------------------------------------------------------------------------- /docker/llm/inference-cpp/start-ollama.sh: -------------------------------------------------------------------------------- 1 | # init ollama first 2 | mkdir -p /llm/ollama 3 | cd /llm/ollama 4 | init-ollama 5 | export OLLAMA_NUM_GPU=999 6 | export ZES_ENABLE_SYSMAN=1 7 | 8 | # start ollama service 9 | (./ollama serve > ollama.log) & 10 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/config/ablation_c1024_w32_k7_maxpool.json: -------------------------------------------------------------------------------- 1 | { 2 | "window_sizes": 32, 3 | "default_max_capacity_prompts": 1024, 4 | "specific_max_capcity_prompts": {}, 5 | "kernel_sizes": 7, 6 | "pooling": "maxpool" 7 | } -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/config/ablation_c2048_w32_k7_maxpool.json: -------------------------------------------------------------------------------- 1 | { 2 | "window_sizes": 32, 3 | "default_max_capacity_prompts": 2048, 4 | "specific_max_capcity_prompts": {}, 5 | "kernel_sizes": 7, 6 | "pooling": "maxpool" 7 | } -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/config/ablation_c4096_w32_k7_maxpool.json: -------------------------------------------------------------------------------- 1 | { 2 | "window_sizes": 32, 3 | "default_max_capacity_prompts": 4096, 4 | "specific_max_capcity_prompts": {}, 5 | "kernel_sizes": 7, 6 | "pooling": "maxpool" 7 | } -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/cli/prompts/chat-with-llm.txt: -------------------------------------------------------------------------------- 1 | A chat between a curious human USER and an artificial intelligence assistant ChatLLM. The assistant gives helpful, detailed, and polite answers to the human's questions. 2 | 3 | USER: Hello, ChatLLM. 4 | ChatLLM: Hello. 5 | USER: 6 | -------------------------------------------------------------------------------- /docker/llm/inference-cpp/start-llama-cpp.sh: -------------------------------------------------------------------------------- 1 | # init llama-cpp first 2 | mkdir -p /llm/llama-cpp 3 | cd /llm/llama-cpp 4 | init-llama-cpp 5 | 6 | # change the model_path to run 7 | model="/models/mistral-7b-v0.1.Q4_0.gguf" 8 | ./llama-cli -m $model -n 32 --prompt "What is AI?" -t 8 -e -ngl 999 --color 9 | -------------------------------------------------------------------------------- /docs/mddocs/Overview/install.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM Installation 2 | 3 | Here, we provide instructions on how to install `ipex-llm` and best practices for setting up your environment. Please refer to the appropriate guide based on your device: 4 | 5 | - [CPU](./install_cpu.md) 6 | - [GPU](./install_gpu.md) 7 | -------------------------------------------------------------------------------- /docker/llm/finetune/lora/cpu/docker/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | appdirs 3 | bitsandbytes 4 | black 5 | black[jupyter] 6 | datasets 7 | fire 8 | peft==0.2.0 9 | #git+https://github.com/huggingface/peft.git 10 | #git+https://github.com/huggingface/transformers.git 11 | gradio 12 | sentencepiece 13 | scipy 14 | -------------------------------------------------------------------------------- /python/llm/example/GPU/HuggingFace/Multimodal/README.md: -------------------------------------------------------------------------------- 1 | # Running HuggingFace multimodal model using IPEX-LLM on Intel GPU 2 | 3 | This folder contains examples of running multimodal models model on IPEX-LLM. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it. 4 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/perplexity/fp16.csv: -------------------------------------------------------------------------------- 1 | Index,Model,Precision,ppl_result 2 | 0,Llama-2-7b-chat-hf,fp16,4.7019 3 | 1,chatglm2-6b,fp16,22.321 4 | 2,chatglm3-6b,fp16,30.1281 5 | 3,Baichuan2-7B-Chat,fp16,10.7676 6 | 4,mpt-7b-chat,fp16,5.7882 7 | 5,falcon-7b-instruct-with-patch,fp16,5.2532 8 | 6,Mistral-7B-v0.1,fp16,3.6597 9 | -------------------------------------------------------------------------------- /docker/llm/finetune/lora/cpu/kubernetes/templates/nfs-pvc.yaml: -------------------------------------------------------------------------------- 1 | kind: PersistentVolumeClaim 2 | apiVersion: v1 3 | metadata: 4 | name: nfs-pvc 5 | namespace: ipex-llm-lora-finetuning 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 10Gi 12 | storageClassName: nfs 13 | -------------------------------------------------------------------------------- /docker/llm/finetune/qlora/cpu/kubernetes/templates/nfs-pvc.yaml: -------------------------------------------------------------------------------- 1 | kind: PersistentVolumeClaim 2 | apiVersion: v1 3 | metadata: 4 | name: nfs-pvc 5 | namespace: ipex-llm-qlora-finetuning 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 10Gi 12 | storageClassName: nfs 13 | -------------------------------------------------------------------------------- /docs/mddocs/Overview/KeyFeatures/gpu_supports.md: -------------------------------------------------------------------------------- 1 | # GPU Supports 2 | 3 | IPEX-LLM not only supports running large language models for inference, but also supports QLoRA finetuning on Intel GPUs. 4 | 5 | * [Inference on GPU](./inference_on_gpu.md) 6 | * [Finetune (QLoRA)](./finetune.md) 7 | * [Multi GPUs selection](./multi_gpus_selection.md) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | name = "BigDL" 2 | version = "2.0" 3 | description = "An open source Big Data AI platform (for distributed TensorFlow, PyTorch and Keras on Apache Spark & Ray)" 4 | license = "Apache-2.0" 5 | repository = "https://github.com/intel-analytics/BigDL" 6 | documentation = "https://bigdl.readthedocs.io/en/branch-2.0/" 7 | 8 | 9 | -------------------------------------------------------------------------------- /docker/llm/serving/xpu/docker/README.md: -------------------------------------------------------------------------------- 1 | > 💡 **Tip**: For a detailed and up-to-date guide on running `vLLM` serving with `IPEX-LLM` on Intel GPUs via Docker, please refer to our official documentation: 2 | > [vllm_docker_quickstart](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/DockerGuides/vllm_docker_quickstart.md) 3 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/harness/fp16.csv: -------------------------------------------------------------------------------- 1 | Index,Model,Precision,Arc,TruthfulQA,Winogrande 2 | 0,falcon-7b-instruct-with-patch,fp16,46.16,44.08,67.96 3 | 1,Llama2-7b-guanaco-dolphin-500,fp16,56.74,46.96,74.27 4 | 2,Baichuan2-7B-Chat-LLaMAfied,fp16,52.47,48.04,69.14 5 | 3,Mistral-7B-v0.1,fp16,59.98,42.15,78.37 6 | 4,mpt-7b-chat,fp16,46.50,40.16,68.43 -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/run-arc.sh: -------------------------------------------------------------------------------- 1 | source /opt/intel/oneapi/setvars.sh 2 | 3 | export USE_XETLA=OFF 4 | export SYCL_CACHE_PERSISTENT=1 5 | KERNEL_VERSION=$(uname -r) 6 | if [[ $KERNEL_VERSION != *"6.5"* ]]; then 7 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 8 | fi 9 | 10 | python run.py # make sure config YAML file -------------------------------------------------------------------------------- /docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh: -------------------------------------------------------------------------------- 1 | # need to update transformers version first 2 | # pip install transformers==4.37.0 3 | cd /llm/lightweight_serving 4 | export IPEX_LLM_NOT_USE_VLLM=True 5 | model_path="/llm/models/Llama-2-7b-chat-hf" 6 | low_bit="sym_int4" 7 | python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit -------------------------------------------------------------------------------- /docker/llm/serving/cpu/kubernetes/models-pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: models-pv 5 | labels: 6 | app: models 7 | spec: 8 | capacity: 9 | storage: 10Gi #Modify according to model size 10 | accessModes: 11 | - ReadWriteMany 12 | storageClassName: models 13 | nfs: 14 | path: YOUR_NFS_PATH 15 | server: YOUR_NFS_SERVER 16 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/run-max-gpu.sh: -------------------------------------------------------------------------------- 1 | source /opt/intel/oneapi/setvars.sh 2 | 3 | export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so 4 | export ENABLE_SDP_FUSION=1 5 | export SYCL_CACHE_PERSISTENT=1 6 | KERNEL_VERSION=$(uname -r) 7 | if [[ $KERNEL_VERSION != *"6.5"* ]]; then 8 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 9 | fi 10 | 11 | python run.py # make sure config YAML file -------------------------------------------------------------------------------- /docker/llm/finetune/lora/cpu/kubernetes/templates/nfs-pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: nfs-pv-ipex-llm-lora-finetuning 5 | namespace: ipex-llm-lora-finetuning 6 | spec: 7 | capacity: 8 | storage: 15Gi 9 | accessModes: 10 | - ReadWriteOnce 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: nfs 13 | nfs: 14 | path: {{ .Values.nfsPath }} 15 | server: {{ .Values.nfsServerIp }} 16 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/deepspeed_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 2, 4 | "offload_optimizer": { 5 | "device": "cpu" 6 | }, 7 | "contiguous_gradients": true, 8 | "overlap_comm": true 9 | }, 10 | "bf16": { 11 | "enabled": true 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "gradient_accumulation_steps": "auto" 15 | } 16 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 2, 4 | "offload_optimizer": { 5 | "device": "cpu" 6 | }, 7 | "contiguous_gradients": true, 8 | "overlap_comm": true 9 | }, 10 | "bf16": { 11 | "enabled": true 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "gradient_accumulation_steps": "auto" 15 | } 16 | -------------------------------------------------------------------------------- /docker/llm/finetune/qlora/cpu/kubernetes/templates/nfs-pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: nfs-pv-ipex-llm-qlora-finetuning 5 | namespace: ipex-llm-qlora-finetuning 6 | spec: 7 | capacity: 8 | storage: 15Gi 9 | accessModes: 10 | - ReadWriteOnce 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: nfs 13 | nfs: 14 | path: {{ .Values.nfsPath }} 15 | server: {{ .Values.nfsServerIp }} 16 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "contiguous_gradients": true, 5 | "overlap_comm": true, 6 | "offload_optimizer": {"device": "cpu"} 7 | }, 8 | "bf16": { 9 | "enabled": true 10 | }, 11 | "world_size": 2, 12 | "train_batch_size": 32, 13 | "train_micro_batch_size_per_gpu": 2, 14 | "gradient_accumulation_steps": 8 15 | } 16 | -------------------------------------------------------------------------------- /docker/llm/finetune/lora/cpu/kubernetes/values.yaml: -------------------------------------------------------------------------------- 1 | imageName: intelanalytics/ipex-llm-finetune-lora-cpu:2.2.0-SNAPSHOT 2 | trainerNum: 8 3 | microBatchSize: 8 4 | nfsServerIp: your_nfs_server_ip 5 | nfsPath: a_nfs_shared_folder_path_on_the_server 6 | dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory 7 | modelSubPath: Llama-2-7b-chat-hf # a subpath of the model file (dir) under nfs directory 8 | ompNumThreads: 14 9 | cpuPerPod: 42 10 | -------------------------------------------------------------------------------- /python/llm/example/GPU/PyTorch-Models/Model/README.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM INT4 Optimization for Large Language Model on Intel GPUs 2 | You can use `optimize_model` API to accelerate general PyTorch models on Intel GPUs. This directory contains example scripts to help you quickly get started using IPEX-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it. 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /docker/llm/sources/README.md: -------------------------------------------------------------------------------- 1 | This is used for OSPDT review. 2 | 3 | A separate Docker container layer tagged as: :-sources tag for sources of 3d party packages with MPL 1.x, MPL 2.x, GPL 1.x, GPL 2.x and GPL 3.x variants. 4 | 5 | ### Build Image 6 | ```bash 7 | docker build \ 8 | --build-arg http_proxy=.. \ 9 | --build-arg https_proxy=.. \ 10 | --build-arg no_proxy=.. \ 11 | --rm --no-cache -t intelanalytics/ipex-llm:sources . 12 | ``` 13 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/common/templates/alpaca_short.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_short.json", 3 | "description": "A shorter template to experiment with.", 4 | "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 5 | "prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Report a Vulnerability 4 | 5 | Please report security issues or vulnerabilities to the [Intel® Security Center]. 6 | 7 | For more information on how Intel® works to resolve security issues, see 8 | [Vulnerability Handling Guidelines]. 9 | 10 | [Intel® Security Center]:https://www.intel.com/security 11 | 12 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html 13 | -------------------------------------------------------------------------------- /python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/templates/alpaca_short.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_short.json", 3 | "description": "A shorter template to experiment with.", 4 | "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 5 | "prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/axolotl/default_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: 'NO' 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | ipex_config: 7 | use_xpu: true 8 | machine_rank: 0 9 | main_training_function: main 10 | mixed_precision: 'no' 11 | num_machines: 1 12 | num_processes: 1 13 | rdzv_backend: static 14 | same_network: true 15 | tpu_env: [] 16 | tpu_use_cluster: false 17 | tpu_use_sudo: false 18 | use_cpu: false 19 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/templates/alpaca_short.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_short.json", 3 | "description": "A shorter template to experiment with.", 4 | "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 5 | "prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/example/GPU/HuggingFace/LLM/README.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM Transformers INT4 Optimization for Large Language Model on Intel GPUs 2 | You can use IPEX-LLM to run almost every Huggingface Transformer models with INT4 optimizations on your laptops with Intel GPUs. This directory contains example scripts to help you quickly get started using IPEX-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it. 3 | -------------------------------------------------------------------------------- /python/llm/portable-zip/setup.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM Portable Zip Setup Script For Windows 2 | 3 | # How to use 4 | 5 | ## Build Portable Zip without Web-UI 6 | 7 | Run `setup.bat` to generate portable zip without Web-UI. It will download and install all dependency and generate `ipex-llm.zip` for user to use. 8 | 9 | ## Build Portable Zip with Web-UI 10 | 11 | Run `setup.bat --ui` to generate portable zip with Web-UI. It will download and install all dependency and generate `ipex-llm.zip` for user to use. 12 | -------------------------------------------------------------------------------- /python/llm/test/run-llm-install-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT} 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src 5 | export LLM_INSTALL_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/install 6 | 7 | set -e 8 | 9 | echo "# Start testing install" 10 | start=$(date "+%s") 11 | 12 | python -m pytest -s ${LLM_INSTALL_TEST_DIR} 13 | 14 | now=$(date "+%s") 15 | time=$((now-start)) 16 | 17 | echo "Bigdl-llm tests finished" 18 | echo "Time used:$time seconds" 19 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/deepspeed_zero3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "contiguous_gradients": true, 5 | "overlap_comm": true, 6 | "offload_optimizer": {"device": "cpu"} 7 | }, 8 | "bf16": { 9 | "enabled": true 10 | }, 11 | "world_size":2, 12 | "train_batch_size": 2, 13 | "train_micro_batch_size_per_gpu": 1, 14 | "gradient_accumulation_steps": 1, 15 | "stage3_gather_16bit_weights_on_model_save":true 16 | } 17 | -------------------------------------------------------------------------------- /python/llm/test/run-llm-langchain-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT} 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/langchain 6 | set -e 7 | 8 | echo "# Start testing inference" 9 | start=$(date "+%s") 10 | 11 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR} 12 | 13 | now=$(date "+%s") 14 | time=$((now-start)) 15 | 16 | echo "Bigdl-llm langchain tests finished" 17 | echo "Time used:$time seconds" -------------------------------------------------------------------------------- /python/llm/example/CPU/Speculative-Decoding/README.md: -------------------------------------------------------------------------------- 1 | # Speculative-Decoding Examples on Intel CPU 2 | 3 | This folder contains examples of running Speculative-Decoding Examples with IPEX-LLM on Intel CPU: 4 | 5 | - [Self-Speculation](Self-Speculation): running BF16 inference for Huggingface Transformer model with ***self-speculative decoding*** with IPEX-LLM on Intel CPUs 6 | - [EAGLE](EAGLE): running speculative sampling using ***EAGLE*** (Extrapolation Algorithm for Greater Language-model Efficiency) with IPEX-LLM on Intel CPUs 7 | -------------------------------------------------------------------------------- /python/llm/example/GPU/Speculative-Decoding/README.md: -------------------------------------------------------------------------------- 1 | # Speculative-Decoding Examples on Intel GPU 2 | 3 | This folder contains examples of running Speculative-Decoding Examples with IPEX-LLM on Intel GPU: 4 | 5 | - [Self-Speculation](Self-Speculation): running BF16 inference for Huggingface Transformer model with ***self-speculative decoding*** with IPEX-LLM on Intel GPUs 6 | - [EAGLE](EAGLE): running speculative sampling using ***EAGLE*** (Extrapolation Algorithm for Greater Language-model Efficiency) with IPEX-LLM on Intel GPUs 7 | -------------------------------------------------------------------------------- /python/llm/test/run-llm-llamaindex-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT} 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/llamaindex 6 | 7 | set -e 8 | 9 | echo "# Start testing inference" 10 | start=$(date "+%s") 11 | 12 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR} 13 | 14 | now=$(date "+%s") 15 | time=$((now-start)) 16 | 17 | echo "Bigdl-llm llamaindex tests finished" 18 | echo "Time used:$time seconds" -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/run-pipeline-parallel-arc.sh: -------------------------------------------------------------------------------- 1 | source /opt/intel/oneapi/setvars.sh 2 | export MASTER_ADDR=127.0.0.1 3 | export MASTER_PORT=8080 4 | export FI_PROVIDER=tcp 5 | export USE_XETLA=OFF 6 | export OMP_NUM_THREADS=6 7 | if [[ $KERNEL_VERSION != *"6.5"* ]]; then 8 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 9 | fi 10 | export TORCH_LLM_ALLREDUCE=0 11 | 12 | NUM_GPUS=2 # number of used GPU 13 | CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS run.py 14 | -------------------------------------------------------------------------------- /python/llm/example/CPU/PyTorch-Models/README.md: -------------------------------------------------------------------------------- 1 | # Running PyTorch model using IPEX-LLM on Intel CPU 2 | 3 | This folder contains examples of running any PyTorch model on IPEX-LLM (with "one-line code change"): 4 | 5 | - [Model](Model): examples of running PyTorch models (e.g., Openai Whisper, LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations 6 | - [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.) 7 | - [Save-Load](Save-Load): examples of saving and loading low-bit models 8 | -------------------------------------------------------------------------------- /python/llm/example/GPU/PyTorch-Models/README.md: -------------------------------------------------------------------------------- 1 | # Running PyTorch model using IPEX-LLM on Intel GPU 2 | 3 | This folder contains examples of running any PyTorch model on IPEX-LLM (with "one-line code change"): 4 | 5 | - [Model](Model): examples of running PyTorch models (e.g., Openai Whisper, LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations 6 | - [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.) 7 | - [Save-Load](Save-Load): examples of saving and loading low-bit models 8 | -------------------------------------------------------------------------------- /docker/llm/serving/cpu/docker/start-vllm-service.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | model="YOUR_MODEL_PATH" 3 | served_model_name="YOUR_MODEL_NAME" 4 | 5 | 6 | python -m ipex_llm.vllm.cpu.entrypoints.openai.api_server \ 7 | --served-model-name $served_model_name \ 8 | --port 8000 \ 9 | --model $model \ 10 | --trust-remote-code \ 11 | --device cpu \ 12 | --dtype bfloat16 \ 13 | --enforce-eager \ 14 | --load-in-low-bit bf16 \ 15 | --max-model-len 4096 \ 16 | --max-num-batched-tokens 10240 \ 17 | --max-num-seqs 12 \ 18 | --tensor-parallel-size 1 -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/README.md: -------------------------------------------------------------------------------- 1 | # QLoRA Finetuning with IPEX-LLM 2 | 3 | We provide [Alpaca-QLoRA example](./alpaca-qlora/), which ports [Alpaca-LoRA](https://github.com/tloen/alpaca-lora/tree/main) to IPEX-LLM (using [QLoRA](https://arxiv.org/abs/2305.14314) algorithm) on [Intel GPU](../../README.md). 4 | 5 | Meanwhile, we also provide a [simple example](./simple-example/) to help you get started with QLoRA Finetuning using IPEX-LLM, and [TRL example](./trl-example/) to help you get started with QLoRA Finetuning using IPEX-LLM and TRL library. 6 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_436.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: 'path to your local model hub' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '1024-128' 11 | test_api: 12 | - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 14 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: 'path to your local model hub' 4 | warm_up: 3 5 | num_trials: 5 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '32-32' 11 | test_api: 12 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 14 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: 'path to your local model hub' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '2048-256' 11 | test_api: 12 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 14 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: 'path to your local model hub' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '1024-128' 11 | test_api: 12 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer 13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 14 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_436.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: 'path to your local model hub' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '3072-384' 11 | test_api: 12 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer 13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 14 | -------------------------------------------------------------------------------- /python/llm/scripts/env-check.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | REM Check Python version 4 | python -V 5 | if ERRORLEVEL 1 ( 6 | echo No Python found! Instructions on how to create an environment can be found in the README.md. 7 | goto:end 8 | ) 9 | python check.py 10 | 11 | echo ----------------------------------------------------------------- 12 | echo System Information 13 | systeminfo 14 | echo ----------------------------------------------------------------- 15 | xpu-smi discovery 16 | if ERRORLEVEL 1 ( 17 | echo xpu-smi is not installed properly. 18 | goto:end 19 | ) 20 | 21 | :end 22 | -------------------------------------------------------------------------------- /docs/mddocs/Overview/KeyFeatures/README.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM Key Features 2 | 3 | You may run the LLMs using `ipex-llm` through one of the following APIs: 4 | 5 | * [PyTorch API](./optimize_model.md) 6 | * [`transformers`-style API](./transformers_style_api.md) 7 | * [Hugging Face `transformers` Format](./hugging_face_format.md) 8 | * [Native Format](./native_format.md) 9 | * [LangChain API](./langchain_api.md) 10 | * [GPU Supports](./gpu_supports.md) 11 | * [Inference on GPU](./inference_on_gpu.md) 12 | * [Finetune (QLoRA)](./finetune.md) 13 | * [Multi GPUs selection](./multi_gpus_selection.md) 14 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_436.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: 'path to your local model hub' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '1024-128' 11 | test_api: 12 | - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 14 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/config/dataset2maxlen.json: -------------------------------------------------------------------------------- 1 | { 2 | "narrativeqa": 128, 3 | "qasper": 128, 4 | "multifieldqa_en": 64, 5 | "multifieldqa_zh": 64, 6 | "hotpotqa": 32, 7 | "2wikimqa": 32, 8 | "musique": 32, 9 | "dureader": 128, 10 | "gov_report": 512, 11 | "qmsum": 512, 12 | "multi_news": 512, 13 | "vcsum": 512, 14 | "trec": 64, 15 | "triviaqa": 32, 16 | "samsum": 128, 17 | "lsht": 64, 18 | "passage_count": 32, 19 | "passage_retrieval_en": 32, 20 | "passage_retrieval_zh": 32, 21 | "lcc": 64, 22 | "repobench-p": 64 23 | } -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_438.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'stabilityai/stablelm-zephyr-3b' 3 | #- 'google/gemma-7b-it' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '1024-128' 12 | test_api: 13 | - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: ubuntu-20.04 10 | tools: 11 | python: "3.7" 12 | apt_packages: 13 | - graphviz 14 | jobs: 15 | pre_install: 16 | - wget https://raw.githubusercontent.com/analytics-zoo/gha-cicd-env/main/python-requirements/requirements-doc.txt 17 | 18 | sphinx: 19 | configuration: docs/readthedocs/source/conf.py 20 | 21 | python: 22 | install: 23 | - requirements: ./requirements-doc.txt 24 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_438.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'stabilityai/stablelm-zephyr-3b' 3 | #- 'google/gemma-7b-it' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 3 6 | num_trials: 5 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '32-32' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/example/GPU/Pipeline-Parallel-Serving/prompt/128.txt: -------------------------------------------------------------------------------- 1 | In a distant future, humanity has expanded across the galaxy, establishing colonies on numerous planets. The interstellar community thrives under the guidance of the United Galactic Federation, which ensures peace and prosperity. However, a new threat emerges from the unknown regions of space, challenging the stability and security of the galaxy. Brave explorers and seasoned warriors must unite to uncover the secrets of this mysterious force and protect the future of all sentient beings. Please continue the above story as long as possible, preferably more than 1000 tokens. -------------------------------------------------------------------------------- /python/llm/tpp/README.md: -------------------------------------------------------------------------------- 1 | Third Party Software notices and information 2 | ------------------------------------------------------------- 3 | “Third Party Software” mean the files (if any) listed in the “third-party-programs.txt” or other similarly-named text file that may be included with the software. Third Party Software, even if included with the distribution of the software, may be governed by separate license terms, including without limitation, third party license terms, open source software notices and terms, and/or other Intel software license terms. These separate license terms solely govern your use of the Third Party Software. -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_445.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '1024-128' 12 | test_api: 13 | - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_438.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'stabilityai/stablelm-zephyr-3b' 3 | #- 'google/gemma-7b-it' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '2048-256' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_438.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'stabilityai/stablelm-zephyr-3b' 3 | #- 'google/gemma-7b-it' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '3072-384' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_438.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'stabilityai/stablelm-zephyr-3b' 3 | #- 'google/gemma-7b-it' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '4096-512' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/config/model2maxlen.json: -------------------------------------------------------------------------------- 1 | { 2 | "llama2-7b-chat-4k": 4096, 3 | "longchat-v1.5-7b-32k": 4096, 4 | "xgen-7b-8k": 4096, 5 | "internlm-7b-8k": 4096, 6 | "chatglm2-6b": 4096, 7 | "chatglm2-6b-32k": 4096, 8 | "chatglm3-6b-32k": 4096, 9 | "chatglm4-9b": 4096, 10 | "vicuna-v1.5-7b-16k": 4096, 11 | "mistral-7B-instruct-v0.2": 4096, 12 | "mistral-7B-instruct-v0.1": 4096, 13 | "mixtral-8x7B-instruct-v0.1": 4096, 14 | "llama-2-7B-32k-instruct": 4096, 15 | "lwm-text-chat-1m": 4096, 16 | "lwm-text-1m": 4096, 17 | "qwen2-7b-instruct": 4096 18 | } 19 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_438.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'stabilityai/stablelm-zephyr-3b' 3 | #- 'google/gemma-7b-it' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '1024-128' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_445.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 3 6 | num_trials: 5 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '32-32' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh: -------------------------------------------------------------------------------- 1 | export MASTER_ADDR=127.0.0.1 2 | export SOCKET_CORES=48 3 | 4 | source ipex-llm-init -t 5 | mpirun -n 2 \ 6 | --bind-to socket \ 7 | -genv OMP_NUM_THREADS=$SOCKET_CORES \ 8 | -genv KMP_AFFINITY="granularity=fine,none" \ 9 | -genv KMP_BLOCKTIME=1 \ 10 | python alpaca_qlora_finetuning_cpu.py \ 11 | --gradient_checkpointing False \ 12 | --batch_size 128 \ 13 | --micro_batch_size 8 \ 14 | --max_steps -1 \ 15 | --base_model "meta-llama/Llama-2-7b-hf" \ 16 | --data_path "yahma/alpaca-cleaned" \ 17 | --output_dir "./ipex-llm-qlora-alpaca" 18 | 19 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/load_config.yaml: -------------------------------------------------------------------------------- 1 | # TODO: move this to a different repo 2 | repo_id: 3 | # - 'THUDM/chatglm-6b' 4 | # - 'THUDM/chatglm2-6b' 5 | - 'meta-llama/Llama-2-7b-chat-hf' 6 | # - 'baichuan-inc/Baichuan2-7B-Chat' 7 | # - 'Qwen/Qwen-7B-Chat' 8 | # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now 9 | local_model_hub: '/mnt/disk1/models' 10 | low_bit: 11 | - 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 12 | - 'bf16' 13 | device: 14 | #- 'cpu' 15 | - 'xpu' 16 | load_low_bit_model: False 17 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-436.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: '/mnt/disk1/models' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '32-32' 11 | - '1024-128' 12 | - '2048-256' 13 | test_api: 14 | - "transformer_int4_fp16_gpu" # on Intel GPU 15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 17 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_445.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '2048-256' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_445.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '3072-384' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_445.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '4096-512' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /docker/llm/finetune/xpu/start-qlora-finetuning-on-xpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | export USE_XETLA=OFF 4 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 5 | source /opt/intel/oneapi/setvars.sh 6 | 7 | if [ -d "./model" ]; 8 | then 9 | MODEL_PARAM="--repo-id-or-model-path ./model" # otherwise, default to download from HF repo 10 | fi 11 | 12 | if [ -d "./data/alpaca-cleaned" ]; 13 | then 14 | DATA_PARAM="--dataset ./data/alpaca-cleaned" # otherwise, default to download from HF dataset 15 | fi 16 | 17 | # QLoRA example dir 18 | cd /LLM-Finetuning/QLoRA/simple-example/ 19 | 20 | python qlora_finetuning.py $MODEL_PARAM $DATA_PARAM 21 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/config.yaml: -------------------------------------------------------------------------------- 1 | model_name: 2 | # - "mistral-7B-instruct-v0.2" 3 | - "llama2-7b-chat-4k" 4 | # - "chatglm4-9b" 5 | # - "qwen2-7b-instruct" 6 | 7 | full_kv: True 8 | optimize_model: True 9 | dtype: 'fp16' 10 | low_bit: 'sym_int4' 11 | 12 | e: False 13 | 14 | compress_kv: 15 | - "ablation_c512_w32_k7_maxpool" 16 | - "ablation_c1024_w32_k7_maxpool" 17 | 18 | datasets: 19 | - "multi_news" 20 | - "qasper" 21 | - "hotpotqa" 22 | - "trec" 23 | - "passage_count" 24 | - "lcc" 25 | # - "multifieldqa_zh" 26 | # - "dureader" 27 | # - "vcsum" 28 | # - "lsht" 29 | # - "passage_retrieval_zh" 30 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_443.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'google/gemma-2-2b-it' 3 | - 'google/gemma-2-9b-it' 4 | - 'meta-llama/Llama-3.1-8B-Instruct' 5 | local_model_hub: 'path to your local model hub' 6 | warm_up: 1 7 | num_trials: 3 8 | num_beams: 1 # default to greedy search 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 10 | batch_size: 1 # default to 1 11 | in_out_pairs: 12 | - '1024-128' 13 | test_api: 14 | - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_445.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '1024-128' 12 | test_api: 13 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_438.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'stabilityai/stablelm-zephyr-3b' 3 | #- 'google/gemma-7b-it' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '1024-128' 12 | test_api: 13 | - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-443.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.1-8B-Instruct' 3 | local_model_hub: '/mnt/disk1/models' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '32-32' 11 | - '1024-128' 12 | - '2048-256' 13 | test_api: 14 | - "transformer_int4_fp16_gpu" # on Intel GPU 15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 17 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'google/gemma-2-2b-it' 3 | - 'google/gemma-2-9b-it' 4 | - 'meta-llama/Llama-3.1-8B-Instruct' 5 | local_model_hub: 'path to your local model hub' 6 | warm_up: 3 7 | num_trials: 5 8 | num_beams: 1 # default to greedy search 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 10 | batch_size: 1 # default to 1 11 | in_out_pairs: 12 | - '32-32' 13 | test_api: 14 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | -------------------------------------------------------------------------------- /apps/ipynb2py.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Usage ################################ 4 | # ./ipynb2py 5 | # Example: 6 | # ipynb2py notebooks/neural_networks/rnn 7 | ######################################### 8 | if [ $# -ne "1" ]; then 9 | echo "Usage: ./nb2script " 10 | else 11 | cp $1.ipynb $1.tmp.ipynb 12 | sed -i 's/%%/#/' $1.tmp.ipynb 13 | sed -i 's/%pylab/#/' $1.tmp.ipynb 14 | 15 | jupyter nbconvert $1.tmp.ipynb --to python 16 | 17 | mv $1.tmp.py $1.py 18 | sed -i '1i# -*- coding: utf-8 -*-' $1.py 19 | sed -i '#!/usr/bin/python' $1.py 20 | rm $1.tmp.ipynb 21 | fi 22 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-436-batch2.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: '/mnt/disk1/models' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 2 # default to 1 9 | in_out_pairs: 10 | - '32-32' 11 | - '1024-128' 12 | - '2048-256' 13 | test_api: 14 | - "transformer_int4_fp16_gpu" # on Intel GPU 15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 17 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-443-batch2.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.1-8B-Instruct' 3 | local_model_hub: '/mnt/disk1/models' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 2 # default to 1 9 | in_out_pairs: 10 | - '32-32' 11 | - '1024-128' 12 | - '2048-256' 13 | test_api: 14 | - "transformer_int4_fp16_gpu" # on Intel GPU 15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 17 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-443-batch4.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.1-8B-Instruct' 3 | local_model_hub: '/mnt/disk1/models' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 4 # default to 1 9 | in_out_pairs: 10 | - '32-32' 11 | - '1024-128' 12 | - '2048-256' 13 | test_api: 14 | - "transformer_int4_fp16_gpu" # on Intel GPU 15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 17 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_445.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: 'path to your local model hub' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '1024-128' 12 | test_api: 13 | - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 15 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'google/gemma-2-2b-it' 3 | - 'google/gemma-2-9b-it' 4 | - 'meta-llama/Llama-3.1-8B-Instruct' 5 | local_model_hub: 'path to your local model hub' 6 | warm_up: 1 7 | num_trials: 3 8 | num_beams: 1 # default to greedy search 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 10 | batch_size: 1 # default to 1 11 | in_out_pairs: 12 | - '2048-256' 13 | test_api: 14 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_443.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'google/gemma-2-2b-it' 3 | # - 'google/gemma-2-9b-it' 4 | - 'meta-llama/Llama-3.1-8B-Instruct' 5 | local_model_hub: 'path to your local model hub' 6 | warm_up: 1 7 | num_trials: 3 8 | num_beams: 1 # default to greedy search 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 10 | batch_size: 1 # default to 1 11 | in_out_pairs: 12 | - '3072-384' 13 | test_api: 14 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_443.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'google/gemma-2-2b-it' 3 | # - 'google/gemma-2-9b-it' 4 | - 'meta-llama/Llama-3.1-8B-Instruct' 5 | local_model_hub: 'path to your local model hub' 6 | warm_up: 1 7 | num_trials: 3 8 | num_beams: 1 # default to greedy search 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 10 | batch_size: 1 # default to 1 11 | in_out_pairs: 12 | - '4096-512' 13 | test_api: 14 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-440.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen1.5-MoE-A2.7B-Chat' 3 | local_model_hub: '/mnt/disk1/models' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 1 # default to 1 9 | in_out_pairs: 10 | - '32-32' 11 | - '1024-128' 12 | - '2048-256' 13 | test_api: 14 | - "transformer_int4_fp16_gpu" # on Intel GPU 15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 17 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'google/gemma-2-2b-it' 3 | - 'google/gemma-2-9b-it' 4 | - 'meta-llama/Llama-3.1-8B-Instruct' 5 | local_model_hub: 'path to your local model hub' 6 | warm_up: 1 7 | num_trials: 3 8 | num_beams: 1 # default to greedy search 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 10 | batch_size: 1 # default to 1 11 | in_out_pairs: 12 | - '1024-128' 13 | test_api: 14 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer 15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | -------------------------------------------------------------------------------- /python/llm/test/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/cpu/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/xpu/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_443.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'google/gemma-2-2b-it' 3 | - 'google/gemma-2-9b-it' 4 | - 'meta-llama/Llama-3.1-8B-Instruct' 5 | local_model_hub: 'path to your local model hub' 6 | warm_up: 1 7 | num_trials: 3 8 | num_beams: 1 # default to greedy search 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 10 | batch_size: 1 # default to 1 11 | in_out_pairs: 12 | - '1024-128' 13 | test_api: 14 | - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/serving/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/models/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/serving/fastchat/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-445.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: '/mnt/disk1/models' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 1 # default to 1 10 | in_out_pairs: 11 | - '32-32' 12 | - '1024-128' 13 | - '2048-256' 14 | test_api: 15 | - "transformer_int4_fp16_gpu" # on Intel GPU 16 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 17 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 18 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/gguf/models/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/npu_models/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /docker/llm/sources/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | # Add deb-src entries to the sources.list 4 | RUN echo "deb-src http://archive.ubuntu.com/ubuntu/ focal main restricted universe multiverse" >> /etc/apt/sources.list 5 | 6 | # Update package lists and install dpkg-dev 7 | RUN apt-get update && apt-get install -y dpkg-dev 8 | 9 | # Create a temporary directory, adjust permissions, and download source code for the specified packages 10 | RUN mkdir -p /usr/local/src/git-source && \ 11 | chown _apt:root /usr/local/src/git-source && \ 12 | cd /usr/local/src/git-source && \ 13 | apt-get source \ 14 | git \ 15 | gnupg \ 16 | numactl \ 17 | wget \ 18 | software-properties-common -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-445-batch2.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: '/mnt/disk1/models' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 2 # default to 1 10 | in_out_pairs: 11 | - '32-32' 12 | - '1024-128' 13 | - '2048-256' 14 | test_api: 15 | - "transformer_int4_fp16_gpu" # on Intel GPU 16 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 17 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 18 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-445-batch4.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-3.2-1B-Instruct' 3 | - 'meta-llama/Llama-3.2-3B-Instruct' 4 | local_model_hub: '/mnt/disk1/models' 5 | warm_up: 1 6 | num_trials: 3 7 | num_beams: 1 # default to greedy search 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 9 | batch_size: 4 # default to 1 10 | in_out_pairs: 11 | - '32-32' 12 | - '1024-128' 13 | - '2048-256' 14 | test_api: 15 | - "transformer_int4_fp16_gpu" # on Intel GPU 16 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 17 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 18 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/npu_pipeline_model/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/arc-perf-transformers-436-batch4.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'Qwen/Qwen-VL-Chat' 3 | local_model_hub: '/mnt/disk1/models' 4 | warm_up: 1 5 | num_trials: 3 6 | num_beams: 1 # default to greedy search 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 8 | batch_size: 4 # default to 1 9 | in_out_pairs: 10 | - '32-32' 11 | - '1024-128' 12 | - '2048-256' 13 | test_api: 14 | - "transformer_int4_fp16_gpu" # on Intel GPU 15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 16 | exclude: 17 | - 'Qwen/Qwen-VL-Chat:2048' 18 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' 19 | -------------------------------------------------------------------------------- /python/llm/example/CPU/HF-Transformers-AutoModels/README.md: -------------------------------------------------------------------------------- 1 | # Running Hugging Face Transformers model using IPEX-LLM on Intel CPU 2 | 3 | This folder contains examples of running any HuggingFace `transformers` model on IPEX-LLM (using the standard AutoModel APIs): 4 | 5 | - [Model](Model): examples of running HuggingFace `transformers` models (e.g., LLaMA, Mistral, ChatGLM, Qwen, Baichuan, Mixtral, Gemma, etc.) using INT4 optimizations 6 | - [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (INT8/INT5, etc.) on Intel CPU 7 | - [Save-Load](Save-Load): examples of saving and loading low-bit models 8 | - [Advanced-Quantizations](Advanced-Quantizations): examples of loading GGUF/AWQ/GPTQ models 9 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/common/templates/alpaca.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json", 3 | "description": "Template used by Alpaca-LoRA.", 4 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 5 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/run-deepspeed-arc.sh: -------------------------------------------------------------------------------- 1 | export MASTER_ADDR=127.0.0.1 2 | export FI_PROVIDER=tcp 3 | export CCL_ATL_TRANSPORT=ofi 4 | export CCL_ZE_IPC_EXCHANGE=sockets 5 | 6 | export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} 7 | basekit_root=/opt/intel/oneapi 8 | source $basekit_root/setvars.sh --force 9 | source $basekit_root/ccl/latest/env/vars.sh --force 10 | 11 | NUM_GPUS=2 # number of used GPU 12 | export USE_XETLA=OFF 13 | if grep -q "Core" /proc/cpuinfo; then 14 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 15 | fi 16 | export TORCH_LLM_ALLREDUCE=0 # Different from PVC 17 | export BIGDL_IMPORT_IPEX=0 18 | mpirun -np $NUM_GPUS --prepend-rank python run.py 19 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/run-deepspeed-pvc.sh: -------------------------------------------------------------------------------- 1 | export ZE_AFFINITY_MASK="0,1" # specify the used GPU 2 | NUM_GPUS=2 # number of used GPU 3 | export MASTER_ADDR=127.0.0.1 4 | export FI_PROVIDER=tcp 5 | export CCL_ATL_TRANSPORT=ofi 6 | export CCL_ZE_IPC_EXCHANGE=sockets 7 | 8 | export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} 9 | basekit_root=/opt/intel/oneapi 10 | source $basekit_root/setvars.sh --force 11 | source $basekit_root/ccl/latest/env/vars.sh --force 12 | 13 | export OMP_NUM_THREADS=$((56/$NUM_GPUS)) 14 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 15 | export TORCH_LLM_ALLREDUCE=1 16 | export BIGDL_IMPORT_IPEX=0 17 | mpirun -np $NUM_GPUS --prepend-rank python run.py 18 | -------------------------------------------------------------------------------- /python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/templates/alpaca.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json", 3 | "description": "Template used by Alpaca-LoRA.", 4 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 5 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/templates/alpaca.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json", 3 | "description": "Template used by Alpaca-LoRA.", 4 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 5 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/gguf/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from .api import load_gguf_model 18 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-2-7b-chat-hf' 3 | - 'THUDM/chatglm2-6b' 4 | - 'THUDM/chatglm3-6b' 5 | - 'baichuan-inc/Baichuan2-7B-Chat' 6 | - 'Qwen/Qwen-7B-Chat' 7 | local_model_hub: '/mnt/disk1/models' 8 | warm_up: 10 9 | num_trials: 100 10 | num_beams: 1 # default to greedy search 11 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 12 | batch_size: 1 # default to 1 13 | in_out_pairs: 14 | - '1024-512' 15 | - '2048-512' 16 | test_api: 17 | - "transformer_int4_gpu" # on Intel GPU 18 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 19 | exclude: 20 | - 'Qwen/Qwen-7B-Chat:2048' 21 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/common/templates/alpaca_legacy.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_legacy.json", 3 | "description": "Legacy template, used by Original Alpaca repository.", 4 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:", 5 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /docker/llm/finetune/qlora/cpu/kubernetes/values.yaml: -------------------------------------------------------------------------------- 1 | imageName: intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.2.0-SNAPSHOT 2 | trainerNum: 2 3 | microBatchSize: 8 4 | enableGradientCheckpoint: false # true will save more memory but increase latency 5 | nfsServerIp: your_nfs_server_ip 6 | nfsPath: a_nfs_shared_folder_path_on_the_server 7 | dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory 8 | modelSubPath: Llama-2-7b-chat-hf # a subpath of the model file (dir) under nfs directory 9 | omp_num_threads: 48 # configure this value based on the number of CPU cores 10 | httpProxy: "your_http_proxy_like_http://xxx:xxxx_if_needed_else_empty" 11 | httpsProxy: "your_https_proxy_like_http://xxx:xxxx_if_needed_else_empty" 12 | -------------------------------------------------------------------------------- /docs/mddocs/PythonAPI/README.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM API 2 | 3 | - [IPEX-LLM `transformers`-style API](./transformers.md) 4 | 5 | - [Hugging Face `transformers` AutoModel](./transformers.md#hugging-face-transformers-automodel) 6 | 7 | - AutoModelForCausalLM 8 | - AutoModel 9 | - AutoModelForSpeechSeq2Seq 10 | - AutoModelForSeq2SeqLM 11 | - AutoModelForSequenceClassification 12 | - AutoModelForMaskedLM 13 | - AutoModelForQuestionAnswering 14 | - AutoModelForNextSentencePrediction 15 | - AutoModelForMultipleChoice 16 | - AutoModelForTokenClassification 17 | 18 | - [IPEX-LLM PyTorch API](./optimize.md) 19 | 20 | - [Optimize Model](./optimize.md#optimize-model) 21 | 22 | - [Load Optimized Model](./optimize.md#load-optimized-model) -------------------------------------------------------------------------------- /python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/templates/alpaca_legacy.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_legacy.json", 3 | "description": "Legacy template, used by Original Alpaca repository.", 4 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:", 5 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/templates/alpaca_legacy.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_legacy.json", 3 | "description": "Legacy template, used by Original Alpaca repository.", 4 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:", 5 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:", 6 | "response_split": "### Response:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/common/templates/vigogne.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/vigogne.json", 3 | "description": "French template, used by Vigogne for finetuning.", 4 | "prompt_input": "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Entrée:\n{input}\n\n### Réponse:\n", 5 | "prompt_no_input": "Ci-dessous se trouve une instruction qui décrit une tâche. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Réponse:\n", 6 | "response_split": "### Réponse:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/common/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from .prompter import Prompter 18 | from .util import * 19 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/serving/fastapi/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from .api_server import FastApp 18 | from .model_worker import ModelWorker -------------------------------------------------------------------------------- /python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/templates/vigogne.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/vigogne.json", 3 | "description": "French template, used by Vigogne for finetuning.", 4 | "prompt_input": "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Entrée:\n{input}\n\n### Réponse:\n", 5 | "prompt_no_input": "Ci-dessous se trouve une instruction qui décrit une tâche. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Réponse:\n", 6 | "response_split": "### Réponse:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-2-7b-chat-hf' 3 | - 'THUDM/chatglm2-6b' 4 | - 'THUDM/chatglm3-6b' 5 | - 'baichuan-inc/Baichuan2-7B-Chat' 6 | - 'Qwen/Qwen-7B-Chat' 7 | local_model_hub: '/mnt/disk1/models' 8 | warm_up: 10 9 | num_trials: 100 10 | num_beams: 1 # default to greedy search 11 | low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4) 12 | batch_size: 1 # default to 1 13 | in_out_pairs: 14 | - '1024-512' 15 | - '2048-512' 16 | test_api: 17 | - "transformer_int4_gpu" # on Intel GPU 18 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 19 | exclude: 20 | - 'baichuan-inc/Baichuan2-7B-Chat:2048' 21 | - 'Qwen/Qwen-7B-Chat:2048' -------------------------------------------------------------------------------- /python/llm/example/GPU/HuggingFace/README.md: -------------------------------------------------------------------------------- 1 | # Running HuggingFace models using IPEX-LLM on Intel GPU 2 | 3 | This folder contains examples of running any HuggingFace model on IPEX-LLM: 4 | 5 | - [LLM](LLM): examples of running large language models (LLaMA, Mistral, ChatGLM, Qwen, Baichuan, Mixtral, Gemma, etc.) using IPEX-LLM optimizations 6 | - [Multimodal](Multimodal): examples of running large multimodal models (StableDiffusion models, Qwen-VL-Chat, glm-4v, etc.) using IPEX-LLM optimizations 7 | - [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (FP8/INT8/FP4, etc.) 8 | - [Save-Load](Save-Load): examples of saving and loading low-bit models 9 | - [Advanced-Quantizations](Advanced-Quantizations): examples of loading GGUF/AWQ/GPTQ models 10 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/templates/vigogne.json: -------------------------------------------------------------------------------- 1 | { 2 | "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/vigogne.json", 3 | "description": "French template, used by Vigogne for finetuning.", 4 | "prompt_input": "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Entrée:\n{input}\n\n### Réponse:\n", 5 | "prompt_no_input": "Ci-dessous se trouve une instruction qui décrit une tâche. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Réponse:\n", 6 | "response_split": "### Réponse:" 7 | } 8 | -------------------------------------------------------------------------------- /python/llm/example/CPU/Deepspeed-AutoTP/run.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | source ipex-llm-init 3 | unset OMP_NUM_THREADS # deepspeed will set it for each instance automatically 4 | source /opt/intel/oneccl/env/setvars.sh 5 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib 6 | export WORLD_SIZE=2 # run 1 instance per SPR socket, thus 2 instances on 2 sockets, 96 cores 7 | export MASTER_ADDR=127.0.0.1 8 | export CCL_ZE_IPC_EXCHANGE=sockets 9 | export DS_ACCELERATOR="cpu" 10 | export CCL_WORKER_AFFINITY=auto 11 | unset KMP_AFFINITY # deepspeed will set it for each instance automatically 12 | export FI_PROVIDER=tcp 13 | export CCL_ATL_TRANSPORT=ofi 14 | export CCL_PROCESS_LAUNCHER=none 15 | 16 | deepspeed \ 17 | --bind_cores_to_rank \ 18 | --bind_core_list 0-95 \ 19 | deepspeed_autotp.py 20 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/run-deepspeed-spr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ipex-llm-init -t 3 | unset OMP_NUM_THREADS # deepspeed will set it for each instance automatically 4 | source /opt/intel/oneccl/env/setvars.sh 5 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib 6 | export WORLD_SIZE=2 # run 1 instance per SPR socket, thus 2 instances on 2 sockets, 96 cores 7 | export MASTER_ADDR=127.0.0.1 8 | export CCL_ZE_IPC_EXCHANGE=sockets 9 | export DS_ACCELERATOR="cpu" 10 | export CCL_WORKER_AFFINITY=auto 11 | unset KMP_AFFINITY # deepspeed will set it for each instance automatically 12 | export FI_PROVIDER=tcp 13 | export CCL_ATL_TRANSPORT=ofi 14 | export CCL_PROCESS_LAUNCHER=none 15 | 16 | deepspeed \ 17 | --bind_cores_to_rank \ 18 | --bind_core_list 0-95 \ 19 | run.py 20 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/stable-version-cpu-stress-test.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-2-7b-chat-hf' 3 | - 'meta-llama/Llama-2-13b-chat-hf' 4 | - 'THUDM/chatglm2-6b' 5 | - 'THUDM/chatglm3-6b' 6 | - 'baichuan-inc/Baichuan2-7B-Chat' 7 | - 'baichuan-inc/Baichuan2-13B-Chat' 8 | - 'Qwen/Qwen-14B-Chat' 9 | local_model_hub: '/mnt/disk1/models' 10 | warm_up: 3 11 | num_trials: 50 12 | num_beams: 1 # default to greedy search 13 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 14 | batch_size: 1 # default to 1 15 | in_out_pairs: 16 | - '1024-512' 17 | - '2048-512' 18 | test_api: 19 | - "transformer_int4" 20 | # - "transformer_int4_gpu" # on Intel GPU 21 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 22 | -------------------------------------------------------------------------------- /python/llm/test/run-llm-inference-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT} 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference 6 | 7 | set -e 8 | 9 | echo "# Start testing inference" 10 | start=$(date "+%s") 11 | 12 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_call_models.py -v 13 | 14 | if [ -z "$THREAD_NUM" ]; then 15 | THREAD_NUM=2 16 | fi 17 | export OMP_NUM_THREADS=$THREAD_NUM 18 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v 19 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v 20 | 21 | now=$(date "+%s") 22 | time=$((now-start)) 23 | 24 | echo "Bigdl-llm tests finished" 25 | echo "Time used:$time seconds" 26 | -------------------------------------------------------------------------------- /docs/mddocs/DockerGuides/README.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM Docker Container User Guides 2 | 3 | 4 | In this section, you will find guides related to using IPEX-LLM with Docker, covering how to: 5 | 6 | - [Overview of IPEX-LLM Containers](./docker_windows_gpu.md) 7 | 8 | - Inference in Python/C++ 9 | - [GPU Inference in Python with IPEX-LLM](./docker_pytorch_inference_gpu.md) 10 | - [VSCode LLM Development with IPEX-LLM on Intel GPU](./docker_run_pytorch_inference_in_vscode.md) 11 | - [llama.cpp/Ollama/Open-WebUI with IPEX-LLM on Intel GPU](./docker_cpp_xpu_quickstart.md) 12 | 13 | - Serving 14 | - [FastChat with IPEX-LLM on Intel GPU](./fastchat_docker_quickstart.md) 15 | - [vLLM with IPEX-LLM on Intel GPU](./vllm_docker_quickstart.md) 16 | - [vLLM with IPEX-LLM on Intel CPU](./vllm_cpu_docker_quickstart.md) 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/🐛bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F41BBug report" 3 | about: Report a bug or error 4 | title: '' 5 | labels: user issue 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug or error is. 12 | 13 | **How to reproduce** 14 | Steps to reproduce the error: 15 | 1. ... 16 | 2. ... 17 | 3. ... 18 | 4. ... 19 | 20 | **Screenshots** 21 | If applicable, add screenshots to help explain the problem 22 | 23 | **Environment information** 24 | If possible, please attach the output of the environment check script, using: 25 | - https://github.com/intel/ipex-llm/blob/main/python/llm/scripts/env-check.bat, or 26 | - https://github.com/intel/ipex-llm/blob/main/python/llm/scripts/env-check.sh 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compile products 2 | *.pyc 3 | *.class 4 | *.crc 5 | *.o 6 | 7 | # SBT, Maven specific 8 | .cache 9 | .history 10 | .lib/ 11 | lib_managed/ 12 | src_managed/ 13 | project/boot/ 14 | project/plugins/project/ 15 | dependency-reduced-pom.xml 16 | 17 | # IDE specific 18 | .scala_dependencies 19 | .worksheet 20 | *.iml 21 | .idea/ 22 | .vscode/ 23 | 24 | # macOS specific 25 | .DS_Store 26 | 27 | # data files 28 | model*.[0-9]* 29 | state*.[0-9]* 30 | 31 | # other 32 | nohup.out 33 | *.log 34 | *.lock 35 | *.un~ 36 | *.idx 37 | .ipynb_checkpoints/ 38 | .project 39 | .settings/ 40 | */.cache-main 41 | */.cache-tests 42 | */.classpath 43 | */.project 44 | */.settings/ 45 | *.so 46 | *.so.* 47 | *.dylib 48 | __pycache__ 49 | *.egg-info 50 | target 51 | build 52 | dist 53 | 54 | # For readthedocs 55 | docs/readthedocs/requirements-doc.txt 56 | docs/readthedocs/_build/* -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/utils/modules.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import sys 18 | from types import ModuleType 19 | 20 | 21 | def insert_fake_module(name, doc=None): 22 | m = ModuleType(name, doc) 23 | m.__file__ = __file__ 24 | sys.modules[name] = m 25 | return m 26 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt: -------------------------------------------------------------------------------- 1 | # This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/requirements.txt 2 | --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ 3 | packaging==23.2 4 | peft==0.10.0 5 | tokenizers 6 | bitsandbytes>=0.41.1 7 | accelerate==0.23.0 8 | deepspeed>=0.13.1 9 | addict 10 | fire 11 | PyYAML>=6.0 12 | datasets 13 | #flash-attn==2.3.3 14 | sentencepiece 15 | wandb 16 | einops 17 | #xformers==0.0.22 18 | optimum==1.13.2 19 | hf_transfer 20 | colorama 21 | numba 22 | numpy>=1.24.4 23 | mlflow 24 | # qlora things 25 | bert-score==0.3.13 26 | evaluate==0.4.0 27 | rouge-score==0.1.2 28 | scipy 29 | scikit-learn>=1.5.0 30 | pynvml 31 | art 32 | fschat 33 | gradio>=4.19.2 34 | tensorboard 35 | 36 | mamba-ssm==1.1.1 37 | 38 | # remote filesystems 39 | s3fs 40 | gcsfs 41 | # adlfs 42 | 43 | trl>=0.7.9, <=0.9.6 44 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/cpu/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine 17 | __all__ = [ 18 | "IPEXLLMAsyncLLMEngine", 19 | "IPEXLLMLLMEngine", 20 | "IPEXLLMClass", 21 | "run_mp_engine", 22 | ] 23 | -------------------------------------------------------------------------------- /python/llm/example/CPU/PyTorch-Models/Model/README.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM INT4 Optimization for Large Language Model 2 | You can use `optimize_model` API to accelerate general PyTorch models on Intel servers and PCs. This directory contains example scripts to help you quickly get started using IPEX-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it. 3 | 4 | ## Recommended Requirements 5 | To run the examples, we recommend using Intel® Xeon® processors (server), or >= 12th Gen Intel® Core™ processor (client). 6 | 7 | For OS, IPEX-LLM supports Ubuntu 20.04 or later, CentOS 7 or later, and Windows 10/11. 8 | 9 | ## Best Known Configuration on Linux 10 | For better performance, it is recommended to set environment variables on Linux with the help of IPEX-LLM: 11 | ```bash 12 | pip install ipex-llm 13 | source ipex-llm-init 14 | ``` 15 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py: -------------------------------------------------------------------------------- 1 | from vllm.logger import init_logger 2 | from vllm.v1.executor.ray_utils import RayWorkerWrapper 3 | 4 | 5 | logger = init_logger(__name__) 6 | 7 | 8 | class IPEXLLMV1Wrapper(RayWorkerWrapper): 9 | def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None: 10 | super().__init__(*args, **kwargs) 11 | from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert 12 | _ipex_llm_convert(load_in_low_bit=load_in_low_bit) 13 | self.compiled_dag_cuda_device_set = False 14 | 15 | 16 | def get_ipex_llm_v1_wrapper(load_in_low_bit): 17 | # The reason why we not using functools.partial is that 18 | # ray seems not work well with it. 19 | class WrapperWithLoadBit(IPEXLLMV1Wrapper): 20 | def __init__(self, *args, **kwargs) -> None: 21 | super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs) 22 | 23 | return WrapperWithLoadBit 24 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py: -------------------------------------------------------------------------------- 1 | from vllm.logger import init_logger 2 | from vllm.v1.executor.ray_utils import RayWorkerWrapper 3 | 4 | 5 | logger = init_logger(__name__) 6 | 7 | 8 | class IPEXLLMV1Wrapper(RayWorkerWrapper): 9 | def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None: 10 | super().__init__(*args, **kwargs) 11 | from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert 12 | _ipex_llm_convert(load_in_low_bit=load_in_low_bit) 13 | self.compiled_dag_cuda_device_set = False 14 | 15 | 16 | def get_ipex_llm_v1_wrapper(load_in_low_bit): 17 | # The reason why we not using functools.partial is that 18 | # ray seems not work well with it. 19 | class WrapperWithLoadBit(IPEXLLMV1Wrapper): 20 | def __init__(self, *args, **kwargs) -> None: 21 | super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs) 22 | 23 | return WrapperWithLoadBit 24 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/gptq/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/ggml/model/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/gptq/convert/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/langchain/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/llamaindex/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | -------------------------------------------------------------------------------- /python/llm/test/run-llm-check-function.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # wrapper for pytest command 4 | # add this before `pytest ...` or `python -m pytest ...` to avoid unexpected exit code 127 caused by ipex on Windows 5 | # ref: https://github.com/intel/intel-extension-for-pytorch/issues/634 6 | pytest_check_error() { 7 | result=$(eval "$@" || echo "FINISH PYTEST") 8 | echo $result > pytest_check_error.log 9 | cat pytest_check_error.log 10 | failed_lines=$(cat pytest_check_error.log | { grep failed || true; }) 11 | if [[ $failed_lines != "" ]]; then 12 | exit 1 13 | fi 14 | rm pytest_check_error.log 15 | } 16 | 17 | # wrapper for python command 18 | # add this before `python ...` to avoid unexpected exit code 127 caused by ipex on Windows 19 | # ref: https://github.com/intel/intel-extension-for-pytorch/issues/634 20 | ipex_workaround_wrapper() { 21 | eval "$@" || ( [[ $? == 127 && $RUNNER_OS == "Windows" ]] && echo "EXIT CODE 127 DETECTED ON WINDOWS, IGNORE." || exit 1) 22 | } 23 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/langchain/vllm/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/awq/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/lora_finetuning_chatglm3_6b_on_alpaca_with_1_arc_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export BIGDL_CHECK_DUPLICATE_IMPORT=0 18 | 19 | # You can also set the remote model repository to a local model path 20 | python lora_finetune_chatglm.py \ 21 | yahma/alpaca-cleaned \ 22 | THUDM/chatglm3-6b \ 23 | ./lora_config.yaml 24 | -------------------------------------------------------------------------------- /python/llm/test/run-llm-llamaindex-tests-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT} 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/llamaindex_gpu 6 | 7 | if [[ $RUNNER_OS == "Linux" ]]; then 8 | export USE_XETLA=OFF 9 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 10 | elif [[ $RUNNER_OS == "Windows" ]]; then 11 | export ANALYTICS_ZOO_ROOT=$(cygpath -m ${ANALYTICS_ZOO_ROOT}) 12 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/llamaindex_gpu 13 | export SYCL_CACHE_PERSISTENT=1 14 | fi 15 | 16 | set -e 17 | 18 | echo "# Start testing inference" 19 | start=$(date "+%s") 20 | 21 | source ${ANALYTICS_ZOO_ROOT}/python/llm/test/run-llm-check-function.sh 22 | 23 | pytest_check_error python -m pytest -s ${LLM_INFERENCE_TEST_DIR} 24 | 25 | now=$(date "+%s") 26 | time=$((now-start)) 27 | 28 | echo "Bigdl-llm llamaindex gpu tests finished" 29 | echo "Time used:$time seconds" -------------------------------------------------------------------------------- /python/llm/dev/benchmark/LongBench/config/model2path.json: -------------------------------------------------------------------------------- 1 | { 2 | "llama2-7b-chat-4k": "meta-llama/Llama-2-7b-chat-hf", 3 | "longchat-v1.5-7b-32k": "lmsys/longchat-7b-v1.5-32k", 4 | "xgen-7b-8k": "Salesforce/xgen-7b-8k-inst", 5 | "internlm-7b-8k": "internlm/internlm-chat-7b-8k", 6 | "chatglm2-6b": "THUDM/chatglm2-6b", 7 | "chatglm2-6b-32k": "THUDM/chatglm2-6b-32k", 8 | "chatglm3-6b-32k": "THUDM/chatglm3-6b-32k", 9 | "chatglm4-9b": "THUDM/glm-4-9b-chat", 10 | "vicuna-v1.5-7b-16k": "lmsys/vicuna-7b-v1.5-16k", 11 | "mistral-7B-instruct-v0.2": "mistralai/Mistral-7B-Instruct-v0.2", 12 | "mistral-7B-instruct-v0.1": "mistralai/Mistral-7B-Instruct-v0.1", 13 | "mixtral-8x7B-instruct-v0.1": "mistralai/Mixtral-8x7B-Instruct-v0.1", 14 | "llama-2-7B-32k-instruct": "togethercomputer/Llama-2-7B-32K-Instruct", 15 | "lwm-text-chat-1m": "LargeWorldModel/LWM-Text-Chat-1M", 16 | "lwm-text-1m": "LargeWorldModel/LWM-Text-1M", 17 | "qwen2-7b-instruct": "Qwen/Qwen2-7B-Instruct" 18 | } 19 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/lora_finetuning_chatglm3_6b_on_advertise_gen_with_1_arc_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export BIGDL_CHECK_DUPLICATE_IMPORT=0 18 | 19 | # You can also set the remote model repository to a local model path 20 | python lora_finetune_chatglm.py \ 21 | ./AdvertiseGen_fix \ 22 | THUDM/chatglm3-6b \ 23 | ./lora_config.yaml 24 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/xpu/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine, IPEXLLMAsyncV1Engine, IPEXLLMLLMV1Engine 17 | __all__ = [ 18 | "IPEXLLMAsyncLLMEngine", 19 | "IPEXLLMLLMEngine", 20 | "IPEXLLMClass", 21 | "IPEXLLMAsyncV1Engine", 22 | "IPEXLLMLLMV1Engine", 23 | "run_mp_engine", 24 | ] 25 | -------------------------------------------------------------------------------- /python/llm/example/CPU/Applications/streaming-llm/streaming_llm/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | 23 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/ggml/model/bloom/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | from .bloom import Bloom 23 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/cpu/ipex_llm_wrapper.py: -------------------------------------------------------------------------------- 1 | from vllm.logger import init_logger 2 | from vllm.executor.ray_utils import RayWorkerWrapper 3 | 4 | 5 | logger = init_logger(__name__) 6 | 7 | 8 | class IPEXLLMWrapper(RayWorkerWrapper): 9 | def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None: 10 | super().__init__(*args, **kwargs) 11 | from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert 12 | _ipex_llm_convert(load_in_low_bit=load_in_low_bit) 13 | self.compiled_dag_cuda_device_set = False 14 | 15 | 16 | def get_ipex_llm_wrapper(load_in_low_bit): 17 | # The reason why we not using functools.partial is that 18 | # ray seems not work well with it. 19 | class WrapperWithLoadBit(IPEXLLMWrapper): 20 | def __init__(self, *args, **kwargs) -> None: 21 | super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs) 22 | 23 | # a = functools.partial(IPEXLLMWrapper, load_in_low_bit=load_in_low_bit) 24 | return WrapperWithLoadBit 25 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/vllm/xpu/ipex_llm_wrapper.py: -------------------------------------------------------------------------------- 1 | from vllm.logger import init_logger 2 | from vllm.executor.ray_utils import RayWorkerWrapper 3 | 4 | 5 | logger = init_logger(__name__) 6 | 7 | 8 | class IPEXLLMWrapper(RayWorkerWrapper): 9 | def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None: 10 | super().__init__(*args, **kwargs) 11 | from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert 12 | _ipex_llm_convert(load_in_low_bit=load_in_low_bit) 13 | self.compiled_dag_cuda_device_set = False 14 | 15 | 16 | def get_ipex_llm_wrapper(load_in_low_bit): 17 | # The reason why we not using functools.partial is that 18 | # ray seems not work well with it. 19 | class WrapperWithLoadBit(IPEXLLMWrapper): 20 | def __init__(self, *args, **kwargs) -> None: 21 | super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs) 22 | 23 | # a = functools.partial(IPEXLLMWrapper, load_in_low_bit=load_in_low_bit) 24 | return WrapperWithLoadBit 25 | -------------------------------------------------------------------------------- /python/llm/test/run-llm-langchain-tests-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT} 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/langchain_gpu 6 | 7 | if [[ $RUNNER_OS == "Linux" ]]; then 8 | export USE_XETLA=OFF 9 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 10 | elif [[ $RUNNER_OS == "Windows" ]]; then 11 | export ANALYTICS_ZOO_ROOT=$(cygpath -m ${ANALYTICS_ZOO_ROOT}) 12 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/langchain_gpu 13 | export SYCL_CACHE_PERSISTENT=1 14 | fi 15 | 16 | export DEVICE='xpu' 17 | 18 | set -e 19 | 20 | echo "# Start testing inference" 21 | start=$(date "+%s") 22 | 23 | source ${ANALYTICS_ZOO_ROOT}/python/llm/test/run-llm-check-function.sh 24 | 25 | pytest_check_error python -m pytest -s ${LLM_INFERENCE_TEST_DIR} 26 | 27 | now=$(date "+%s") 28 | time=$((now-start)) 29 | 30 | echo "Bigdl-llm langchain gpu tests finished" 31 | echo "Time used:$time seconds" -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/ggml/model/generation/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | from .utils import GenerationMixin 23 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/ggml/model/starcoder/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | from .starcoder import Starcoder 23 | -------------------------------------------------------------------------------- /docker/llm/serving/cpu/docker/model_adapter.py.patch: -------------------------------------------------------------------------------- 1 | --- model_adapter.py.old 2024-03-05 15:08:47.169275336 +0800 2 | +++ model_adapter.py 2024-03-05 15:10:13.434703674 +0800 3 | @@ -1690,15 +1690,17 @@ 4 | ) 5 | # NOTE: if you use the old version of model file, please remove the comments below 6 | # config.use_flash_attn = False 7 | - self.float_set(config, "fp16") 8 | + # self.float_set(config, "fp16") 9 | generation_config = GenerationConfig.from_pretrained( 10 | model_path, trust_remote_code=True 11 | ) 12 | + from ipex_llm.transformers import AutoModelForCausalLM 13 | model = AutoModelForCausalLM.from_pretrained( 14 | model_path, 15 | config=config, 16 | low_cpu_mem_usage=True, 17 | trust_remote_code=True, 18 | + load_in_4bit=True, 19 | **from_pretrained_kwargs, 20 | ).eval() 21 | if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk: 22 | -------------------------------------------------------------------------------- /python/llm/test/install/test_install.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | import ipex_llm 19 | import pytest 20 | from unittest import TestCase 21 | 22 | 23 | class Test_LLM_Basics(TestCase): 24 | 25 | def test_naive(self): 26 | from ipex_llm.ggml import quantize 27 | from ipex_llm.utils.common import invalidInputError 28 | pass 29 | 30 | 31 | if __name__ == '__main__': 32 | pytest.main([__file__]) 33 | -------------------------------------------------------------------------------- /python/llm/test/run-llm-convert-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT} 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src 5 | export LLM_CONVERT_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/convert 6 | 7 | set -e 8 | 9 | echo "# Start testing convert" 10 | start=$(date "+%s") 11 | 12 | # separate convert process to save disk space 13 | if [[ $1 == "llama" ]]; then 14 | python -m pytest -s ${LLM_CONVERT_TEST_DIR}/test_convert_model.py -k "test_convert_llama" 15 | elif [[ $1 == "gptneox" ]]; then 16 | python -m pytest -s ${LLM_CONVERT_TEST_DIR}/test_convert_model.py -k "test_convert_gptneox" 17 | elif [[ $1 == "bloom" ]]; then 18 | python -m pytest -s ${LLM_CONVERT_TEST_DIR}/test_convert_model.py -k "test_convert_bloom" 19 | elif [[ $1 == "starcoder" ]]; then 20 | python -m pytest -s ${LLM_CONVERT_TEST_DIR}/test_convert_model.py -k "test_convert_starcoder" 21 | fi 22 | 23 | now=$(date "+%s") 24 | time=$((now-start)) 25 | 26 | echo "Bigdl-llm tests finished" 27 | echo "Time used:$time seconds" 28 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/ggml/model/llama/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | from .llama_cpp import * 23 | from .llama import * 24 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/ggml/model/gptneox/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | from .gptneox_cpp import * 23 | from .gptneox import * 24 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_gemma_2b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --base_model "google/gemma-2b-it" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --output_dir "./ipex-llm-qlora-alpaca" 22 | -------------------------------------------------------------------------------- /docker/llm/serving/cpu/docker/start-notebook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2016 The IPEX-LLM Authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | #set -x 19 | port=${port:-12345} 20 | token=${token:-""} 21 | 22 | while [ $# -gt 0 ]; do 23 | 24 | if [[ $1 == *"--"* ]]; then 25 | param="${1/--/}" 26 | declare $param="$2" 27 | fi 28 | 29 | shift 30 | done 31 | 32 | jupyter-lab --notebook-dir=/llm/ipex-llm-tutorial --ip=0.0.0.0 --port=$port --no-browser --NotebookApp.token=$token --allow-root 33 | -------------------------------------------------------------------------------- /python/llm/example/CPU/HF-Transformers-AutoModels/Model/README.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM Transformers INT4 Optimization for Large Language Model 2 | You can use IPEX-LLM to run any Huggingface Transformer models with INT4 optimizations on either servers or laptops. This directory contains example scripts to help you quickly get started using IPEX-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it. 3 | 4 | ## Recommended Requirements 5 | To run the examples, we recommend using Intel® Xeon® processors (server), or >= 12th Gen Intel® Core™ processor (client). 6 | 7 | For OS, IPEX-LLM supports Ubuntu 20.04 or later (glibc>=2.17), CentOS 7 or later (glibc>=2.17), and Windows 10/11. 8 | 9 | ## Best Known Configuration on Linux 10 | For better performance, it is recommended to set environment variables on Linux with the help of IPEX-LLM: 11 | ```bash 12 | pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu 13 | source ipex-llm-init 14 | ``` 15 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --base_model "meta-llama/Llama-2-7b-hf" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --output_dir "./ipex-llm-qlora-alpaca" 22 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_qwen15_7b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --base_model "Qwen/Qwen1.5-7B-Chat" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --output_dir "./ipex-llm-qlora-alpaca" 22 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_baichuan2_7b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --base_model "path/to/Baichuan2-7B-Chat" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --output_dir "./ipex-llm-qlora-alpaca" 22 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama3_8b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --base_model "meta-llama/Meta-Llama-3-8B-Instruct" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --output_dir "./ipex-llm-qlora-alpaca" 22 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=6 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 2 \ 23 | python -u ./alpaca_qlora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-7b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-qlora-alpaca" > training.log 27 | -------------------------------------------------------------------------------- /python/llm/example/CPU/Speculative-Decoding/Self-Speculation/README.md: -------------------------------------------------------------------------------- 1 | # Self-Speculative Decoding for Large Language Model BF16 Inference using IPEX-LLM on Intel CPUs 2 | You can use IPEX-LLM to run BF16 inference for any Huggingface Transformer model with ***self-speculative decoding*** on Intel CPUs. This directory contains example scripts to help you quickly get started to run some popular open-source models using self-speculative decoding. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it. 3 | 4 | ## Verified Hardware Platforms 5 | 6 | - Intel Xeon SPR server 7 | 8 | ## Recommended Requirements 9 | To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../../README.md#system-support) for more information. Make sure you have installed `ipex-llm` before: 10 | 11 | ```bash 12 | pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu 13 | ``` 14 | 15 | Moreover, install IPEX 2.1.0, which can be done through `pip install intel_extension_for_pytorch==2.1.0`. 16 | -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/utils/common/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | from .log4Error import invalidInputError, invalidOperationError, MuteHFLogger 23 | from .lazyimport import LazyImport 24 | -------------------------------------------------------------------------------- /python/llm/example/CPU/Deepspeed-AutoTP/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 1. install oneccl for intel mpi 3 | # can skip this step if oneccl/oneapi is already installed on your machine 4 | # report to https://github.com/oneapi-src/oneCCL if any issue 5 | git clone https://github.com/oneapi-src/oneCCL.git 6 | cd oneCCL 7 | mkdir build 8 | cd build 9 | cmake .. 10 | make -j install 11 | mkdir -p /opt/intel/oneccl 12 | mv ./_install/env /opt/intel/oneccl 13 | # 2. install torch and ipex 14 | pip install torch==2.1.0 15 | pip install intel_extension_for_pytorch==2.1.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ 16 | # install torchccl (oneccl binding for pytorch) 17 | pip install https://intel-extension-for-pytorch.s3.amazonaws.com/torch_ccl/cpu/oneccl_bind_pt-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl 18 | # 3. install deepspeed 19 | pip install deepspeed==0.11.1 20 | # 4. exclude intel deepspeed extension, which is only for XPU 21 | pip uninstall intel-extension-for-deepspeed 22 | # 5. install ipex-llm 23 | pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu 24 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/stable-version-cpu-perf-test.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-2-7b-chat-hf' 3 | - 'meta-llama/Llama-2-13b-chat-hf' 4 | - 'THUDM/chatglm2-6b' 5 | - 'THUDM/chatglm3-6b' 6 | - 'baichuan-inc/Baichuan2-7B-Chat' 7 | - 'baichuan-inc/Baichuan2-13B-Chat' 8 | - 'Qwen/Qwen-14B-Chat' 9 | local_model_hub: '/mnt/disk1/models' 10 | warm_up: 1 11 | num_trials: 3 12 | num_beams: 1 # default to greedy search 13 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 14 | batch_size: 1 # default to 1 15 | in_out_pairs: 16 | - '32-32' 17 | - '1024-128' 18 | - '2048-256' 19 | test_api: 20 | - "transformer_int4" 21 | # - "native_int4" 22 | # - "optimize_model" 23 | # - "pytorch_autocast_bf16" 24 | # - "ipex_fp16_gpu" # on Intel GPU 25 | # - "transformer_int4_gpu" # on Intel GPU 26 | # - "optimize_model_gpu" # on Intel GPU 27 | # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server 28 | # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 29 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 30 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_relora_finetuning.py \ 19 | --base_model "meta-llama/Llama-2-7b-hf" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --output_dir "./ipex-llm-relora-alpaca" \ 22 | --relora_steps 300 \ 23 | --relora_warmup_steps 10 24 | -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-go-isatty.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) Yasuhiro MATSUMOTO 2 | 3 | MIT License (Expat) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --micro_batch_size 2 \ 20 | --batch_size 128 \ 21 | --base_model "meta-llama/Llama-2-7b-hf" \ 22 | --data_path "yahma/alpaca-cleaned" \ 23 | --output_dir "./ipex-llm-qlora-alpaca" 24 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --micro_batch_size 8 \ 20 | --batch_size 128 \ 21 | --base_model "meta-llama/Llama-2-7b-hf" \ 22 | --data_path "yahma/alpaca-cleaned" \ 23 | --output_dir "./ipex-llm-qlora-alpaca" 24 | -------------------------------------------------------------------------------- /python/llm/portable-zip/README-ui.md: -------------------------------------------------------------------------------- 1 | # IPEX-LLM Portable Zip with Web-UI For Windows: User Guide 2 | 3 | ## Introduction 4 | 5 | This portable zip includes everything you need to run an LLM with IPEX-LLM optimizations and chat with it in Web-UI. Please refer to [How to use](#how-to-use) section to get started. 6 | 7 | ### 6B model running on an Intel 11-Gen Core PC (real-time screen capture) 8 | 9 | 10 | ### Verified Models 11 | 12 | - ChatGLM2-6b 13 | 14 | ## How to use 15 | 16 | 1. Download the zip from link [here](). 17 | 2. (Optional) You could also build the zip on your own. Run `setup.bat --ui` and it will generate the zip file. 18 | 3. Unzip `ipex-llm.zip`. 19 | 4. Download the model to your computer. 20 | 5. Go into the unzipped folder and double click `chat-ui.bat`. Input the path of the model (e.g. `path\to\model`, note that there's no slash at the end of the path). Press Enter and wait until it shows `All service started. Visit 127.0.0.1:7860 in browser to chat.`. Do NOT close the terminal window! 21 | 6. Visit `127.0.0.1:7860` in your browser and enjoy chatting! 22 | 7. If you want to stop the program, just close the terminal window. -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/transformers/patches.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # 17 | 18 | from typing import List 19 | from transformers.dynamic_module_utils import get_imports 20 | from ipex_llm.utils.ipex_importer import IPEXImporter 21 | 22 | 23 | def patch_flash_attn_import(filename: str) -> List[str]: 24 | """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72.""" 25 | imports = get_imports(filename) 26 | if "flash_attn" in imports: 27 | imports.remove("flash_attn") 28 | return imports 29 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --base_model "meta-llama/Llama-2-13b-hf" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --output_dir "./ipex-llm-qlora-alpaca" \ 22 | --micro_batch_size 8 \ 23 | --batch_size 128 24 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/cpu-perf-test.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'meta-llama/Llama-2-7b-chat-hf' 3 | - 'meta-llama/Llama-2-13b-chat-hf' 4 | - 'THUDM/chatglm2-6b' 5 | - 'THUDM/chatglm3-6b' 6 | - 'baichuan-inc/Baichuan2-7B-Chat' 7 | - 'baichuan-inc/Baichuan2-13B-Chat' 8 | # - 'Qwen/Qwen-14B-Chat' # requires transformers < 4.37.0 9 | local_model_hub: '/mnt/disk1/models' 10 | warm_up: 1 11 | num_trials: 3 12 | num_beams: 1 # default to greedy search 13 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 14 | batch_size: 1 # default to 1 15 | in_out_pairs: 16 | - '32-32' 17 | - '1024-128' 18 | - '2048-256' 19 | test_api: 20 | - "transformer_int4" 21 | # - "native_int4" 22 | # - "optimize_model" 23 | # - "pytorch_autocast_bf16" 24 | # - "ipex_fp16_gpu" # on Intel GPU 25 | # - "transformer_int4_gpu" # on Intel GPU 26 | # - "optimize_model_gpu" # on Intel GPU 27 | # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server 28 | # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 29 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 30 | -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-tablewriter.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014 by Oleku Konko 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_chatglm3_6b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qlora_finetuning.py \ 19 | --base_model "THUDM/chatglm3-6b" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --lora_target_modules '[query_key_value,dense,dense_h_to_4h,dense_4h_to_h]' \ 22 | --output_dir "./ipex-llm-qlora-alpaca" 23 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=6 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 2 \ 23 | python -u ./alpaca_relora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-7b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-relora-alpaca" \ 27 | --relora_steps 300 \ 28 | --relora_warmup_steps 10 > training.log 29 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=28 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 4 \ 23 | python -u ./alpaca_qlora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-7b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-qlora-alpaca" \ 27 | --micro_batch_size 8 \ 28 | --batch_size 128 > training.log 29 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=56 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 2 \ 23 | python -u ./alpaca_qlora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-7b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-qlora-alpaca" \ 27 | --micro_batch_size 8 \ 28 | --batch_size 128 > training.log 29 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=56 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 8 \ 23 | python -u ./alpaca_qlora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-7b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-qlora-alpaca" \ 27 | --micro_batch_size 8 \ 28 | --batch_size 128 > training.log 29 | -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-hm.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Xuanyi Chew 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=56 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 2 \ 23 | python -u ./alpaca_qlora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-13b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-qlora-alpaca" \ 27 | --micro_batch_size 8 \ 28 | --batch_size 128 > training.log 29 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=56 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 8 \ 23 | python -u ./alpaca_qlora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-13b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-qlora-alpaca" \ 27 | --micro_batch_size 8 \ 28 | --batch_size 128 > training.log 29 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/ceval/evaluators/evaluator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | class Evaluator: 18 | def __init__(self, choices, model_path, device, qtype): 19 | self.choices = choices 20 | self.model_path = model_path 21 | self.device = device 22 | self.qtype = qtype 23 | 24 | def format_example(self, line, **kwargs): 25 | pass 26 | 27 | def eval_subject(self, subject_name, test_df, eval_type, **kwargs): 28 | pass 29 | 30 | def extract_answer(self, response, row, **kwargs): 31 | pass 32 | -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-gin-contrib-cors.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Gin-Gonic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-go-urn.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Leonardo Di Donato 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-gorgonia.org-vecf32.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Chewxy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-mimetype.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Gabriel Vasile 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-uniseg.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Oliver Kuederle 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/src/ipex_llm/models.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # This would makes sure Python is aware there is more than one sub-package within bigdl, 18 | # physically located elsewhere. 19 | # Otherwise there would be module not found error in non-pip's setting as Python would 20 | # only search the first bigdl package and end up finding only one sub-package. 21 | 22 | from ipex_llm.ggml.model.llama import Llama 23 | from ipex_llm.ggml.model.gptneox import Gptneox 24 | from ipex_llm.ggml.model.bloom import Bloom 25 | from ipex_llm.ggml.model.starcoder import Starcoder 26 | -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-d4l3k-go-bfloat16.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Tristan Rice 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-gorgonia.org-vecf64.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Xuanyi Chew 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-validator.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Dean Karn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/lora_finetuning_chatglm3_6b_on_alpaca_with_2_arc_cards.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=6 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | export BIGDL_CHECK_DUPLICATE_IMPORT=0 22 | 23 | # You can also set the remote model repository to a local model path 24 | mpirun -n 2 \ 25 | python lora_finetune_chatglm.py \ 26 | yahma/alpaca-cleaned \ 27 | THUDM/chatglm3-6b \ 28 | ./lora_config.yaml \ 29 | ./deepspeed_config.json 30 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/core-perf-test.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'THUDM/chatglm2-6b' 3 | - 'THUDM/chatglm3-6b' 4 | - 'baichuan-inc/Baichuan2-7B-Chat' 5 | - 'internlm/internlm-chat-7b' 6 | # - 'Qwen/Qwen-7B-Chat' # requires transformers < 4.37.0 7 | - 'BAAI/AquilaChat2-7B' 8 | - 'meta-llama/Llama-2-7b-chat-hf' 9 | - 'WisdomShell/CodeShell-7B' 10 | - 'tiiuae/falcon-7b-instruct-with-patch' 11 | local_model_hub: 'D:\llm-models' 12 | warm_up: 1 13 | num_trials: 3 14 | num_beams: 1 # default to greedy search 15 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 16 | batch_size: 1 # default to 1 17 | in_out_pairs: 18 | - '32-32' 19 | - '1024-128' 20 | test_api: 21 | - "transformer_int4" 22 | # - "native_int4" 23 | # - "optimize_model" 24 | # - "pytorch_autocast_bf16" 25 | # - "ipex_fp16_gpu" # on Intel GPU 26 | # - "transformer_int4_gpu" # on Intel GPU 27 | # - "optimize_model_gpu" # on Intel GPU 28 | # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server 29 | # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 30 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) 31 | -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-gin.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Manuel Martínez-Almeida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-go-runewidth.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Yasuhiro Matsumoto 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-sse.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Manuel Martínez-Almeida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/lora_finetuning_chatglm3_6b_on_advertise_gen_with_2_arc_cards.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=6 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | export BIGDL_CHECK_DUPLICATE_IMPORT=0 22 | 23 | # You can also set the remote model repository to a local model path 24 | mpirun -n 2 \ 25 | python lora_finetune_chatglm.py \ 26 | ./AdvertiseGen_fix \ 27 | THUDM/chatglm3-6b \ 28 | ./lora_config.yaml \ 29 | ./deepspeed_config.json 30 | -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-go-playground-locales.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Go Playground 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-agnivade-levenshtein.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Agniva De Sarker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-gin-contrib-sse.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Manuel Martínez-Almeida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-float16.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-present Montgomery Edwards⁴⁴⁸ and Faye Amacker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=12 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 3 \ 23 | python -u ./alpaca_qlora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-7b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-qlora-alpaca" \ 27 | --gradient_checkpointing False \ 28 | --micro_batch_size 2 \ 29 | --batch_size 128 > training.log 30 | -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-go-playground-universal-translator.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Go Playground 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-go-toml.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | go-toml v2 4 | Copyright (c) 2021 - 2023 Thomas Pelletier 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-go-codec.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2012-2020 Ugorji Nwoke. 4 | All rights reserved. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. -------------------------------------------------------------------------------- /python/llm/tpp/licenses/LICENSE-zlib.txt: -------------------------------------------------------------------------------- 1 | /* zlib.h -- interface of the 'zlib' general purpose compression library 2 | version 1.3.1, January 22nd, 2024 3 | 4 | Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler 5 | 6 | This software is provided 'as-is', without any express or implied 7 | warranty. In no event will the authors be held liable for any damages 8 | arising from the use of this software. 9 | 10 | Permission is granted to anyone to use this software for any purpose, 11 | including commercial applications, and to alter it and redistribute it 12 | freely, subject to the following restrictions: 13 | 14 | 1. The origin of this software must not be misrepresented; you must not 15 | claim that you wrote the original software. If you use this software 16 | in a product, an acknowledgment in the product documentation would be 17 | appreciated but is not required. 18 | 2. Altered source versions must be plainly marked as such, and must not be 19 | misrepresented as being the original software. 20 | 3. This notice may not be removed or altered from any source distribution. 21 | 22 | Jean-loup Gailly Mark Adler 23 | jloup@gzip.org madler@alumni.caltech.edu 24 | 25 | */ -------------------------------------------------------------------------------- /python/llm/example/GPU/HuggingFace/LLM/yuan2/yuan2-2B-instruct/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config":true, 3 | "architectures": [ 4 | "YuanForCausalLM" 5 | ], 6 | "auto_map":{ 7 | "AutoConfig":"configuration_yuan.YuanConfig", 8 | "AutoModelForCausalLM":"yuan_hf_model.YuanForCausalLM" 9 | }, 10 | "tokenizer_class":"YuanTokenizer", 11 | "hidden_act": "silu", 12 | "hidden_size": 2048, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 8192, 15 | "max_position_embeddings": 8192, 16 | "model_type": "yuan", 17 | "num_attention_heads": 32, 18 | "num_hidden_layers": 24, 19 | "rms_norm_eps": 1e-06, 20 | "dropout": 0.1, 21 | "tie_word_embeddings": true, 22 | "torch_dtype": "bfloat16", 23 | "transformers_version": "4.30.0.dev0", 24 | "use_cache": true, 25 | "causal_mask": true, 26 | "use_flash_attention": false, 27 | "reset_attention_mask": true, 28 | "reset_position_ids": true, 29 | "use_loss_mask": false, 30 | "eod_token": 77185, 31 | "sep_token": 77187, 32 | "eod_token_id": 77185, 33 | "sep_token_id": 77185, 34 | "pad_token_id": 77185, 35 | "bos_token_id": 77185, 36 | "eos_token_id": 77185, 37 | "mask_token_id": 77185, 38 | "vocab_size": 135040 39 | } -------------------------------------------------------------------------------- /python/llm/example/CPU/PyTorch-Models/Model/yuan2/yuan2-2B-instruct/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config":true, 3 | "architectures": [ 4 | "YuanForCausalLM" 5 | ], 6 | "auto_map":{ 7 | "AutoConfig":"configuration_yuan.YuanConfig", 8 | "AutoModelForCausalLM":"yuan_hf_model.YuanForCausalLM" 9 | }, 10 | "tokenizer_class":"YuanTokenizer", 11 | "hidden_act": "silu", 12 | "hidden_size": 2048, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 8192, 15 | "max_position_embeddings": 8192, 16 | "model_type": "yuan", 17 | "num_attention_heads": 32, 18 | "num_hidden_layers": 24, 19 | "rms_norm_eps": 1e-06, 20 | "dropout": 0.1, 21 | "tie_word_embeddings": true, 22 | "torch_dtype": "bfloat16", 23 | "transformers_version": "4.30.0.dev0", 24 | "use_cache": true, 25 | "causal_mask": true, 26 | "use_flash_attention": false, 27 | "reset_attention_mask": true, 28 | "reset_position_ids": true, 29 | "use_loss_mask": false, 30 | "eod_token": 77185, 31 | "sep_token": 77187, 32 | "eod_token_id": 77185, 33 | "sep_token_id": 77185, 34 | "pad_token_id": 77185, 35 | "bos_token_id": 77185, 36 | "eos_token_id": 77185, 37 | "mask_token_id": 77185, 38 | "vocab_size": 135040 39 | } -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_lora_finetuning.py \ 19 | --micro_batch_size 8 \ 20 | --batch_size 128 \ 21 | --base_model "meta-llama/Llama-2-7b-hf" \ 22 | --data_path "yahma/alpaca-cleaned" \ 23 | --output_dir "./ipex-llm-lora-alpaca" \ 24 | --gradient_checkpointing True \ 25 | --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj']" 26 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=56 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 2 \ 23 | python -u ./alpaca_relora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-7b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-relora-alpaca" \ 27 | --micro_batch_size 8 \ 28 | --relora_steps 300 \ 29 | --relora_warmup_steps 10 \ 30 | --batch_size 128 > relora_training.log 31 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=56 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 8 \ 23 | python -u ./alpaca_relora_finetuning.py \ 24 | --base_model "meta-llama/Llama-2-7b-hf" \ 25 | --data_path "yahma/alpaca-cleaned" \ 26 | --output_dir "./ipex-llm-relora-alpaca" \ 27 | --micro_batch_size 8 \ 28 | --relora_steps 300 \ 29 | --relora_warmup_steps 10 \ 30 | --batch_size 128 > relora_training.log 31 | -------------------------------------------------------------------------------- /python/llm/example/GPU/PyTorch-Models/Model/yuan2/yuan2-2B-instruct/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config":true, 3 | "architectures": [ 4 | "YuanForCausalLM" 5 | ], 6 | "auto_map":{ 7 | "AutoConfig":"configuration_yuan.YuanConfig", 8 | "AutoModelForCausalLM":"yuan_hf_model.YuanForCausalLM" 9 | }, 10 | "tokenizer_class":"YuanTokenizer", 11 | "hidden_act": "silu", 12 | "hidden_size": 2048, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 8192, 15 | "max_position_embeddings": 8192, 16 | "model_type": "yuan", 17 | "num_attention_heads": 32, 18 | "num_hidden_layers": 24, 19 | "rms_norm_eps": 1e-06, 20 | "dropout": 0.1, 21 | "tie_word_embeddings": true, 22 | "torch_dtype": "bfloat16", 23 | "transformers_version": "4.30.0.dev0", 24 | "use_cache": true, 25 | "causal_mask": true, 26 | "use_flash_attention": false, 27 | "reset_attention_mask": true, 28 | "reset_position_ids": true, 29 | "use_loss_mask": false, 30 | "eod_token": 77185, 31 | "sep_token": 77187, 32 | "eod_token_id": 77185, 33 | "sep_token_id": 77185, 34 | "pad_token_id": 77185, 35 | "bos_token_id": 77185, 36 | "eos_token_id": 77185, 37 | "mask_token_id": 77185, 38 | "vocab_size": 135040 39 | } -------------------------------------------------------------------------------- /python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/yuan2-2B-instruct/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config":true, 3 | "architectures": [ 4 | "YuanForCausalLM" 5 | ], 6 | "auto_map":{ 7 | "AutoConfig":"configuration_yuan.YuanConfig", 8 | "AutoModelForCausalLM":"yuan_hf_model.YuanForCausalLM" 9 | }, 10 | "tokenizer_class":"YuanTokenizer", 11 | "hidden_act": "silu", 12 | "hidden_size": 2048, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 8192, 15 | "max_position_embeddings": 8192, 16 | "model_type": "yuan", 17 | "num_attention_heads": 32, 18 | "num_hidden_layers": 24, 19 | "rms_norm_eps": 1e-06, 20 | "dropout": 0.1, 21 | "tie_word_embeddings": true, 22 | "torch_dtype": "bfloat16", 23 | "transformers_version": "4.30.0.dev0", 24 | "use_cache": true, 25 | "causal_mask": true, 26 | "use_flash_attention": false, 27 | "reset_attention_mask": true, 28 | "reset_position_ids": true, 29 | "use_loss_mask": false, 30 | "eod_token": 77185, 31 | "sep_token": 77187, 32 | "eod_token_id": 77185, 33 | "sep_token_id": 77185, 34 | "pad_token_id": 77185, 35 | "bos_token_id": 77185, 36 | "eos_token_id": 77185, 37 | "mask_token_id": 77185, 38 | "vocab_size": 135040 39 | } -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'THUDM/chatglm3-6b' 3 | - 'THUDM/glm-4-9b-chat' 4 | - 'baichuan-inc/Baichuan2-7B-Chat' 5 | - 'meta-llama/Llama-2-7b-chat-hf' 6 | - 'meta-llama/Llama-2-13b-chat-hf' 7 | - 'meta-llama/Meta-Llama-3-8B-Instruct' 8 | - 'mistralai/Mistral-7B-Instruct-v0.2' 9 | - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' 10 | - '01-ai/Yi-6B-Chat' 11 | - 'openbmb/MiniCPM-1B-sft-bf16' 12 | - 'openbmb/MiniCPM-2B-sft-bf16' 13 | - 'Qwen/Qwen1.5-7B-Chat' 14 | - 'Qwen/Qwen2-1.5B-Instruct' 15 | - 'Qwen/Qwen2-7B-Instruct' 16 | - 'microsoft/Phi-3-mini-4k-instruct' 17 | - 'microsoft/Phi-3-mini-128k-instruct' 18 | - 'microsoft/phi-3-vision-128k-instruct' 19 | - 'openbmb/MiniCPM-V-2_6' 20 | local_model_hub: 'path to your local model hub' 21 | warm_up: 1 22 | num_trials: 3 23 | num_beams: 1 # default to greedy search 24 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 25 | batch_size: 1 # default to 1 26 | in_out_pairs: 27 | - '3072-384' 28 | test_api: 29 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer 30 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 31 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'THUDM/chatglm3-6b' 3 | - 'THUDM/glm-4-9b-chat' 4 | - 'baichuan-inc/Baichuan2-7B-Chat' 5 | - 'meta-llama/Llama-2-7b-chat-hf' 6 | - 'meta-llama/Llama-2-13b-chat-hf' 7 | - 'meta-llama/Meta-Llama-3-8B-Instruct' 8 | - 'mistralai/Mistral-7B-Instruct-v0.2' 9 | - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' 10 | - '01-ai/Yi-6B-Chat' 11 | - 'openbmb/MiniCPM-1B-sft-bf16' 12 | - 'openbmb/MiniCPM-2B-sft-bf16' 13 | - 'Qwen/Qwen1.5-7B-Chat' 14 | - 'Qwen/Qwen2-1.5B-Instruct' 15 | - 'Qwen/Qwen2-7B-Instruct' 16 | - 'microsoft/Phi-3-mini-4k-instruct' 17 | - 'microsoft/Phi-3-mini-128k-instruct' 18 | - 'microsoft/phi-3-vision-128k-instruct' 19 | - 'openbmb/MiniCPM-V-2_6' 20 | local_model_hub: 'path to your local model hub' 21 | warm_up: 1 22 | num_trials: 3 23 | num_beams: 1 # default to greedy search 24 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 25 | batch_size: 1 # default to 1 26 | in_out_pairs: 27 | - '4096-512' 28 | test_api: 29 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer 30 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 31 | -------------------------------------------------------------------------------- /python/llm/dev/benchmark/all-in-one/prompt/continuation/256.txt: -------------------------------------------------------------------------------- 1 | Once upon a time, there was a young girl named Samantha who lived with her parents in a small town. Samantha had always dreamed of traveling the world and experiencing new cultures and adventures. But as much as she yearned for something more than what her town could offer, it felt out of reach for a girl like her. 2 | One day, while browsing through the pages of a travel magazine, Samantha came across an advertisement that seemed too good to be true. It was an invitation to travel to a faraway land and experience all the adventures she had ever dreamed of. The only catch was that she needed to attend a special briefing beforehand in order to ensure her safety during the trip. 3 | Samantha quickly scribbled down the information on the back of the advertisement and resolved to attend the briefing at the end of the week. As time passed, Samantha became more and more excited about the prospect of traveling abroad. She even began putting together a packing list, imagining all the things she would need for her adventure. 4 | Finally, the day of the briefing arrived. Samantha made sure to arrive early so that she could go through security clearance before anyone else did. As she waited in line, she couldn't help but 5 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1110_4_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=14 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 4 \ 23 | python -u ./alpaca_lora_finetuning.py \ 24 | --micro_batch_size 8 \ 25 | --batch_size 128 \ 26 | --base_model "meta-llama/Llama-2-7b-hf" \ 27 | --data_path "yahma/alpaca-cleaned" \ 28 | --output_dir "./ipex-llm-lora-alpaca" \ 29 | --gradient_checkpointing True \ 30 | --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" 31 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_4_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | export MASTER_ADDR=127.0.0.1 18 | export OMP_NUM_THREADS=56 19 | export FI_PROVIDER=tcp 20 | export CCL_ATL_TRANSPORT=ofi 21 | 22 | mpirun -n 8 \ 23 | python -u ./alpaca_lora_finetuning.py \ 24 | --micro_batch_size 8 \ 25 | --batch_size 128 \ 26 | --base_model "meta-llama/Llama-2-7b-hf" \ 27 | --data_path "yahma/alpaca-cleaned" \ 28 | --output_dir "./ipex-llm-lora-alpaca" \ 29 | --gradient_checkpointing False \ 30 | --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" 31 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_1_card.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_qalora_finetuning.py \ 19 | --base_model "meta-llama/Llama-2-7b-hf" \ 20 | --data_path "yahma/alpaca-cleaned" \ 21 | --output_dir "./ipex-llm-qlora-alpaca" \ 22 | --learning_rate 9e-5 \ 23 | --micro_batch_size 2 \ 24 | --batch_size 128 \ 25 | --lora_r 8 \ 26 | --lora_alpha 16 \ 27 | --lora_dropout 0.05 \ 28 | --val_set_size 2000 29 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'THUDM/chatglm3-6b' 3 | - 'THUDM/glm-4-9b-chat' 4 | - 'baichuan-inc/Baichuan2-7B-Chat' 5 | - 'baichuan-inc/Baichuan2-13B-Chat' 6 | - 'meta-llama/Llama-2-7b-chat-hf' 7 | - 'meta-llama/Llama-2-13b-chat-hf' 8 | - 'meta-llama/Meta-Llama-3-8B-Instruct' 9 | - 'mistralai/Mistral-7B-Instruct-v0.2' 10 | - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' 11 | - '01-ai/Yi-6B-Chat' 12 | - 'openbmb/MiniCPM-1B-sft-bf16' 13 | - 'openbmb/MiniCPM-2B-sft-bf16' 14 | - 'Qwen/Qwen1.5-7B-Chat' 15 | - 'Qwen/Qwen2-1.5B-Instruct' 16 | - 'Qwen/Qwen2-7B-Instruct' 17 | - 'microsoft/Phi-3-mini-4k-instruct' 18 | - 'microsoft/Phi-3-mini-128k-instruct' 19 | - 'microsoft/phi-3-vision-128k-instruct' 20 | local_model_hub: 'path to your local model hub' 21 | warm_up: 1 22 | num_trials: 3 23 | num_beams: 1 # default to greedy search 24 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 25 | batch_size: 1 # default to 1 26 | in_out_pairs: 27 | - '1024-128' 28 | test_api: 29 | - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 30 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 31 | -------------------------------------------------------------------------------- /python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml: -------------------------------------------------------------------------------- 1 | repo_id: 2 | - 'THUDM/chatglm3-6b' 3 | - 'THUDM/glm-4-9b-chat' 4 | - 'baichuan-inc/Baichuan2-7B-Chat' 5 | - 'baichuan-inc/Baichuan2-13B-Chat' 6 | - 'meta-llama/Llama-2-7b-chat-hf' 7 | - 'meta-llama/Llama-2-13b-chat-hf' 8 | - 'meta-llama/Meta-Llama-3-8B-Instruct' 9 | - 'mistralai/Mistral-7B-Instruct-v0.2' 10 | - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' 11 | - '01-ai/Yi-6B-Chat' 12 | - 'openbmb/MiniCPM-1B-sft-bf16' 13 | - 'openbmb/MiniCPM-2B-sft-bf16' 14 | - 'Qwen/Qwen1.5-7B-Chat' 15 | - 'Qwen/Qwen2-1.5B-Instruct' 16 | - 'Qwen/Qwen2-7B-Instruct' 17 | - 'microsoft/Phi-3-mini-4k-instruct' 18 | - 'microsoft/Phi-3-mini-128k-instruct' 19 | - 'microsoft/phi-3-vision-128k-instruct' 20 | - 'openbmb/MiniCPM-V-2_6' 21 | local_model_hub: 'path to your local model hub' 22 | warm_up: 3 23 | num_trials: 5 24 | num_beams: 1 # default to greedy search 25 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) 26 | batch_size: 1 # default to 1 27 | in_out_pairs: 28 | - '32-32' 29 | test_api: 30 | - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) 31 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) 32 | -------------------------------------------------------------------------------- /python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_1_tile.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file 18 | python ./alpaca_lora_finetuning.py \ 19 | --micro_batch_size 8 \ 20 | --batch_size 128 \ 21 | --base_model "meta-llama/Llama-2-7b-hf" \ 22 | --data_path "yahma/alpaca-cleaned" \ 23 | --output_dir "./ipex-llm-lora-alpaca" \ 24 | --gradient_checkpointing True \ 25 | --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" 26 | --------------------------------------------------------------------------------