├── python
    └── llm
    │   ├── .gitignore
    │   ├── version.txt
    │   ├── portable-zip
    │       ├── .gitignore
    │       ├── chat.bat
    │       ├── setup.md
    │       └── README-ui.md
    │   ├── example
    │       ├── CPU
    │       │   ├── Speculative-Decoding
    │       │   │   ├── EAGLE
    │       │   │   │   └── requirements.txt
    │       │   │   ├── README.md
    │       │   │   └── Self-Speculation
    │       │   │   │   └── README.md
    │       │   ├── QLoRA-FineTuning
    │       │   │   └── alpaca-qlora
    │       │   │   │   ├── templates
    │       │   │   │       ├── alpaca_short.json
    │       │   │   │       ├── alpaca.json
    │       │   │   │       ├── alpaca_legacy.json
    │       │   │   │       └── vigogne.json
    │       │   │   │   └── finetune_one_node_two_sockets.sh
    │       │   ├── PyTorch-Models
    │       │   │   ├── README.md
    │       │   │   └── Model
    │       │   │   │   ├── README.md
    │       │   │   │   └── yuan2
    │       │   │   │       └── yuan2-2B-instruct
    │       │   │   │           └── config.json
    │       │   ├── HF-Transformers-AutoModels
    │       │   │   ├── README.md
    │       │   │   └── Model
    │       │   │   │   ├── README.md
    │       │   │   │   └── yuan2
    │       │   │   │       └── yuan2-2B-instruct
    │       │   │   │           └── config.json
    │       │   ├── Deepspeed-AutoTP
    │       │   │   ├── run.sh
    │       │   │   └── install.sh
    │       │   └── Applications
    │       │   │   └── streaming-llm
    │       │   │       └── streaming_llm
    │       │   │           └── __init__.py
    │       └── GPU
    │       │   ├── Speculative-Decoding
    │       │       ├── EAGLE
    │       │       │   └── requirements.txt
    │       │       └── README.md
    │       │   ├── vLLM-Serving
    │       │       ├── fp8_kv.png
    │       │       └── max_length.png
    │       │   ├── Deepspeed-AutoTP-FastAPI
    │       │       └── prompt
    │       │       │   └── 32.txt
    │       │   ├── Pipeline-Parallel-Serving
    │       │       └── prompt
    │       │       │   ├── 32.txt
    │       │       │   └── 128.txt
    │       │   ├── HuggingFace
    │       │       ├── Multimodal
    │       │       │   └── README.md
    │       │       ├── LLM
    │       │       │   ├── README.md
    │       │       │   └── yuan2
    │       │       │   │   └── yuan2-2B-instruct
    │       │       │   │       └── config.json
    │       │       └── README.md
    │       │   ├── LLM-Finetuning
    │       │       ├── LoRA
    │       │       │   ├── chatglm_finetune
    │       │       │   │   ├── deepspeed_config.json
    │       │       │   │   ├── lora_finetuning_chatglm3_6b_on_alpaca_with_1_arc_card.sh
    │       │       │   │   ├── lora_finetuning_chatglm3_6b_on_advertise_gen_with_1_arc_card.sh
    │       │       │   │   ├── lora_finetuning_chatglm3_6b_on_alpaca_with_2_arc_cards.sh
    │       │       │   │   └── lora_finetuning_chatglm3_6b_on_advertise_gen_with_2_arc_cards.sh
    │       │       │   ├── deepspeed_zero3_config.json
    │       │       │   ├── lora_finetune_llama2_7b_arc_1_card.sh
    │       │       │   ├── lora_finetune_llama2_7b_pvc_1110_4_card.sh
    │       │       │   ├── lora_finetune_llama2_7b_pvc_1550_4_card.sh
    │       │       │   └── lora_finetune_llama2_7b_pvc_1550_1_tile.sh
    │       │       ├── QLoRA
    │       │       │   ├── alpaca-qlora
    │       │       │   │   ├── deepspeed_zero2.json
    │       │       │   │   ├── deepspeed_zero3.json
    │       │       │   │   ├── qlora_finetune_gemma_2b_arc_1_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_7b_arc_1_card.sh
    │       │       │   │   ├── qlora_finetune_qwen15_7b_arc_1_card.sh
    │       │       │   │   ├── qlora_finetune_baichuan2_7b_arc_1_card.sh
    │       │       │   │   ├── qlora_finetune_llama3_8b_arc_1_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_7b_arc_2_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_7b_flex_170_1_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_7b_pvc_1100_1_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_13b_pvc_1550_1_tile.sh
    │       │       │   │   ├── qlora_finetune_chatglm3_6b_arc_1_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_7b_pvc_1100_4_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_7b_pvc_1550_1_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_7b_pvc_1550_4_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_13b_pvc_1550_1_card.sh
    │       │       │   │   ├── qlora_finetune_llama2_13b_pvc_1550_4_card.sh
    │       │       │   │   └── qlora_finetune_llama2_7b_flex_170_3_card.sh
    │       │       │   └── README.md
    │       │       ├── common
    │       │       │   ├── templates
    │       │       │   │   ├── alpaca_short.json
    │       │       │   │   ├── alpaca.json
    │       │       │   │   ├── alpaca_legacy.json
    │       │       │   │   └── vigogne.json
    │       │       │   └── utils
    │       │       │   │   └── __init__.py
    │       │       ├── axolotl
    │       │       │   ├── default_config.yaml
    │       │       │   └── requirements-xpu.txt
    │       │       ├── HF-PEFT
    │       │       │   └── alpaca-lora
    │       │       │   │   └── templates
    │       │       │   │       ├── alpaca_short.json
    │       │       │   │       ├── alpaca.json
    │       │       │   │       ├── alpaca_legacy.json
    │       │       │   │       └── vigogne.json
    │       │       ├── ReLora
    │       │       │   ├── relora_finetune_llama2_7b_arc_1_card.sh
    │       │       │   ├── relora_finetune_llama2_7b_arc_2_card.sh
    │       │       │   ├── relora_finetune_llama2_7b_pvc_1550_1_card.sh
    │       │       │   └── relora_finetune_llama2_7b_pvc_1550_4_card.sh
    │       │       └── QA-LoRA
    │       │       │   └── qalora_finetune_llama2_7b_arc_1_card.sh
    │       │   └── PyTorch-Models
    │       │       ├── Model
    │       │           ├── README.md
    │       │           └── yuan2
    │       │           │   └── yuan2-2B-instruct
    │       │           │       └── config.json
    │       │       └── README.md
    │   ├── dev
    │       └── benchmark
    │       │   ├── all-in-one
    │       │       ├── prompt
    │       │       │   └── continuation
    │       │       │   │   ├── 32.txt
    │       │       │   │   └── 256.txt
    │       │       ├── run-igpu.sh
    │       │       ├── run-spr.sh
    │       │       ├── run-arc.sh
    │       │       ├── run-max-gpu.sh
    │       │       ├── run-pipeline-parallel-arc.sh
    │       │       ├── run-deepspeed-arc.sh
    │       │       ├── run-deepspeed-pvc.sh
    │       │       └── run-deepspeed-spr.sh
    │       │   ├── LongBench
    │       │       ├── test_and_eval.sh
    │       │       ├── config
    │       │       │   ├── ablation_c512_w32_k7_maxpool.json
    │       │       │   ├── ablation_c1024_w32_k7_maxpool.json
    │       │       │   ├── ablation_c2048_w32_k7_maxpool.json
    │       │       │   ├── ablation_c4096_w32_k7_maxpool.json
    │       │       │   ├── dataset2maxlen.json
    │       │       │   ├── model2maxlen.json
    │       │       │   └── model2path.json
    │       │       └── config.yaml
    │       │   └── ceval
    │       │       ├── run.sh
    │       │       └── evaluators
    │       │           └── evaluator.py
    │   ├── src
    │       └── ipex_llm
    │       │   ├── cli
    │       │       └── prompts
    │       │       │   └── chat-with-llm.txt
    │       │   ├── transformers
    │       │       ├── load_config.yaml
    │       │       ├── models
    │       │       │   └── __init__.py
    │       │       ├── gguf
    │       │       │   ├── models
    │       │       │   │   ├── __init__.py
    │       │       │   │   └── model_implement
    │       │       │   │   │   ├── baichuan
    │       │       │   │   │       └── __init__.py
    │       │       │   │   │   └── yuan2
    │       │       │   │   │       └── __init__.py
    │       │       │   └── __init__.py
    │       │       ├── npu_models
    │       │       │   └── __init__.py
    │       │       ├── npu_pipeline_model
    │       │       │   └── __init__.py
    │       │       ├── awq
    │       │       │   └── __init__.py
    │       │       └── patches.py
    │       │   ├── vllm
    │       │       ├── __init__.py
    │       │       ├── cpu
    │       │       │   ├── __init__.py
    │       │       │   ├── engine
    │       │       │   │   └── __init__.py
    │       │       │   ├── ipex_llm_v1_wrapper.py
    │       │       │   └── ipex_llm_wrapper.py
    │       │       └── xpu
    │       │       │   ├── __init__.py
    │       │       │   ├── ipex_llm_v1_wrapper.py
    │       │       │   ├── engine
    │       │       │       └── __init__.py
    │       │       │   └── ipex_llm_wrapper.py
    │       │   ├── serving
    │       │       ├── __init__.py
    │       │       ├── fastchat
    │       │       │   └── __init__.py
    │       │       └── fastapi
    │       │       │   └── __init__.py
    │       │   ├── utils
    │       │       ├── modules.py
    │       │       └── common
    │       │       │   └── __init__.py
    │       │   ├── gptq
    │       │       ├── __init__.py
    │       │       └── convert
    │       │       │   └── __init__.py
    │       │   ├── ggml
    │       │       └── model
    │       │       │   ├── __init__.py
    │       │       │   ├── bloom
    │       │       │       └── __init__.py
    │       │       │   ├── generation
    │       │       │       └── __init__.py
    │       │       │   ├── starcoder
    │       │       │       └── __init__.py
    │       │       │   ├── llama
    │       │       │       └── __init__.py
    │       │       │   └── gptneox
    │       │       │       └── __init__.py
    │       │   ├── langchain
    │       │       ├── __init__.py
    │       │       └── vllm
    │       │       │   └── __init__.py
    │       │   ├── llamaindex
    │       │       └── __init__.py
    │       │   └── models.py
    │   ├── test
    │       ├── benchmark
    │       │   ├── perplexity
    │       │   │   └── fp16.csv
    │       │   ├── harness
    │       │   │   └── fp16.csv
    │       │   ├── igpu-perf
    │       │   │   ├── 1024-128_436.yaml
    │       │   │   ├── 32-32_int4_fp16_436.yaml
    │       │   │   ├── 2048-256_int4_fp16_436.yaml
    │       │   │   ├── 1024-128_int4_fp16_436.yaml
    │       │   │   ├── 3072-384_int4_fp16_436.yaml
    │       │   │   ├── 1024-128_int4_fp16_loadlowbit_436.yaml
    │       │   │   ├── 1024-128_438.yaml
    │       │   │   ├── 32-32_int4_fp16_438.yaml
    │       │   │   ├── 1024-128_445.yaml
    │       │   │   ├── 2048-256_int4_fp16_438.yaml
    │       │   │   ├── 3072-384_int4_fp16_438.yaml
    │       │   │   ├── 4096-512_int4_fp16_438.yaml
    │       │   │   ├── 1024-128_int4_fp16_438.yaml
    │       │   │   ├── 32-32_int4_fp16_445.yaml
    │       │   │   ├── 2048-256_int4_fp16_445.yaml
    │       │   │   ├── 3072-384_int4_fp16_445.yaml
    │       │   │   ├── 4096-512_int4_fp16_445.yaml
    │       │   │   ├── 1024-128_443.yaml
    │       │   │   ├── 1024-128_int4_fp16_445.yaml
    │       │   │   ├── 1024-128_int4_fp16_loadlowbit_438.yaml
    │       │   │   ├── 32-32_int4_fp16_443.yaml
    │       │   │   ├── 1024-128_int4_fp16_loadlowbit_445.yaml
    │       │   │   ├── 2048-256_int4_fp16_443.yaml
    │       │   │   ├── 3072-384_int4_fp16_443.yaml
    │       │   │   ├── 4096-512_int4_fp16_443.yaml
    │       │   │   ├── 1024-128_int4_fp16_443.yaml
    │       │   │   ├── 1024-128_int4_fp16_loadlowbit_443.yaml
    │       │   │   ├── 3072-384_int4_fp16.yaml
    │       │   │   ├── 4096-512_int4_fp16.yaml
    │       │   │   ├── 1024-128_int4_fp16_loadlowbit.yaml
    │       │   │   └── 32-32_int4_fp16.yaml
    │       │   ├── arc-perf-transformers-436.yaml
    │       │   ├── arc-perf-transformers-443.yaml
    │       │   ├── arc-perf-transformers-436-batch2.yaml
    │       │   ├── arc-perf-transformers-443-batch2.yaml
    │       │   ├── arc-perf-transformers-443-batch4.yaml
    │       │   ├── arc-perf-transformers-440.yaml
    │       │   ├── arc-perf-transformers-445.yaml
    │       │   ├── arc-perf-transformers-445-batch2.yaml
    │       │   ├── arc-perf-transformers-445-batch4.yaml
    │       │   ├── arc-perf-transformers-436-batch4.yaml
    │       │   ├── stable-version-arc-stress-test-sym_int4.yaml
    │       │   ├── stable-version-arc-stress-test-fp8.yaml
    │       │   ├── stable-version-cpu-stress-test.yaml
    │       │   ├── stable-version-cpu-perf-test.yaml
    │       │   ├── cpu-perf-test.yaml
    │       │   └── core-perf-test.yaml
    │       ├── run-llm-install-tests.sh
    │       ├── run-llm-langchain-tests.sh
    │       ├── run-llm-llamaindex-tests.sh
    │       ├── __init__.py
    │       ├── run-llm-inference-tests.sh
    │       ├── run-llm-check-function.sh
    │       ├── run-llm-llamaindex-tests-gpu.sh
    │       ├── run-llm-langchain-tests-gpu.sh
    │       ├── install
    │       │   └── test_install.py
    │       └── run-llm-convert-tests.sh
    │   ├── scripts
    │       └── env-check.bat
    │   └── tpp
    │       ├── README.md
    │       └── licenses
    │           ├── LICENSE-go-isatty.txt
    │           ├── LICENSE-tablewriter.txt
    │           ├── LICENSE-hm.txt
    │           ├── LICENSE-gin-contrib-cors.txt
    │           ├── LICENSE-go-urn.txt
    │           ├── LICENSE-gorgonia.org-vecf32.txt
    │           ├── LICENSE-mimetype.txt
    │           ├── LICENSE-uniseg.txt
    │           ├── LICENSE-d4l3k-go-bfloat16.txt
    │           ├── LICENSE-gorgonia.org-vecf64.txt
    │           ├── LICENSE-validator.txt
    │           ├── LICENSE-gin.txt
    │           ├── LICENSE-go-runewidth.txt
    │           ├── LICENSE-sse.txt
    │           ├── LICENSE-go-playground-locales.txt
    │           ├── LICENSE-agnivade-levenshtein.txt
    │           ├── LICENSE-gin-contrib-sse.txt
    │           ├── LICENSE-float16.txt
    │           ├── LICENSE-go-playground-universal-translator.txt
    │           ├── LICENSE-go-toml.txt
    │           ├── LICENSE-go-codec.txt
    │           └── LICENSE-zlib.txt
├── docker
    └── llm
    │   ├── serving
    │       ├── cpu
    │       │   ├── kubernetes
    │       │   │   ├── clean.sh
    │       │   │   └── models-pv.yaml
    │       │   └── docker
    │       │   │   ├── start-vllm-service.sh
    │       │   │   ├── model_adapter.py.patch
    │       │   │   └── start-notebook.sh
    │       └── xpu
    │       │   └── docker
    │       │       ├── setvars.sh
    │       │       ├── README.md
    │       │       └── start-lightweight_serving-service.sh
    │   ├── inference-cpp
    │       ├── start-open-webui.sh
    │       ├── start-ollama.sh
    │       └── start-llama-cpp.sh
    │   ├── finetune
    │       ├── qlora
    │       │   └── cpu
    │       │   │   └── kubernetes
    │       │   │       ├── templates
    │       │   │           ├── ipex-llm-finetuning-namespace.yaml
    │       │   │           ├── nfs-pvc.yaml
    │       │   │           └── nfs-pv.yaml
    │       │   │       ├── Chart.yaml
    │       │   │       └── values.yaml
    │       ├── lora
    │       │   └── cpu
    │       │   │   ├── kubernetes
    │       │   │       ├── templates
    │       │   │       │   ├── ipex-llm-lora-finetuning-namespace.yaml
    │       │   │       │   ├── nfs-pvc.yaml
    │       │   │       │   └── nfs-pv.yaml
    │       │   │       ├── Chart.yaml
    │       │   │       └── values.yaml
    │       │   │   └── docker
    │       │   │       └── requirements.txt
    │       └── xpu
    │       │   └── start-qlora-finetuning-on-xpu.sh
    │   └── sources
    │       ├── README.md
    │       └── Dockerfile
├── .github
    ├── CODEOWNERS
    └── ISSUE_TEMPLATE
    │   └── 🐛bug-report.md
├── docs
    └── mddocs
    │   ├── Overview
    │       ├── install.md
    │       └── KeyFeatures
    │       │   ├── gpu_supports.md
    │       │   └── README.md
    │   ├── PythonAPI
    │       └── README.md
    │   └── DockerGuides
    │       └── README.md
├── pyproject.toml
├── SECURITY.md
├── .readthedocs.yml
├── apps
    └── ipynb2py.sh
└── .gitignore


/python/llm/.gitignore:
--------------------------------------------------------------------------------
1 | libs/
2 | 


--------------------------------------------------------------------------------
/python/llm/version.txt:
--------------------------------------------------------------------------------
1 | 2.3.0.dev0
2 | 


--------------------------------------------------------------------------------
/docker/llm/serving/cpu/kubernetes/clean.sh:
--------------------------------------------------------------------------------
1 | kubectl delete -f deployment.yaml


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | /.github/workflows/* @glorysdj @liu-shaojun @intel-analytics/CICD
2 | 


--------------------------------------------------------------------------------
/python/llm/portable-zip/.gitignore:
--------------------------------------------------------------------------------
1 | python-embed
2 | bigdl-llm.zip
3 | *.log
4 | *.json


--------------------------------------------------------------------------------
/docker/llm/inference-cpp/start-open-webui.sh:
--------------------------------------------------------------------------------
1 | cd /llm/open-webui/backend
2 | bash start.sh > open-webui.log
3 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/Speculative-Decoding/EAGLE/requirements.txt:
--------------------------------------------------------------------------------
1 | openai==0.28.0 
2 | anthropic==0.5.0 
3 | wandb
4 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/Speculative-Decoding/EAGLE/requirements.txt:
--------------------------------------------------------------------------------
1 | openai==0.28.0 
2 | anthropic==0.5.0 
3 | wandb
4 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/vLLM-Serving/fp8_kv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm/HEAD/python/llm/example/GPU/vLLM-Serving/fp8_kv.png


--------------------------------------------------------------------------------
/python/llm/example/GPU/vLLM-Serving/max_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm/HEAD/python/llm/example/GPU/vLLM-Serving/max_length.png


--------------------------------------------------------------------------------
/docker/llm/serving/xpu/docker/setvars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export CCL_DG2_ALLREDUCE=1
4 | export LD_LIBRARY_PATH=/opt/intel/1ccl-wks/lib:$LD_LIBRARY_PATH
5 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/qlora/cpu/kubernetes/templates/ipex-llm-finetuning-namespace.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 |   name: ipex-llm-qlora-finetuning
5 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/lora/cpu/kubernetes/templates/ipex-llm-lora-finetuning-namespace.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 |   name: ipex-llm-lora-finetuning
5 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/prompt/32.txt:
--------------------------------------------------------------------------------
1 | Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun
2 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/Pipeline-Parallel-Serving/prompt/32.txt:
--------------------------------------------------------------------------------
1 | Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun
2 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/prompt/continuation/32.txt:
--------------------------------------------------------------------------------
1 | Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. 
2 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/run-igpu.sh:
--------------------------------------------------------------------------------
1 | source /opt/intel/oneapi/setvars.sh
2 |  
3 | export SYCL_CACHE_PERSISTENT=1
4 | export BIGDL_LLM_XMX_DISABLED=1
5 |  
6 | python run.py # make sure config YAML file
7 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/test_and_eval.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | 
3 | export HF_ENDPOINT=https://hf-mirror.com
4 | 
5 | SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
6 | python ${SHELL_FOLDER}/pred.py
7 | python ${SHELL_FOLDER}/eval.py


--------------------------------------------------------------------------------
/docker/llm/finetune/lora/cpu/kubernetes/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: trusted-fintune-service
3 | description: A Helm chart for IPEX-LLM Finetuning Service on Kubernetes
4 | type: application
5 | version: 1.1.27
6 | appVersion: "1.16.0"
7 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: ipex_llm-fintune-service
3 | description: A Helm chart for IPEX-LLM Finetune Service on Kubernetes
4 | type: application
5 | version: 1.1.27
6 | appVersion: "1.16.0"
7 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/ceval/run.sh:
--------------------------------------------------------------------------------
1 | export IPEX_LLM_LAST_LM_HEAD=0
2 | 
3 | python eval.py \
4 |     --model_path "path to model" \
5 |     --eval_type validation \
6 |     --device xpu \
7 |     --eval_data_path data \
8 |     --qtype sym_int4


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/config/ablation_c512_w32_k7_maxpool.json:
--------------------------------------------------------------------------------
1 | {
2 |     "window_sizes": 32, 
3 |     "default_max_capacity_prompts": 512,
4 |     "specific_max_capcity_prompts": {},
5 |     "kernel_sizes": 7, 
6 |     "pooling": "maxpool"
7 | }


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/run-spr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ipex-llm-init -t
3 | export OMP_NUM_THREADS=48
4 | 
5 | # set following parameters according to the actual specs of the test machine
6 | numactl -C 0-47 -m 0 python $(dirname "$0")/run.py


--------------------------------------------------------------------------------
/python/llm/portable-zip/chat.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | 
 4 | :: execute chat script
 5 | set PYTHONUNBUFFERED=1
 6 | 
 7 | set /p modelpath="Please enter the model path: "
 8 | .\python-embed\python.exe .\chat.py --model-path="%modelpath%"
 9 | 
10 | pause


--------------------------------------------------------------------------------
/docker/llm/inference-cpp/start-ollama.sh:
--------------------------------------------------------------------------------
 1 | # init ollama first
 2 | mkdir -p /llm/ollama
 3 | cd /llm/ollama
 4 | init-ollama
 5 | export OLLAMA_NUM_GPU=999
 6 | export ZES_ENABLE_SYSMAN=1
 7 | 
 8 | # start ollama service
 9 | (./ollama serve > ollama.log) &
10 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/config/ablation_c1024_w32_k7_maxpool.json:
--------------------------------------------------------------------------------
1 | {
2 |     "window_sizes": 32, 
3 |     "default_max_capacity_prompts": 1024, 
4 |     "specific_max_capcity_prompts": {},
5 |     "kernel_sizes": 7, 
6 |     "pooling": "maxpool"
7 | }


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/config/ablation_c2048_w32_k7_maxpool.json:
--------------------------------------------------------------------------------
1 | {
2 |     "window_sizes": 32, 
3 |     "default_max_capacity_prompts": 2048, 
4 |     "specific_max_capcity_prompts": {},
5 |     "kernel_sizes": 7, 
6 |     "pooling": "maxpool"
7 | }


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/config/ablation_c4096_w32_k7_maxpool.json:
--------------------------------------------------------------------------------
1 | {
2 |     "window_sizes": 32, 
3 |     "default_max_capacity_prompts": 4096, 
4 |     "specific_max_capcity_prompts": {},
5 |     "kernel_sizes": 7, 
6 |     "pooling": "maxpool"
7 | }


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/cli/prompts/chat-with-llm.txt:
--------------------------------------------------------------------------------
1 | A chat between a curious human USER and an artificial intelligence assistant ChatLLM. The assistant gives helpful, detailed, and polite answers to the human's questions.
2 | 
3 | USER: Hello, ChatLLM.
4 | ChatLLM: Hello. 
5 | USER:
6 | 


--------------------------------------------------------------------------------
/docker/llm/inference-cpp/start-llama-cpp.sh:
--------------------------------------------------------------------------------
1 | # init llama-cpp first
2 | mkdir -p /llm/llama-cpp
3 | cd /llm/llama-cpp
4 | init-llama-cpp
5 | 
6 | # change the model_path to run
7 | model="/models/mistral-7b-v0.1.Q4_0.gguf"
8 | ./llama-cli -m $model -n 32 --prompt "What is AI?" -t 8 -e -ngl 999 --color
9 | 


--------------------------------------------------------------------------------
/docs/mddocs/Overview/install.md:
--------------------------------------------------------------------------------
1 | # IPEX-LLM Installation
2 | 
3 | Here, we provide instructions on how to install `ipex-llm` and best practices for setting up your environment. Please refer to the appropriate guide based on your device:
4 | 
5 | - [CPU](./install_cpu.md)
6 | - [GPU](./install_gpu.md)
7 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/lora/cpu/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | appdirs
 3 | bitsandbytes
 4 | black
 5 | black[jupyter]
 6 | datasets
 7 | fire
 8 | peft==0.2.0
 9 | #git+https://github.com/huggingface/peft.git
10 | #git+https://github.com/huggingface/transformers.git
11 | gradio
12 | sentencepiece
13 | scipy
14 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/HuggingFace/Multimodal/README.md:
--------------------------------------------------------------------------------
1 | # Running HuggingFace multimodal model using IPEX-LLM on Intel GPU
2 | 
3 | This folder contains examples of running multimodal models model on IPEX-LLM. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
4 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/perplexity/fp16.csv:
--------------------------------------------------------------------------------
1 | Index,Model,Precision,ppl_result
2 | 0,Llama-2-7b-chat-hf,fp16,4.7019
3 | 1,chatglm2-6b,fp16,22.321
4 | 2,chatglm3-6b,fp16,30.1281
5 | 3,Baichuan2-7B-Chat,fp16,10.7676
6 | 4,mpt-7b-chat,fp16,5.7882
7 | 5,falcon-7b-instruct-with-patch,fp16,5.2532
8 | 6,Mistral-7B-v0.1,fp16,3.6597
9 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/lora/cpu/kubernetes/templates/nfs-pvc.yaml:
--------------------------------------------------------------------------------
 1 | kind: PersistentVolumeClaim
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: nfs-pvc
 5 |   namespace: ipex-llm-lora-finetuning
 6 | spec:
 7 |   accessModes:
 8 |   - ReadWriteOnce
 9 |   resources:
10 |     requests:
11 |       storage: 10Gi
12 |   storageClassName: nfs
13 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/qlora/cpu/kubernetes/templates/nfs-pvc.yaml:
--------------------------------------------------------------------------------
 1 | kind: PersistentVolumeClaim
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: nfs-pvc
 5 |   namespace: ipex-llm-qlora-finetuning
 6 | spec:
 7 |   accessModes:
 8 |   - ReadWriteOnce
 9 |   resources:
10 |     requests:
11 |       storage: 10Gi
12 |   storageClassName: nfs
13 | 


--------------------------------------------------------------------------------
/docs/mddocs/Overview/KeyFeatures/gpu_supports.md:
--------------------------------------------------------------------------------
1 | # GPU Supports
2 | 
3 | IPEX-LLM not only supports running large language models for inference, but also supports QLoRA finetuning on Intel GPUs.
4 | 
5 | * [Inference on GPU](./inference_on_gpu.md)
6 | * [Finetune (QLoRA)](./finetune.md)
7 | * [Multi GPUs selection](./multi_gpus_selection.md)


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | name = "BigDL"
2 | version = "2.0"
3 | description = "An open source Big Data AI platform (for distributed TensorFlow, PyTorch and Keras on Apache Spark & Ray)"
4 | license = "Apache-2.0"
5 | repository = "https://github.com/intel-analytics/BigDL"
6 | documentation = "https://bigdl.readthedocs.io/en/branch-2.0/"
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/docker/llm/serving/xpu/docker/README.md:
--------------------------------------------------------------------------------
1 | > 💡 **Tip**: For a detailed and up-to-date guide on running `vLLM` serving with `IPEX-LLM` on Intel GPUs via Docker, please refer to our official documentation:  
2 | > [vllm_docker_quickstart](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/DockerGuides/vllm_docker_quickstart.md)
3 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/harness/fp16.csv:
--------------------------------------------------------------------------------
1 | Index,Model,Precision,Arc,TruthfulQA,Winogrande
2 | 0,falcon-7b-instruct-with-patch,fp16,46.16,44.08,67.96
3 | 1,Llama2-7b-guanaco-dolphin-500,fp16,56.74,46.96,74.27
4 | 2,Baichuan2-7B-Chat-LLaMAfied,fp16,52.47,48.04,69.14
5 | 3,Mistral-7B-v0.1,fp16,59.98,42.15,78.37
6 | 4,mpt-7b-chat,fp16,46.50,40.16,68.43


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/run-arc.sh:
--------------------------------------------------------------------------------
 1 | source /opt/intel/oneapi/setvars.sh
 2 |  
 3 | export USE_XETLA=OFF
 4 | export SYCL_CACHE_PERSISTENT=1
 5 | KERNEL_VERSION=$(uname -r)
 6 | if [[ $KERNEL_VERSION != *"6.5"* ]]; then
 7 |     export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 8 | fi
 9 |  
10 | python run.py # make sure config YAML file


--------------------------------------------------------------------------------
/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh:
--------------------------------------------------------------------------------
1 | # need to update transformers version first
2 | # pip install transformers==4.37.0
3 | cd /llm/lightweight_serving
4 | export IPEX_LLM_NOT_USE_VLLM=True
5 | model_path="/llm/models/Llama-2-7b-chat-hf"
6 | low_bit="sym_int4"
7 | python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit


--------------------------------------------------------------------------------
/docker/llm/serving/cpu/kubernetes/models-pv.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: models-pv
 5 |   labels:
 6 |     app: models
 7 | spec:
 8 |   capacity:
 9 |     storage: 10Gi #Modify according to model size
10 |   accessModes:
11 |     - ReadWriteMany
12 |   storageClassName: models
13 |   nfs:
14 |     path: YOUR_NFS_PATH
15 |     server: YOUR_NFS_SERVER
16 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/run-max-gpu.sh:
--------------------------------------------------------------------------------
 1 | source /opt/intel/oneapi/setvars.sh
 2 |  
 3 | export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 4 | export ENABLE_SDP_FUSION=1
 5 | export SYCL_CACHE_PERSISTENT=1
 6 | KERNEL_VERSION=$(uname -r)
 7 | if [[ $KERNEL_VERSION != *"6.5"* ]]; then
 8 |     export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 9 | fi
10 |  
11 | python run.py # make sure config YAML file


--------------------------------------------------------------------------------
/docker/llm/finetune/lora/cpu/kubernetes/templates/nfs-pv.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: nfs-pv-ipex-llm-lora-finetuning
 5 |   namespace: ipex-llm-lora-finetuning
 6 | spec:
 7 |   capacity:
 8 |     storage: 15Gi
 9 |   accessModes:
10 |     - ReadWriteOnce
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: nfs
13 |   nfs:
14 |    path: {{ .Values.nfsPath }}
15 |    server: {{ .Values.nfsServerIp }}
16 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/deepspeed_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "zero_optimization": {
 3 |       "stage": 2,
 4 |       "offload_optimizer": {
 5 |           "device": "cpu"
 6 |       },
 7 |       "contiguous_gradients": true,
 8 |       "overlap_comm": true
 9 |     },  
10 |     "bf16": {
11 |       "enabled": true
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "gradient_accumulation_steps": "auto"
15 | }
16 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "zero_optimization": {
 3 |       "stage": 2,
 4 |       "offload_optimizer": {
 5 |           "device": "cpu"
 6 |       },
 7 |       "contiguous_gradients": true,
 8 |       "overlap_comm": true
 9 |     },  
10 |     "bf16": {
11 |       "enabled": true
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "gradient_accumulation_steps": "auto"
15 | }
16 |   


--------------------------------------------------------------------------------
/docker/llm/finetune/qlora/cpu/kubernetes/templates/nfs-pv.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: nfs-pv-ipex-llm-qlora-finetuning
 5 |   namespace: ipex-llm-qlora-finetuning
 6 | spec:
 7 |   capacity:
 8 |     storage: 15Gi
 9 |   accessModes:
10 |     - ReadWriteOnce
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: nfs
13 |   nfs:
14 |    path: {{ .Values.nfsPath }}
15 |    server: {{ .Values.nfsServerIp }}
16 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "zero_optimization": {
 3 |       "stage": 3,
 4 |       "contiguous_gradients": true,
 5 |       "overlap_comm": true,
 6 |       "offload_optimizer": {"device": "cpu"}
 7 |     },
 8 |     "bf16": {
 9 |       "enabled": true
10 |     },
11 |     "world_size": 2,
12 |     "train_batch_size": 32,
13 |     "train_micro_batch_size_per_gpu": 2,
14 |     "gradient_accumulation_steps": 8
15 | }
16 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/lora/cpu/kubernetes/values.yaml:
--------------------------------------------------------------------------------
 1 | imageName: intelanalytics/ipex-llm-finetune-lora-cpu:2.2.0-SNAPSHOT
 2 | trainerNum: 8
 3 | microBatchSize: 8
 4 | nfsServerIp: your_nfs_server_ip
 5 | nfsPath: a_nfs_shared_folder_path_on_the_server
 6 | dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory
 7 | modelSubPath: Llama-2-7b-chat-hf # a subpath of the model file (dir) under nfs directory
 8 | ompNumThreads: 14
 9 | cpuPerPod: 42
10 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/PyTorch-Models/Model/README.md:
--------------------------------------------------------------------------------
1 | # IPEX-LLM INT4 Optimization for Large Language Model on Intel GPUs
2 | You can use `optimize_model` API to accelerate general PyTorch models on Intel GPUs. This directory contains example scripts to help you quickly get started using IPEX-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
3 | 
4 | 
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/docker/llm/sources/README.md:
--------------------------------------------------------------------------------
 1 | This is used for OSPDT review.
 2 | 
 3 | A separate Docker container layer tagged as: <YOUR_DOCKER_IMAGE>:<xPL_VERSION>-sources tag for sources of 3d party packages with MPL 1.x, MPL 2.x, GPL 1.x, GPL 2.x and GPL 3.x variants.
 4 | 
 5 | ### Build Image
 6 | ```bash
 7 | docker build \
 8 |   --build-arg http_proxy=.. \
 9 |   --build-arg https_proxy=.. \
10 |   --build-arg no_proxy=.. \
11 |   --rm --no-cache -t intelanalytics/ipex-llm:sources .
12 | ```
13 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/common/templates/alpaca_short.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_short.json",
3 |     "description": "A shorter template to experiment with.",
4 |     "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
5 |     "prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Report a Vulnerability
 4 | 
 5 | Please report security issues or vulnerabilities to the [Intel® Security Center].
 6 | 
 7 | For more information on how Intel® works to resolve security issues, see
 8 | [Vulnerability Handling Guidelines].
 9 | 
10 | [Intel® Security Center]:https://www.intel.com/security
11 | 
12 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html
13 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/templates/alpaca_short.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_short.json",
3 |     "description": "A shorter template to experiment with.",
4 |     "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
5 |     "prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/axolotl/default_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: 'NO'
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | ipex_config:
 7 |   use_xpu: true
 8 | machine_rank: 0
 9 | main_training_function: main
10 | mixed_precision: 'no'
11 | num_machines: 1
12 | num_processes: 1
13 | rdzv_backend: static
14 | same_network: true
15 | tpu_env: []
16 | tpu_use_cluster: false
17 | tpu_use_sudo: false
18 | use_cpu: false
19 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/templates/alpaca_short.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_short.json",
3 |     "description": "A shorter template to experiment with.",
4 |     "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
5 |     "prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/HuggingFace/LLM/README.md:
--------------------------------------------------------------------------------
1 | # IPEX-LLM Transformers INT4 Optimization for Large Language Model on Intel GPUs
2 | You can use IPEX-LLM to run almost every Huggingface Transformer models with INT4 optimizations on your laptops with Intel GPUs. This directory contains example scripts to help you quickly get started using IPEX-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
3 | 


--------------------------------------------------------------------------------
/python/llm/portable-zip/setup.md:
--------------------------------------------------------------------------------
 1 | # IPEX-LLM Portable Zip Setup Script For Windows
 2 | 
 3 | # How to use
 4 | 
 5 | ## Build Portable Zip without Web-UI
 6 | 
 7 | Run `setup.bat` to generate portable zip without Web-UI. It will download and install all dependency and generate `ipex-llm.zip` for user to use.
 8 | 
 9 | ## Build Portable Zip with Web-UI
10 | 
11 | Run `setup.bat --ui` to generate portable zip with Web-UI. It will download and install all dependency and generate `ipex-llm.zip` for user to use.
12 | 


--------------------------------------------------------------------------------
/python/llm/test/run-llm-install-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 5 | export LLM_INSTALL_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/install
 6 | 
 7 | set -e
 8 | 
 9 | echo "# Start testing install"
10 | start=$(date "+%s")
11 | 
12 | python -m pytest -s ${LLM_INSTALL_TEST_DIR}
13 | 
14 | now=$(date "+%s")
15 | time=$((now-start))
16 | 
17 | echo "Bigdl-llm tests finished"
18 | echo "Time used:$time seconds"
19 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/deepspeed_zero3_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "zero_optimization": {
 3 |       "stage": 3,
 4 |       "contiguous_gradients": true,
 5 |       "overlap_comm": true,
 6 |       "offload_optimizer": {"device": "cpu"}
 7 |     },
 8 |     "bf16": {
 9 |       "enabled": true
10 |     },
11 |     "world_size":2,
12 |     "train_batch_size": 2,
13 |     "train_micro_batch_size_per_gpu": 1,
14 |     "gradient_accumulation_steps": 1,
15 |     "stage3_gather_16bit_weights_on_model_save":true
16 | }
17 | 


--------------------------------------------------------------------------------
/python/llm/test/run-llm-langchain-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/langchain
 6 | set -e
 7 | 
 8 | echo "# Start testing inference"
 9 | start=$(date "+%s")
10 | 
11 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR}
12 | 
13 | now=$(date "+%s")
14 | time=$((now-start))
15 | 
16 | echo "Bigdl-llm langchain tests finished"
17 | echo "Time used:$time seconds"


--------------------------------------------------------------------------------
/python/llm/example/CPU/Speculative-Decoding/README.md:
--------------------------------------------------------------------------------
1 | # Speculative-Decoding Examples on Intel CPU
2 | 
3 | This folder contains examples of running Speculative-Decoding Examples with IPEX-LLM on Intel CPU:
4 | 
5 | - [Self-Speculation](Self-Speculation): running BF16 inference for Huggingface Transformer model with ***self-speculative decoding*** with IPEX-LLM on Intel CPUs
6 | - [EAGLE](EAGLE): running speculative sampling using ***EAGLE*** (Extrapolation Algorithm for Greater Language-model Efficiency) with IPEX-LLM on Intel CPUs
7 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/Speculative-Decoding/README.md:
--------------------------------------------------------------------------------
1 | # Speculative-Decoding Examples on Intel GPU
2 | 
3 | This folder contains examples of running Speculative-Decoding Examples with IPEX-LLM on Intel GPU:
4 | 
5 | - [Self-Speculation](Self-Speculation): running BF16 inference for Huggingface Transformer model with ***self-speculative decoding*** with IPEX-LLM on Intel GPUs
6 | - [EAGLE](EAGLE): running speculative sampling using ***EAGLE*** (Extrapolation Algorithm for Greater Language-model Efficiency) with IPEX-LLM on Intel GPUs
7 | 


--------------------------------------------------------------------------------
/python/llm/test/run-llm-llamaindex-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/llamaindex
 6 | 
 7 | set -e
 8 | 
 9 | echo "# Start testing inference"
10 | start=$(date "+%s")
11 | 
12 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR}
13 | 
14 | now=$(date "+%s")
15 | time=$((now-start))
16 | 
17 | echo "Bigdl-llm llamaindex tests finished"
18 | echo "Time used:$time seconds"


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/run-pipeline-parallel-arc.sh:
--------------------------------------------------------------------------------
 1 | source /opt/intel/oneapi/setvars.sh
 2 | export MASTER_ADDR=127.0.0.1
 3 | export MASTER_PORT=8080
 4 | export FI_PROVIDER=tcp
 5 | export USE_XETLA=OFF
 6 | export OMP_NUM_THREADS=6
 7 | if [[ $KERNEL_VERSION != *"6.5"* ]]; then
 8 |     export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 9 | fi
10 | export TORCH_LLM_ALLREDUCE=0
11 | 
12 | NUM_GPUS=2 # number of used GPU
13 | CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS run.py
14 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/PyTorch-Models/README.md:
--------------------------------------------------------------------------------
1 | # Running PyTorch model using IPEX-LLM on Intel CPU
2 | 
3 | This folder contains examples of running any PyTorch model on IPEX-LLM (with "one-line code change"):
4 | 
5 | - [Model](Model): examples of running PyTorch models (e.g., Openai Whisper, LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations
6 | - [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.)
7 | - [Save-Load](Save-Load): examples of saving and loading low-bit models
8 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/PyTorch-Models/README.md:
--------------------------------------------------------------------------------
1 | # Running PyTorch model using IPEX-LLM on Intel GPU
2 | 
3 | This folder contains examples of running any PyTorch model on IPEX-LLM (with "one-line code change"):
4 | 
5 | - [Model](Model): examples of running PyTorch models (e.g., Openai Whisper, LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations
6 | - [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.)
7 | - [Save-Load](Save-Load): examples of saving and loading low-bit models
8 | 


--------------------------------------------------------------------------------
/docker/llm/serving/cpu/docker/start-vllm-service.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | model="YOUR_MODEL_PATH"
 3 | served_model_name="YOUR_MODEL_NAME"
 4 |  
 5 |  
 6 | python -m ipex_llm.vllm.cpu.entrypoints.openai.api_server \
 7 |   --served-model-name $served_model_name \
 8 |   --port 8000 \
 9 |   --model $model \
10 |   --trust-remote-code \
11 |   --device cpu \
12 |   --dtype bfloat16 \
13 |   --enforce-eager \
14 |   --load-in-low-bit bf16 \
15 |   --max-model-len 4096 \
16 |   --max-num-batched-tokens 10240 \
17 |   --max-num-seqs 12 \
18 |   --tensor-parallel-size 1


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/README.md:
--------------------------------------------------------------------------------
1 | # QLoRA Finetuning with IPEX-LLM
2 | 
3 | We provide [Alpaca-QLoRA example](./alpaca-qlora/), which ports [Alpaca-LoRA](https://github.com/tloen/alpaca-lora/tree/main) to IPEX-LLM (using [QLoRA](https://arxiv.org/abs/2305.14314) algorithm) on [Intel GPU](../../README.md).
4 | 
5 | Meanwhile, we also provide a [simple example](./simple-example/) to help you get started with QLoRA Finetuning using IPEX-LLM, and [TRL example](./trl-example/) to help you get started with QLoRA Finetuning using IPEX-LLM and TRL library.
6 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_436.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: 'path to your local model hub'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '1024-128'
11 | test_api:
12 |   - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
14 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: 'path to your local model hub'
 4 | warm_up: 3
 5 | num_trials: 5
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '32-32'
11 | test_api:
12 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
14 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: 'path to your local model hub'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '2048-256'
11 | test_api:
12 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
14 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: 'path to your local model hub'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '1024-128'
11 | test_api:
12 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
14 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_436.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: 'path to your local model hub'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '3072-384'
11 | test_api:
12 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
14 | 


--------------------------------------------------------------------------------
/python/llm/scripts/env-check.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | REM Check Python version
 4 | python -V
 5 | if ERRORLEVEL 1 ( 
 6 |     echo No Python found! Instructions on how to create an environment can be found in the README.md.
 7 |     goto:end
 8 | )
 9 | python check.py
10 | 
11 | echo -----------------------------------------------------------------
12 | echo System Information
13 | systeminfo
14 | echo -----------------------------------------------------------------
15 | xpu-smi discovery
16 | if ERRORLEVEL 1 ( 
17 |     echo xpu-smi is not installed properly. 
18 |     goto:end
19 | )
20 | 
21 | :end
22 | 


--------------------------------------------------------------------------------
/docs/mddocs/Overview/KeyFeatures/README.md:
--------------------------------------------------------------------------------
 1 | # IPEX-LLM Key Features
 2 | 
 3 | You may run the LLMs using `ipex-llm` through one of the following APIs:
 4 | 
 5 | * [PyTorch API](./optimize_model.md)
 6 | * [`transformers`-style API](./transformers_style_api.md)
 7 |   * [Hugging Face `transformers` Format](./hugging_face_format.md)
 8 |   * [Native Format](./native_format.md)
 9 | * [LangChain API](./langchain_api.md)
10 | * [GPU Supports](./gpu_supports.md)
11 |   * [Inference on GPU](./inference_on_gpu.md)
12 |   * [Finetune (QLoRA)](./finetune.md)
13 |   * [Multi GPUs selection](./multi_gpus_selection.md)
14 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_436.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: 'path to your local model hub'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '1024-128'
11 | test_api:
12 |   - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
13 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
14 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/config/dataset2maxlen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "narrativeqa": 128,
 3 |     "qasper": 128,
 4 |     "multifieldqa_en": 64,
 5 |     "multifieldqa_zh": 64,
 6 |     "hotpotqa": 32,
 7 |     "2wikimqa": 32,
 8 |     "musique": 32,
 9 |     "dureader": 128,
10 |     "gov_report": 512,
11 |     "qmsum": 512,
12 |     "multi_news": 512,
13 |     "vcsum": 512,
14 |     "trec": 64,
15 |     "triviaqa": 32,
16 |     "samsum": 128,
17 |     "lsht": 64,
18 |     "passage_count": 32,
19 |     "passage_retrieval_en": 32,
20 |     "passage_retrieval_zh": 32,
21 |     "lcc": 64,
22 |     "repobench-p": 64
23 | }


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_438.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'stabilityai/stablelm-zephyr-3b'
 3 |   #- 'google/gemma-7b-it'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '1024-128'
12 | test_api:
13 |   - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | build:
 9 |   os: ubuntu-20.04
10 |   tools:
11 |     python: "3.7"
12 |   apt_packages:
13 |     - graphviz
14 |   jobs:
15 |     pre_install:
16 |       - wget https://raw.githubusercontent.com/analytics-zoo/gha-cicd-env/main/python-requirements/requirements-doc.txt
17 | 
18 | sphinx:
19 |   configuration: docs/readthedocs/source/conf.py
20 | 
21 | python:
22 |   install:
23 |     - requirements: ./requirements-doc.txt
24 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_438.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'stabilityai/stablelm-zephyr-3b'
 3 |   #- 'google/gemma-7b-it'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 3
 6 | num_trials: 5
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '32-32'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/Pipeline-Parallel-Serving/prompt/128.txt:
--------------------------------------------------------------------------------
1 | In a distant future, humanity has expanded across the galaxy, establishing colonies on numerous planets. The interstellar community thrives under the guidance of the United Galactic Federation, which ensures peace and prosperity. However, a new threat emerges from the unknown regions of space, challenging the stability and security of the galaxy. Brave explorers and seasoned warriors must unite to uncover the secrets of this mysterious force and protect the future of all sentient beings.  Please continue the above story as long as possible, preferably more than 1000 tokens.


--------------------------------------------------------------------------------
/python/llm/tpp/README.md:
--------------------------------------------------------------------------------
1 | Third Party Software notices and information
2 | -------------------------------------------------------------
3 | “Third Party Software” mean the files (if any) listed in the “third-party-programs.txt” or other similarly-named text file that may be included with the software. Third Party Software, even if included with the distribution of the software, may be governed by separate license terms, including without limitation, third party license terms, open source software notices and terms, and/or other Intel software license terms. These separate license terms solely govern your use of the Third Party Software.


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_445.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '1024-128'
12 | test_api:
13 |   - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_438.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'stabilityai/stablelm-zephyr-3b'
 3 |   #- 'google/gemma-7b-it'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '2048-256'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_438.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'stabilityai/stablelm-zephyr-3b'
 3 |   #- 'google/gemma-7b-it'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '3072-384'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_438.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'stabilityai/stablelm-zephyr-3b'
 3 |   #- 'google/gemma-7b-it'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '4096-512'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/config/model2maxlen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "llama2-7b-chat-4k": 4096,
 3 |     "longchat-v1.5-7b-32k": 4096,
 4 |     "xgen-7b-8k": 4096,
 5 |     "internlm-7b-8k": 4096,
 6 |     "chatglm2-6b": 4096,
 7 |     "chatglm2-6b-32k": 4096,
 8 |     "chatglm3-6b-32k": 4096,
 9 |     "chatglm4-9b": 4096,
10 |     "vicuna-v1.5-7b-16k": 4096,
11 |     "mistral-7B-instruct-v0.2": 4096,
12 |     "mistral-7B-instruct-v0.1": 4096,
13 |     "mixtral-8x7B-instruct-v0.1": 4096,
14 |     "llama-2-7B-32k-instruct": 4096,
15 |     "lwm-text-chat-1m": 4096,
16 |     "lwm-text-1m": 4096,
17 |     "qwen2-7b-instruct": 4096
18 | }
19 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_438.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'stabilityai/stablelm-zephyr-3b'
 3 |   #- 'google/gemma-7b-it'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '1024-128'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_445.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 3
 6 | num_trials: 5
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '32-32'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh:
--------------------------------------------------------------------------------
 1 | export MASTER_ADDR=127.0.0.1
 2 | export SOCKET_CORES=48
 3 | 
 4 | source ipex-llm-init -t
 5 | mpirun -n 2 \
 6 |  --bind-to socket \
 7 |  -genv OMP_NUM_THREADS=$SOCKET_CORES \
 8 |  -genv KMP_AFFINITY="granularity=fine,none" \
 9 |  -genv KMP_BLOCKTIME=1 \
10 |  python alpaca_qlora_finetuning_cpu.py \
11 |  --gradient_checkpointing False \
12 |  --batch_size 128 \
13 |  --micro_batch_size 8 \
14 |  --max_steps -1 \
15 |  --base_model "meta-llama/Llama-2-7b-hf" \
16 |  --data_path "yahma/alpaca-cleaned" \
17 |  --output_dir "./ipex-llm-qlora-alpaca"
18 | 
19 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/load_config.yaml:
--------------------------------------------------------------------------------
 1 | # TODO: move this to a different repo
 2 | repo_id:
 3 |   # - 'THUDM/chatglm-6b'
 4 |   # - 'THUDM/chatglm2-6b'
 5 |   - 'meta-llama/Llama-2-7b-chat-hf'
 6 |   # - 'baichuan-inc/Baichuan2-7B-Chat'
 7 |   # - 'Qwen/Qwen-7B-Chat'
 8 |   # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
 9 | local_model_hub: '/mnt/disk1/models'
10 | low_bit:
11 |   - 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
12 |   - 'bf16'
13 | device:
14 |   #- 'cpu'
15 |   - 'xpu'
16 | load_low_bit_model: False
17 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-436.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: '/mnt/disk1/models'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '32-32'
11 |   - '1024-128'
12 |   - '2048-256'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
17 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_445.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '2048-256'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_445.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '3072-384'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_445.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '4096-512'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/xpu/start-qlora-finetuning-on-xpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | export USE_XETLA=OFF
 4 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 5 | source /opt/intel/oneapi/setvars.sh
 6 | 
 7 | if [ -d "./model" ];
 8 | then
 9 |   MODEL_PARAM="--repo-id-or-model-path ./model"  # otherwise, default to download from HF repo
10 | fi
11 | 
12 | if [ -d "./data/alpaca-cleaned" ];
13 | then
14 |   DATA_PARAM="--dataset ./data/alpaca-cleaned" # otherwise, default to download from HF dataset
15 | fi
16 | 
17 | # QLoRA example dir
18 | cd /LLM-Finetuning/QLoRA/simple-example/
19 | 
20 | python qlora_finetuning.py $MODEL_PARAM $DATA_PARAM
21 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/config.yaml:
--------------------------------------------------------------------------------
 1 | model_name:
 2 |   # - "mistral-7B-instruct-v0.2"
 3 |   - "llama2-7b-chat-4k"
 4 |   # - "chatglm4-9b"
 5 |   # - "qwen2-7b-instruct"
 6 | 
 7 | full_kv: True
 8 | optimize_model: True
 9 | dtype: 'fp16'
10 | low_bit: 'sym_int4'
11 | 
12 | e: False
13 | 
14 | compress_kv:
15 |   - "ablation_c512_w32_k7_maxpool"
16 |   - "ablation_c1024_w32_k7_maxpool"
17 | 
18 | datasets:
19 |   - "multi_news"
20 |   - "qasper"
21 |   - "hotpotqa"
22 |   - "trec"
23 |   - "passage_count"
24 |   - "lcc"
25 |   # - "multifieldqa_zh"
26 |   # - "dureader"
27 |   # - "vcsum"
28 |   # - "lsht"
29 |   # - "passage_retrieval_zh"
30 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_443.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'google/gemma-2-2b-it'
 3 |   - 'google/gemma-2-9b-it'
 4 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 5 | local_model_hub: 'path to your local model hub'
 6 | warm_up: 1
 7 | num_trials: 3
 8 | num_beams: 1 # default to greedy search
 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
10 | batch_size: 1 # default to 1
11 | in_out_pairs:
12 |   - '1024-128'
13 | test_api:
14 |   - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_445.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '1024-128'
12 | test_api:
13 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_438.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'stabilityai/stablelm-zephyr-3b'
 3 |   #- 'google/gemma-7b-it'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '1024-128'
12 | test_api:
13 |   - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-443.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 3 | local_model_hub: '/mnt/disk1/models'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '32-32'
11 |   - '1024-128'
12 |   - '2048-256'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
17 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'google/gemma-2-2b-it'
 3 |   - 'google/gemma-2-9b-it'
 4 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 5 | local_model_hub: 'path to your local model hub'
 6 | warm_up: 3
 7 | num_trials: 5
 8 | num_beams: 1 # default to greedy search
 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
10 | batch_size: 1 # default to 1
11 | in_out_pairs:
12 |   - '32-32'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | 


--------------------------------------------------------------------------------
/apps/ipynb2py.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## Usage ################################
 4 | # ./ipynb2py <file-name without extension>
 5 | # Example:
 6 | # ipynb2py notebooks/neural_networks/rnn
 7 | #########################################
 8 | if [ $# -ne "1" ]; then
 9 |     echo "Usage: ./nb2script <file-name without extension>"
10 | else
11 |     cp $1.ipynb $1.tmp.ipynb
12 |     sed -i 's/%%/#/' $1.tmp.ipynb
13 |     sed -i 's/%pylab/#/' $1.tmp.ipynb
14 | 
15 |     jupyter nbconvert $1.tmp.ipynb --to python
16 | 
17 |     mv $1.tmp.py $1.py
18 |     sed -i '1i# -*- coding: utf-8 -*-' $1.py
19 |     sed -i '#!/usr/bin/python' $1.py
20 |     rm $1.tmp.ipynb
21 | fi
22 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-436-batch2.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: '/mnt/disk1/models'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 2 # default to 1
 9 | in_out_pairs:
10 |   - '32-32'
11 |   - '1024-128'
12 |   - '2048-256'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
17 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-443-batch2.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 3 | local_model_hub: '/mnt/disk1/models'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 2 # default to 1
 9 | in_out_pairs:
10 |   - '32-32'
11 |   - '1024-128'
12 |   - '2048-256'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
17 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-443-batch4.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 3 | local_model_hub: '/mnt/disk1/models'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 4 # default to 1
 9 | in_out_pairs:
10 |   - '32-32'
11 |   - '1024-128'
12 |   - '2048-256'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
17 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_445.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: 'path to your local model hub'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '1024-128'
12 | test_api:
13 |   - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
14 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
15 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'google/gemma-2-2b-it'
 3 |   - 'google/gemma-2-9b-it'
 4 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 5 | local_model_hub: 'path to your local model hub'
 6 | warm_up: 1
 7 | num_trials: 3
 8 | num_beams: 1 # default to greedy search
 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
10 | batch_size: 1 # default to 1
11 | in_out_pairs:
12 |   - '2048-256'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_443.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'google/gemma-2-2b-it'
 3 |   # - 'google/gemma-2-9b-it'
 4 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 5 | local_model_hub: 'path to your local model hub'
 6 | warm_up: 1
 7 | num_trials: 3
 8 | num_beams: 1 # default to greedy search
 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
10 | batch_size: 1 # default to 1
11 | in_out_pairs:
12 |   - '3072-384'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_443.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'google/gemma-2-2b-it'
 3 |   # - 'google/gemma-2-9b-it'
 4 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 5 | local_model_hub: 'path to your local model hub'
 6 | warm_up: 1
 7 | num_trials: 3
 8 | num_beams: 1 # default to greedy search
 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
10 | batch_size: 1 # default to 1
11 | in_out_pairs:
12 |   - '4096-512'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-440.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen1.5-MoE-A2.7B-Chat'
 3 | local_model_hub: '/mnt/disk1/models'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 1 # default to 1
 9 | in_out_pairs:
10 |   - '32-32'
11 |   - '1024-128'
12 |   - '2048-256'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
17 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'google/gemma-2-2b-it'
 3 |   - 'google/gemma-2-9b-it'
 4 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 5 | local_model_hub: 'path to your local model hub'
 6 | warm_up: 1
 7 | num_trials: 3
 8 | num_beams: 1 # default to greedy search
 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
10 | batch_size: 1 # default to 1
11 | in_out_pairs:
12 |   - '1024-128'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | 


--------------------------------------------------------------------------------
/python/llm/test/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/cpu/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/xpu/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_443.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'google/gemma-2-2b-it'
 3 |   - 'google/gemma-2-9b-it'
 4 |   - 'meta-llama/Llama-3.1-8B-Instruct'
 5 | local_model_hub: 'path to your local model hub'
 6 | warm_up: 1
 7 | num_trials: 3
 8 | num_beams: 1 # default to greedy search
 9 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
10 | batch_size: 1 # default to 1
11 | in_out_pairs:
12 |   - '1024-128'
13 | test_api:
14 |   - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
15 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/serving/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/models/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/serving/fastchat/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-445.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: '/mnt/disk1/models'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 1 # default to 1
10 | in_out_pairs:
11 |   - '32-32'
12 |   - '1024-128'
13 |   - '2048-256'
14 | test_api:
15 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
16 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
17 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
18 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/gguf/models/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/npu_models/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/docker/llm/sources/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | # Add deb-src entries to the sources.list
 4 | RUN echo "deb-src http://archive.ubuntu.com/ubuntu/ focal main restricted universe multiverse" >> /etc/apt/sources.list
 5 | 
 6 | # Update package lists and install dpkg-dev
 7 | RUN apt-get update && apt-get install -y dpkg-dev
 8 | 
 9 | # Create a temporary directory, adjust permissions, and download source code for the specified packages
10 | RUN mkdir -p /usr/local/src/git-source && \
11 |     chown _apt:root /usr/local/src/git-source && \
12 |     cd /usr/local/src/git-source && \
13 |     apt-get source \
14 |     git \
15 |     gnupg \
16 |     numactl \
17 |     wget \
18 |     software-properties-common


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-445-batch2.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: '/mnt/disk1/models'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 2 # default to 1
10 | in_out_pairs:
11 |   - '32-32'
12 |   - '1024-128'
13 |   - '2048-256'
14 | test_api:
15 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
16 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
17 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
18 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-445-batch4.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-3.2-1B-Instruct'
 3 |   - 'meta-llama/Llama-3.2-3B-Instruct'
 4 | local_model_hub: '/mnt/disk1/models'
 5 | warm_up: 1
 6 | num_trials: 3
 7 | num_beams: 1 # default to greedy search
 8 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 9 | batch_size: 4 # default to 1
10 | in_out_pairs:
11 |   - '32-32'
12 |   - '1024-128'
13 |   - '2048-256'
14 | test_api:
15 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
16 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
17 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
18 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/npu_pipeline_model/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/arc-perf-transformers-436-batch4.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'Qwen/Qwen-VL-Chat'
 3 | local_model_hub: '/mnt/disk1/models'
 4 | warm_up: 1
 5 | num_trials: 3
 6 | num_beams: 1 # default to greedy search
 7 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 8 | batch_size: 4 # default to 1
 9 | in_out_pairs:
10 |   - '32-32'
11 |   - '1024-128'
12 |   - '2048-256'
13 | test_api:
14 |   - "transformer_int4_fp16_gpu"  # on Intel GPU
15 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
16 | exclude:
17 |   - 'Qwen/Qwen-VL-Chat:2048'
18 | task: 'continuation' # task can be 'continuation', 'QA' and 'summarize'
19 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/HF-Transformers-AutoModels/README.md:
--------------------------------------------------------------------------------
1 | # Running Hugging Face Transformers model using IPEX-LLM on Intel CPU
2 | 
3 | This folder contains examples of running any HuggingFace `transformers` model on IPEX-LLM (using the standard AutoModel APIs):
4 | 
5 | - [Model](Model): examples of running HuggingFace `transformers` models (e.g., LLaMA, Mistral, ChatGLM, Qwen, Baichuan, Mixtral, Gemma, etc.) using INT4 optimizations
6 | - [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (INT8/INT5, etc.) on Intel CPU
7 | - [Save-Load](Save-Load): examples of saving and loading low-bit models
8 | - [Advanced-Quantizations](Advanced-Quantizations): examples of loading GGUF/AWQ/GPTQ models
9 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/common/templates/alpaca.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json",
3 |     "description": "Template used by Alpaca-LoRA.",
4 |     "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
5 |     "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/run-deepspeed-arc.sh:
--------------------------------------------------------------------------------
 1 | export MASTER_ADDR=127.0.0.1
 2 | export FI_PROVIDER=tcp
 3 | export CCL_ATL_TRANSPORT=ofi
 4 | export CCL_ZE_IPC_EXCHANGE=sockets
 5 | 
 6 | export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 7 | basekit_root=/opt/intel/oneapi
 8 | source $basekit_root/setvars.sh --force
 9 | source $basekit_root/ccl/latest/env/vars.sh --force
10 | 
11 | NUM_GPUS=2 # number of used GPU
12 | export USE_XETLA=OFF
13 | if grep -q "Core" /proc/cpuinfo; then
14 |     export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
15 | fi
16 | export TORCH_LLM_ALLREDUCE=0 # Different from PVC
17 | export BIGDL_IMPORT_IPEX=0
18 | mpirun -np $NUM_GPUS --prepend-rank python run.py
19 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/run-deepspeed-pvc.sh:
--------------------------------------------------------------------------------
 1 | export ZE_AFFINITY_MASK="0,1" # specify the used GPU
 2 | NUM_GPUS=2 # number of used GPU
 3 | export MASTER_ADDR=127.0.0.1
 4 | export FI_PROVIDER=tcp
 5 | export CCL_ATL_TRANSPORT=ofi
 6 | export CCL_ZE_IPC_EXCHANGE=sockets
 7 | 
 8 | export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 9 | basekit_root=/opt/intel/oneapi
10 | source $basekit_root/setvars.sh --force
11 | source $basekit_root/ccl/latest/env/vars.sh --force
12 | 
13 | export OMP_NUM_THREADS=$((56/$NUM_GPUS))
14 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
15 | export TORCH_LLM_ALLREDUCE=1
16 | export BIGDL_IMPORT_IPEX=0
17 | mpirun -np $NUM_GPUS --prepend-rank python run.py
18 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/templates/alpaca.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json",
3 |     "description": "Template used by Alpaca-LoRA.",
4 |     "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
5 |     "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/templates/alpaca.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json",
3 |     "description": "Template used by Alpaca-LoRA.",
4 |     "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
5 |     "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/gguf/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from .api import load_gguf_model
18 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-2-7b-chat-hf'
 3 |   - 'THUDM/chatglm2-6b'
 4 |   - 'THUDM/chatglm3-6b'
 5 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 6 |   - 'Qwen/Qwen-7B-Chat'
 7 | local_model_hub: '/mnt/disk1/models'
 8 | warm_up: 10
 9 | num_trials: 100
10 | num_beams: 1 # default to greedy search
11 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
12 | batch_size: 1 # default to 1
13 | in_out_pairs:
14 |   - '1024-512'
15 |   - '2048-512'
16 | test_api:
17 |   - "transformer_int4_gpu"  # on Intel GPU
18 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
19 | exclude:
20 |   - 'Qwen/Qwen-7B-Chat:2048'
21 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/common/templates/alpaca_legacy.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_legacy.json",
3 |     "description": "Legacy template, used by Original Alpaca repository.",
4 |     "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:",
5 |     "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml:
--------------------------------------------------------------------------------
 1 | imageName: intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.2.0-SNAPSHOT
 2 | trainerNum: 2
 3 | microBatchSize: 8
 4 | enableGradientCheckpoint: false # true will save more memory but increase latency
 5 | nfsServerIp: your_nfs_server_ip
 6 | nfsPath: a_nfs_shared_folder_path_on_the_server
 7 | dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory
 8 | modelSubPath: Llama-2-7b-chat-hf # a subpath of the model file (dir) under nfs directory
 9 | omp_num_threads: 48 # configure this value based on the number of CPU cores
10 | httpProxy: "your_http_proxy_like_http://xxx:xxxx_if_needed_else_empty"
11 | httpsProxy: "your_https_proxy_like_http://xxx:xxxx_if_needed_else_empty"
12 | 


--------------------------------------------------------------------------------
/docs/mddocs/PythonAPI/README.md:
--------------------------------------------------------------------------------
 1 | # IPEX-LLM API
 2 | 
 3 | - [IPEX-LLM `transformers`-style API](./transformers.md)
 4 | 
 5 |   - [Hugging Face `transformers` AutoModel](./transformers.md#hugging-face-transformers-automodel)
 6 | 
 7 |     - AutoModelForCausalLM
 8 |     - AutoModel
 9 |     - AutoModelForSpeechSeq2Seq
10 |     - AutoModelForSeq2SeqLM
11 |     - AutoModelForSequenceClassification
12 |     - AutoModelForMaskedLM
13 |     - AutoModelForQuestionAnswering
14 |     - AutoModelForNextSentencePrediction
15 |     - AutoModelForMultipleChoice
16 |     - AutoModelForTokenClassification
17 | 
18 | - [IPEX-LLM PyTorch API](./optimize.md)
19 |   
20 |   - [Optimize Model](./optimize.md#optimize-model)
21 | 
22 |   - [Load Optimized Model](./optimize.md#load-optimized-model)


--------------------------------------------------------------------------------
/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/templates/alpaca_legacy.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_legacy.json",
3 |     "description": "Legacy template, used by Original Alpaca repository.",
4 |     "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:",
5 |     "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/templates/alpaca_legacy.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca_legacy.json",
3 |     "description": "Legacy template, used by Original Alpaca repository.",
4 |     "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:",
5 |     "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:",
6 |     "response_split": "### Response:"    
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/common/templates/vigogne.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/vigogne.json",
3 |     "description": "French template, used by Vigogne for finetuning.",
4 |     "prompt_input": "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Entrée:\n{input}\n\n### Réponse:\n",
5 |     "prompt_no_input": "Ci-dessous se trouve une instruction qui décrit une tâche. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Réponse:\n",
6 |     "response_split": "### Réponse:"
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/common/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from .prompter import Prompter
18 | from .util import *
19 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/serving/fastapi/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from .api_server import FastApp
18 | from .model_worker import ModelWorker


--------------------------------------------------------------------------------
/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/templates/vigogne.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/vigogne.json",
3 |     "description": "French template, used by Vigogne for finetuning.",
4 |     "prompt_input": "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Entrée:\n{input}\n\n### Réponse:\n",
5 |     "prompt_no_input": "Ci-dessous se trouve une instruction qui décrit une tâche. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Réponse:\n",
6 |     "response_split": "### Réponse:"
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-2-7b-chat-hf'
 3 |   - 'THUDM/chatglm2-6b'
 4 |   - 'THUDM/chatglm3-6b'
 5 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 6 |   - 'Qwen/Qwen-7B-Chat'
 7 | local_model_hub: '/mnt/disk1/models'
 8 | warm_up: 10
 9 | num_trials: 100
10 | num_beams: 1 # default to greedy search
11 | low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4)
12 | batch_size: 1 # default to 1
13 | in_out_pairs:
14 |   - '1024-512'
15 |   - '2048-512'
16 | test_api:
17 |   - "transformer_int4_gpu"  # on Intel GPU
18 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
19 | exclude:
20 |   - 'baichuan-inc/Baichuan2-7B-Chat:2048'
21 |   - 'Qwen/Qwen-7B-Chat:2048'


--------------------------------------------------------------------------------
/python/llm/example/GPU/HuggingFace/README.md:
--------------------------------------------------------------------------------
 1 | # Running HuggingFace models using IPEX-LLM on Intel GPU
 2 | 
 3 | This folder contains examples of running any HuggingFace model on IPEX-LLM:
 4 | 
 5 | - [LLM](LLM): examples of running large language models (LLaMA, Mistral, ChatGLM, Qwen, Baichuan, Mixtral, Gemma, etc.) using IPEX-LLM optimizations
 6 | - [Multimodal](Multimodal): examples of running large multimodal models (StableDiffusion models, Qwen-VL-Chat, glm-4v, etc.) using IPEX-LLM optimizations
 7 | - [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (FP8/INT8/FP4, etc.)
 8 | - [Save-Load](Save-Load): examples of saving and loading low-bit models
 9 | - [Advanced-Quantizations](Advanced-Quantizations): examples of loading GGUF/AWQ/GPTQ models
10 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/templates/vigogne.json:
--------------------------------------------------------------------------------
1 | {
2 |     "//": "This file is copied from https://github.com/tloen/alpaca-lora/blob/main/templates/vigogne.json",
3 |     "description": "French template, used by Vigogne for finetuning.",
4 |     "prompt_input": "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Entrée:\n{input}\n\n### Réponse:\n",
5 |     "prompt_no_input": "Ci-dessous se trouve une instruction qui décrit une tâche. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Réponse:\n",
6 |     "response_split": "### Réponse:"
7 | }
8 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/Deepspeed-AutoTP/run.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | source ipex-llm-init
 3 | unset OMP_NUM_THREADS # deepspeed will set it for each instance automatically
 4 | source /opt/intel/oneccl/env/setvars.sh
 5 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
 6 | export WORLD_SIZE=2 # run 1 instance per SPR socket, thus 2 instances on 2 sockets, 96 cores
 7 | export MASTER_ADDR=127.0.0.1
 8 | export CCL_ZE_IPC_EXCHANGE=sockets
 9 | export DS_ACCELERATOR="cpu"
10 | export CCL_WORKER_AFFINITY=auto
11 | unset KMP_AFFINITY # deepspeed will set it for each instance automatically
12 | export FI_PROVIDER=tcp
13 | export CCL_ATL_TRANSPORT=ofi
14 | export CCL_PROCESS_LAUNCHER=none
15 | 
16 | deepspeed \
17 |   --bind_cores_to_rank \
18 |   --bind_core_list 0-95 \
19 |   deepspeed_autotp.py
20 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/run-deepspeed-spr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source ipex-llm-init -t
 3 | unset OMP_NUM_THREADS # deepspeed will set it for each instance automatically
 4 | source /opt/intel/oneccl/env/setvars.sh
 5 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
 6 | export WORLD_SIZE=2 # run 1 instance per SPR socket, thus 2 instances on 2 sockets, 96 cores
 7 | export MASTER_ADDR=127.0.0.1
 8 | export CCL_ZE_IPC_EXCHANGE=sockets
 9 | export DS_ACCELERATOR="cpu"
10 | export CCL_WORKER_AFFINITY=auto
11 | unset KMP_AFFINITY # deepspeed will set it for each instance automatically
12 | export FI_PROVIDER=tcp
13 | export CCL_ATL_TRANSPORT=ofi
14 | export CCL_PROCESS_LAUNCHER=none
15 | 
16 | deepspeed \
17 |   --bind_cores_to_rank \
18 |   --bind_core_list 0-95 \
19 |   run.py
20 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-2-7b-chat-hf'
 3 |   - 'meta-llama/Llama-2-13b-chat-hf'
 4 |   - 'THUDM/chatglm2-6b'
 5 |   - 'THUDM/chatglm3-6b'
 6 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 7 |   - 'baichuan-inc/Baichuan2-13B-Chat'
 8 |   - 'Qwen/Qwen-14B-Chat'
 9 | local_model_hub: '/mnt/disk1/models'
10 | warm_up: 3
11 | num_trials: 50
12 | num_beams: 1 # default to greedy search
13 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
14 | batch_size: 1 # default to 1
15 | in_out_pairs:
16 |   - '1024-512'
17 |   - '2048-512'
18 | test_api:
19 |   - "transformer_int4"
20 |   # - "transformer_int4_gpu"  # on Intel GPU
21 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
22 | 


--------------------------------------------------------------------------------
/python/llm/test/run-llm-inference-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference
 6 | 
 7 | set -e
 8 | 
 9 | echo "# Start testing inference"
10 | start=$(date "+%s")
11 | 
12 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_call_models.py -v
13 | 
14 | if [ -z "$THREAD_NUM" ]; then
15 |   THREAD_NUM=2
16 | fi
17 | export OMP_NUM_THREADS=$THREAD_NUM
18 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v
19 | python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v
20 | 
21 | now=$(date "+%s")
22 | time=$((now-start))
23 | 
24 | echo "Bigdl-llm tests finished"
25 | echo "Time used:$time seconds"
26 | 


--------------------------------------------------------------------------------
/docs/mddocs/DockerGuides/README.md:
--------------------------------------------------------------------------------
 1 | # IPEX-LLM Docker Container User Guides
 2 | 
 3 | 
 4 | In this section, you will find guides related to using IPEX-LLM with Docker, covering how to:
 5 | 
 6 | - [Overview of IPEX-LLM Containers](./docker_windows_gpu.md)
 7 | 
 8 | - Inference in Python/C++  
 9 |   - [GPU Inference in Python with IPEX-LLM](./docker_pytorch_inference_gpu.md)
10 |   - [VSCode LLM Development with IPEX-LLM on Intel GPU](./docker_run_pytorch_inference_in_vscode.md)
11 |   - [llama.cpp/Ollama/Open-WebUI with IPEX-LLM on Intel GPU](./docker_cpp_xpu_quickstart.md)
12 | 
13 | - Serving
14 |   - [FastChat with IPEX-LLM on Intel GPU](./fastchat_docker_quickstart.md)
15 |   - [vLLM with IPEX-LLM on Intel GPU](./vllm_docker_quickstart.md)
16 |   - [vLLM with IPEX-LLM on Intel CPU](./vllm_cpu_docker_quickstart.md)
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/🐛bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F41BBug report"
 3 | about: Report a bug or error
 4 | title: ''
 5 | labels: user issue
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug or error is.
12 | 
13 | **How to reproduce**
14 | Steps to reproduce the error:
15 | 1. ...
16 | 2. ...
17 | 3. ...
18 | 4. ...
19 | 
20 | **Screenshots**
21 | If applicable, add screenshots to help explain the problem
22 | 
23 | **Environment information**
24 | If possible, please attach the output of the environment check script, using:
25 | - https://github.com/intel/ipex-llm/blob/main/python/llm/scripts/env-check.bat, or
26 | - https://github.com/intel/ipex-llm/blob/main/python/llm/scripts/env-check.sh
27 | 
28 | **Additional context**
29 | Add any other context about the problem here.
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compile products
 2 | *.pyc
 3 | *.class
 4 | *.crc
 5 | *.o
 6 | 
 7 | # SBT, Maven specific
 8 | .cache
 9 | .history
10 | .lib/
11 | lib_managed/
12 | src_managed/
13 | project/boot/
14 | project/plugins/project/
15 | dependency-reduced-pom.xml
16 | 
17 | # IDE specific
18 | .scala_dependencies
19 | .worksheet
20 | *.iml
21 | .idea/
22 | .vscode/
23 | 
24 | # macOS specific
25 | .DS_Store
26 | 
27 | # data files
28 | model*.[0-9]*
29 | state*.[0-9]*
30 | 
31 | # other
32 | nohup.out
33 | *.log
34 | *.lock
35 | *.un~
36 | *.idx
37 | .ipynb_checkpoints/
38 | .project
39 | .settings/
40 | */.cache-main
41 | */.cache-tests
42 | */.classpath
43 | */.project
44 | */.settings/
45 | *.so
46 | *.so.*
47 | *.dylib
48 | __pycache__
49 | *.egg-info
50 | target
51 | build
52 | dist
53 | 
54 | # For readthedocs
55 | docs/readthedocs/requirements-doc.txt
56 | docs/readthedocs/_build/*


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/utils/modules.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import sys
18 | from types import ModuleType
19 | 
20 | 
21 | def insert_fake_module(name, doc=None):
22 |     m = ModuleType(name, doc)
23 |     m.__file__ = __file__
24 |     sys.modules[name] = m
25 |     return m
26 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt:
--------------------------------------------------------------------------------
 1 | # This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/requirements.txt
 2 | --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 3 | packaging==23.2
 4 | peft==0.10.0
 5 | tokenizers
 6 | bitsandbytes>=0.41.1
 7 | accelerate==0.23.0
 8 | deepspeed>=0.13.1
 9 | addict
10 | fire
11 | PyYAML>=6.0
12 | datasets
13 | #flash-attn==2.3.3
14 | sentencepiece
15 | wandb
16 | einops
17 | #xformers==0.0.22
18 | optimum==1.13.2
19 | hf_transfer
20 | colorama
21 | numba
22 | numpy>=1.24.4
23 | mlflow
24 | # qlora things
25 | bert-score==0.3.13
26 | evaluate==0.4.0
27 | rouge-score==0.1.2
28 | scipy
29 | scikit-learn>=1.5.0
30 | pynvml
31 | art
32 | fschat
33 | gradio>=4.19.2
34 | tensorboard
35 | 
36 | mamba-ssm==1.1.1
37 | 
38 | # remote filesystems
39 | s3fs
40 | gcsfs
41 | # adlfs
42 | 
43 | trl>=0.7.9, <=0.9.6
44 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/cpu/engine/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine
17 | __all__ = [
18 |     "IPEXLLMAsyncLLMEngine",
19 |     "IPEXLLMLLMEngine",
20 |     "IPEXLLMClass",
21 |     "run_mp_engine",
22 | ]
23 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/PyTorch-Models/Model/README.md:
--------------------------------------------------------------------------------
 1 | # IPEX-LLM INT4 Optimization for Large Language Model
 2 | You can use `optimize_model` API to accelerate general PyTorch models on Intel servers and PCs. This directory contains example scripts to help you quickly get started using IPEX-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
 3 | 
 4 | ## Recommended Requirements
 5 | To run the examples, we recommend using Intel® Xeon® processors (server), or >= 12th Gen Intel® Core™ processor (client).
 6 | 
 7 | For OS, IPEX-LLM supports Ubuntu 20.04 or later, CentOS 7 or later, and Windows 10/11.
 8 | 
 9 | ## Best Known Configuration on Linux
10 | For better performance, it is recommended to set environment variables on Linux with the help of IPEX-LLM:
11 | ```bash
12 | pip install ipex-llm
13 | source ipex-llm-init
14 | ```
15 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py:
--------------------------------------------------------------------------------
 1 | from vllm.logger import init_logger
 2 | from vllm.v1.executor.ray_utils import RayWorkerWrapper
 3 | 
 4 | 
 5 | logger = init_logger(__name__)
 6 | 
 7 | 
 8 | class IPEXLLMV1Wrapper(RayWorkerWrapper):
 9 |     def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None:
10 |         super().__init__(*args, **kwargs)
11 |         from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
12 |         _ipex_llm_convert(load_in_low_bit=load_in_low_bit)
13 |         self.compiled_dag_cuda_device_set = False
14 | 
15 | 
16 | def get_ipex_llm_v1_wrapper(load_in_low_bit):
17 |     # The reason why we not using functools.partial is that
18 |     # ray seems not work well with it.
19 |     class WrapperWithLoadBit(IPEXLLMV1Wrapper):
20 |         def __init__(self, *args, **kwargs) -> None:
21 |             super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs)
22 | 
23 |     return WrapperWithLoadBit
24 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py:
--------------------------------------------------------------------------------
 1 | from vllm.logger import init_logger
 2 | from vllm.v1.executor.ray_utils import RayWorkerWrapper
 3 | 
 4 | 
 5 | logger = init_logger(__name__)
 6 | 
 7 | 
 8 | class IPEXLLMV1Wrapper(RayWorkerWrapper):
 9 |     def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None:
10 |         super().__init__(*args, **kwargs)
11 |         from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
12 |         _ipex_llm_convert(load_in_low_bit=load_in_low_bit)
13 |         self.compiled_dag_cuda_device_set = False
14 | 
15 | 
16 | def get_ipex_llm_v1_wrapper(load_in_low_bit):
17 |     # The reason why we not using functools.partial is that
18 |     # ray seems not work well with it.
19 |     class WrapperWithLoadBit(IPEXLLMV1Wrapper):
20 |         def __init__(self, *args, **kwargs) -> None:
21 |             super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs)
22 | 
23 |     return WrapperWithLoadBit
24 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/gptq/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/ggml/model/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/gptq/convert/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/langchain/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/llamaindex/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 


--------------------------------------------------------------------------------
/python/llm/test/run-llm-check-function.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # wrapper for pytest command
 4 | # add this before `pytest ...` or `python -m pytest ...` to avoid unexpected exit code 127 caused by ipex on Windows
 5 | # ref: https://github.com/intel/intel-extension-for-pytorch/issues/634
 6 | pytest_check_error() {
 7 |   result=$(eval "$@" || echo "FINISH PYTEST")
 8 |   echo $result > pytest_check_error.log
 9 |   cat pytest_check_error.log
10 |   failed_lines=$(cat pytest_check_error.log | { grep failed || true; })
11 |   if [[ $failed_lines != "" ]]; then
12 |     exit 1
13 |   fi
14 |   rm pytest_check_error.log
15 | }
16 | 
17 | # wrapper for python command
18 | # add this before `python ...` to avoid unexpected exit code 127 caused by ipex on Windows
19 | # ref: https://github.com/intel/intel-extension-for-pytorch/issues/634
20 | ipex_workaround_wrapper() {
21 |     eval "$@" || ( [[ $? == 127 && $RUNNER_OS == "Windows" ]] && echo "EXIT CODE 127 DETECTED ON WINDOWS, IGNORE." || exit 1)
22 | }
23 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/langchain/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/awq/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/lora_finetuning_chatglm3_6b_on_alpaca_with_1_arc_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export BIGDL_CHECK_DUPLICATE_IMPORT=0
18 | 
19 | # You can also set the remote model repository to a local model path
20 | python lora_finetune_chatglm.py \
21 |        yahma/alpaca-cleaned \
22 |        THUDM/chatglm3-6b  \
23 |        ./lora_config.yaml
24 | 


--------------------------------------------------------------------------------
/python/llm/test/run-llm-llamaindex-tests-gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/llamaindex_gpu
 6 | 
 7 | if [[ $RUNNER_OS == "Linux" ]]; then
 8 |   export USE_XETLA=OFF
 9 |   export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
10 | elif [[ $RUNNER_OS == "Windows" ]]; then
11 |   export ANALYTICS_ZOO_ROOT=$(cygpath -m ${ANALYTICS_ZOO_ROOT})
12 |   export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/llamaindex_gpu
13 |   export SYCL_CACHE_PERSISTENT=1
14 | fi
15 | 
16 | set -e
17 | 
18 | echo "# Start testing inference"
19 | start=$(date "+%s")
20 | 
21 | source ${ANALYTICS_ZOO_ROOT}/python/llm/test/run-llm-check-function.sh
22 | 
23 | pytest_check_error python -m pytest -s ${LLM_INFERENCE_TEST_DIR}
24 | 
25 | now=$(date "+%s")
26 | time=$((now-start))
27 | 
28 | echo "Bigdl-llm llamaindex gpu tests finished"
29 | echo "Time used:$time seconds"


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/LongBench/config/model2path.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "llama2-7b-chat-4k": "meta-llama/Llama-2-7b-chat-hf",
 3 |     "longchat-v1.5-7b-32k": "lmsys/longchat-7b-v1.5-32k",
 4 |     "xgen-7b-8k": "Salesforce/xgen-7b-8k-inst",
 5 |     "internlm-7b-8k": "internlm/internlm-chat-7b-8k",
 6 |     "chatglm2-6b": "THUDM/chatglm2-6b",
 7 |     "chatglm2-6b-32k": "THUDM/chatglm2-6b-32k",
 8 |     "chatglm3-6b-32k": "THUDM/chatglm3-6b-32k",
 9 |     "chatglm4-9b": "THUDM/glm-4-9b-chat",
10 |     "vicuna-v1.5-7b-16k": "lmsys/vicuna-7b-v1.5-16k",
11 |     "mistral-7B-instruct-v0.2": "mistralai/Mistral-7B-Instruct-v0.2",
12 |     "mistral-7B-instruct-v0.1": "mistralai/Mistral-7B-Instruct-v0.1",
13 |     "mixtral-8x7B-instruct-v0.1": "mistralai/Mixtral-8x7B-Instruct-v0.1",
14 |     "llama-2-7B-32k-instruct": "togethercomputer/Llama-2-7B-32K-Instruct",
15 |     "lwm-text-chat-1m": "LargeWorldModel/LWM-Text-Chat-1M",
16 |     "lwm-text-1m": "LargeWorldModel/LWM-Text-1M",
17 |     "qwen2-7b-instruct": "Qwen/Qwen2-7B-Instruct"
18 | }
19 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/lora_finetuning_chatglm3_6b_on_advertise_gen_with_1_arc_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export BIGDL_CHECK_DUPLICATE_IMPORT=0
18 | 
19 | # You can also set the remote model repository to a local model path
20 | python lora_finetune_chatglm.py \
21 |        ./AdvertiseGen_fix  \
22 |        THUDM/chatglm3-6b  \
23 |        ./lora_config.yaml
24 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/xpu/engine/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine, IPEXLLMAsyncV1Engine, IPEXLLMLLMV1Engine
17 | __all__ = [
18 |     "IPEXLLMAsyncLLMEngine",
19 |     "IPEXLLMLLMEngine",
20 |     "IPEXLLMClass",
21 |     "IPEXLLMAsyncV1Engine",
22 |     "IPEXLLMLLMV1Engine",
23 |     "run_mp_engine",
24 | ]
25 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/ggml/model/bloom/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | from .bloom import Bloom
23 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/cpu/ipex_llm_wrapper.py:
--------------------------------------------------------------------------------
 1 | from vllm.logger import init_logger
 2 | from vllm.executor.ray_utils import RayWorkerWrapper
 3 | 
 4 | 
 5 | logger = init_logger(__name__)
 6 | 
 7 | 
 8 | class IPEXLLMWrapper(RayWorkerWrapper):
 9 |     def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None:
10 |         super().__init__(*args, **kwargs)
11 |         from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
12 |         _ipex_llm_convert(load_in_low_bit=load_in_low_bit)
13 |         self.compiled_dag_cuda_device_set = False
14 | 
15 | 
16 | def get_ipex_llm_wrapper(load_in_low_bit):
17 |     # The reason why we not using functools.partial is that
18 |     # ray seems not work well with it.
19 |     class WrapperWithLoadBit(IPEXLLMWrapper):
20 |         def __init__(self, *args, **kwargs) -> None:
21 |             super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs)
22 | 
23 |     # a = functools.partial(IPEXLLMWrapper, load_in_low_bit=load_in_low_bit)
24 |     return WrapperWithLoadBit
25 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/vllm/xpu/ipex_llm_wrapper.py:
--------------------------------------------------------------------------------
 1 | from vllm.logger import init_logger
 2 | from vllm.executor.ray_utils import RayWorkerWrapper
 3 | 
 4 | 
 5 | logger = init_logger(__name__)
 6 | 
 7 | 
 8 | class IPEXLLMWrapper(RayWorkerWrapper):
 9 |     def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None:
10 |         super().__init__(*args, **kwargs)
11 |         from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
12 |         _ipex_llm_convert(load_in_low_bit=load_in_low_bit)
13 |         self.compiled_dag_cuda_device_set = False
14 | 
15 | 
16 | def get_ipex_llm_wrapper(load_in_low_bit):
17 |     # The reason why we not using functools.partial is that
18 |     # ray seems not work well with it.
19 |     class WrapperWithLoadBit(IPEXLLMWrapper):
20 |         def __init__(self, *args, **kwargs) -> None:
21 |             super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs)
22 | 
23 |     # a = functools.partial(IPEXLLMWrapper, load_in_low_bit=load_in_low_bit)
24 |     return WrapperWithLoadBit
25 | 


--------------------------------------------------------------------------------
/python/llm/test/run-llm-langchain-tests-gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 5 | export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/langchain_gpu
 6 | 
 7 | if [[ $RUNNER_OS == "Linux" ]]; then
 8 |   export USE_XETLA=OFF
 9 |   export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
10 | elif [[ $RUNNER_OS == "Windows" ]]; then
11 |   export ANALYTICS_ZOO_ROOT=$(cygpath -m ${ANALYTICS_ZOO_ROOT})
12 |   export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/langchain_gpu
13 |   export SYCL_CACHE_PERSISTENT=1
14 | fi
15 | 
16 | export DEVICE='xpu'
17 | 
18 | set -e
19 | 
20 | echo "# Start testing inference"
21 | start=$(date "+%s")
22 | 
23 | source ${ANALYTICS_ZOO_ROOT}/python/llm/test/run-llm-check-function.sh
24 | 
25 | pytest_check_error python -m pytest -s ${LLM_INFERENCE_TEST_DIR}
26 | 
27 | now=$(date "+%s")
28 | time=$((now-start))
29 | 
30 | echo "Bigdl-llm langchain gpu tests finished"
31 | echo "Time used:$time seconds"


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/ggml/model/generation/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | from .utils import GenerationMixin
23 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/ggml/model/starcoder/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | from .starcoder import Starcoder
23 | 


--------------------------------------------------------------------------------
/docker/llm/serving/cpu/docker/model_adapter.py.patch:
--------------------------------------------------------------------------------
 1 | --- model_adapter.py.old        2024-03-05 15:08:47.169275336 +0800
 2 | +++ model_adapter.py    2024-03-05 15:10:13.434703674 +0800
 3 | @@ -1690,15 +1690,17 @@
 4 |          )
 5 |          # NOTE: if you use the old version of model file, please remove the comments below
 6 |          # config.use_flash_attn = False
 7 | -        self.float_set(config, "fp16")
 8 | +        # self.float_set(config, "fp16")
 9 |          generation_config = GenerationConfig.from_pretrained(
10 |              model_path, trust_remote_code=True
11 |          )
12 | +        from ipex_llm.transformers import AutoModelForCausalLM
13 |          model = AutoModelForCausalLM.from_pretrained(
14 |              model_path,
15 |              config=config,
16 |              low_cpu_mem_usage=True,
17 |              trust_remote_code=True,
18 | +            load_in_4bit=True,
19 |              **from_pretrained_kwargs,
20 |          ).eval()
21 |          if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk:
22 | 


--------------------------------------------------------------------------------
/python/llm/test/install/test_install.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | 
18 | import ipex_llm
19 | import pytest
20 | from unittest import TestCase
21 | 
22 | 
23 | class Test_LLM_Basics(TestCase):
24 | 
25 |     def test_naive(self):
26 |         from ipex_llm.ggml import quantize
27 |         from ipex_llm.utils.common import invalidInputError
28 |         pass
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     pytest.main([__file__])
33 | 


--------------------------------------------------------------------------------
/python/llm/test/run-llm-convert-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 4 | export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 5 | export LLM_CONVERT_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/convert
 6 | 
 7 | set -e
 8 | 
 9 | echo "# Start testing convert"
10 | start=$(date "+%s")
11 | 
12 | # separate convert process to save disk space
13 | if [[ $1 == "llama" ]]; then
14 |   python -m pytest -s ${LLM_CONVERT_TEST_DIR}/test_convert_model.py -k "test_convert_llama"
15 | elif [[ $1 == "gptneox" ]]; then
16 |   python -m pytest -s ${LLM_CONVERT_TEST_DIR}/test_convert_model.py -k "test_convert_gptneox"
17 | elif [[ $1 == "bloom" ]]; then
18 |   python -m pytest -s ${LLM_CONVERT_TEST_DIR}/test_convert_model.py -k "test_convert_bloom"
19 | elif [[ $1 == "starcoder" ]]; then
20 |   python -m pytest -s ${LLM_CONVERT_TEST_DIR}/test_convert_model.py -k "test_convert_starcoder"
21 | fi
22 | 
23 | now=$(date "+%s")
24 | time=$((now-start))
25 | 
26 | echo "Bigdl-llm tests finished"
27 | echo "Time used:$time seconds"
28 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/ggml/model/llama/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | from .llama_cpp import *
23 | from .llama import *
24 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/ggml/model/gptneox/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | from .gptneox_cpp import *
23 | from .gptneox import *
24 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_gemma_2b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --base_model "google/gemma-2b-it" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --output_dir "./ipex-llm-qlora-alpaca"
22 | 


--------------------------------------------------------------------------------
/docker/llm/serving/cpu/docker/start-notebook.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright 2016 The IPEX-LLM Authors.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | #set -x
19 | port=${port:-12345}
20 | token=${token:-""}
21 | 
22 | while [ $# -gt 0 ]; do
23 | 
24 |    if [[ $1 == *"--"* ]]; then
25 |         param="${1/--/}"
26 |         declare $param="$2"
27 |    fi
28 | 
29 |   shift
30 | done
31 | 
32 | jupyter-lab --notebook-dir=/llm/ipex-llm-tutorial --ip=0.0.0.0 --port=$port --no-browser --NotebookApp.token=$token --allow-root
33 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/HF-Transformers-AutoModels/Model/README.md:
--------------------------------------------------------------------------------
 1 | # IPEX-LLM Transformers INT4 Optimization for Large Language Model
 2 | You can use IPEX-LLM to run any Huggingface Transformer models with INT4 optimizations on either servers or laptops. This directory contains example scripts to help you quickly get started using IPEX-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
 3 | 
 4 | ## Recommended Requirements
 5 | To run the examples, we recommend using Intel® Xeon® processors (server), or >= 12th Gen Intel® Core™ processor (client).
 6 | 
 7 | For OS, IPEX-LLM supports Ubuntu 20.04 or later (glibc>=2.17), CentOS 7 or later (glibc>=2.17), and Windows 10/11.
 8 | 
 9 | ## Best Known Configuration on Linux
10 | For better performance, it is recommended to set environment variables on Linux with the help of IPEX-LLM:
11 | ```bash
12 | pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
13 | source ipex-llm-init
14 | ```
15 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --base_model "meta-llama/Llama-2-7b-hf" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --output_dir "./ipex-llm-qlora-alpaca"
22 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_qwen15_7b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --base_model "Qwen/Qwen1.5-7B-Chat" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --output_dir "./ipex-llm-qlora-alpaca"    
22 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_baichuan2_7b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --base_model "path/to/Baichuan2-7B-Chat" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --output_dir "./ipex-llm-qlora-alpaca"
22 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama3_8b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --base_model "meta-llama/Meta-Llama-3-8B-Instruct" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --output_dir "./ipex-llm-qlora-alpaca"
22 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=6
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 2 \
23 |        python -u ./alpaca_qlora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-7b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-qlora-alpaca" > training.log
27 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/Speculative-Decoding/Self-Speculation/README.md:
--------------------------------------------------------------------------------
 1 | # Self-Speculative Decoding for Large Language Model BF16 Inference using IPEX-LLM on Intel CPUs
 2 | You can use IPEX-LLM to run BF16 inference for any Huggingface Transformer model with ***self-speculative decoding*** on Intel CPUs. This directory contains example scripts to help you quickly get started to run some popular open-source models using self-speculative decoding. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
 3 | 
 4 | ## Verified Hardware Platforms
 5 | 
 6 | - Intel Xeon SPR server
 7 | 
 8 | ## Recommended Requirements
 9 | To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../../README.md#system-support) for more information. Make sure you have installed `ipex-llm` before:
10 | 
11 | ```bash
12 | pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
13 | ```
14 | 
15 | Moreover, install IPEX 2.1.0, which can be done through `pip install intel_extension_for_pytorch==2.1.0`.
16 | 


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/utils/common/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | from .log4Error import invalidInputError, invalidOperationError, MuteHFLogger
23 | from .lazyimport import LazyImport
24 | 


--------------------------------------------------------------------------------
/python/llm/example/CPU/Deepspeed-AutoTP/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 1. install oneccl for intel mpi
 3 | # can skip this step if oneccl/oneapi is already installed on your machine
 4 | # report to https://github.com/oneapi-src/oneCCL if any issue
 5 | git clone https://github.com/oneapi-src/oneCCL.git
 6 | cd oneCCL
 7 | mkdir build
 8 | cd build
 9 | cmake ..
10 | make -j install
11 | mkdir -p /opt/intel/oneccl
12 | mv ./_install/env /opt/intel/oneccl
13 | # 2. install torch and ipex
14 | pip install torch==2.1.0
15 | pip install intel_extension_for_pytorch==2.1.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
16 | # install torchccl (oneccl binding for pytorch)
17 | pip install https://intel-extension-for-pytorch.s3.amazonaws.com/torch_ccl/cpu/oneccl_bind_pt-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl
18 | # 3. install deepspeed
19 | pip install deepspeed==0.11.1
20 | # 4. exclude intel deepspeed extension, which is only for XPU
21 | pip uninstall intel-extension-for-deepspeed
22 | # 5. install ipex-llm
23 | pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
24 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-2-7b-chat-hf'
 3 |   - 'meta-llama/Llama-2-13b-chat-hf'
 4 |   - 'THUDM/chatglm2-6b'
 5 |   - 'THUDM/chatglm3-6b'
 6 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 7 |   - 'baichuan-inc/Baichuan2-13B-Chat'
 8 |   - 'Qwen/Qwen-14B-Chat'
 9 | local_model_hub: '/mnt/disk1/models'
10 | warm_up: 1
11 | num_trials: 3
12 | num_beams: 1 # default to greedy search
13 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
14 | batch_size: 1 # default to 1
15 | in_out_pairs:
16 |   - '32-32'
17 |   - '1024-128'
18 |   - '2048-256'
19 | test_api:
20 |   - "transformer_int4"
21 |   # - "native_int4"
22 |   # - "optimize_model"
23 |   # - "pytorch_autocast_bf16"
24 |   # - "ipex_fp16_gpu" # on Intel GPU
25 |   # - "transformer_int4_gpu"  # on Intel GPU
26 |   # - "optimize_model_gpu"  # on Intel GPU
27 |   # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
28 |   # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
29 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
30 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_relora_finetuning.py \
19 |     --base_model "meta-llama/Llama-2-7b-hf" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --output_dir "./ipex-llm-relora-alpaca" \
22 |     --relora_steps 300 \
23 |     --relora_warmup_steps 10
24 | 


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-go-isatty.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) Yasuhiro MATSUMOTO <mattn.jp@gmail.com>
2 | 
3 | MIT License (Expat)
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --micro_batch_size 2 \
20 |     --batch_size 128 \
21 |     --base_model "meta-llama/Llama-2-7b-hf" \
22 |     --data_path "yahma/alpaca-cleaned" \
23 |     --output_dir "./ipex-llm-qlora-alpaca"
24 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --micro_batch_size 8 \
20 |     --batch_size 128 \
21 |     --base_model "meta-llama/Llama-2-7b-hf" \
22 |     --data_path "yahma/alpaca-cleaned" \
23 |     --output_dir "./ipex-llm-qlora-alpaca"
24 | 


--------------------------------------------------------------------------------
/python/llm/portable-zip/README-ui.md:
--------------------------------------------------------------------------------
 1 | # IPEX-LLM Portable Zip with Web-UI For Windows: User Guide
 2 | 
 3 | ## Introduction
 4 | 
 5 | This portable zip includes everything you need to run an LLM with IPEX-LLM optimizations and chat with it in Web-UI. Please refer to [How to use](#how-to-use) section to get started.
 6 | 
 7 | ### 6B model running on an Intel 11-Gen Core PC (real-time screen capture)
 8 | 
 9 | 
10 | ### Verified Models
11 | 
12 | - ChatGLM2-6b
13 | 
14 | ## How to use
15 | 
16 | 1. Download the zip from link [here]().
17 | 2. (Optional) You could also build the zip on your own. Run `setup.bat --ui` and it will generate the zip file.
18 | 3. Unzip `ipex-llm.zip`.
19 | 4. Download the model to your computer.
20 | 5. Go into the unzipped folder and double click `chat-ui.bat`. Input the path of the model (e.g. `path\to\model`, note that there's no slash at the end of the path). Press Enter and wait until it shows `All service started. Visit 127.0.0.1:7860 in browser to chat.`. Do NOT close the terminal window!
21 | 6. Visit `127.0.0.1:7860` in your browser and enjoy chatting!
22 | 7. If you want to stop the program, just close the terminal window.


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/transformers/patches.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | #
17 | 
18 | from typing import List
19 | from transformers.dynamic_module_utils import get_imports
20 | from ipex_llm.utils.ipex_importer import IPEXImporter
21 | 
22 | 
23 | def patch_flash_attn_import(filename: str) -> List[str]:
24 |     """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
25 |     imports = get_imports(filename)
26 |     if "flash_attn" in imports:
27 |         imports.remove("flash_attn")
28 |     return imports
29 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --base_model "meta-llama/Llama-2-13b-hf" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --output_dir "./ipex-llm-qlora-alpaca" \
22 |     --micro_batch_size 8 \
23 |     --batch_size 128
24 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/cpu-perf-test.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'meta-llama/Llama-2-7b-chat-hf'
 3 |   - 'meta-llama/Llama-2-13b-chat-hf'
 4 |   - 'THUDM/chatglm2-6b'
 5 |   - 'THUDM/chatglm3-6b'
 6 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 7 |   - 'baichuan-inc/Baichuan2-13B-Chat'
 8 |   # - 'Qwen/Qwen-14B-Chat' # requires transformers < 4.37.0
 9 | local_model_hub: '/mnt/disk1/models'
10 | warm_up: 1
11 | num_trials: 3
12 | num_beams: 1 # default to greedy search
13 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
14 | batch_size: 1 # default to 1
15 | in_out_pairs:
16 |   - '32-32'
17 |   - '1024-128'
18 |   - '2048-256'
19 | test_api:
20 |   - "transformer_int4"
21 |   # - "native_int4"
22 |   # - "optimize_model"
23 |   # - "pytorch_autocast_bf16"
24 |   # - "ipex_fp16_gpu" # on Intel GPU
25 |   # - "transformer_int4_gpu"  # on Intel GPU
26 |   # - "optimize_model_gpu"  # on Intel GPU
27 |   # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
28 |   # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
29 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
30 | 


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-tablewriter.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2014 by Oleku Konko
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_chatglm3_6b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qlora_finetuning.py \
19 |     --base_model "THUDM/chatglm3-6b" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --lora_target_modules '[query_key_value,dense,dense_h_to_4h,dense_4h_to_h]' \
22 |     --output_dir "./ipex-llm-qlora-alpaca"
23 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=6
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 2 \
23 |        python -u ./alpaca_relora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-7b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-relora-alpaca" \
27 |        --relora_steps 300 \
28 |        --relora_warmup_steps 10 > training.log
29 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=28
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 4 \
23 |        python -u ./alpaca_qlora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-7b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-qlora-alpaca" \
27 |        --micro_batch_size 8 \
28 |        --batch_size 128 > training.log
29 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=56
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 2 \
23 |        python -u ./alpaca_qlora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-7b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-qlora-alpaca" \
27 |        --micro_batch_size 8 \
28 |        --batch_size 128 > training.log
29 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=56
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 8 \
23 |        python -u ./alpaca_qlora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-7b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-qlora-alpaca" \
27 |        --micro_batch_size 8 \
28 |        --batch_size 128 > training.log
29 | 


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-hm.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Xuanyi Chew
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=56
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 2 \
23 |        python -u ./alpaca_qlora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-13b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-qlora-alpaca" \
27 |        --micro_batch_size 8 \
28 |        --batch_size 128 > training.log
29 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=56
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 8 \
23 |        python -u ./alpaca_qlora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-13b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-qlora-alpaca" \
27 |        --micro_batch_size 8 \
28 |        --batch_size 128 > training.log
29 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/ceval/evaluators/evaluator.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | class Evaluator:
18 |     def __init__(self, choices, model_path, device, qtype):
19 |         self.choices = choices
20 |         self.model_path = model_path
21 |         self.device = device
22 |         self.qtype = qtype
23 | 
24 |     def format_example(self, line, **kwargs):
25 |         pass
26 |     
27 |     def eval_subject(self, subject_name, test_df, eval_type, **kwargs):
28 |         pass
29 | 
30 |     def extract_answer(self, response, row, **kwargs):
31 |         pass
32 | 


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-gin-contrib-cors.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Gin-Gonic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-go-urn.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Leonardo Di Donato
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-gorgonia.org-vecf32.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Chewxy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-mimetype.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Gabriel Vasile
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-uniseg.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Oliver Kuederle
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/src/ipex_llm/models.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # This would makes sure Python is aware there is more than one sub-package within bigdl,
18 | # physically located elsewhere.
19 | # Otherwise there would be module not found error in non-pip's setting as Python would
20 | # only search the first bigdl package and end up finding only one sub-package.
21 | 
22 | from ipex_llm.ggml.model.llama import Llama
23 | from ipex_llm.ggml.model.gptneox import Gptneox
24 | from ipex_llm.ggml.model.bloom import Bloom
25 | from ipex_llm.ggml.model.starcoder import Starcoder
26 | 


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-d4l3k-go-bfloat16.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Tristan Rice
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-gorgonia.org-vecf64.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Xuanyi Chew
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-validator.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Dean Karn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/lora_finetuning_chatglm3_6b_on_alpaca_with_2_arc_cards.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=6
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | export BIGDL_CHECK_DUPLICATE_IMPORT=0
22 | 
23 | # You can also set the remote model repository to a local model path
24 | mpirun -n 2 \
25 |     python lora_finetune_chatglm.py \
26 |         yahma/alpaca-cleaned  \
27 |         THUDM/chatglm3-6b  \
28 |         ./lora_config.yaml \
29 | 	./deepspeed_config.json
30 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/core-perf-test.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'THUDM/chatglm2-6b'
 3 |   - 'THUDM/chatglm3-6b'
 4 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 5 |   - 'internlm/internlm-chat-7b'
 6 |  # - 'Qwen/Qwen-7B-Chat'  # requires transformers < 4.37.0
 7 |   - 'BAAI/AquilaChat2-7B'
 8 |   - 'meta-llama/Llama-2-7b-chat-hf'
 9 |   - 'WisdomShell/CodeShell-7B'
10 |   - 'tiiuae/falcon-7b-instruct-with-patch'
11 | local_model_hub: 'D:\llm-models'
12 | warm_up: 1
13 | num_trials: 3
14 | num_beams: 1 # default to greedy search
15 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
16 | batch_size: 1 # default to 1
17 | in_out_pairs:
18 |   - '32-32'
19 |   - '1024-128'
20 | test_api:
21 |   - "transformer_int4"
22 |   # - "native_int4"
23 |   # - "optimize_model"
24 |   # - "pytorch_autocast_bf16"
25 |   # - "ipex_fp16_gpu" # on Intel GPU
26 |   # - "transformer_int4_gpu"  # on Intel GPU
27 |   # - "optimize_model_gpu"  # on Intel GPU
28 |   # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
29 |   # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
30 | cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
31 | 


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-gin.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Manuel Martínez-Almeida
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-go-runewidth.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Yasuhiro Matsumoto
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-sse.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Manuel Martínez-Almeida
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/chatglm_finetune/lora_finetuning_chatglm3_6b_on_advertise_gen_with_2_arc_cards.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=6
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | export BIGDL_CHECK_DUPLICATE_IMPORT=0
22 | 
23 | # You can also set the remote model repository to a local model path
24 | mpirun -n 2 \
25 |     python lora_finetune_chatglm.py \
26 |         ./AdvertiseGen_fix  \
27 |         THUDM/chatglm3-6b  \
28 |         ./lora_config.yaml \
29 | 	./deepspeed_config.json
30 | 


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-go-playground-locales.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Go Playground
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-agnivade-levenshtein.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Agniva De Sarker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-gin-contrib-sse.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Manuel Martínez-Almeida
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-float16.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-present Montgomery Edwards⁴⁴⁸ and Faye Amacker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=12
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 3 \
23 |        python -u ./alpaca_qlora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-7b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-qlora-alpaca" \
27 |        --gradient_checkpointing False \
28 |        --micro_batch_size 2 \
29 |        --batch_size 128 > training.log
30 | 


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-go-playground-universal-translator.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Go Playground
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-go-toml.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | go-toml v2
 4 | Copyright (c) 2021 - 2023 Thomas Pelletier
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-go-codec.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2012-2020 Ugorji Nwoke.
 4 | All rights reserved.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.


--------------------------------------------------------------------------------
/python/llm/tpp/licenses/LICENSE-zlib.txt:
--------------------------------------------------------------------------------
 1 | /* zlib.h -- interface of the 'zlib' general purpose compression library
 2 |   version 1.3.1, January 22nd, 2024
 3 | 
 4 |   Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 5 | 
 6 |   This software is provided 'as-is', without any express or implied
 7 |   warranty.  In no event will the authors be held liable for any damages
 8 |   arising from the use of this software.
 9 | 
10 |   Permission is granted to anyone to use this software for any purpose,
11 |   including commercial applications, and to alter it and redistribute it
12 |   freely, subject to the following restrictions:
13 | 
14 |   1. The origin of this software must not be misrepresented; you must not
15 |      claim that you wrote the original software. If you use this software
16 |      in a product, an acknowledgment in the product documentation would be
17 |      appreciated but is not required.
18 |   2. Altered source versions must be plainly marked as such, and must not be
19 |      misrepresented as being the original software.
20 |   3. This notice may not be removed or altered from any source distribution.
21 | 
22 |   Jean-loup Gailly        Mark Adler
23 |   jloup@gzip.org          madler@alumni.caltech.edu
24 | 
25 | */


--------------------------------------------------------------------------------
/python/llm/example/GPU/HuggingFace/LLM/yuan2/yuan2-2B-instruct/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_from_model_config":true,
 3 |   "architectures": [
 4 |     "YuanForCausalLM"
 5 |   ],
 6 |   "auto_map":{
 7 |           "AutoConfig":"configuration_yuan.YuanConfig",
 8 |           "AutoModelForCausalLM":"yuan_hf_model.YuanForCausalLM"
 9 |   },
10 |   "tokenizer_class":"YuanTokenizer",
11 |   "hidden_act": "silu",
12 |   "hidden_size": 2048,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": 8192,
15 |   "max_position_embeddings": 8192,
16 |   "model_type": "yuan",
17 |   "num_attention_heads": 32,
18 |   "num_hidden_layers": 24,
19 |   "rms_norm_eps": 1e-06,
20 |   "dropout": 0.1,
21 |   "tie_word_embeddings": true,
22 |   "torch_dtype": "bfloat16",
23 |   "transformers_version": "4.30.0.dev0",
24 |   "use_cache": true,
25 |   "causal_mask": true,
26 |   "use_flash_attention": false,
27 |   "reset_attention_mask": true,
28 |   "reset_position_ids": true,
29 |   "use_loss_mask": false,
30 |   "eod_token": 77185,
31 |   "sep_token": 77187,
32 |   "eod_token_id": 77185,
33 |   "sep_token_id": 77185,
34 |   "pad_token_id": 77185,
35 |   "bos_token_id": 77185,
36 |   "eos_token_id": 77185,
37 |   "mask_token_id": 77185,
38 |   "vocab_size": 135040
39 | }


--------------------------------------------------------------------------------
/python/llm/example/CPU/PyTorch-Models/Model/yuan2/yuan2-2B-instruct/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_from_model_config":true,
 3 |   "architectures": [
 4 |     "YuanForCausalLM"
 5 |   ],
 6 |   "auto_map":{
 7 |           "AutoConfig":"configuration_yuan.YuanConfig",
 8 |           "AutoModelForCausalLM":"yuan_hf_model.YuanForCausalLM"
 9 |   },
10 |   "tokenizer_class":"YuanTokenizer",
11 |   "hidden_act": "silu",
12 |   "hidden_size": 2048,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": 8192,
15 |   "max_position_embeddings": 8192,
16 |   "model_type": "yuan",
17 |   "num_attention_heads": 32,
18 |   "num_hidden_layers": 24,
19 |   "rms_norm_eps": 1e-06,
20 |   "dropout": 0.1,
21 |   "tie_word_embeddings": true,
22 |   "torch_dtype": "bfloat16",
23 |   "transformers_version": "4.30.0.dev0",
24 |   "use_cache": true,
25 |   "causal_mask": true,
26 |   "use_flash_attention": false,
27 |   "reset_attention_mask": true,
28 |   "reset_position_ids": true,
29 |   "use_loss_mask": false,
30 |   "eod_token": 77185,
31 |   "sep_token": 77187,
32 |   "eod_token_id": 77185,
33 |   "sep_token_id": 77185,
34 |   "pad_token_id": 77185,
35 |   "bos_token_id": 77185,
36 |   "eos_token_id": 77185,
37 |   "mask_token_id": 77185,
38 |   "vocab_size": 135040
39 | }


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_lora_finetuning.py \
19 |     --micro_batch_size 8 \
20 |     --batch_size 128 \
21 |     --base_model "meta-llama/Llama-2-7b-hf" \
22 |     --data_path "yahma/alpaca-cleaned" \
23 |     --output_dir "./ipex-llm-lora-alpaca" \
24 |     --gradient_checkpointing True \
25 |     --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj']"
26 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=56
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 2 \
23 |        python -u ./alpaca_relora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-7b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-relora-alpaca" \
27 |        --micro_batch_size 8 \
28 |        --relora_steps 300 \
29 |        --relora_warmup_steps 10 \
30 |        --batch_size 128 > relora_training.log
31 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=56
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 8 \
23 |        python -u ./alpaca_relora_finetuning.py \
24 |        --base_model "meta-llama/Llama-2-7b-hf" \
25 |        --data_path "yahma/alpaca-cleaned" \
26 |        --output_dir "./ipex-llm-relora-alpaca" \
27 |        --micro_batch_size 8 \
28 |        --relora_steps 300 \
29 |        --relora_warmup_steps 10 \
30 |        --batch_size 128 > relora_training.log
31 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/PyTorch-Models/Model/yuan2/yuan2-2B-instruct/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_from_model_config":true,
 3 |   "architectures": [
 4 |     "YuanForCausalLM"
 5 |   ],
 6 |   "auto_map":{
 7 |           "AutoConfig":"configuration_yuan.YuanConfig",
 8 |           "AutoModelForCausalLM":"yuan_hf_model.YuanForCausalLM"
 9 |   },
10 |   "tokenizer_class":"YuanTokenizer",
11 |   "hidden_act": "silu",
12 |   "hidden_size": 2048,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": 8192,
15 |   "max_position_embeddings": 8192,
16 |   "model_type": "yuan",
17 |   "num_attention_heads": 32,
18 |   "num_hidden_layers": 24,
19 |   "rms_norm_eps": 1e-06,
20 |   "dropout": 0.1,
21 |   "tie_word_embeddings": true,
22 |   "torch_dtype": "bfloat16",
23 |   "transformers_version": "4.30.0.dev0",
24 |   "use_cache": true,
25 |   "causal_mask": true,
26 |   "use_flash_attention": false,
27 |   "reset_attention_mask": true,
28 |   "reset_position_ids": true,
29 |   "use_loss_mask": false,
30 |   "eod_token": 77185,
31 |   "sep_token": 77187,
32 |   "eod_token_id": 77185,
33 |   "sep_token_id": 77185,
34 |   "pad_token_id": 77185,
35 |   "bos_token_id": 77185,
36 |   "eos_token_id": 77185,
37 |   "mask_token_id": 77185,
38 |   "vocab_size": 135040
39 | }


--------------------------------------------------------------------------------
/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/yuan2-2B-instruct/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_from_model_config":true,
 3 |   "architectures": [
 4 |     "YuanForCausalLM"
 5 |   ],
 6 |   "auto_map":{
 7 |           "AutoConfig":"configuration_yuan.YuanConfig",
 8 |           "AutoModelForCausalLM":"yuan_hf_model.YuanForCausalLM"
 9 |   },
10 |   "tokenizer_class":"YuanTokenizer",
11 |   "hidden_act": "silu",
12 |   "hidden_size": 2048,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": 8192,
15 |   "max_position_embeddings": 8192,
16 |   "model_type": "yuan",
17 |   "num_attention_heads": 32,
18 |   "num_hidden_layers": 24,
19 |   "rms_norm_eps": 1e-06,
20 |   "dropout": 0.1,
21 |   "tie_word_embeddings": true,
22 |   "torch_dtype": "bfloat16",
23 |   "transformers_version": "4.30.0.dev0",
24 |   "use_cache": true,
25 |   "causal_mask": true,
26 |   "use_flash_attention": false,
27 |   "reset_attention_mask": true,
28 |   "reset_position_ids": true,
29 |   "use_loss_mask": false,
30 |   "eod_token": 77185,
31 |   "sep_token": 77187,
32 |   "eod_token_id": 77185,
33 |   "sep_token_id": 77185,
34 |   "pad_token_id": 77185,
35 |   "bos_token_id": 77185,
36 |   "eos_token_id": 77185,
37 |   "mask_token_id": 77185,
38 |   "vocab_size": 135040
39 | }


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'THUDM/chatglm3-6b'
 3 |   - 'THUDM/glm-4-9b-chat'
 4 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 5 |   - 'meta-llama/Llama-2-7b-chat-hf'
 6 |   - 'meta-llama/Llama-2-13b-chat-hf'
 7 |   - 'meta-llama/Meta-Llama-3-8B-Instruct'
 8 |   - 'mistralai/Mistral-7B-Instruct-v0.2'
 9 |   - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5'
10 |   - '01-ai/Yi-6B-Chat'
11 |   - 'openbmb/MiniCPM-1B-sft-bf16'
12 |   - 'openbmb/MiniCPM-2B-sft-bf16'
13 |   - 'Qwen/Qwen1.5-7B-Chat'
14 |   - 'Qwen/Qwen2-1.5B-Instruct'
15 |   - 'Qwen/Qwen2-7B-Instruct'
16 |   - 'microsoft/Phi-3-mini-4k-instruct'
17 |   - 'microsoft/Phi-3-mini-128k-instruct'
18 |   - 'microsoft/phi-3-vision-128k-instruct'
19 |   - 'openbmb/MiniCPM-V-2_6'
20 | local_model_hub: 'path to your local model hub'
21 | warm_up: 1
22 | num_trials: 3
23 | num_beams: 1 # default to greedy search
24 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
25 | batch_size: 1 # default to 1
26 | in_out_pairs:
27 |   - '3072-384'
28 | test_api:
29 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
30 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
31 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'THUDM/chatglm3-6b'
 3 |   - 'THUDM/glm-4-9b-chat'
 4 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 5 |   - 'meta-llama/Llama-2-7b-chat-hf'
 6 |   - 'meta-llama/Llama-2-13b-chat-hf'
 7 |   - 'meta-llama/Meta-Llama-3-8B-Instruct'
 8 |   - 'mistralai/Mistral-7B-Instruct-v0.2'
 9 |   - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5'
10 |   - '01-ai/Yi-6B-Chat'
11 |   - 'openbmb/MiniCPM-1B-sft-bf16'
12 |   - 'openbmb/MiniCPM-2B-sft-bf16'
13 |   - 'Qwen/Qwen1.5-7B-Chat'
14 |   - 'Qwen/Qwen2-1.5B-Instruct'
15 |   - 'Qwen/Qwen2-7B-Instruct'
16 |   - 'microsoft/Phi-3-mini-4k-instruct'
17 |   - 'microsoft/Phi-3-mini-128k-instruct'
18 |   - 'microsoft/phi-3-vision-128k-instruct'
19 |   - 'openbmb/MiniCPM-V-2_6'
20 | local_model_hub: 'path to your local model hub'
21 | warm_up: 1
22 | num_trials: 3
23 | num_beams: 1 # default to greedy search
24 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
25 | batch_size: 1 # default to 1
26 | in_out_pairs:
27 |   - '4096-512'
28 | test_api:
29 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
30 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
31 | 


--------------------------------------------------------------------------------
/python/llm/dev/benchmark/all-in-one/prompt/continuation/256.txt:
--------------------------------------------------------------------------------
1 | Once upon a time, there was a young girl named Samantha who lived with her parents in a small town. Samantha had always dreamed of traveling the world and experiencing new cultures and adventures. But as much as she yearned for something more than what her town could offer, it felt out of reach for a girl like her.
2 | One day, while browsing through the pages of a travel magazine, Samantha came across an advertisement that seemed too good to be true. It was an invitation to travel to a faraway land and experience all the adventures she had ever dreamed of. The only catch was that she needed to attend a special briefing beforehand in order to ensure her safety during the trip.
3 | Samantha quickly scribbled down the information on the back of the advertisement and resolved to attend the briefing at the end of the week. As time passed, Samantha became more and more excited about the prospect of traveling abroad. She even began putting together a packing list, imagining all the things she would need for her adventure.
4 | Finally, the day of the briefing arrived. Samantha made sure to arrive early so that she could go through security clearance before anyone else did. As she waited in line, she couldn't help but
5 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1110_4_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=14
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 4 \
23 |     python -u ./alpaca_lora_finetuning.py \
24 |     --micro_batch_size 8 \
25 |     --batch_size 128 \
26 |     --base_model "meta-llama/Llama-2-7b-hf" \
27 |     --data_path "yahma/alpaca-cleaned" \
28 |     --output_dir "./ipex-llm-lora-alpaca" \
29 |     --gradient_checkpointing True \
30 |     --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']"
31 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_4_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | export MASTER_ADDR=127.0.0.1
18 | export OMP_NUM_THREADS=56
19 | export FI_PROVIDER=tcp
20 | export CCL_ATL_TRANSPORT=ofi
21 | 
22 | mpirun -n 8 \
23 |     python -u ./alpaca_lora_finetuning.py \
24 |     --micro_batch_size 8 \
25 |     --batch_size 128 \
26 |     --base_model "meta-llama/Llama-2-7b-hf" \
27 |     --data_path "yahma/alpaca-cleaned" \
28 |     --output_dir "./ipex-llm-lora-alpaca" \
29 |     --gradient_checkpointing False \
30 |     --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']"
31 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_1_card.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_qalora_finetuning.py \
19 |     --base_model "meta-llama/Llama-2-7b-hf" \
20 |     --data_path "yahma/alpaca-cleaned" \
21 |     --output_dir "./ipex-llm-qlora-alpaca" \
22 |     --learning_rate 9e-5 \
23 |     --micro_batch_size 2 \
24 |     --batch_size 128 \
25 |     --lora_r 8 \
26 |     --lora_alpha 16 \
27 |     --lora_dropout 0.05 \
28 |     --val_set_size 2000
29 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'THUDM/chatglm3-6b'
 3 |   - 'THUDM/glm-4-9b-chat'
 4 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 5 |   - 'baichuan-inc/Baichuan2-13B-Chat'
 6 |   - 'meta-llama/Llama-2-7b-chat-hf'
 7 |   - 'meta-llama/Llama-2-13b-chat-hf'
 8 |   - 'meta-llama/Meta-Llama-3-8B-Instruct'
 9 |   - 'mistralai/Mistral-7B-Instruct-v0.2'
10 |   - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5'
11 |   - '01-ai/Yi-6B-Chat'
12 |   - 'openbmb/MiniCPM-1B-sft-bf16'
13 |   - 'openbmb/MiniCPM-2B-sft-bf16'
14 |   - 'Qwen/Qwen1.5-7B-Chat'
15 |   - 'Qwen/Qwen2-1.5B-Instruct'
16 |   - 'Qwen/Qwen2-7B-Instruct'
17 |   - 'microsoft/Phi-3-mini-4k-instruct'
18 |   - 'microsoft/Phi-3-mini-128k-instruct'
19 |   - 'microsoft/phi-3-vision-128k-instruct'
20 | local_model_hub: 'path to your local model hub'
21 | warm_up: 1
22 | num_trials: 3
23 | num_beams: 1 # default to greedy search
24 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
25 | batch_size: 1 # default to 1
26 | in_out_pairs:
27 |   - '1024-128'
28 | test_api:
29 |   - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
30 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
31 | 


--------------------------------------------------------------------------------
/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml:
--------------------------------------------------------------------------------
 1 | repo_id:
 2 |   - 'THUDM/chatglm3-6b'
 3 |   - 'THUDM/glm-4-9b-chat'
 4 |   - 'baichuan-inc/Baichuan2-7B-Chat'
 5 |   - 'baichuan-inc/Baichuan2-13B-Chat'
 6 |   - 'meta-llama/Llama-2-7b-chat-hf'
 7 |   - 'meta-llama/Llama-2-13b-chat-hf'
 8 |   - 'meta-llama/Meta-Llama-3-8B-Instruct'
 9 |   - 'mistralai/Mistral-7B-Instruct-v0.2'
10 |   - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5'
11 |   - '01-ai/Yi-6B-Chat'
12 |   - 'openbmb/MiniCPM-1B-sft-bf16'
13 |   - 'openbmb/MiniCPM-2B-sft-bf16'
14 |   - 'Qwen/Qwen1.5-7B-Chat'
15 |   - 'Qwen/Qwen2-1.5B-Instruct'
16 |   - 'Qwen/Qwen2-7B-Instruct'
17 |   - 'microsoft/Phi-3-mini-4k-instruct'
18 |   - 'microsoft/Phi-3-mini-128k-instruct'
19 |   - 'microsoft/phi-3-vision-128k-instruct'
20 |   - 'openbmb/MiniCPM-V-2_6'
21 | local_model_hub: 'path to your local model hub'
22 | warm_up: 3
23 | num_trials: 5
24 | num_beams: 1 # default to greedy search
25 | low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
26 | batch_size: 1 # default to 1
27 | in_out_pairs:
28 |   - '32-32'
29 | test_api:
30 |   - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
31 | cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
32 | 


--------------------------------------------------------------------------------
/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_1_tile.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2016 The BigDL Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
18 | python ./alpaca_lora_finetuning.py \
19 |     --micro_batch_size 8 \
20 |     --batch_size 128 \
21 |     --base_model "meta-llama/Llama-2-7b-hf" \
22 |     --data_path "yahma/alpaca-cleaned" \
23 |     --output_dir "./ipex-llm-lora-alpaca" \
24 |     --gradient_checkpointing True \
25 |     --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']"
26 | 


--------------------------------------------------------------------------------