├── .DS_Store ├── .gitignore ├── LICENSE ├── README.md ├── ai-compiler ├── .DS_Store ├── README.md ├── Treebeard │ ├── .DS_Store │ ├── 10-TableI-1.png │ ├── 10-TableII-1.png │ ├── 9-Figure6-1.png │ ├── README.md │ ├── TREEBEARD IR lowering 和 optimization 细节.png │ └── TREEBEARD编译器结构.png ├── cuda │ └── README.md ├── treelit │ ├── README.md │ └── xgb.md └── triton-lang │ └── README.md ├── ai-framework ├── .DS_Store ├── README.md ├── TensorRT-Model-Optimizer.md ├── cuda │ └── README.md ├── deepspeed │ ├── 1.DeepSpeed入门.md │ ├── 2.安装DeepSpeed.md │ ├── 3.基于CIFAR-10使用DeepSpeed进行分布式训练 .md │ ├── DeepSpeed配置JSON文件.md │ ├── README.md │ ├── config-json │ │ ├── README.md │ │ └── deepspeed-nvme.md │ ├── deepspeed-slurm.md │ ├── hello_bert │ │ ├── README.md │ │ ├── train_bert.py │ │ └── train_bert_ds.py │ └── training │ │ └── pipeline_parallelism │ │ └── README.md ├── dlrover.md ├── huggingface-accelerate │ └── README.md ├── huggingface-peft │ └── README.md ├── huggingface-transformers │ ├── API.md │ ├── FSDP.md │ └── README.md ├── huggingface-trl │ └── README.md ├── jax │ ├── README.md │ └── reference.md ├── llama-cpp │ └── README.md ├── megatron-deepspeed │ └── README.md ├── megatron-lm │ └── README.md ├── mxnet │ ├── README.md │ ├── mnist.py │ ├── mxnet_cnn_mnist.py │ ├── mxnet_mlp_mnist.py │ ├── oneflow_cnn_mnist.py │ ├── oneflow_mlp_mnist.py │ └── reference.md ├── oneflow │ ├── README.md │ ├── oneflow_mlp_mnist.py │ └── reference.md ├── openai-triton │ └── README.md ├── paddlepaddle │ ├── README.md │ └── reference.md ├── pai-megatron-patch │ ├── .DS_Store │ └── README.md ├── pai-torchacc.md ├── pytorch │ ├── README.md │ ├── install.md │ └── reference.md ├── tensorflow │ ├── README.md │ └── reference.md ├── transformer-engine │ └── mnist │ │ ├── README.md │ │ ├── main.py │ │ └── main_stat.py └── unsloth-微调.md ├── ai-infra ├── .DS_Store ├── ai-cluster │ └── README.md ├── ai-hardware │ ├── .DS_Store │ ├── AI芯片软件生态.md │ ├── CUDA.md │ ├── GPU-network.md │ ├── GPU相关环节变量.md │ ├── OEM-DGX.md │ ├── README.md │ ├── TSMC-台积电.md │ ├── cuda镜像.md │ ├── gpudirect.md │ └── 硬件对比.md ├── communication.md ├── 存储 │ ├── README.md │ ├── REF.md │ ├── nvme-ssd.md │ ├── 固态硬盘.md │ └── 存储.md ├── 算力 │ ├── AI芯片.md │ ├── GPU工作原理.md │ ├── NVIDIA-GPU型号.md │ ├── 推理芯片.md │ └── 昇腾NPU.md └── 网络 │ ├── .DS_Store │ ├── HPC性能测试.md │ ├── IB-docker.md │ ├── IB流量监控.md │ ├── IB软件.md │ ├── InfiniBand.md │ ├── NCCL.md │ ├── README.md │ ├── REF.md │ ├── Spine-Leaf和InfiniBand网络架构区别简述.md │ ├── nccl-test-集合通讯的性能测试.md │ ├── nvbandwidth.md │ ├── pic │ ├── .DS_Store │ ├── 8卡V100的混合网络拓扑.png │ ├── A800-H100-H800.jpeg │ ├── NVLink-generations比较.png │ ├── PCIe-Generation对比.png │ ├── nvidia-dgx-1-v100-nvlink-gpu-xeon-config.webp │ ├── nvidia-dgx-1-with-volta.webp │ ├── nvlink.png │ ├── nvlink性能.png │ ├── nvswitch.png │ ├── 支持 NVLink GPU 之间连接的 NVIDIA H100-1.png │ ├── 支持 NVLink GPU 之间连接的 NVIDIA H100-2.png │ ├── 服务器之间的nvlink与nvswitch.png │ ├── 网络之间的连接.png │ └── 英伟达A100-A800-H100-H800.jpeg │ ├── ringallreduce │ ├── All Gather 流程图.gif │ └── Scatter Reduce 流程图.gif │ ├── roce.md │ ├── 网络硬件.md │ └── 通信软件.md ├── blog ├── .DS_Store ├── TODO.md ├── ai-infra │ ├── AI 集群基础设施 InfiniBand 详解.md │ └── AI 集群基础设施 NVMe SSD 详解.md ├── distribution-parallelism │ ├── 大模型分布式训练并行技术(一)-概述.md │ ├── 大模型分布式训练并行技术(九)-总结.md │ └── 大模型分布式训练并行技术(六)-多维混合并行.md ├── llm-algo │ ├── moe.md │ └── 大白话Transformer架构.md ├── llm-compression │ ├── 大模型量化技术原理-ZeroQuant系列.md │ └── 大模型量化技术原理:QoQ量化及QServe推理服务系统.md ├── llm-inference │ └── 大模型推理框架概述.md ├── llm-localization │ ├── 大模型国产化适配1-华为昇腾AI全栈软硬件平台总结.md │ └── 大模型国产化适配4-基于昇腾910使用LLaMA-13B进行多机多卡训练.md ├── llm-peft │ ├── 大模型参数高效微调技术原理综述(一)-背景、参数高效微调简介.md │ └── 大模型参数高效微调技术原理综述(五)-LoRA、AdaLoRA、QLoRA.md └── reference │ └── 高性能 LLM 推理框架的设计与实现.md ├── docs ├── .DS_Store ├── README.md ├── conda.md ├── flash-attention │ └── FlashAttention.md ├── llm-base │ ├── .DS_Store │ ├── FLOPS.md │ ├── NVIDIA-Nsight-Systems性能分析.md │ ├── README.md │ ├── a800-env-install.md │ ├── ai-algo.md │ ├── autoregressive-lm-decoding-methods.md │ ├── dcgmi.md │ ├── distribution-parallelism │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── auto-parallel │ │ │ ├── Alpa.md │ │ │ ├── Flexflow.md │ │ │ ├── Galvatron.md │ │ │ ├── Mesh-Tensorflow.md │ │ │ ├── README.md │ │ │ ├── Unity.md │ │ │ ├── auto-parallel.md │ │ │ ├── gspmd.md │ │ │ ├── 分布式训练自动并行概述.md │ │ │ └── 飞桨面向异构场景下的自动并行设计与实践.md │ │ ├── data-parallelism │ │ │ ├── README.md │ │ │ └── ddp │ │ │ │ ├── Gradient Bucketing 示意图.webp │ │ │ │ └── 代码架构.webp │ │ ├── moe-parallel │ │ │ ├── README.md │ │ │ ├── moe-framework.md │ │ │ ├── moe-parallel.md │ │ │ └── paddle_moe.py │ │ ├── multidimensional-hybrid-parallel │ │ │ ├── BloombergGPT模型超参数.png │ │ │ └── README.md │ │ ├── pipeline-parallelism │ │ │ └── README.md │ │ ├── tensor-parallel │ │ │ ├── README.md │ │ │ └── tensor-parallel.md │ │ └── 并行技术.drawio │ ├── distribution-training │ │ ├── .DS_Store │ │ ├── Bloom-176B训练经验.md │ │ ├── FP16-BF16.md │ │ ├── GLM-130B训练经验.md │ │ ├── OPT-175B训练经验.md │ │ ├── README.md │ │ └── 自动混合精度.md │ ├── gpu-env-var.md │ ├── h800-env-install.md │ ├── images │ │ └── slurm │ │ │ └── slurm.gif │ ├── monitor.md │ ├── multimodal │ │ └── sora.md │ ├── nvidia-smi-dmon.md │ ├── nvidia-smi.md │ ├── rlhf │ │ └── README.md │ ├── scenes │ │ ├── README.md │ │ ├── cv │ │ │ ├── README.md │ │ │ ├── paddle │ │ │ │ └── README.md │ │ │ ├── pytorch │ │ │ │ └── README.md │ │ │ └── reference.md │ │ └── multi-modal │ │ │ ├── README.md │ │ │ └── reference.md │ ├── singularity命令.md │ ├── slurm.md │ ├── 分布式训练加速技术.md │ ├── 多机RDMA性能测试.txt │ └── 机器学习中常用的数据类型.md ├── llm-experience.md ├── llm-inference │ ├── DeepSpeed-Inference.md │ ├── KV-Cache.md │ ├── LLM服务框架对比.md │ ├── README.md │ ├── blog.md │ ├── flexflow │ │ ├── spec_infer_demo.gif │ │ └── 投机采样.md │ ├── llm推理优化技术.md │ ├── llm推理框架.md │ └── vllm.md ├── llm-peft │ ├── LoRA-FA.md │ ├── MAM_Adapter.md │ ├── README.md │ └── ReLoRA.md ├── llm-summarize │ ├── README.md │ ├── distribution_dl_roadmap.md │ ├── pic │ │ ├── A800.jpeg │ │ ├── H800.jpeg │ │ ├── transformer架构.jpg │ │ └── why_RLHF.jpg │ ├── 大模型实践总结-20230930.md │ ├── 大模型实践总结.md │ ├── 文档大模型.md │ ├── 金融大模型.md │ └── 领域大模型.md └── transformer内存估算.md ├── faq └── FAQ.md ├── git-pull-push.sh ├── llm-algo ├── .DS_Store ├── FLOPs.md ├── InternLM-20B.md ├── README.md ├── baichuan2 │ └── baichuan.md ├── bert.md ├── bert │ └── 模型架构.md ├── bloom.md ├── bloom │ └── README.md ├── chatglm │ ├── GLM说明.png │ ├── GLM预训练.png │ ├── README.md │ └── 模型架构.md ├── chatglm2 │ ├── README.md │ └── 模型架构.md ├── chatglm3 │ ├── README.md │ └── reference.md ├── chatgpt │ └── README.md ├── deepseek │ ├── DeepSeek-R1.md │ ├── DeepSeek-V2.md │ └── README.md ├── glm-130b │ ├── README.md │ └── 模型架构.gif ├── glm4.md ├── gpt │ ├── README.md │ └── 模型结构.png ├── gpt2 │ ├── README.md │ ├── hf_modeling_gpt2.py │ └── 模型架构.md ├── gpt3 │ └── README.md ├── llama.md ├── llama │ ├── README.md │ └── 模型架构.md ├── mixtral │ └── README.md ├── mlp.md ├── moe │ └── README.md ├── qwen │ ├── README.md │ └── 参数说明及函数说明.md ├── qwen2.md ├── t5 │ └── README.md ├── transformer.md ├── transformer │ ├── README.md │ ├── Transformer中FFN的记忆功能.md │ ├── multi-head-attention.webp │ ├── transformer-building-blocks.webp │ ├── w-qkv.png │ └── 模型架构.md ├── 基本概念.md ├── 旋转编码RoPE.md ├── 模型架构类图.drawio └── 训练范式.md ├── llm-alignment ├── DPO.md ├── README.md ├── RLHF.md └── 基本概念.md ├── llm-application ├── .DS_Store ├── Higress.md ├── README.md ├── embbedding-model.md ├── gradio │ └── README.md ├── langchain │ ├── .DS_Store │ ├── README.md │ ├── serve.py │ └── tutorials │ │ ├── client.py │ │ └── serve.py ├── one-api.md ├── pre-post-handle │ └── README.md ├── rag │ ├── README.md │ ├── embedding.md │ ├── 存在的一些问题.md │ └── 方案.md ├── vector-db │ ├── README.md │ └── reference.md └── 应用场景.md ├── llm-compression ├── PaddleSlim │ ├── quantization.md │ └── README.md ├── README.md ├── distillation │ ├── GKD.md │ ├── MINILLM.md │ ├── README.md │ ├── SCOTT.md │ └── 大模型蒸馏概述.md ├── gptqmodel │ └── README.md ├── llm-compressor │ ├── README.md │ ├── source-code.md │ ├── 剪枝.md │ └── 量化方案.md ├── pruning │ └── README.md ├── quantization │ ├── FP6-LLM.md │ ├── GPTQ.md │ ├── LLM-int8.md │ ├── PEQA.md │ ├── QQQ-W4A8.md │ ├── README.md │ ├── SmoothQuant.md │ ├── SpinQuant.md │ ├── ZeroQuant(4+2).md │ ├── ZeroQuant.md │ ├── fp4.md │ ├── fp6.md │ ├── fp8.md │ ├── kv-cache-quant.md │ ├── llm-qat │ │ ├── LLM-QAT.md │ │ ├── README.md │ │ ├── cfd70ff │ │ │ ├── README.md │ │ │ ├── generate_data.py │ │ │ ├── inference.py │ │ │ ├── merge_gen_data.py │ │ │ ├── pip.conf │ │ │ ├── run_train.sh │ │ │ ├── train.py │ │ │ └── utils.py │ │ ├── f4d873a │ │ │ ├── datautils.py │ │ │ ├── run_train.sh │ │ │ └── train.py │ │ └── log.md │ ├── tools.md │ └── 大模型量化概述.md ├── sparsity │ └── README.md ├── tools.md ├── 大模型压缩综述.md └── 经验.md ├── llm-data-engineering ├── README.md ├── dataset │ ├── README.md │ ├── baichuan2.md │ ├── chinese-corpus-all.md │ └── english-corpus-all.md ├── reference.md └── sft-dataset │ ├── baichuan2_test.py │ ├── evol-instruct.md │ ├── firefly-template.py │ ├── jinja-demo.py │ ├── jinja-llm-baichuan.py │ ├── jinja-llm-baichuan2.py │ ├── jinja-llm-bloom.py │ ├── jinja-llm-chatglm3.py │ ├── jinja-llm.py │ ├── jinja.md │ ├── 数据格式设计.md │ └── 数据集格式.md ├── llm-eval ├── .DS_Store ├── EvalScope.md ├── README.md ├── eval-data │ ├── longtext_L115433-question.txt │ ├── longtext_L115433.txt │ ├── longtext_L32503_answer.txt │ ├── longtext_L32503_question.txt │ ├── longtext_L64031.txt │ └── longtext_L64031_question.txt ├── evalscope_framework.png ├── llm-performance │ ├── .DS_Store │ ├── AI芯片性能.md │ ├── README.md │ ├── hardware-performance │ │ ├── gpu-monitor-ui.py │ │ └── pynvml-stat-memory.py │ ├── llmperf.md │ ├── mindie │ │ ├── .DS_Store │ │ ├── lantency │ │ │ ├── README.md │ │ │ ├── perfermance-stat.py │ │ │ ├── performance-stream-baichuan2.py │ │ │ ├── performance-stream-chatglm3.py │ │ │ ├── performance-stream-qwen1.5.py │ │ │ ├── performance-stream-qwen1.py │ │ │ ├── performance-stream.py │ │ │ └── stat_input_token.py │ │ └── locust-lantency-throughput │ │ │ ├── .DS_Store │ │ │ ├── README.md │ │ │ ├── hello.py │ │ │ ├── llm-910b4-baichuan2-7b-2tp.py │ │ │ ├── llm-910b4-chatglm3-6b-2tp.py │ │ │ ├── llm-910b4-qwen-72b-8tp.py │ │ │ ├── llm-910b4-qwen1.5-4tp.py │ │ │ ├── qwen-14b-chart.jpg │ │ │ ├── qwen-14b-stat.jpg │ │ │ ├── qwen1.5-72b-8tp.html │ │ │ ├── qwen1.5-7b-4tp-chart.png │ │ │ ├── qwen1.5-7b-4tp-stat.png │ │ │ └── 示例.py │ ├── perfetto.md │ ├── stat_gpu_memory.py │ ├── tgi-benchmark.md │ ├── tgi-benchmark.png │ ├── vllm-benchmark.md │ ├── vllm │ │ ├── README.md │ │ ├── vllm-locust-qwen1.5-7b-long.py │ │ └── vllm-performance-stream-qwen1.5-long.py │ ├── wrk-性能测试工具.md │ ├── 大模型场景下训练和推理性能指标名词解释.md │ ├── 推理性能测试.md │ └── 训练性能测试.md ├── llm-precision │ ├── .DS_Store │ ├── README.md │ └── 模型质量评估.md ├── opencompass.md └── 大模型测评集.md ├── llm-inference ├── .DS_Store ├── DeepSpeed-Inference.md ├── Flash-Decoding.md ├── FlashInfer.md ├── FlexFlow-Serve.md ├── GuidedGeneration.md ├── KV-Cache优化.md ├── Mooncake.md ├── NanoFlow.md ├── PD分离.md ├── README.md ├── RTP-LLM.md ├── ascend │ ├── .DS_Store │ └── mindformers │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── baichuan2 │ │ ├── README.md │ │ ├── baichuan-inference.py │ │ └── baichuan-stat.py │ │ ├── chatglm3 │ │ ├── README.md │ │ ├── chatglm-gen.py │ │ ├── chatglm-inference.py │ │ └── chatglm-stat.py │ │ ├── mindsporelite-inference.py │ │ ├── mindsporelite-stat.py │ │ └── text_generator_infer.py ├── chatgpt.md ├── deepspeed-mii │ └── README.md ├── faster-transformer │ ├── README.md │ ├── bloom │ │ ├── README.md │ │ └── firefly_lambada_1w_stat_token.py │ ├── gpt │ │ └── README.md │ ├── llama │ │ └── README.md │ └── megatron-gpt2 │ │ ├── gpt_summarization.py │ │ ├── gpt_summarization_stat.py │ │ └── megatron-gpt2-fp8.md ├── flexflow-serve │ └── benchmark-batch1.py ├── huggingface-tgi │ └── README.md ├── huggingface-transformer │ └── README.md ├── lightllm │ └── README.md ├── lmdeploy │ ├── README.md │ ├── 功能.md │ └── 服务启动参数.md ├── native-model │ └── chatglm3-6b │ │ └── cli_demo.py ├── offload.md ├── sglang │ ├── README.md │ ├── source-code.md │ ├── 服务器启动参数.md │ └── 项目代码结构.md ├── tensorrt-llm │ ├── FP8.md │ ├── Memory Usage of TensorRT-LLM.md │ ├── README.md │ ├── TRT-LLM引擎构建参数.md │ ├── Triton服务启动参数.md │ └── 安装.md ├── tensorrt │ ├── README.md │ └── install.md ├── triton │ ├── REAEME.md │ ├── onnx │ │ └── README.md │ └── resnet50 │ │ ├── client.py │ │ ├── config.pbtxt │ │ ├── labels.txt │ │ └── resnet50_convert_torchscript.py ├── vllm │ ├── FAQ.md │ ├── FP8.md │ ├── README.md │ ├── REF.md │ ├── api_client.py │ ├── vllm.md │ ├── 服务启动参数.md │ ├── 源码.md │ ├── 请求处理流程.md │ └── 长文本推理.md ├── web │ ├── fastapi │ │ ├── README.md │ │ └── llm-qwen-mindspore-lite.py │ ├── flask │ │ ├── README.md │ │ └── llm-qwen-mindspore-lite.py │ └── sanic │ │ └── README.md ├── xinference │ └── README.md ├── 分离式推理架构.md └── 解码策略.md ├── llm-interview ├── README.md ├── base.md ├── comprehensive.md ├── llm-algo.md ├── llm-app.md ├── llm-compress.md ├── llm-eval.md ├── llm-ft.md ├── llm-inference.md ├── llm-rlhf.md └── llm-train.md ├── llm-localization ├── .DS_Store ├── README.md ├── ascend │ ├── .DS_Store │ ├── FAQ.md │ ├── HCCL.md │ ├── MacOS环境.md │ ├── MindSpore-note.md │ ├── README.md │ ├── ascend-c │ │ └── README.md │ ├── ascend-dmi.md │ ├── ascend-docker-runtime.md │ ├── ascend-docker.md │ ├── ascend-npu-smi.md │ ├── ascend910-env-install.md │ ├── docker环境升级cann.md │ ├── fabric-insight │ │ └── README.md │ ├── firefly.md │ ├── kylin-linux.md │ ├── llm下载.md │ ├── log.md │ ├── mindformers │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── baichuan2 │ │ │ ├── baichuan2训练.md │ │ │ ├── run_baichuan2_7b.yaml │ │ │ ├── run_baichuan2_7b_910b.yaml │ │ │ └── run_baichuan2_7b_lora_910b.yaml │ │ ├── chatglm │ │ │ ├── README.md │ │ │ ├── chat_glm.py │ │ │ ├── glm_6b.yaml │ │ │ ├── glm_6b_chat.yaml │ │ │ ├── merge_ckpt.py │ │ │ ├── merge_ckpt_lora.py │ │ │ ├── pt2ms.py │ │ │ ├── run_glm_6b_finetune.yaml │ │ │ ├── run_glm_6b_infer.yaml │ │ │ ├── run_glm_6b_lora.yaml │ │ │ └── run_glm_6b_lora_infer.yaml │ │ ├── env.md │ │ ├── llama │ │ │ └── README.md │ │ ├── qwen │ │ │ ├── qwen1训练.md │ │ │ ├── run_qwen_7b.yaml │ │ │ └── run_qwen_7b_910b.yaml │ │ ├── qwen1.5 │ │ │ ├── qwen1.5训练.md │ │ │ ├── run_qwen1_5_7b_finetune.yaml │ │ │ └── run_qwen1_5_7b_infer.yaml │ │ ├── trick.md │ │ └── 权重格式转换.md │ ├── mindie │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── config-1.0.RC1.json │ │ ├── config │ │ │ ├── .DS_Store │ │ │ ├── chatglm3-6b.json │ │ │ ├── qwen-72b.json │ │ │ └── run.sh │ │ ├── docker │ │ │ ├── README.md │ │ │ ├── TEST.md │ │ │ ├── baichuan2-13b.json │ │ │ ├── baichuan2-7b.json │ │ │ ├── deploy.sh │ │ │ ├── install_and_enable_cann.sh │ │ │ ├── llm-server.sh │ │ │ ├── mindie-1.0.Dockerfile │ │ │ ├── mindie-all-1.0.Dockerfile │ │ │ ├── mindie-env-1.0.Dockerfile │ │ │ ├── qwen-72b.json │ │ │ ├── qwen1.5-14b.json │ │ │ ├── qwen1.5-72b.json │ │ │ └── qwen1.5-7b.json │ │ ├── llm-server.sh │ │ ├── mindid-1.0-offical.md │ │ ├── mindid-performance.md │ │ ├── mindie-1.0.Dockerfile │ │ ├── mindie-1.0.RC2.md │ │ ├── mindie-1.0.md │ │ ├── mindie-1.0.rc2-config.json │ │ ├── mindie-1.0.rc2-llm-server.sh │ │ ├── mindie-20240411.md │ │ ├── mindie-api.md │ │ ├── model-test.md │ │ ├── script │ │ │ ├── model-test.py │ │ │ └── run.sh │ │ ├── 性能调优.md │ │ └── 日志分析.txt │ ├── mindspore │ │ ├── README.md │ │ ├── bert.md │ │ ├── reference.md │ │ └── 镜像.md │ ├── modellink │ │ ├── README.md │ │ ├── dataset.md │ │ ├── llm.md │ │ ├── qwen.md │ │ ├── 环境-20240521.md │ │ └── 环境安装.md │ ├── msmodelslim │ │ ├── README.md │ │ └── llm_quant │ │ │ ├── baichuan2-w8a8.py │ │ │ ├── calib_set.json │ │ │ └── qwen1.5-72b-w8a16.py │ ├── network.md │ ├── npu监控.md │ ├── peft │ │ ├── README.md │ │ └── finetune-lora.py │ ├── pytorch │ │ ├── .DS_Store │ │ ├── README.md │ │ └── llm-lora.py │ ├── standford-alpaca │ │ ├── README.md │ │ ├── ds_config_zero2.json │ │ ├── ds_config_zero3.json │ │ ├── requirements.txt │ │ ├── train.py │ │ └── utils.py │ ├── transformers │ │ └── README.md │ ├── ubuntu操作系统.md │ ├── vllm-ascend-npu.md │ ├── 昇腾LLM支持概览.md │ ├── 昇腾卡-soc版本.md │ ├── 昇腾卡注意事项.md │ ├── 昇腾镜像.md │ ├── 服务器配置.md │ ├── 环境安装.md │ └── 达芬奇架构.md ├── modelscope │ └── README.md ├── paddle │ └── PaddleNLP.md └── tianshuzhixin │ ├── README.md │ └── ixsmi.md ├── llm-maas ├── OpenAI-ChatGPT.md └── README.md ├── llm-optimizer ├── FlashAttention.md ├── README.md ├── kv-cache.md └── xformers.md ├── llm-pipeline └── REAEMD.md ├── llm-tools ├── Pytorch-Profiler.md ├── README.md ├── base-profiler.py ├── nvtx.md ├── profiler-recipe.py └── tensorboard-profiler.py ├── llm-train ├── .DS_Store ├── README.md ├── alpa │ └── train │ │ ├── pipeshard_parallelism.ipynb │ │ └── pipeshard_parallelism.py ├── alpaca-lora │ ├── README.md │ ├── export_hf_checkpoint.py │ ├── export_state_dict_checkpoint.py │ ├── finetune.py │ ├── finetune_metrics_epoch.py │ ├── generate.py │ └── inference.py ├── alpaca │ ├── README.md │ ├── ds_config.json │ ├── ds_config_zero2.json │ ├── ds_config_zero2_ddp.json │ ├── inference.py │ ├── train.py │ └── train_ddp.py ├── ascend │ └── .DS_Store ├── chatglm-lora │ ├── README.md │ ├── finetune.py │ ├── finetune_ddp.py │ └── inference.py ├── chatglm │ ├── README.md │ ├── deepspeed.json │ ├── ds_train_finetune.sh │ ├── evaluate.sh │ ├── evaluate_finetune.sh │ ├── inference.py │ ├── main.py │ ├── train.sh │ └── train_ptuningv2_dp.sh ├── chinese-llama-alpaca │ ├── README.md │ ├── inference_hf.py │ ├── merge_llama_with_chinese_lora.py │ ├── merge_tokenizers.py │ ├── run_clm_pt_with_peft.py │ ├── run_clm_sft_with_peft.py │ ├── run_pt.sh │ └── run_sft.sh ├── deepspeedchat │ ├── README.md │ ├── llama │ │ └── README.md │ └── training │ │ ├── step1_supervised_finetuning │ │ └── training_scripts │ │ │ └── single_node │ │ │ └── run_13b.sh │ │ ├── step2_reward_model_finetuning │ │ └── training_scripts │ │ │ └── single_node │ │ │ └── run_350m.sh │ │ ├── step3_rlhf_finetuning │ │ └── training_scripts │ │ │ └── single_node │ │ │ └── run_13b.sh │ │ └── utils │ │ └── data │ │ └── raw_datasets.py ├── firefly │ ├── README.md │ ├── bootstrap-s3.sh │ ├── bootstrap.sh │ ├── dockerfile.md │ └── test_bash_getopts.sh ├── fp8.md ├── galore │ └── torchrun_main.py ├── megatron-deepspeed │ ├── README.md │ ├── bigscience │ │ └── bloom-note.md │ ├── bloom-megatron-deepspeed.md │ ├── microsoft │ │ ├── H800多机多卡训练坑点.md │ │ ├── README.md │ │ ├── llama-note.md │ │ ├── pip.conf │ │ ├── pretrain_llama2_13b_distributed_fp16.sh │ │ ├── pretrain_llama2_distributed.sh │ │ ├── pretrain_llama_13b_distributed_fp16.sh │ │ ├── pretrain_llama_7b_distributed_fp16.sh │ │ ├── pretrain_llama_distributed_fp16.sh │ │ ├── slurm │ │ │ ├── README.md │ │ │ ├── llama-multinode-ib.sh │ │ │ ├── megatron-deepspeed-multinode-ib-part2-30b-fp16.slurm │ │ │ └── megatron-deepspeed-multinode-ib-part2-65b-fp16.slurm │ │ ├── 代码.md │ │ ├── 环境准备.md │ │ ├── 训练日志分析.md │ │ └── 项目结构-202312228.md │ └── source-code.md ├── megatron │ ├── README.md │ ├── codegeex │ │ ├── README.md │ │ └── pic │ │ │ ├── CodeGeeX模型架构.png │ │ │ └── CodeGeeX训练配置.png │ ├── gpt2 │ │ ├── README.md │ │ ├── data │ │ │ ├── cMinhash.cpp │ │ │ ├── download.py │ │ │ ├── file_utils.py │ │ │ └── merge_data.py │ │ ├── gpt-data-preprocess.md │ │ ├── merge_ck_and_inference │ │ │ ├── README.md │ │ │ ├── checkpoint_loader_megatron.py │ │ │ ├── checkpoint_saver_megatron.py │ │ │ ├── checkpoint_util.py │ │ │ ├── eval_gpt2_lambada.sh │ │ │ ├── run_text_generation_server.py │ │ │ ├── run_text_generation_server_345M.sh │ │ │ ├── run_text_generation_server_345M_2tp_2dp.sh │ │ │ ├── run_text_generation_server_345M_4_tensor_parallel.sh │ │ │ └── text_generation_cli.py │ │ ├── model_merge_eval_inference.md │ │ ├── model_train.md │ │ ├── requirements.txt │ │ └── train │ │ │ ├── pretrain_gpt.sh │ │ │ ├── pretrain_gpt_distributed.sh │ │ │ ├── pretrain_gpt_distributed_with_4pp.sh │ │ │ ├── pretrain_gpt_distributed_with_4tp.sh │ │ │ └── pretrain_gpt_distributed_with_mp.sh │ ├── kernel_fusion.png │ ├── megatron.drawio │ ├── pretrain.xmind │ ├── project.md │ └── source-code.md ├── paddle │ ├── README.md │ └── paddlenlp │ │ ├── README.md │ │ ├── baichuan2 │ │ └── README.md │ │ └── bloom │ │ ├── README.md │ │ └── sft_argument.json ├── peft │ ├── .DS_Store │ ├── LoRA-QLoRA.md │ ├── PEFT-API.md │ ├── Prefix-Tuning.md │ ├── Prompt-Tuning.md │ ├── README.md │ ├── clm │ │ ├── accelerate_ds_zero3_cpu_offload_config.yaml │ │ ├── peft_ia3_clm.ipynb │ │ ├── peft_lora_clm.ipynb │ │ ├── peft_lora_clm_accelerate_ds_zero3_offload.py │ │ ├── peft_p_tuning_clm.ipynb │ │ ├── peft_p_tuning_lstm_clm.ipynb │ │ ├── peft_p_tuning_v2_clm.ipynb │ │ ├── peft_prefix_tuning_clm.ipynb │ │ └── peft_prompt_tuning_clm.ipynb │ ├── conditional_generation │ │ └── README.md │ └── multimodal │ │ ├── blip2_lora_inference.py │ │ ├── blip2_lora_int8_fine_tune.py │ │ └── finetune_bloom_bnb_peft.ipynb ├── pytorch │ ├── Pytorch源码解读.md │ ├── README.md │ ├── api.md │ ├── distribution │ │ ├── README.md │ │ ├── api.md │ │ ├── data-parallel │ │ │ ├── README.md │ │ │ ├── ddp_launch.py │ │ │ ├── ddp_main.py │ │ │ ├── elastic_ddp.py │ │ │ ├── minGPT-ddp │ │ │ │ ├── README.md │ │ │ │ ├── multinode.sh │ │ │ │ ├── sbatch_run.sh │ │ │ │ ├── sbatch_run_sig.sh │ │ │ │ └── sbatch_run_sig_opt.sh │ │ │ ├── sbatch_run.sh │ │ │ └── 使用DDP训练真实世界的模型.md │ │ ├── pipeline-parallel │ │ │ ├── 1-流水线.md │ │ │ ├── 2-使用torchtext训练transformer模型.md │ │ │ ├── 3-使用流水线并行训练Transformer模型.md │ │ │ ├── 4-使用DDP与流水线并行训练Transformer模型.md │ │ │ ├── README.md │ │ │ ├── ddp_pipeline.py │ │ │ ├── pipeline_tutorial.ipynb │ │ │ └── transformer_tutorial.ipynb │ │ ├── rpc │ │ │ └── README.md │ │ ├── sequence-parallelism │ │ │ └── README.md │ │ ├── tensor-parallel │ │ │ ├── 2d_parallel_example.py │ │ │ ├── README.md │ │ │ ├── sequence_parallel_example.py │ │ │ ├── tensor_parallel_example.py │ │ │ └── utils.py │ │ ├── torchrun.md │ │ ├── 分布式通信包.md │ │ ├── 多机多卡.md │ │ └── 多机训练.md │ ├── resource.md │ └── torchrun.md ├── qlora │ ├── README.md │ ├── accuracy.py │ ├── export_hf_checkpoint.py │ ├── inference.py │ ├── inference_merge.py │ ├── inference_qlora.py │ └── qlora.py ├── slurm │ ├── README.md │ ├── deepspeed │ │ ├── pp-multinode-machine.slurm │ │ ├── pp-multinode-singularity.slurm │ │ ├── pp-mutinode-singularity-pmix.slurm │ │ ├── pp-standalone-singularity-v2.slurm │ │ └── pp-standalone-singularity.slurm │ ├── megatron-deepspeed │ │ └── megatron-deepspeed-multinode-ib-part2-65b-fp16.slurm │ └── pytorch │ │ ├── alpaca-docker.slurm │ │ ├── alpaca-machine.slurm │ │ ├── alpaca-singularity.slurm │ │ ├── mingpt-singularity-multinode-2.slurm │ │ └── mingpt-singularity-multinode.slurm └── vicuna │ └── README.md ├── llmops ├── FAQ.md ├── README.md ├── kubernetes.md ├── tq-llm │ └── train │ │ ├── FAQ.md │ │ ├── README.md │ │ ├── bootstrap-llm-zero3-offload.sh │ │ ├── bootstrap-llm.sh │ │ ├── bootstrap-llm2.sh │ │ ├── zero2-offload.json │ │ └── zero3-offload.json ├── 使用docker进行多机多卡训练.md ├── 千帆大模型平台.md └── 模型推理平台方案.md ├── mkdir-dir-file.sh ├── paper ├── A Survey on Efficient Training of Transformers.md ├── LESS-选择有影响力的数据进行目标指令精调.md ├── LLM增强LLMS.md ├── PagedAttention.md ├── README.md ├── data │ ├── LESS 实践:仅用少量的数据完成目标指令微调.md │ ├── LESS-选择有影响力的数据进行目标指令精调.md │ └── LESS.md ├── inference │ ├── llm-in-a-flash.md │ ├── orca.md │ └── 迈向高效的生成式大语言模型服务综述-从算法到系统.md ├── llm对齐综述.md ├── moe │ └── README.md ├── parameter-pruning │ ├── LLM-Pruner.md │ ├── SparseGPT.md │ ├── Wanda.md │ └── 公式.md └── training │ ├── A Survey on Efficient Training of Transformers.md │ ├── GaLore.md │ └── Reducing Activation Recomputation in Large Transformer Models.md └── pic ├── .DS_Store ├── llm-action-v2.png ├── llm-action-v3.png ├── llm ├── .DS_Store ├── model │ ├── llm-famliy.jpg │ └── llm-timeline-v2.png └── train │ ├── .DS_Store │ ├── pretrain │ └── llm-pretrain-pipeline-v2.png │ └── sft │ └── peft方法.jpg ├── wechat.jpeg ├── wx-gzh.png ├── wx.jpg └── 公众号.jpeg /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/.DS_Store -------------------------------------------------------------------------------- /ai-compiler/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-compiler/.DS_Store -------------------------------------------------------------------------------- /ai-compiler/Treebeard/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-compiler/Treebeard/.DS_Store -------------------------------------------------------------------------------- /ai-compiler/Treebeard/10-TableI-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-compiler/Treebeard/10-TableI-1.png -------------------------------------------------------------------------------- /ai-compiler/Treebeard/10-TableII-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-compiler/Treebeard/10-TableII-1.png -------------------------------------------------------------------------------- /ai-compiler/Treebeard/9-Figure6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-compiler/Treebeard/9-Figure6-1.png -------------------------------------------------------------------------------- /ai-compiler/Treebeard/TREEBEARD IR lowering 和 optimization 细节.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-compiler/Treebeard/TREEBEARD IR lowering 和 optimization 细节.png -------------------------------------------------------------------------------- /ai-compiler/Treebeard/TREEBEARD编译器结构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-compiler/Treebeard/TREEBEARD编译器结构.png -------------------------------------------------------------------------------- /ai-compiler/cuda/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /ai-compiler/treelit/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | conda create -n model-inference-venv python=3.9 -y 6 | 7 | 8 | conda activate model-inference-venv 9 | ``` 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | - 机器学习:软件工程方法与实现:https://github.com/chansonZ/book-ml-sem/ 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /ai-compiler/treelit/xgb.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | conda create -n model-server-venv python=3.9 -y 7 | ``` -------------------------------------------------------------------------------- /ai-compiler/triton-lang/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /ai-framework/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-framework/.DS_Store -------------------------------------------------------------------------------- /ai-framework/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## 国外 7 | 8 | 9 | ### PyTorch 10 | 11 | 12 | 13 | 14 | 15 | ## 国内 16 | 17 | 18 | ### Oneflow 19 | 20 | 21 | 22 | 23 | ### PaddlePaddle 24 | 25 | 26 | 27 | 28 | ### MindSpore 29 | 30 | 31 | 32 | 33 | 34 | 35 | 自动混合精度 36 | 37 | - https://github.com/Azure/MS-AMP 38 | - FP8-LM 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /ai-framework/TensorRT-Model-Optimizer.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - 代码:https://github.com/NVIDIA/TensorRT-Model-Optimizer 6 | - 文档:https://nvidia.github.io/TensorRT-Model-Optimizer/ 7 | 8 | - 量化方法最佳实践:https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_choosing_quant_methods.html -------------------------------------------------------------------------------- /ai-framework/cuda/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /ai-framework/deepspeed/DeepSpeed配置JSON文件.md: -------------------------------------------------------------------------------- 1 | ## DeepSpeed Configuration JSON 2 | 3 | 地址:https://www.deepspeed.ai/docs/config-json/ 4 | 5 | 6 | 7 | ### FP16 训练的 ZeRO 优化 8 | 9 | 启用和配置 ZeRO 内存优化 10 | 11 | 12 | 13 | - stage3_gather_16bit_weights_on_model_save: [boolean] 14 | 15 | > 在通过 save_16bit_model() 保存模型之前合并权重。 由于权重在 GPU 之间进行分区,因此它们不是 state_dict 的一部分,因此启用此选项时该函数会自动收集权重,然后保存 fp16 模型权重。 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /ai-framework/deepspeed/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://github.com/microsoft/DeepSpeedExamples 5 | - https://github.com/microsoft/DeepSpeedExamples.git 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /ai-framework/deepspeed/config-json/deepspeed-nvme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | - ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning 8 | 9 | ``` 10 | 11 | ``` 12 | 13 | 14 | -------------------------------------------------------------------------------- /ai-framework/deepspeed/deepspeed-slurm.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | ## 支持的发布 8 | 9 | PDSH_LAUNCHER = 'pdsh' 10 | PDSH_MAX_FAN_OUT = 1024 11 | 12 | OPENMPI_LAUNCHER = 'openmpi' 13 | MPICH_LAUNCHER = 'mpich' 14 | IMPI_LAUNCHER = 'impi' 15 | SLURM_LAUNCHER = 'slurm' 16 | MVAPICH_LAUNCHER = 'mvapich' 17 | 18 | 19 | 20 | 21 | 22 | 23 | ## Slurm 24 | 25 | - https://hpclib.com/Scheduler/Slurm/mpi_guide.html 26 | - https://slurm.schedmd.com/mpi_guide.html 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /ai-framework/deepspeed/training/pipeline_parallelism/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ``` 9 | deepspeed --include localhost:3,4,5,6 train.py --deepspeed_config=ds_config.json -p 2 --steps=200 10 | ``` 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /ai-framework/dlrover.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://github.com/intelligent-machine-learning/dlrover 5 | 6 | DLRover: An Automatic Distributed Deep Learning System 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /ai-framework/huggingface-accelerate/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://huggingface.co/docs/accelerate/package_reference/cli 4 | 5 | ``` 6 | accelerate env 7 | 8 | # 9 | accelerate config default [arguments] 10 | 11 | 12 | 13 | accelerate config update --config_file 14 | 15 | 16 | 17 | 18 | ``` 19 | 20 | 21 | 22 | ## huggingface 加载大模型 23 | 24 | - 使用HuggingFace的Accelerate库加载和运行超大模型: https://zhuanlan.zhihu.com/p/605640431 25 | 26 | 27 | ``` 28 | import torch 29 | from transformers import AutoModelForCausalLM 30 | ​ 31 | checkpoint = "facebook/opt-13b" 32 | model = AutoModelForCausalLM.from_pretrained( 33 | checkpoint, device_map="auto", offload_folder="offload", offload_state_dict = True, torch_dtype=torch.float16 34 | ) 35 | 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /ai-framework/huggingface-peft/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /ai-framework/huggingface-transformers/FSDP.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - https://pytorch.org/docs/stable/fsdp.html 6 | - https://huggingface.co/docs/accelerate/usage_guides/fsdp 7 | 8 | 9 | transformers 10 | - https://zhuanlan.zhihu.com/p/648094197 11 | - https://github.com/ifromeast/LLMTrainer/blob/main/02_fsdp/fsdp.json 12 | 13 | 14 | accelerate 15 | - 使用 PyTorch FSDP 微调 Llama 2 70B:https://zhuanlan.zhihu.com/p/671742753 16 | - https://huggingface.co/docs/transformers/v4.41.0/en/fsdp#fsdp-configuration 17 | - https://huggingface.co/docs/transformers/v4.41.0/en/main_classes/trainer#transformers.TrainingArguments 18 | - https://github.com/pacman100/LLM-Workshop/blob/main/chat_assistant/sft/training/configs/fsdp_config.yaml 19 | 20 | 21 | -------------------------------------------------------------------------------- /ai-framework/huggingface-trl/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /ai-framework/jax/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Jax 是我看过那么多项目中,唯一一个让我看了之后觉得「哇,软件还可以这么写,一切都很有道理」的项目。我觉得 Google 还是吸取了很多 Tensorflow 的经验,把它们都用到了 Jax 里面。 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /ai-framework/jax/reference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - https://jax.readthedocs.io/en/latest/notebooks/neural_network_with_tfds_data.html 6 | - https://github.com/google/jax 7 | 8 | 9 | -------------------------------------------------------------------------------- /ai-framework/megatron-deepspeed/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-framework/megatron-deepspeed/README.md -------------------------------------------------------------------------------- /ai-framework/megatron-lm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-framework/megatron-lm/README.md -------------------------------------------------------------------------------- /ai-framework/mxnet/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## 安装 5 | 6 | ``` 7 | pip install --upgrade mxnet gluonnlp 8 | 9 | pip install mxnet==1.9.1 gluonnlp==0.10.0 10 | ``` 11 | 12 | ## docker 13 | 14 | ``` 15 | # GPU Instance 16 | docker pull gluonai/gluon-nlp:gpu-latest 17 | docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:gpu-latest 18 | 19 | # CPU Instance 20 | docker pull gluonai/gluon-nlp:cpu-latest 21 | docker run --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:cpu-latest 22 | ``` 23 | 24 | 25 | 26 | ``` 27 | docker run --gpus all -itd \ 28 | --ipc=host \ 29 | --network host \ 30 | --shm-size=4g \ 31 | -v /home/guodong.li/workspace/:/workspace/ \ 32 | --name mxnet_dev \ 33 | gluonai/gluon-nlp:gpu-latest \ 34 | /bin/bash 35 | 36 | 37 | docker exec -it mxnet_dev bash 38 | 39 | pip uninstall mxnet-cu102 40 | pip install mxnet==1.9.1 gluonnlp==0.10.0 -i http://nexus3.xxx.com/repository/pypi/simple --trusted-host nexus3.xxx.com 41 | ``` 42 | 43 | -------------------------------------------------------------------------------- /ai-framework/mxnet/reference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - https://github.com/apache/mxnet 6 | - https://github.com/dmlc/gluon-nlp/ 7 | - https://nlp.gluon.ai/model_zoo/index.html 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /ai-framework/oneflow/reference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://github.com/Oneflow-Inc/oneflow 5 | - https://docs.oneflow.org/master/basics/04_build_network.html 6 | - https://docs.oneflow.org/master/basics/01_quickstart.html 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /ai-framework/openai-triton/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://github.com/openai/triton 4 | 5 | 6 | 7 | - OpenAI Triton 入门教程: https://zhuanlan.zhihu.com/p/684473453 -------------------------------------------------------------------------------- /ai-framework/paddlepaddle/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://www.paddlepaddle.org.cn/install/quick 4 | - https://github.com/PaddlePaddle/PaddleNLP 5 | 6 | 7 | 8 | ``` 9 | python -m pip install paddlepaddle==2.5.2 -i https://pypi.tuna.tsinghua.edu.cn/simple 10 | 11 | 12 | pip install --upgrade paddlenlp 13 | ``` 14 | 15 | 16 | ``` 17 | docker pull paddlepaddle/paddle:2.5.2-gpu-cuda11.7-cudnn8.4-trt8.4 18 | 19 | docker pull paddlepaddle/paddle:2.5.2 20 | ``` 21 | 22 | 23 | 24 | 25 | 26 | 27 | ## 可视化 28 | 29 | 30 | ``` 31 | visualdl --logdir ./log 32 | ``` 33 | -------------------------------------------------------------------------------- /ai-framework/paddlepaddle/reference.md: -------------------------------------------------------------------------------- 1 | 2 | - paddle支持的硬件:https://www.paddlepaddle.org.cn/install/other 3 | 4 | - 【推荐】手写数字识别模型:https://www.paddlepaddle.org.cn/tutorials/projectdetail/3473063 5 | - mnist数据下载:https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/vision/datasets/mnist.py 6 | 7 | - VisualDL可视化:https://www.paddlepaddle.org.cn/documentation/docs/zh/2.2/guides/03_VisualDL/visualdl_cn.html#qidongmianban 8 | - Paddle2.0 高层api调用VisualDL可视化:https://aistudio.baidu.com/projectdetail/1321602 9 | - CV任务示例:https://www.paddlepaddle.org.cn/documentation/docs/zh/practices/cv/image_classification.html 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /ai-framework/pai-megatron-patch/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-framework/pai-megatron-patch/.DS_Store -------------------------------------------------------------------------------- /ai-framework/pai-torchacc.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://help.aliyun.com/zh/pai/user-guide/torchacc-overview 5 | 6 | 7 | PAI-TorchAcc(Torch Accelerator)是基于PyTorch的训练加速框架,通过GraphCapture技术将PyTorch动态图转换为静态执行图,然后进一步基于计算图完成分布式优化、计算优化,从而提高PyTorch模型训练的效率,使其更加易于使用。 8 | 9 | -------------------------------------------------------------------------------- /ai-framework/pytorch/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-framework/pytorch/README.md -------------------------------------------------------------------------------- /ai-framework/pytorch/reference.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-framework/pytorch/reference.md -------------------------------------------------------------------------------- /ai-framework/tensorflow/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-framework/tensorflow/README.md -------------------------------------------------------------------------------- /ai-framework/tensorflow/reference.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-framework/tensorflow/reference.md -------------------------------------------------------------------------------- /ai-framework/unsloth-微调.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://github.com/unslothai/unsloth 5 | 6 | 7 | Unsloth 8 | 模型的微调,全部用 Triton Kernel 重写。从技术角度来看,这个项目非常有意思,它推到了 PyTorch 目前无法达到的优化极致。 -------------------------------------------------------------------------------- /ai-infra/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/.DS_Store -------------------------------------------------------------------------------- /ai-infra/ai-cluster/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## AI硬件 3 | 4 | 5 | A800只是在A100的基础上,将NVLink高速互连总线的带宽从600GB/s降低到400GB/s,仅此而已。 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /ai-infra/ai-hardware/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/ai-hardware/.DS_Store -------------------------------------------------------------------------------- /ai-infra/ai-hardware/AI芯片软件生态.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## cuda 6 | 7 | 8 | ``` 9 | wget https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run 10 | sudo sh cuda_12.1.1_530.30.02_linux.run 11 | ``` 12 | 13 | 14 | 15 | 16 | ## 瑞芯微 RockX RKNN 17 | 18 | ## 昇腾 CANN 19 | 20 | ## 海光 ROCm 21 | 22 | 23 | 24 | ## 寒武纪 Neuware 25 | 26 | 27 | 28 | 29 | 30 | 自建生态是长久之道,国产头部算力生态厂商受益。全球来看主要的追赶者AMD和Intel均是走兼容模式为主,是以移植方式兼容CUDA生态,以此次报道的要求来看或短期不受影响。国内核心国产AI算力芯片技术路线中,华为昇腾自建CANN生态、寒武纪自建生态Neuware、海光使用AMD的ROCm平台,其中自建生态的华为昇腾和寒武纪在生态自主上走的更稳更远,而海光当前短期或不受影响且可借力AMD,报道中提到的登临科技、摩尔线程、壁仞科技等AI芯片创业或面临较大的AI算力生态压力。 31 | -------------------------------------------------------------------------------- /ai-infra/ai-hardware/CUDA.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | CUDA 库主要包括以下几个部分: 5 | 6 | CUDA Runtime API:这是CUDA的核心库,提供了运行时的设备初始化、内存管理、内核执行等功能。 7 | 8 | CUDA Driver API:这是CUDA的底层驱动库,提供了与设备和操作系统底层交互的功能。 9 | 10 | CUDA CUDART库:这是CUDA运行时库,提供了C语言的标准数学函数和其他功能的接口。 11 | 12 | CUDA CUBLAS库:这是CUDA的线性代数库,提供了高效的矩阵和向量运算。 13 | 14 | CUDA CUFFT库:这是CUDA的快速傅立叶变换库,用于进行傅立叶变换。 15 | 16 | CUDA CURAND库:这是CUDA的随机数库,用于生成各种分布的随机数。 17 | 18 | 19 | 20 | - https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html 21 | - https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#id4 22 | CUDA Toolkit and Corresponding Driver Versions 23 | 24 | 25 | 26 | 27 | - CUDA 编程手册: https://github.com/HeKun-NVIDIA/CUDA-Programming-Guide-in-Chinese 28 | 29 | 30 | -------------------------------------------------------------------------------- /ai-infra/ai-hardware/GPU-network.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | network 6 | - https://docs.nvidia.com/networking/display/mlnxofedv583070101/introduction 7 | 8 | 9 | 10 | nvme-of 11 | - https://docs.nvidia.com/networking/display/mlnxofedv583070101/nvme-of+-+nvm+express+over+fabrics 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | NVIDIA Ada Lovelace Architecture 22 | - https://www.nvidia.cn/geforce/ada-lovelace-architecture/ 23 | - https://www.nvidia.com/en-us/geforce/ada-lovelace-architecture/ 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /ai-infra/ai-hardware/GPU相关环节变量.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## CUDA 5 | 6 | CUDA_VISIBLE_DEVICES=1 7 | export CUDA_LAUNCH_BLOCKING=1 8 | export CUDA_DEVICE_MAX_CONNECTIONS=1 9 | 10 | 11 | ## NCCL 12 | 13 | 14 | export NCCL_IB_DISABLE=1 15 | export NCCL_SOCKET_IFNAME=bond0 16 | -------------------------------------------------------------------------------- /ai-infra/ai-hardware/OEM-DGX.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## H3C 7 | 8 | H3C UniServer R5500LC G5服务器---全新A800 GPU的人工智能液冷服务器,支持HGX A800 8-GPU模组,8块A800 GPU通过6个NVSWITCH实现400GB/s的全互联,AI算力较上一代可提升多达20倍。 9 | 10 | - https://www.h3c.com/cn/Products_And_Solution/Server/H3C/Products/GPU_Server/Products_Series/Application_Optimization_GPU/R5500LC_G5/ 11 | 12 | 13 | H3C UniServer R5500 G6 服务器---NVIDIA HGX H800 8-GPU模组的人工智能服务器 14 | 15 | - https://www.h3c.com/cn/Products_And_Solution/Server/H3C/Products/GPU_Server/Products_Series/Application_Optimization_GPU/R5500_G6/ 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /ai-infra/ai-hardware/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Nvidia GPU 4 | 5 | 6 | NVIDIA A100 80GB PCIe GPU: https://www.edomtech.com.cn/product-detail/nvidia-a100-80gb-pcie-gpu/ 7 | 8 | - RTX : https://images.nvidia.com/aem-dam/Solutions/geforce/blackwell/nvidia-rtx-blackwell-gpu-architecture.pdf 9 | 10 | 11 | 12 | ## Google TPU 13 | 14 | 15 | 16 | ## Ascend NPU 17 | 18 | 19 | ## 特斯拉 DOJO 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | - NVIDIA GPUDirect: https://developer.nvidia.com/gpudirect 29 | 30 | 31 | 32 | 33 | 34 | ## 国产AI芯片 35 | 36 | 37 | 38 | - 百度昆仑芯:https://www.kunlunxin.com/ 39 | 40 | 41 | 42 | - 阿里含光https://zhuanlan.zhihu.com/p/593171726 43 | - 平头哥:https://www.t-head.cn/ 44 | 45 | 46 | - 昇腾:https://zhuanlan.zhihu.com/p/593202259 -------------------------------------------------------------------------------- /ai-infra/ai-hardware/TSMC-台积电.md: -------------------------------------------------------------------------------- 1 | 2 | TSMC N7(7纳米工艺)-DUV(深紫外线)光刻技术 3 | 4 | 5 | TSMC N4(4纳米工艺)-EUV(极紫外线)光刻技术 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /ai-infra/ai-hardware/cuda镜像.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda 6 | https://gitlab.com/nvidia/container-images/cuda/-/tree/master/dist 7 | https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.1.1/centos7/devel/Dockerfile 8 | 9 | nvcr.io/nvidia/cuda:12.1.0-cudnn8-runtime-centos7 10 | 11 | ``` 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /ai-infra/ai-hardware/硬件对比.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | FPGA 相比同等面积和工艺 ASIC 的算力差着数量级 5 | 6 | 7 | 大多数大模型 16-bit 权重真的可以量化到 8-bit 而不太损失精度。但要压缩到 4-bit,精度一般就会有比较大的损失了。 8 | 9 | 10 | 11 | NVIDIA 的 Tensor Core 也可以很高效地执行 16-bit 和 8-bit 的计算,8-bit 的算力基本上是 16-bit 的两倍。 12 | 13 | 14 | 15 | FPGA 还是适合用来做智能网卡,也适合做存储编码、压缩、加密等特定算法的加速。AI 大模型还是 ASIC 更在行。 16 | -------------------------------------------------------------------------------- /ai-infra/communication.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - MoE 通信优化技术 COMET 开源: https://zhuanlan.zhihu.com/p/29264560896 5 | - https://github.com/bytedance/flux 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /ai-infra/存储/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - [GDDR6 vs DDR4 vs HBM2?为什么CPU还不用GDDR?异构内存的未来在哪里?](https://www.zhihu.com/tardis/zm/art/83935084?source_id=1003) 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /ai-infra/存储/REF.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /ai-infra/存储/nvme-ssd.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - ChatGPT一路狂飙,NVMe SSD能否应对性能挑战?:https://blog.csdn.net/Memblaze_2011/article/details/129040963 5 | - NVMe 2.0 简介:https://blog.csdn.net/Memblaze_2011/article/details/134402001 6 | - NVMe over Fabric网络技术介绍:https://zhuanlan.zhihu.com/p/596081274 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /ai-infra/存储/存储.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | - [硬盘科普,M.2,PCI-E,NVMe 傻傻分不清](https://zhuanlan.zhihu.com/p/396745362) 13 | 14 | 物理接口,通道,协议 15 | 16 | 17 | 18 | - [NVMe、AHCI、PCIe、SATA、NGFF接口、协议小结](https://blog.csdn.net/wujinglin7/article/details/122826608) 19 | 20 | 通讯协议、物理接口标准(规范)、传输通道 21 | 22 | 23 | 24 | 25 | - 详解:什么是NVMe?:https://zhuanlan.zhihu.com/p/363589126 26 | 27 | 28 | 29 | 30 | 机械硬盘 ————> SATA M.2 固态硬盘 ----> NVMe M.2 固态硬盘 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /ai-infra/算力/GPU工作原理.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - [GPU 工作原理解析](https://zhuanlan.zhihu.com/p/697694330) 5 | - [GPU 架构与 CUDA 关系](https://zhuanlan.zhihu.com/p/697746975) 6 | 7 | -------------------------------------------------------------------------------- /ai-infra/算力/推理芯片.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 如果说大模型「上半场」是技术的较量,那么「下半场」则是商业化的比拼。一旦大模型成熟,与之而来的便是落地应用,滋生对推理芯片的庞大需求。 4 | 5 | -------------------------------------------------------------------------------- /ai-infra/算力/昇腾NPU.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Atlas 800-9000A2 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /ai-infra/网络/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/.DS_Store -------------------------------------------------------------------------------- /ai-infra/网络/IB-docker.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ``` 9 | yum install libibverbs 10 | ``` -------------------------------------------------------------------------------- /ai-infra/网络/IB流量监控.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ifstat,nload 这些工具都只能监控 TCP/IP 的流量,因此虽然其上面能显示出 IB 卡,但其实并不能监控到出入 IB 的流量数据,结果中对应部分一直都是 0。 5 | 6 | 7 | -------------------------------------------------------------------------------- /ai-infra/网络/InfiniBand.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | InfiniBand网络接口的一种分类方式,按照数据传输速率的的不同进行区分。具体如下: 4 | 5 | SDR(Single Data Rate):单倍数据率,即8Gb/s 。 6 | DDR(Double Data Rate):双倍数据率,即16Gb/s。 7 | QDR(Quad Data Rate):四倍数据率,即32Gb/s。 8 | FDR(Fourteen Data Rate):十四倍数据率,即56Gb/s。 9 | EDR(Enhanced Data Rate):100 Gb/s。 10 | HDR(High Data Rate):200 Gb/s。 11 | NDR(Next Data Rate):400 Gb/s+。 12 | 13 | 这些技术指标主要是根据实际的数据传输速率来区分的,而且每种技术都有自己的编码方式以提高数据传输的效率和稳定性。 14 | 15 | 例如,对于SDR、DDR、QDR、FDR、EDR、HDR和NDR,它们采用的数据编码方式分别为8b/10b、8b/10b、8b/10b、64b/66b(仅FDR和EDR)、64b/66b(仅HDR)和自定义编码方式。 16 | 17 | 18 | 19 | 20 | - 态路小课堂丨关于InfiniBand网络相关内容简介!:https://baijiahao.baidu.com/s?id=1760941961023057651&wfr=spider&for=pc 21 | 22 | 23 | 24 | - Infiniband Verbs 性能测试:https://github.com/linux-rdma/perftest 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /ai-infra/网络/NCCL.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | NCCL 通信库仅针对 Nvidia Spectrum-X 和 Nvidia InfiniBand 进行了优化。 4 | 5 | 博通 Tomahawk 5 以太网方案,客户需要有足够的工程能力来为 Tomahawk 5 适配及优化英伟达的 NCCL 通信库。 6 | 7 | 8 | - 环境变量:https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html 9 | 10 | 11 | 12 | 13 | NCCL_DEBUG=WARN 14 | 15 | 16 | NCCL_SOCKET_IFNAME==ens1f0 17 | 18 | 19 | 20 | ldconfig -p | grep libnccl 21 | 22 | export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH 23 | 24 | 25 | ``` 26 | yum install libnccl libnccl-devel libnccl-static 27 | ``` 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /ai-infra/网络/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - 聊透 GPU 通信技术——GPU Direct、NVLink、RDMA: https://zhuanlan.zhihu.com/p/654417967 6 | - 腾讯机智团队分享--GPU数据传输概览: https://zhuanlan.zhihu.com/p/74217534 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | ## PCIe 15 | 16 | - https://blogs.synopsys.com/expressyourself/2017/08/15/1-2-3-4-5-its-official-pcie-5-0-is-announced/ 17 | 18 | - https://www.zhihu.com/question/618932114/answer/3192465335 19 | 20 | 21 | 22 | ## NVLink 23 | 24 | 25 | 26 | 27 | 28 | ## NVSwitch 29 | 30 | 31 | 32 | 第三代 NVIDIA NVSwitch™ 基于 NVLink 的高级通信能力构建,可为计算密集型工作负载提供更高带宽和更低延迟。为了支持高速集合运算,每个 NVSwitch 都有 64 个 NVLink 端口,并配有 NVIDIA SHARP™ 引擎,可用于网络内归约和组播加速。 33 | 34 | 35 | 36 | NVLink 是一种 GPU 之间的直接互连,可扩展服务器内的多 GPU 输入/输出 (IO)。NVSwitch 可连接多个 NVLink,在单节点内和节点间实现以 NVLink 能够达到的最高速度进行多对多 GPU 通信。 37 | 38 | -------------------------------------------------------------------------------- /ai-infra/网络/REF.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - [RoCE、IB和TCP等网络的基本知识及差异对比](https://support.huawei.com/enterprise/zh/doc/EDOC1100203347) 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /ai-infra/网络/pic/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/.DS_Store -------------------------------------------------------------------------------- /ai-infra/网络/pic/8卡V100的混合网络拓扑.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/8卡V100的混合网络拓扑.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/A800-H100-H800.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/A800-H100-H800.jpeg -------------------------------------------------------------------------------- /ai-infra/网络/pic/NVLink-generations比较.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/NVLink-generations比较.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/PCIe-Generation对比.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/PCIe-Generation对比.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/nvidia-dgx-1-v100-nvlink-gpu-xeon-config.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/nvidia-dgx-1-v100-nvlink-gpu-xeon-config.webp -------------------------------------------------------------------------------- /ai-infra/网络/pic/nvidia-dgx-1-with-volta.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/nvidia-dgx-1-with-volta.webp -------------------------------------------------------------------------------- /ai-infra/网络/pic/nvlink.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/nvlink.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/nvlink性能.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/nvlink性能.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/nvswitch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/nvswitch.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/支持 NVLink GPU 之间连接的 NVIDIA H100-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/支持 NVLink GPU 之间连接的 NVIDIA H100-1.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/支持 NVLink GPU 之间连接的 NVIDIA H100-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/支持 NVLink GPU 之间连接的 NVIDIA H100-2.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/服务器之间的nvlink与nvswitch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/服务器之间的nvlink与nvswitch.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/网络之间的连接.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/网络之间的连接.png -------------------------------------------------------------------------------- /ai-infra/网络/pic/英伟达A100-A800-H100-H800.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/pic/英伟达A100-A800-H100-H800.jpeg -------------------------------------------------------------------------------- /ai-infra/网络/ringallreduce/All Gather 流程图.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/ringallreduce/All Gather 流程图.gif -------------------------------------------------------------------------------- /ai-infra/网络/ringallreduce/Scatter Reduce 流程图.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/ai-infra/网络/ringallreduce/Scatter Reduce 流程图.gif -------------------------------------------------------------------------------- /ai-infra/网络/roce.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | AI场景下高性能网络技术RoCE v2介绍: https://mp.weixin.qq.com/s/XyMFst3w-d65u4fU7cgLPA 5 | 6 | 7 | RoCE是基于 Ethernet的RDMA,RoCEv1版本基于网络链路层,无法跨网段,基本无应用。 8 | RoCEv2基于UDP,可以跨网段具有良好的扩展性,而且吞吐,时延性能相对较好,所以是被大规模采用的方案。 9 | 10 | 11 | 12 | ## ofed驱动 13 | 14 | 使用RoCE v2之前需要安装相关驱动,也就是ofed软件栈。OFED (OpenFabrics Enterprise Distribution) 是一个开源的软件栈,用于在高性能计算 (HPC) 和数据中心环境中实现高性能网络通信。它是一组用于 InfiniBand 和以太网 RDMA (Remote Direct Memory Access) 技术的软件包和驱动程序的集合。 15 | 16 | 17 | 18 | ## 性能测试-perftest 19 | 20 | perftest是ofed性能测试工具集。专门用于测试RDMA的性能 21 | 22 | ``` 23 | 带宽测试 24 | ib_send_bw -d mlx5_0 25 | 26 | 客户端 27 | ib_send_bw -d mlx5_1 10.251.30.207 28 | ``` 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /ai-infra/网络/网络硬件.md: -------------------------------------------------------------------------------- 1 | 2 | LOC PIX PXB PHB SYS 3 | 4 | 5 | GPU间的通讯速度: 6 | 7 | NV# > PIX > PXB > PHB > NODE > SYS 8 | 9 | 10 | - SYS : 穿越 PCIe 的连接以及 NUMA 节点之间的 SMP 互连(例如 QPI/UPI) 11 | - NODE :穿越 PCIe 的连接以及 NUMA 节点内 PCIe 主机桥(PCIe Host Bridges)之间的互连 12 | - PHB :穿越 PCIe 以及 PCIe 主机桥(通常是 CPU)的连接 13 | - PXB :穿过多个 PCIe 交换机的连接(不穿过 PCIe 主机桥) 14 | - PIX :最多穿越单个 PCIe 交换机的连接 15 | - NV# :穿越一组 # NVLinks 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /ai-infra/网络/通信软件.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Open MPI / MPICH 6 | 7 | - https://github.com/pmodels/mpich 8 | - https://github.com/open-mpi/ompi 9 | 10 | 11 | 12 | 13 | 14 | 15 | MPI有多种实现方式,例如OpenMPI,MPICH。 16 | 17 | MPI 全名叫 Message Passing Interface,即信息传递接口,作用是可以通过 MPI 可以在不同进程间传递消息,从而可以并行地处理任务,即进行并行计算。NCCL中利用MPI来处理多机通讯的部分。 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | OpenMPI 28 | 29 | OpenMPI是一个开源的 Message Passing Interface 实现,是一种高性能消息传递库,能够结合整个高性能计算社区的专业知识、技术和资源,建立现有的最佳MPI库。OpenMPI在系统和软件供应商、应用开发者和计算机科学研究人员中有广泛应用。 30 | 31 | NCCL 32 | 33 | NCCL(Nvidia Collective multi-GPU Communication Library,读作 "Nickel")是一个提供GPU间通信基元的库,它具有拓扑感知能力,可以轻松集成到应用程序中。NCCL做了很多优化,以在PCIe、Nvlink、InfiniBand上实现较高的通信速度。NCCL支持安装在单个节点或多个节点上的大量GPU卡上,并可用于单进程或多进程(如MPI)应用。 34 | 35 | NCCL Tests 36 | 37 | NCCL Tests是一个测试工具集,可以用来评估NCCL的运行性能和正确性。 -------------------------------------------------------------------------------- /blog/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/blog/.DS_Store -------------------------------------------------------------------------------- /blog/TODO.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | llm推理优化技术: 5 | - https://github.com/liguodongiot/llm-action/blob/main/docs/llm-inference/llm%E6%8E%A8%E7%90%86%E4%BC%98%E5%8C%96%E6%8A%80%E6%9C%AF.md?plain=1 6 | 7 | 8 | 分布式训练加速技术: 9 | - https://github.com/liguodongiot/llm-action/blob/main/docs/llm-base/distribution-training/%E5%88%86%E5%B8%83%E5%BC%8F%E8%AE%AD%E7%BB%83%E5%8A%A0%E9%80%9F%E6%8A%80%E6%9C%AF.md?plain=1 10 | 11 | 12 | 13 | 大模型优化技术 14 | 15 | - 大模型显存优化技术-KV Cache 16 | - 大模型推理优化技术-Flash-Decoding 17 | - 大模型显存优化技术-PagedAttention 18 | - 大模型显存I/O优化技术-FlashAttention V1 19 | - 大模型显存优化技术-ZeRO系列 20 | - 大模型解码优化-Speculative Decoding及其变体 21 | - 大模型推理服务化调度优化技术-Dynamic batching/Continuous batching 22 | - 大模型显存优化技术-混合精度训练 23 | 24 | - 大模型显存I/O优化技术-FlashAttention V2 25 | -------------------------------------------------------------------------------- /blog/llm-algo/moe.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 将输入路由到不止一个专家,以便门控学会如何进行有效的路由选择,因此至少需要选择两个专家。Switch Transformers 就这点进行了更多的研究。 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /blog/llm-algo/大白话Transformer架构.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Attention(注意力机制): Attention机制允许模型为输入序列中的每个位置分配不同的权重,用以关注输入序列中不同位置的信息。它通过计算每个位置与其他所有位置之间的相似度(通过点积、缩放点积等方法),然后将这些相似度转换成权重,最后将输入序列中的所有位置按照这些权重进行加权求和。这种机制使得模型能够处理长距离的依赖关系,同时能够并行计算,提高了模型的效率。 4 | 5 | Feed-Forward Neural Network (FFN)(前馈神经网络): 每个Transformer层都包含两个线性变换,之间由非线性激活函数(通常是ReLU)连接。FFN对每个位置的表示进行独立的变换,从而捕捉到位置特定的模式和特征。这个步骤有助于提高模型的非线性建模能力。 6 | 7 | Layer Normalization(层归一化): 在每个Transformer层的子层(Attention和FFN)之后都会应用LayerNorm。LayerNorm的作用是对每个位置的特征进行归一化处理,使得每个特征的均值接近0,标准差接近1。这样做有助于缓解训练时的梯度消失问题,并且可以加速训练过程。 8 | 9 | Add & Normalize(加和与归一化): 在每个子层(Attention和FFN)的输入和输出之间应用残差连接(或者称为skip connection),然后对输出进行LayerNorm操作。这个步骤的目的是引入残差连接,使得模型可以学习到输入和输出之间的差异,有助于减缓梯度消失问题,同时也使得模型更容易学习到恒等映射。在LayerNorm之后应用残差连接有助于稳定训练。 10 | 11 | 12 | Attention机制用于捕捉输入序列中的关联关系, 13 | FFN用于捕捉每个位置的非线性特征,从而增加模型的表示能力和拟合复杂模式的能力, 14 | LayerNorm用于归一化特征并缓解梯度消失问题,而Add & Normalize结构引入残差连接,使得模型更容易训练。 -------------------------------------------------------------------------------- /blog/reference/高性能 LLM 推理框架的设计与实现.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - 高性能 LLM 推理框架的设计与实现:https://zhuanlan.zhihu.com/p/682872971 5 | 6 | -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/.DS_Store -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## [LLM 基础](https://github.com/liguodongiot/llm-action/tree/main/docs/llm-base) 5 | 6 | 7 | 8 | ## [LLM 面试题](https://github.com/liguodongiot/llm-action/tree/main/docs/llm_interview) 9 | 10 | 11 | 12 | ## [LLM 国产化](https://github.com/liguodongiot/llm-action/tree/main/docs/llm_localization) 13 | 14 | 15 | ## [LLM 总结](https://github.com/liguodongiot/llm-action/tree/main/docs/llm_summarize) 16 | -------------------------------------------------------------------------------- /docs/conda.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 安装 4 | - https://docs.anaconda.com/free/miniconda/ 5 | 6 | - https://repo.anaconda.com/miniconda/ 7 | -------------------------------------------------------------------------------- /docs/flash-attention/FlashAttention.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - https://github.com/Dao-AILab/flash-attention 7 | 8 | - FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness 9 | - FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning 10 | 11 | 12 | Flash Attention的主要目的是加速和节省内存,主要贡献包括: 13 | 14 | 计算softmax时候不需要全量input数据,可以分段计算; 15 | 16 | 反向传播的时候,不存储attention matrix (N^2的矩阵),而是只存储softmax归一化的系数。 17 | 18 | 19 | 20 | 21 | Online Softmax+Tiling+Recompute 22 | 23 | 24 | 25 | Online Softmax 26 | 27 | Online normalizer calculation for softmax(https://arxiv.org/abs/1805.02867) 28 | 29 | 30 | FlashAttention-2 31 | 32 | 33 | 34 | 35 | Flash Decoding 36 | 37 | https://crfm.stanford.edu/2023/10/12/flashdecoding.html 38 | 39 | Flash-Decoding 可以显著加快推理过程中的注意力,使长序列的生成速度提高 8 倍。 40 | 41 | 主要思想是尽可能快地并行加载Key和Value,然后分别重新缩放并组合结果以维持正确的注意力输出。 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /docs/llm-base/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-base/.DS_Store -------------------------------------------------------------------------------- /docs/llm-base/FLOPS.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## FLOPS 5 | 6 | FLOPS(Floating-point operations per second),每秒浮点运算次数 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/llm-base/NVIDIA-Nsight-Systems性能分析.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # NVIDIA Nsight Systems 5 | 6 | 7 | NVIDIA Nsight Systems是一款低开销性能分析工具,旨在为开发人员提供优化软件所需的洞察力。无偏差的活动数据可在工具中可视化,可帮助用户调查瓶颈,避免推断误报,并以更高的性能提升概率实现优化。用户将能够识别问题,例如 GPU 闲置、不必要的 GPU 同步、CPU 并行化不足,甚至其目标平台的 CPU 和 GPU 中意外昂贵的算法。它旨在跨各种 NVIDIA 平台进行扩展,例如:大型 Tesla 多 GPU x86 服务器、Quadro 工作站、支持 Optimus 的笔记本电脑、配备 Tegra+dGPU 多操作系统的 DRIVE 设备,以及 Jetson。NVIDIA Nsight Systems 甚至可以为 PyTorch 和 TensorFlow 等深度学习框架行为和负载提供宝贵见解;允许用户调整模型和参数,以提高单个或多个 GPU 的整体利用率。 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/llm-base/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## [AI 算法](https://github.com/liguodongiot/llm-action/blob/main/docs/llm-base/ai-algo.md) 5 | 6 | ## [AI 集群](https://github.com/liguodongiot/llm-action/tree/main/docs/llm-base/ai-cluster) 7 | ## [AI 集群通信](https://github.com/liguodongiot/llm-action/tree/main/docs/llm-base/network-communication) 8 | 9 | 10 | ## 其他 11 | 12 | - [机器学习中常用的数据类型](https://github.com/liguodongiot/llm-action/blob/main/docs/llm-base/机器学习中常用的数据类型.md) 13 | 14 | 15 | 16 | 17 | 18 | ## 工具 19 | 20 | 21 | ### nvidia-visual-profiler 22 | 23 | - nvprof 24 | - nvidia-visual-profiler 25 | - https://developer.nvidia.com/nvidia-visual-profiler 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/llm-base/ai-algo.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## CodeGeeX 6 | 7 | - CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Evaluations on HumanEval-X 8 | - https://github.com/THUDM/CodeGeeX 9 | 10 | ## CodeX 11 | 12 | - Evaluating Large Language Models Trained on Code 13 | - https://github.com/openai/human-eval 14 | 15 | 16 | ## Transformer 17 | 18 | - 哈佛的NLP团队实现:https://github.com/harvardnlp/annotated-transformer 19 | 20 | 21 | ## Bert 22 | 23 | 24 | ## GPT2 25 | - [GPT2](https://github.com/openai/gpt-2) 26 | - Language Models are Unsupervised Multitask Learners 27 | 28 | 29 | 30 | ## Bloom 31 | 32 | 33 | 34 | 35 | ## LLaMA / LLaMA2 36 | 37 | 38 | 39 | ## ChatGLM 40 | 41 | ## ChatGLM2 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-base/distribution-parallelism/.DS_Store -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/auto-parallel/Mesh-Tensorflow.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - Mesh-Tensorflow: 广义分布式: https://zhuanlan.zhihu.com/p/342223356 7 | 8 | 9 | 在深度学习中,由于数据量和计算量的浩大,往往会使用到分布式计算。而最常用的分布式模式是SPMD(Single-Program-Multiple-Data),即数据并行,这种模式相当于在数据的batch维去做拆分;然后,进行并行。 10 | 11 | Mesh-Tensorflow对这种模式做了泛化,即除了batch维外的其他维度也可做并行。 12 | 13 | 14 | 15 | 16 | 17 | 18 | --- 19 | 20 | 21 | Mesh-Tensorflow的灵感来自于目前广泛使用的数据并行, 数据并行可以看做是把 tensors 和 operations 在 batch 这个维度上进行分割。 Mesh-Tensorflow则顺势把这个点子推广到所有维度。 22 | 23 | 24 | Mesh-Tensorflow 看定义了一套DSL语法,用于描述模型的维度和布局,你用它重写你的整个Model后,它自动帮你把模型和数据分割到多个TPU上。 25 | 26 | Mesh-Tensorflow看起来很复杂和精巧,比 recomputation, pipeline parallelism 等技巧要更复杂更自动化,那它是否就能解决问题呢? 27 | 28 | 我觉得它侵入性比普通的库(例如GPipe)更强,你需要用Mesh-Tensorflow的语法重写你的整个模型,仔细思考维度,说实话,这个精神负担挺重的(想起了C++)。况且,它目前还没有实现并行的卷积操作,因此对于CNN网络并没有卵用,暂时只适合 Language Model 这个领域. 29 | 30 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/auto-parallel/Unity.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - Unity:通过代数变换和并行化的联合优化加速 DNN 训练:https://www.victorlamp.com/article/7387511088 5 | - 【论文赏读】Unity: Accelerating DNN Training Through Joint Opt of Algebraic Transform and Parallelization: https://zhuanlan.zhihu.com/p/560247608 6 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/auto-parallel/auto-parallel.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - Colossal-Auto 5 | - MindSpore 6 | - [Tofu ](https://arxiv.org/abs/1807.08887), 7 | - [Flexflow ](https://arxiv.org/abs/1807.05358), 8 | - [Alpa ](https://arxiv.org/abs/2201.12023) 9 | 10 | 11 | 12 | 13 | 14 | 15 | ## Alpa 16 | 17 | 18 | 19 | 模型量级很大的时候,搜索并行策略的时间非常长. 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/auto-parallel/gspmd.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - GSPMD 6 | 7 | - GSPMD:General and Scalable Parallelization for ML Computation Graphs: https://zhuanlan.zhihu.com/p/506026413 8 | - GSPMD: ML计算图的通用可扩展并行化: https://zhuanlan.zhihu.com/p/504670919 9 | 10 | 11 | 12 | 13 | ## 原paper 14 | 15 | 16 | 17 | 18 | GSPMD 是一个用于机器学习计算的高度自动化的并行化系统。 19 | 20 | 它提供了一个简单但功能强大的 API,该 API 足够通用,可以组合不同的典型并行模式。 GSPMD 提供直观的自动完成功能,使用户只需注解几个张量即可有效地划分整个模型。 21 | 22 | 我们已经证明,GSPMD 能够在多达数千个 Cloud TPUv3 核心上对多个图像、语音和语言模型进行分区,并具有良好且可预测的性能和内存扩展。 -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/auto-parallel/分布式训练自动并行概述.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | A Survey on Auto-Parallelism of Neural Networks Training 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | ## 2. 问题定义 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/data-parallelism/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-base/distribution-parallelism/data-parallelism/README.md -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/data-parallelism/ddp/Gradient Bucketing 示意图.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-base/distribution-parallelism/data-parallelism/ddp/Gradient Bucketing 示意图.webp -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/data-parallelism/ddp/代码架构.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-base/distribution-parallelism/data-parallelism/ddp/代码架构.webp -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/moe-parallel/moe-framework.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## colossalai 6 | 7 | - https://colossalai.org/zh-Hans/docs/advanced_tutorials/integrate_mixture_of_experts_into_your_model/ 8 | 9 | ## paddle 10 | 11 | - https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/06_distributed_training/moe_cn.html 12 | 13 | 14 | ``` 15 | python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 --log_dir logs train_moe.py 16 | ``` 17 | 18 | ## deepspeed 19 | 20 | - https://www.deepspeed.ai/tutorials/mixture-of-experts/ 21 | 22 | 23 | 24 | 25 | - https://www.deepspeed.ai/tutorials/mixture-of-experts/ 26 | - https://github.com/microsoft/DeepSpeedExamples/blob/master/training/cifar/run_ds_moe.sh 27 | - https://colossalai.org/zh-Hans/docs/advanced_tutorials/integrate_mixture_of_experts_into_your_model/ 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/moe-parallel/moe-parallel.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/multidimensional-hybrid-parallel/BloombergGPT模型超参数.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-base/distribution-parallelism/multidimensional-hybrid-parallel/BloombergGPT模型超参数.png -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/pipeline-parallelism/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | DP 将批次(global batch size)拆分为小批次(mini-batch)。PP 将一个小批次切分为多个块 (chunks),因此,PP 引入了微批次(micro-batch,MBS) 的概念。 6 | 7 | 计算 DP + PP 设置的全局批量大小的公式为: `mbs*chunks*dp_degree` , 比如:DP并行度为4,微批次大小为8,块为32,则全局批次大小为:`8*32*4=1024`。 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-parallelism/tensor-parallel/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Megatron-LM 的张量并行,通信量很大,同时,计算和通信没办法同时进行。 6 | 7 | 8 | 9 | 需要特别考虑的是:由于前向和后向传播中每层都有两个 all reduce,因此 TP 需要设备间有非常快速的互联。因此,除非你有一个非常快的网络,否则不建议跨多个节点进行 TP。 10 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-training/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-base/distribution-training/.DS_Store -------------------------------------------------------------------------------- /docs/llm-base/distribution-training/FP16-BF16.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## FP16 7 | 8 | 9 | 数值上溢和数值下溢的问题 10 | 11 | 数值上溢:大量级的数被近似为正无穷或负无穷时发生上溢,进一步运算导致无限值变为非数字。 12 | 13 | 数值下溢:接近零的数被四舍五入为0时发生下溢。被零除,取零的对数,进一步运算会变为非数字。 14 | 15 | 16 | 17 | 求梯度的时候可能下溢,求激活的时候可能上溢出。 18 | 19 | 20 | ### 存在的问题 21 | 22 | 使用FP16同样会带来一些问题,其中最重要的是1)精度溢出和2)舍入误差。 23 | 24 | 25 | 数据溢出:可见FP16相比FP32的有效范围要窄很多,使用FP16替换FP32会出现上溢(Overflow)和下溢(Underflow)的情况。而在深度学习中,需要计算网络模型中权重的梯度(一阶导数),因此梯度会比权重值更加小,往往容易出现下溢情况。 26 | 27 | 28 | 舍入误差:Rounding Error指示是当网络模型的反向梯度很小,一般FP32能够表示,但是转换到FP16会小于当前区间内的最小间隔,会导致数据溢出。 29 | 30 | 如0.00006666666在FP32中能正常表示,转换到FP16后会表示成为0.000067,不满足FP16最小间隔的数会强制舍入。 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-training/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 用 FP16 训练巨型 LLM 模型是一个禁忌,它将面临更多的稳定性挑战。 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/llm-base/distribution-training/自动混合精度.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Torch.cuda.amp vs Nvidia apex 5 | 6 | 7 | 8 | 9 | 10 | pytorch从1.6版本开始,已经内置了torch.cuda.amp,采用自动混合精度训练就不需要加载第三方NVIDIA的apex库了。 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/llm-base/gpu-env-var.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | CUDA_VISIBLE_DEVICES=1 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/llm-base/images/slurm/slurm.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-base/images/slurm/slurm.gif -------------------------------------------------------------------------------- /docs/llm-base/multimodal/sora.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 方案:VAE Encoder(视频压缩) -> Transform Diffusion (从视频数据中学习分布,并根据条件生成新视频) -> VAE Decoder (视频解压缩) 5 | 6 | 从博客出发,经过学术Survey,可以推断出全貌。一句话结论: 7 | 8 | Sora是采用了Meta的 DiT (2022.12) 框架, 9 | 融合了Google的 MAGViT (2022.12) 的Video Tokenize方案, 10 | 借用Google DeepMind的NaViT (2023.07) 支持了原始比例和分辨率, 11 | 使用OpenAI DALL-E 3 (2023.09) 里的图像描述方案生成了高质量Video Caption(视频描述),即文本-视频对,实现了准确的条件生成。 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/llm-base/rlhf/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | ## 百川2 8 | 9 | 10 | Reward Model: 11 | 12 | 13 | Prompt多样性:构造了一个200+细分类目的数据体系,尽可能覆盖用户需求,同时提升每类prompt多样性,从而提升泛化能力 14 | Response多样性:用不同尺寸和阶段的百川模型生成答案,不使用其他开源模型(经验证无法提升RM准确率) 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | RM: 23 | 24 | 设计了一个三层分类系统全面覆盖所有类型的用户需求,包括6个主要类别、30个二级类别、200多个三级类别。 25 | 26 | 27 | 在奖励模型训练时,需要保证每个类别内的数据应该有足够的多样性,以确保奖励模型能够有更好地泛化性。 28 | 29 | 并且奖励数据中结果需要由Baichuan2模型生成,以确保数据分布的统一。 -------------------------------------------------------------------------------- /docs/llm-base/scenes/cv/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## CV算法 5 | 6 | 7 | 8 | - 图像分类 9 | - 图像语义分割(Semantic Segmentation) 10 | - 目标检测(Object Detection) 11 | - 视频分类(video classification) 12 | 13 | 14 | 15 | 16 | - 人脸关键点检测 17 | - 图像超分辨率(Image Super Resolution) 18 | 19 | 20 | 21 | 22 | 23 | - 图像生成 24 | - 目标跟踪 25 | - 图像重构 26 | - 超分 27 | - 风格迁移 28 | - 姿态估计 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | ## 算法 37 | 38 | 39 | 40 | - CNN 41 | - https://deeplearning-doc.readthedocs.io/en/latest/deeplearning/CNN/CNN.html 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /docs/llm-base/scenes/cv/paddle/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - https://www.paddlepaddle.org.cn/documentation/docs/zh/practices/cv/landmark_detection.html 7 | 8 | 9 | 10 | wget --no-check-certificate --no-cookies --header "Cookie: oraclelicense=accept-securebackup-cookie" http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/BSR/BSR_bsds500.tgz 11 | 12 | 13 | 14 | tar -zxvf BSR_bsds500.tgz 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/llm-base/scenes/cv/pytorch/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## pyav 5 | 6 | 7 | ``` 8 | pip install av 9 | ``` 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | - Pytorch搭建训练简单的图像分割模型:https://blog.csdn.net/qq_42032507/article/details/103052193 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/llm-base/scenes/cv/reference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://pytorch.org/vision/stable/models.html 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/llm-base/scenes/multi-modal/reference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - [多模态大模型 CLIP, BLIP, BLIP2, LLaVA, miniGPT4, InstructBLIP 系列解读](https://zhuanlan.zhihu.com/p/653902791) 6 | - [AIGC爆火的背后——对抗生成网络GAN浅析](https://zhuanlan.zhihu.com/p/580137376) 7 | - [AIGC爆火的背后——扩散模型DDPM浅析](https://zhuanlan.zhihu.com/p/590840909) 8 | - [十分钟读懂Diffusion:图解Diffusion扩散模型](https://zhuanlan.zhihu.com/p/599887666) 9 | - [十分钟读懂Stable Diffusion](https://zhuanlan.zhihu.com/p/600251419) 10 | 11 | 12 | 13 | 14 | - [CogVLM](https://github.com/THUDM/CogVLM/blob/main/README_zh.md) 15 | - [Qwen-VL](https://github.com/QwenLM/Qwen-VL/blob/master/README_CN.md) 16 | 17 | 18 | 19 | 20 | 21 | - https://github.com/facebookresearch/multimodal 22 | 23 | LateFusion, FLAVA and CLIP 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/llm-base/singularity命令.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | ``` 12 | # -cleanenv选项来禁用所有环境变量,确保容器的环境是独立的 13 | singularity run --cleanenv my_container.sif 14 | ``` -------------------------------------------------------------------------------- /docs/llm-experience.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 微调: 8 | 9 | 10 | PEFT总是有局限性,基于低秩的微调可能并不always work,比如:finetune与pretrain的gap过大的时候,比如中英差异。 11 | 12 | 13 | 微调的过程不是让模型适应另外的数据分布,而是让模型更好的激发出本身的表征能力。 14 | 15 | 16 | 17 | 18 | 19 | 20 | 量化: 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/llm-inference/DeepSpeed-Inference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 目前业界基本都针对 Transformer layer 结构特点,手工实现了算子融合。以 DeepSpeed Inference 为例,算子融合主要分为如下四类: 6 | 7 | 归一化层和 QKV 横向融合:将三次计算 Query/Key/Value 的操作合并为一个算子,并与前面的归一化算子融合。 8 | 自注意力计算融合:将自注意力计算涉及到的多个算子融合为一个,业界熟知的 FlashAttention 即是一个成熟的自注意力融合方案。 9 | 残差连接、归一化层、全连接层和激活层融合:将 MLP 中第一个全连接层上下相关的算子合并为一个。 10 | 偏置加法和残差连接融合。 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/llm-inference/KV-Cache.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 最后需要注意当sequence特别长的时候,KV Cache其实还是个Memory刺客。 5 | 6 | 比如batch_size=32, head=32, layer=32, dim_size=4096, seq_length=2048, float32类型,则需要占用的显存为(感谢网友指正) 2 * 32 * 4096 * 2048 * 32 * 4 / 1024/1024/1024 /1024 = 64G。 7 | 8 | 9 | -------------------------------------------------------------------------------- /docs/llm-inference/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 吞吐量 6 | 7 | 8 | 延迟 9 | 10 | 11 | 12 | 13 | 14 | 投机采样: 15 | - https://github.com/feifeibear/LLMSpeculativeSampling 16 | 17 | 美杜莎: 18 | - https://github.com/FasterDecoding/Medusa 19 | - Medusa: Simple Framework for Accelerating LLM Generation with Multiple Decoding Heads 20 | - https://sites.google.com/view/medusa-llm 21 | 22 | 23 | 24 | 25 | - OpenLLM: https://github.com/bentoml/OpenLLM 26 | 27 | ## Triton 28 | 29 | 30 | 31 | 32 | 33 | ## 博客 34 | 35 | - https://huggingface.co/blog/optimize-llm 36 | - 加速大模型推理的7种方法:https://betterprogramming.pub/speed-up-llm-inference-83653aa24c47 37 | - 7个大模型推理服务化框架:https://betterprogramming.pub/frameworks-for-serving-llms-60b7f7b23407 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /docs/llm-inference/flexflow/spec_infer_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-inference/flexflow/spec_infer_demo.gif -------------------------------------------------------------------------------- /docs/llm-inference/llm推理框架.md: -------------------------------------------------------------------------------- 1 | 2 | ## vLLM 3 | 4 | 适用于大批量Prompt输入,并对推理速度要求高的场景; 5 | 6 | 7 | 8 | 9 | 10 | ## Huggingface TGI 11 | 12 | 13 | 依赖HuggingFace模型,并且不需要为核心模型增加多个adapter的场景; 14 | 15 | 16 | 17 | 18 | 19 | ## DeepSpeed-MII 20 | 21 | 使用DeepSpeed库来部署LLM; 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /docs/llm-inference/vllm.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - VLLM推理流程梳理(一): https://zhuanlan.zhihu.com/p/649974825 5 | - VLLM推理流程梳理(二): https://zhuanlan.zhihu.com/p/649977422 6 | - 大模型推理服务框架vLLM要点简析 (上): https://zhuanlan.zhihu.com/p/654259045 7 | - PagedAttention--大模型推理服务框架vLLM要点简析 (中): https://zhuanlan.zhihu.com/p/655561941 8 | 9 | 10 | 11 | 12 | vLLM是一个大模型推理服务框架,声称 13 | 14 | 最牛的serving 吞吐量PagedAttention 15 | 对kv cache的有效管理 16 | 传入请求的continus batching,而不是static batching 17 | 高性能CUDA kernel 18 | 流行的HuggingFace模型无缝集成 19 | 有各种decoder算法的高吞吐量服务,包括parallel sampling和beam search等 20 | tensor parallel 21 | 兼容OpenAI的API服务 22 | 23 | 24 | continus batching和PagedAttention 25 | 26 | 27 | ### PagedAttention 28 | 29 | 30 | 31 | ### continus batching 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /docs/llm-peft/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - https://github.com/OpenAccess-AI-Collective/axolotl 7 | 8 | 9 | 10 | - 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /docs/llm-peft/ReLoRA.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /docs/llm-summarize/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## LLM选择标准 6 | 7 | 在选本地化的LLM之前,我们先根据实际情况定义一些选择标准: 8 | 9 | - 归纳优先:我们不需要LLM在各个方面都很优秀,不需要它们会很强的coding和复杂逻辑推理能力,RAG最重要的还是出色的归纳能力; 10 | - 体量考虑:不要太大,最好在13B及以下,因为再大就需要一张A100等专业显卡,或者要多张消费级显卡。我们的目标是一张RTX 4090可以解决问题,对很多客户来说,A卡很难买,而且价格太高了; 11 | - 中文能力:我们主要面对的还是中文业务,所以Llama对我们还是成本太高,如果自己做大量训练的话。 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | ## 实践经验 23 | 24 | ### 坑点 25 | 26 | - 使用 Hugingface Transformers 进行多机多卡训练时,反复加载模型,设置卡等,不然可能导致代码错乱,比如:保存模型不成功,保存模型不完整等等。 27 | - 使用Megatron-Deepspeed、Hugingface Transformers (Deepspeed)进行分布式训练时,使用一些共享存储可能会遇到问题,比如一个进程写入文件之后,另一个进程无法马上读取到文件。 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /docs/llm-summarize/distribution_dl_roadmap.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 分布式并行技术: 6 | 7 | - 数据并行 8 | - 流水线并行 9 | - 张量并行 10 | - 序列并行 11 | - 多维混合并行 12 | - 自动并行 13 | - MOE 并行 14 | 15 | 大模型算法结构: 16 | 17 | - Transformer 18 | - GPT2 (345M) 19 | - Bloom 20 | - LLaMA / LLaMA2 21 | - ChatGLM 22 | - ChatGLM2 23 | 24 | 25 | 分布式训练框架: 26 | 27 | - DeepSpeed 28 | - Megatron-LM 29 | - Alpa 30 | -------------------------------------------------------------------------------- /docs/llm-summarize/pic/A800.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-summarize/pic/A800.jpeg -------------------------------------------------------------------------------- /docs/llm-summarize/pic/H800.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-summarize/pic/H800.jpeg -------------------------------------------------------------------------------- /docs/llm-summarize/pic/transformer架构.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-summarize/pic/transformer架构.jpg -------------------------------------------------------------------------------- /docs/llm-summarize/pic/why_RLHF.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/docs/llm-summarize/pic/why_RLHF.jpg -------------------------------------------------------------------------------- /docs/llm-summarize/文档大模型.md: -------------------------------------------------------------------------------- 1 | 2 | 处理流程: 3 | 4 | 1. 对表格或者文章文档切分成chunk,将其存入DB 5 | 2. 根据chunk文档内容,通过prompt生成问题(qwen) 6 | 3. 通过sentencetransformer生成embbedding(Text embedding 模型 7 | stella_large 模型,长文本编码), 第二步 抽取的问题 和 文档 进行相似度匹配,过滤掉阈值小于0.5的问题和文档对 8 | 4. 选择正负样本,一个正样本,15个负样本 9 | 5. 扩展bert长度从512到1024,使用层次分解的位置编码进行初始化(bge-large) 10 | 6. 模型训练(bge) 11 | 7. 校验模型获取到 query 与 文档chunk 前三的文档及其分数 ,以及query对应的标准答案 12 | 8. 计算rouge指标 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/transformer内存估算.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | https://blog.eleuther.ai/transformer-math/ 8 | 9 | https://kipp.ly/transformer-inference-arithmetic/ 10 | 11 | -------------------------------------------------------------------------------- /git-pull-push.sh: -------------------------------------------------------------------------------- 1 | git pull origin main 2 | git add . 3 | 4 | #time=`date -Iminutes` 5 | time=`date +"%Y-%m-%d_%H:%M:%S"` 6 | 7 | echo $time 8 | 9 | commit_info="update-""$time" 10 | 11 | git commit -m $commit_info 12 | 13 | git push origin main 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /llm-algo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/.DS_Store -------------------------------------------------------------------------------- /llm-algo/FLOPs.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | https://epochai.org/blog/backward-forward-FLOP-ratio 8 | 9 | 10 | 如何计算FLOPs 11 | 12 | 有两种方式: 13 | 14 | 根据计算公式和模型结构手动推算 15 | 16 | 借助第三方工具:calflops、ptflops、thop、torchstat、torchsumary、fvcore 17 | 手动推导FLOPs原则: 18 | 手动推导模型的FLOPs时只推导前向传播,大部分情况默认模型后向传播的计算量是前向传播的2倍, 总共FLOPs是前向的3倍。(结论出自——https://epochai.org/blog/backward-forward-FLOP-ratio) 19 | 由于LLM模型参数过大,占用显存过多,有时候为了降低显存在训练采用将中间参数保留在内存里——激活重计算。因此推导LLM训练时FLOPs如果考虑到中间参数的激活重计算的过程,需要计算整体FLOPs需要再加一份前向计算量,即1(前向) + 2(反向)+ 1(激活重计算)= 4 倍 计算量。 (结论出自——https://arxiv.org/pdf/2205.05198.pdf) 20 | 手动推导模型的FLOPs时,优先推导整个过程计算量占大头部分,通常忽略激活函数、layer normalize,softmax等等部分计算量。 21 | 22 | 23 | 参考最简单的计算模型(LLM)FLOPs的方法: https://zhuanlan.zhihu.com/p/652697200 -------------------------------------------------------------------------------- /llm-algo/InternLM-20B.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | InternLM预训练框架 4 | 5 | 大模型微调工具箱XTuner 6 | 7 | 8 | 9 | LMDeploy推理工具链 10 | 11 | 12 | OpenCompas大模型评测平台 13 | 14 | 15 | 16 | Lagent智能体框架 17 | 18 | -------------------------------------------------------------------------------- /llm-algo/baichuan2/baichuan.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/modeling_baichuan.py 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /llm-algo/bert.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | BertEmbeddings 7 | 8 | BertSelfAttention 9 | BertSelfOutput 10 | BertAttention 11 | 12 | 13 | BertIntermediate 14 | BertOutput 15 | BertLayer 16 | 17 | BertEncoder 18 | BertPooler 19 | BertPredictionHeadTransform 20 | BertLMPredictionHead: 21 | BertOnlyMLMHead: 22 | BertOnlyNSPHead: 23 | BertPreTrainingHeads: 24 | BertPreTrainedModel: 25 | BertForPreTrainingOutput: 26 | 27 | BertModel(BertPreTrainedModel): 28 | BertForPreTraining(BertPreTrainedModel): 29 | BertLMHeadModel(BertPreTrainedModel): 30 | BertForMaskedLM(BertPreTrainedModel): 31 | BertForNextSentencePrediction(BertPreTrainedModel): 32 | BertForSequenceClassification(BertPreTrainedModel): 33 | BertForMultipleChoice(BertPreTrainedModel): 34 | BertForTokenClassification(BertPreTrainedModel): 35 | BertForQuestionAnswering(BertPreTrainedModel): 36 | 37 | ``` 38 | -------------------------------------------------------------------------------- /llm-algo/bloom.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ``` 4 | BloomGelu 5 | BloomAttention 6 | BloomMLP 7 | BloomBlock 8 | BloomPreTrainedModel 9 | BloomModel(BloomPreTrainedModel) 10 | BloomForCausalLM(BloomPreTrainedModel) 11 | ``` 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-algo/bloom/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - [BLOOM模型结构详解](https://juejin.cn/post/7223305855923044409) 7 | - 8 | -------------------------------------------------------------------------------- /llm-algo/chatglm/GLM说明.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/chatglm/GLM说明.png -------------------------------------------------------------------------------- /llm-algo/chatglm/GLM预训练.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/chatglm/GLM预训练.png -------------------------------------------------------------------------------- /llm-algo/chatglm/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # chatglm-6b 4 | 5 | - https://huggingface.co/THUDM/chatglm-6b/tree/main 6 | - https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py 7 | 8 | 9 | 自回归填空 10 | 11 | 12 | 13 | 14 | 15 | ChatGLM借助编码器-解码器架构思想,前半部分采用类似于Bert的双向注意力进行掩码,后半部分采用类似于GPT的自回归架构进行预测。 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 说明: 24 | 25 | - gelu 26 | - LayerNorm 27 | 28 | 29 | 30 | - 重新排列了LN和残差连接的顺序,具体来讲就是将Post-LN改成Pre-LN。 31 | - 使用一个线性层来预测输出词; 32 | - 将ReLU激活函数替换为GeLU激活函数。 -------------------------------------------------------------------------------- /llm-algo/chatglm2/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py 5 | 6 | 7 | -更强大的性能:ChatGLM2-6B 使用了 GLM 的混合目标函数,经过了 1.4T 中英标识符的预训练与人类偏好对齐训练。 8 | - 更长的上下文:基于 FlashAttention 技术,我们将基座模型的上下文长度(Context Length)由 ChatGLM-6B 的 2K 扩展到了 32K,并在对话阶段使用 8K 的上下文长度训练。对于更长的上下文,我们发布了 ChatGLM2-6B-32K 模型。 9 | - 更高效的推理:基于 Multi-Query Attention 技术,ChatGLM2-6B 有更高效的推理速度和更低的显存占用。 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | ## 说明 19 | 20 | 21 | - F.silu: 22 | - RMSNorm 23 | 24 | 25 | 26 | 27 | 28 | ## chatglm 与 chatglm2 不同支持 29 | 30 | 31 | - 激活函数不同 32 | - RotaryEmbedding 位置不同。 -------------------------------------------------------------------------------- /llm-algo/chatglm3/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /llm-algo/chatglm3/reference.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/chatglm3/reference.md -------------------------------------------------------------------------------- /llm-algo/chatgpt/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - ChatGPT技术原理解析:从RL之PPO算法、RLHF到GPT4、instructGPT: https://blog.csdn.net/v_JULY_v/article/details/128579457 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /llm-algo/deepseek/DeepSeek-R1.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## 复现 6 | 7 | - https://github.com/huggingface/open-r1 8 | 9 | 10 | 11 | ## 量化 12 | 13 | - AWQ:https://modelscope.cn/models/cognitivecomputations/DeepSeek-R1-awq 14 | - AWQ:https://huggingface.co/cognitivecomputations/DeepSeek-R1-AWQ 15 | - FP4:https://modelscope.cn/models/nvidia/DeepSeek-R1-FP4 16 | - https://github.com/nbasyl/LLM-FP4 17 | 18 | 19 | 20 | 21 | 22 | https://huggingface.co/dwetzel/DeepSeek-R1-Distill-Qwen-32B-GPTQ-INT4 23 | 24 | 25 | ### sglang 26 | https://github.com/sgl-project/sglang/issues/2706 27 | 28 | 29 | - sglang 对 deepseek r1 性能测试: https://github.com/sgl-project/sglang/issues/3956 30 | 31 | 32 | ### vllm 33 | 34 | https://huggingface.co/cognitivecomputations/DeepSeek-R1-AWQ/discussions 35 | 36 | 37 | 38 | 39 | ## hf transformers 40 | 41 | - https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization/finegrained_fp8.md 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /llm-algo/deepseek/DeepSeek-V2.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # DeepSeek-V2 6 | 7 | 8 | 9 | 10 | Deepseek-V2技术详解: https://zhuanlan.zhihu.com/p/697524307 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-algo/deepseek/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - DeepSeek-V3:https://arxiv.org/pdf/2412.19437v1 6 | - https://github.com/deepseek-ai/DeepSeek-V3 7 | 8 | 9 | - DeepSeekMath(GRPO):https://arxiv.org/pdf/2402.03300 10 | 11 | - DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning: https://arxiv.org/pdf/2501.12948 12 | 13 | 14 | 15 | - DeepSeek-R1训练数据准备方案:https://zhuanlan.zhihu.com/p/20763180629 16 | 17 | 18 | - 详细介绍:https://planetbanatt.net/articles/deepseek.html 19 | - DeepSeek-R1: Model Architecture: https://pub.towardsai.net/deepseek-r1-model-architecture-853fefac7050 20 | - Run DeepSeek R1 Dynamic 1.58-bit: https://unsloth.ai/blog/deepseekr1-dynamic -------------------------------------------------------------------------------- /llm-algo/glm-130b/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - 预训练语言模型:GLM: https://zhuanlan.zhihu.com/p/641499380 5 | 6 | 7 | 8 | 9 | GLM-130B 对超过 4000 亿个双语标记(2000 亿英文和 2000 亿中文标记)进行了预训练。 10 | 11 | 它的预训练目标由两部分组成: 12 | 13 | 第一部分(95%)是自监督的预训练,即在公开的大规模语料库以及其他一些较小的中文语料库上的自回归空白填充。 14 | 15 | 第二部分(5%)是在 T0++18 和 DeepStruct19 中 70 个不同数据集的抽样子集上进行多任务指令预训练,格式为基于指令的多任务多提示序列到序列的生成。 16 | 17 | 这种设计使 GLM-130B 可以在其他数据集上进行了零样本学习,以及从英文到中文的零样本迁移。 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /llm-algo/glm-130b/模型架构.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/glm-130b/模型架构.gif -------------------------------------------------------------------------------- /llm-algo/glm4.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /llm-algo/gpt/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - OpenAI ChatGPT(二):十分钟读懂 GPT-1:https://zhuanlan.zhihu.com/p/604625917 5 | - GPT(二)GPT1原理和代码详解: https://zhuanlan.zhihu.com/p/636915538 6 | -------------------------------------------------------------------------------- /llm-algo/gpt/模型结构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/gpt/模型结构.png -------------------------------------------------------------------------------- /llm-algo/gpt2/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - GPT2 源码:https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py 4 | - GPT2 源码解析:https://zhuanlan.zhihu.com/p/630970209 5 | - nanoGPT:https://github.com/karpathy/nanoGPT/blob/master/model.py 6 | 7 | 8 | - 7.3 GPT2模型深度解析:http://121.199.45.168:13013/7_3.html 9 | - GPT(三)GPT2原理和代码详解: https://zhuanlan.zhihu.com/p/637782385 10 | - OpenAI ChatGPT(三):十分钟读懂 GPT-2: https://zhuanlan.zhihu.com/p/613895056 11 | - OpenAI ChatGPT(三):Tensorflow实现GPT-2: https://zhuanlan.zhihu.com/p/614003576 12 | - GPT2参数量剖析: https://zhuanlan.zhihu.com/p/640501114 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-algo/gpt2/模型架构.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## GPT2LMHeadModel 6 | 7 | 8 | 9 | 10 | ## GPT2PreTrainedModel 11 | 12 | 13 | 14 | 15 | 16 | ## GPT2Model 17 | 18 | 19 | 20 | 21 | ## GPT2Attention 22 | 23 | 24 | 25 | 26 | ## GPT2Block 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /llm-algo/gpt3/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | GPT(五)GPT3原理讲解:https://zhuanlan.zhihu.com/p/642745932 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /llm-algo/llama.md: -------------------------------------------------------------------------------- 1 | 2 | - https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py 3 | 4 | 5 | - http://lihuaxi.xjx100.cn/news/1377523.html 6 | 7 | 8 | ``` 9 | LlamaRotaryEmbedding 10 | 11 | 12 | LlamaMLP 13 | LlamaAttention:多头注意力 14 | 15 | LlamaDecoderLayer 16 | 17 | LlamaPreTrainedModel 18 | 19 | LlamaModel(LlamaPreTrainedModel) 20 | LlamaForCausalLM(LlamaPreTrainedModel) 21 | 22 | 23 | 24 | ``` 25 | 26 | 27 | -------------------------------------------------------------------------------- /llm-algo/llama/模型架构.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## LlamaAttention(nn.Module) 6 | 7 | 8 | 9 | 10 | 11 | 12 | ## LlamaMLP(nn.Module) 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-algo/mixtral/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - 8 x 7b: https://www.promptingguide.ai/models/mixtral 6 | 7 | - https://www.promptingguide.ai/models/mixtral-8x22b 8 | 9 | 10 | -------------------------------------------------------------------------------- /llm-algo/mlp.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 1. MLP的基本原理 5 | MLP是一种由多个神经元组成的前馈神经网络模型。每个神经元接收来自上一层神经元的输出,并通过激活函数进行非线性变换,然后将结果传递给下一层神经元。 6 | 这种前馈传播的方式使得MLP能够处理复杂的非线性关系。 7 | 8 | 3. MLP的结构 9 | MLP由输入层、隐藏层和输出层组成。输入层接收外部输入数据,隐藏层对输入数据进行中间表示的学习,输出层生成最终的预测结果。每个神经元与上一层的所有神经元相连,权重和偏置用于调整信号传递过程中的参数。 10 | 11 | 4. MLP的训练 12 | MLP的训练目标是通过优化算法,使得模型的输出尽可能地接近真实值。为了达到这个目标,需要定义一个损失函数来度量预测结果和真实值之间的差距。常用的损失函数包括均方误差(Mean Squared Error)和交叉熵(Cross Entropy)。反向传播算法是一种常用的优化方法,通过计算梯度并更新权重和偏置来最小化损失函数。 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /llm-algo/moe/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | https://www.promptingguide.ai/models/mixtral 6 | 7 | -------------------------------------------------------------------------------- /llm-algo/qwen/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## qwen 4 | 5 | ``` 6 | git clone https://www.modelscope.cn/qwen/Qwen-7B-Chat.git 7 | ``` 8 | 9 | 10 | ## Qwen1.5 11 | 12 | Qwen1.5版本本次开源了包括0.5B、1.8B、4B、7B、14B和72B在内的六种大小的基础和聊天模型,同时,也开源了量化模型。不仅提供了Int4和Int8的GPTQ模型,还有AWQ模型,以及GGUF量化模型。 13 | 14 | 为了提升开发者体验,Qwen1.5的代码合并到Hugging Face Transformers中,开发者现在可以直接使用transformers>=4.37.0而无需trust_remote_code。 15 | 16 | 此外,Qwen1.5支持了例如vLLM、SGLang、AutoGPTQ等框架对Qwen1.5的支持。 17 | 18 | Qwen1.5显著提升了聊天模型与人类偏好的一致性,并且改善了它们的多语言能力。所有模型提供了统一的上下文长度支持,支持32K上下文, 基础语言模型的质量也有所改进。 19 | 20 | 21 | 22 | ## 模型 23 | 24 | ``` 25 | git clone https://www.modelscope.cn/qwen/Qwen1.5-0.5B.git 26 | git clone https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct 27 | ``` 28 | 29 | 30 | 31 | ## 代码 32 | 33 | - https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2/modeling_qwen2.py 34 | 35 | 36 | 37 | ## 模型结构 38 | 39 | 40 | https://blog.csdn.net/fan_fan_feng/article/details/138978901 41 | -------------------------------------------------------------------------------- /llm-algo/qwen/参数说明及函数说明.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - 详细解释:https://kimi.moonshot.cn/chat/cvll5lp3om1s14tgq990 4 | 5 | 6 | 7 | initializer_range: 8 | 9 | ``` 10 | initializer_range是一个参数,用于指定初始化权重矩阵时使用的 truncated_normal_initializer 的标准差。它的默认值是 0.02,并且是一个可选的浮点数(float)。 11 | 12 | 默认值为 0.02,这意味着初始化权重时,权重值会从一个均值为 0、标准差为 0.02 的截断正态分布中采样。 13 | 14 | 15 | 截断正态分布可以避免权重值过大或过小,从而帮助模型更快地收敛。它是一种常见的初始化方法,尤其是在深度学习中。 16 | 17 | ``` 18 | 19 | rms_norm_eps: 20 | 21 | ``` 22 | rms_norm_eps 是一个参数,用于 RMSNorm(Root Mean Square Normalization)层中的数值稳定化。它的默认值是 1e-06,并且是一个可选的浮点数(float)。 23 | 24 | RMSNorm 是一种归一化方法,用于稳定神经网络的训练过程。 25 | 它通过计算输入的均方根(Root Mean Square, RMS)值来调整输入的尺度。 26 | 27 | 28 | 29 | 30 | ``` 31 | -------------------------------------------------------------------------------- /llm-algo/qwen2.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - https://qwenlm.github.io/zh/blog/qwen2/ -------------------------------------------------------------------------------- /llm-algo/t5/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://github.com/ZhuiyiTechnology/t5-pegasus 5 | - https://github.com/renmada/t5-pegasus-pytorch 6 | - T5 PEGASUS:开源一个中文生成式预训练模型: https://zhuanlan.zhihu.com/p/359509608 7 | 8 | 9 | ## LCSTS_new 10 | 11 | - LCSTS_new中文短摘要生成数据集:https://www.luge.ai/#/luge/dataDetail?id=10 12 | 13 | 生成式短摘要数据集,以微博原文为输入,1~2句话的短摘要为输出 14 | 15 | 16 | LCSTS_new是中文短摘要最常用的LCSTS短摘要数据集的升级版本,在数据量、质量方面均有显著提升,在信息摘要与提炼的过程中,与原文的事实一致性需要得到重点关注。 17 | 18 | 19 | ``` 20 | { 21 | "id": 6, 22 | "summary": "中国游客大增多国放宽签证", 23 | "content": "①北京和上海户籍的游客可获得韩国多次签证;②“整容客”可以不经由韩国使领馆、直接在网上申请签证;③中泰免签的实施日期尚未敲定;④越南已向中国持通行证旅游的公民全面开放。" 24 | } 25 | ``` 26 | 27 | 28 | ## AdvertiseGen 29 | 30 | - AdvertiseGen广告文案生成数据集: https://www.luge.ai/#/luge/dataDetail?id=9 31 | 32 | ``` 33 | { 34 | "content": "类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞", 35 | "summary": "简约而不简单的牛仔外套,白色的衣身十分百搭。衣身多处有做旧破洞设计,打破单调乏味,增加一丝造型看点。衣身后背处有趣味刺绣装饰,丰富层次感,彰显别样时尚。" 36 | } 37 | ``` -------------------------------------------------------------------------------- /llm-algo/transformer/README.md : -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 阐述了FFN层的几个问题, 5 | 6 | 第一,是FFN层参数量巨大,大概占了整个模型参数量的三分之二左右。 7 | 8 | 第二,FFN层的激活存在较大的稀疏性,也就是针对某些问题的输入,FFN层只有部分的参数是有用的。 -------------------------------------------------------------------------------- /llm-algo/transformer/multi-head-attention.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/transformer/multi-head-attention.webp -------------------------------------------------------------------------------- /llm-algo/transformer/transformer-building-blocks.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/transformer/transformer-building-blocks.webp -------------------------------------------------------------------------------- /llm-algo/transformer/w-qkv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-algo/transformer/w-qkv.png -------------------------------------------------------------------------------- /llm-algo/基本概念.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## 激活函数 6 | 7 | 8 | 9 | ReLU 10 | 11 | 12 | GeLU 13 | 14 | SwiGLU(Yi) 15 | 16 | 17 | 18 | ## 位置编码 19 | 20 | 21 | RoPE 22 | 23 | 24 | 25 | ## 优化器 26 | 27 | 28 | Adam 29 | 30 | 31 | 32 | AdamW 33 | 34 | OneBitAdam 35 | 36 | Lamb 37 | 38 | OneBitLamb 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /llm-algo/训练范式.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [NLP的四个范式](https://zhuanlan.zhihu.com/p/456951972) 4 | 5 | 6 | 7 | 训练范式 8 | 9 | ## 非神经网络时代的完全监督学习(特征工程) 10 | 11 | 提取出自然语言语料库中的一些特征,利用特定的规则或数学、统计学的模型来对特征进行匹配和利用,进而完成特定的NLP任务。 12 | 13 | 14 | ## 第二范式:基于神经网络的完全监督学习(架构工程) 15 | 16 | 不用手动设置特征和规则,节省了大量的人力资源,但仍然需要人工设计合适的神经网路架构来对数据集进行训练。 17 | 18 | 19 | ## 第三范式:预训练,精调范式(目标工程) 20 | 21 | 先在大的无监督数据集上进行预训练,学习到一些通用的语法和语义特征,然后利用预训练好的模型在下游任务的特定数据集上进行fine-tuning,使模型更适应下游任务。 22 | 23 | 24 | 特点是不需要大量的有监督下游任务数据,模型主要在大型无监督数据上训练,只需要少量下游任务数据来微调少量网络层即可。 25 | 26 | 27 | 28 | ## 预训练,提示,预测范式(Prompt工程) 29 | 30 | 将下游任务的建模方式重新定义,通过合适的prompt来实现直接在预训练模型上解决下游任务,这种模式需要极少量(甚至不需要)下游任务数据,使得小样本、零样本学习成为可能。 31 | 32 | 33 | -------------------------------------------------------------------------------- /llm-alignment/DPO.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 给 NLP 引入了一些 RL 相关的复杂性: 既要构建一个好的奖励函数,并训练一个模型用以估计每个状态的价值 (value) ; 又要注意最终生成的 LLM 不能与原始模型相差太远,如果太远的话会使得模型容易产生乱码而非有意义的文本。该过程非常复杂,涉及到许多复杂的组件,而这些组件本身在训练过程中又是动态变化的,因此把它们料理好并不容易。 5 | 6 | 7 | 8 | Direct Preference Optimization,论文提出将现有方法使用的基于强化学习的目标转换为可以通过简单的二元交叉熵损失直接优化的目标,这一做法大大简化了 LLM 的提纯过程。 9 | 10 | 11 | 12 | 13 | DPO 与 PPO 14 | 15 | 在通过 RL 优化人类衍生偏好时,一直以来的传统做法是使用一个辅助奖励模型来微调目标模型,以通过 RL 机制最大化目标模型所能获得的奖励。 16 | 直观上,我们使用奖励模型向待优化模型提供反馈,以促使它多生成高奖励输出,少生成低奖励输出。 17 | 同时,我们使用冻结的参考模型来确保输出偏差不会太大,且继续保持输出的多样性。 18 | 这通常需要在目标函数设计时,除了奖励最大化目标外再添加一个相对于参考模型的 KL 惩罚项,这样做有助于防止模型学习作弊或钻营奖励模型。 19 | 20 | 21 | DPO 绕过了建模奖励函数这一步,这源于一个关键洞见: 22 | 从奖励函数到最优 RL 策略的解析映射。这个映射直观地度量了给定奖励函数与给定偏好数据的匹配程度。 23 | 有了它,作者就可与将基于奖励和参考模型的 RL 损失直接转换为仅基于参考模型的损失,从而直接在偏好数据上优化语言模型! 24 | 因此,DPO 从寻找最小化 RLHF 损失的最佳方案开始,通过改变参量的方式推导出一个仅需参考模型的损失! 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /llm-alignment/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - [关于Instruct GPT复现的一些细节与想法](https://zhuanlan.zhihu.com/p/609078527) 6 | - [人人都能看懂的PPO原理与源码解读](https://zhuanlan.zhihu.com/p/677607581) 7 | - [MOSS-RLHF](https://github.com/OpenLMLab/MOSS-RLHF) 8 | - [模型调优(RLHF/DPO/ORPO)- 终极指南](https://zhuanlan.zhihu.com/p/692594519) 9 | - [DPO: Direct Preference Optimization 论文解读及代码实践](https://zhuanlan.zhihu.com/p/642569664) 10 | - [强化学习入门:基本思想和经典算法](https://imzhanghao.com/2022/02/10/reinforcement-learning/) 11 | - [动手学强化学习](https://hrl.boyuai.com/chapter/intro) 12 | 13 | 14 | ORPO: 15 | - ORPO: Monolithic Preference Optimization without Reference Model 16 | - https://github.com/xfactlab/orpo 17 | - https://arxiv.org/pdf/2307.12966.pdf 18 | 19 | 20 | -------------------------------------------------------------------------------- /llm-alignment/RLHF.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | RLHF 11 | 12 | 13 | 14 | 15 | 16 | 收集人类偏好数据的质量和数量决定了 RLHF 系统性能的上限。RLHF 系统需要两种人类偏好数据:人工生成的文本和对模型输出的偏好标签。生成高质量回答需要雇佣兼职人员 (而不能依赖产品用户和众包) 。 17 | 另一方面,训练 RM 需要的奖励标签规模大概是 50k 左右,所以并不那么昂贵 (当然远超了学术实验室的预算) 。 18 | 另一个挑战来自标注者的偏见。几个人类标注者可能有不同意见,导致了训练数据存在一些潜在差异。 -------------------------------------------------------------------------------- /llm-application/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-application/.DS_Store -------------------------------------------------------------------------------- /llm-application/Higress.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /llm-application/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | [@liguodongiot](https://github.com/liguodongiot) 5 | 6 | 7 | 8 | 9 | 10 | - GPT-4+Midjourney零代码做「愤怒的南瓜」 11 | 12 | 13 | 14 | 15 | LLM Powered Autonomous Agents:https://lilianweng.github.io/posts/2023-06-23-agent/ 16 | 17 | 18 | 19 | 20 | ## Vector DB 21 | 22 | ``` 23 | simCSE相似度的模型 24 | SentenceTransformers 25 | ``` 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /llm-application/embbedding-model.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 智源的工作bge,llm-embedder 6 | 7 | 段落文本用sentence bert类 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-application/gradio/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - 启动参数: 5 | - https://github.com/gradio-app/gradio/blob/main/gradio/blocks.py 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /llm-application/langchain/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-application/langchain/.DS_Store -------------------------------------------------------------------------------- /llm-application/langchain/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://python.langchain.com/docs/tutorials/llm_chain/ 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /llm-application/langchain/tutorials/client.py: -------------------------------------------------------------------------------- 1 | from langserve import RemoteRunnable 2 | 3 | 4 | remote_chain = RemoteRunnable("http://localhost:8000/chain/") 5 | 6 | remote_chain.invoke({"language": "italian", "text": "hi"}) 7 | -------------------------------------------------------------------------------- /llm-application/one-api.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | https://github.com/songquanpeng/one-api 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /llm-application/pre-post-handle/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## COLDataset 7 | 8 | 中文冒犯语言检测数据集 9 | 10 | - https://github.com/thu-coai/COLDataset 11 | 12 | 13 | ## 敏感词检测 14 | 15 | 16 | 17 | ### sensitive-word 18 | 19 | 基于 DFA 算法实现的高性能 java 敏感词过滤工具框架。 20 | 21 | https://github.com/houbb/sensitive-word 22 | 23 | 24 | 25 | 26 | 27 | ### 敏感词及违禁词检测 28 | 29 | https://www.volcengine.com/theme/1563016-M-7-1 30 | 31 | 32 | 一、基于关键词匹配的敏感词检测算法 33 | 二、基于正则表达式的敏感词检测算法 34 | 35 | 36 | 37 | 38 | ### DFA敏感词过滤算法详解 39 | 40 | https://blog.csdn.net/java_eiji/article/details/127354207 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /llm-application/rag/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - 大模型主流应用RAG的介绍——从架构到技术细节:https://luxiangdong.com/2023/09/25/ragone/#/%E5%86%99%E5%9C%A8%E5%89%8D%E9%9D%A2 5 | - 从 RAG 到 Self-RAG —— LLM 的知识增强:https://zhuanlan.zhihu.com/p/661465330 6 | - 大模型检索增强生成(RAG)有哪些好用的技巧?:https://www.zhihu.com/question/625481187 7 | - The limits of LLMs and how RAG remedies them:https://konfuzio.com/en/limits-llms-retrieval-augmented-generation/ 8 | 9 | 10 | 11 | 12 | 13 | 14 | ----- 15 | 16 | 现在越来越多的实践者开始意识到一套向量数据库打天下的方案已经不够了,于是有了各种花式疗法,从构建索引到回复生成,可谓百花齐放,眼花缭乱: 17 | 18 | 内容切片不够好,容易切碎,于是有了段落智能划分; 19 | 20 | 向量生成的质量不可控,于是有了可根据不同QA场景动态生成向量的Instructor; 21 | 22 | 隐式的动态向量不够过瘾,再用HyDE做个中间层:先生成一些虚拟文档/假设文档再做召回,提升召回率; 23 | 24 | 如果向量这一路召回不够,再上关键词召回,传统BM25+向量HNSW融合各召回通路; 25 | 26 | 召回的太多容易干扰答案生成,探究一下Lost in the Middle,搞一搞trick,或者用LLMLingua压缩; 27 | 28 | 嫌召回太麻烦?直接扩到100k窗口全量怼进大模型,LongLoRA横空出世; 29 | 30 | 刚才提到的各个环节需要改进的点太多,懒得手工做,直接交给大模型,用Self-RAG替你完成每个步骤…… 31 | 32 | 33 | 34 | 35 | 36 | 37 | ## 密集检索和检索增强LLM 38 | 39 | - https://github.com/FlagOpen/FlagEmbedding/tree/master -------------------------------------------------------------------------------- /llm-application/rag/embedding.md: -------------------------------------------------------------------------------- 1 | 2 | Text embedding就是将文本转成一组固定维度的向量表示。我们所熟知的word embedding是以token为基本单位,而text embedding则是以文本为基本单位的。 3 | 4 | **理想的text embedding应该尽可能保留文本的语义信息,相同语义但不同表述方式的文本可以被映射到同一个位置,而不同语义的文本在向量空间应该保持对应的距离。** 5 | 6 | 7 | Text embedding能用来做些什么?它可以用在Classification, Clustering, Pair Classification, Reranking, Retrieval, STS, Summarization, Bitext Mining等任务中。如何评价它的好坏,也可以通过在这些任务的表现来评估。 8 | 9 | 10 | 如何从语言模型中获得Text embedding呢?以目前主流的Transformer-Encoder为例,有以下几种不同方式。 11 | 12 | 13 | a) 将模型最后一层[CLS]位置的向量表征直接作为句向量。 14 | b) 将模型最后一层[CLS]位置的向量表征,再经过MLP层得到的向量。 15 | c) 将模型最后一层所有位置的向量表征,再经过一个Pooling层得到的向量。(大部分情况下采用的是mean pooling,在有些情况下也会使用max pooling等其他方式) 16 | d) 将模型最后一层所有位置的向量表征,再经过一个Pooling层跟MLP层得到的向量。 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /llm-application/rag/存在的一些问题.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 主要原因还是: 4 | 5 | 1)哪怕是GPT3.5,幻觉依然严重,商业场景(如客服,研究)没法支持,用户满意度可怜。何况客户还要求私有化部署,还没有服务器. 6 | 7 | 2)客户的问题很多是无法用RAG解决的: 8 | 9 | - 预测类:xxxx2024年会如何? 10 | 11 | - 多跳逻辑类:Elon musk的兄弟是谁?(知识库里只有Elon musk的妈妈,以及妈妈的儿子有谁,导致召回严重失败) 12 | 13 | - 统计类:昨天有多少销售反馈了折扣力度不足的问题? 14 | 15 | - 逻辑推理 16 | 17 | - “太有”良知问题:很多模型都很有良知,无法匹配商业中的无良场景(别笑)。比如GPT会非常忠诚地指出,化妆品里的xx因子是没有经过科学验证的,要谨慎对待。 -------------------------------------------------------------------------------- /llm-application/rag/方案.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | rerank: 6 | - https://huggingface.co/BAAI/bge-m3 7 | - https://huggingface.co/BAAI/bge-reranker-v2-m3 8 | 9 | 10 | 11 | 12 | embedding: 13 | 14 | - https://huggingface.co/intfloat/multilingual-e5-large 15 | 16 | 17 | 18 | ``` 19 | import torch 20 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 21 | 22 | tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3') 23 | model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3') 24 | model.eval() 25 | 26 | pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']] 27 | with torch.no_grad(): 28 | inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512) 29 | scores = model(**inputs, return_dict=True).logits.view(-1, ).float() 30 | print(scores) 31 | ``` 32 | -------------------------------------------------------------------------------- /llm-application/vector-db/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 词搜索 ---> 语义搜索 6 | 7 | 8 | 语义相似度 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-application/vector-db/reference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - 向量数据库(第 1 部分):每个数据库有何不同?:https://www.modb.pro/db/1694527960317513728 5 | 6 | Vespa是最早(2017年)在当时主导的基于BM25关键词搜索算法旁边引入向量相似性搜索的供应商之一(有趣的事实:Vespa的GitHub仓库现在已经有近7.5万次提交)。Weaviate紧随其后,在2018年底推出了一个开源的专用向量搜索数据库,到2019年,我们开始看到更多的竞争者进入这个领域,其中包括开源的Milvus。需要注意的是,时间线中还显示了Zilliz,但它没有单独列出,因为它是Milvus的(商业)母公司,并提供基于Milvus构建的完全托管的云解决方案。在2021年,又有三家新的供应商进入了这个领域:Vald、Qdrant和Pinecone。而Elasticsearch、Redis和PostgreSQL等现有供应商在此之前明显缺席,并且直到2022年及以后才开始提供向量搜索,比人们预期的要晚得多。 7 | 8 | - 7个向量数据库对比:Milvus、Pinecone、Vespa、Weaviate、Vald、GSI 和 Qdrant:https://www.modb.pro/db/516016 9 | -------------------------------------------------------------------------------- /llm-application/应用场景.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 文生图: 6 | - Stable Diffusion 7 | - 文心一格:https://yige.baidu.com/creation?mode=0 8 | 9 | 图生文: 10 | - Blip2 11 | 12 | 13 | 14 | 15 | 16 | 17 | 数字人 18 | - 百度智能云曦灵数字人:https://xiling.cloud.baidu.com/main/plaza/portrait 19 | 20 | 21 | 22 | AI教研平台 23 | 24 | 25 | 音乐生成模型: Suno V3 Alpha 26 | 27 | 弊端就是Suno最多只能生成2分钟的音乐,所以可以听到最后,会戛然而止直接截断,但是已经比V2好很多了。 28 | 29 | 但是这个音质、咬字、节奏编排啥的,也都好太多太多了。 30 | 31 | https://app.suno.ai/ 32 | 33 | 要生成音乐的话,第一步肯定是写prompt,第二步(纯音乐没有)就是写歌词。 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /llm-compression/PaddleSlim/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | - paddle.fluid: https://github.com/PaddlePaddle/Paddle/issues/55108 8 | 9 | 10 | ``` 11 | from paddle.base.framework import IrGraph 12 | from paddle.base.layer_helper import LayerHelper 13 | ``` 14 | 15 | 16 | -------------------------------------------------------------------------------- /llm-compression/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - https://lilianweng.github.io/posts/2023-01-27-the-transformer-family-v2/ 6 | - The Transformer Family Version 2.0 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-compression/distillation/GKD.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 图 1:在 XSum summarization 上比较 GKD 与常见的蒸馏方法。 我们使用经过监督微调(FT)训练的 T5 模型作为蒸馏的学生模型。 5 | 6 | 有监督的 KD 和有监督的 FT 使用带有ground-truth摘要的 XSum 训练数据集,但 KD 可以查询教师以获得概率,而 FT 则不能。 7 | 8 | 此外,on-policy 方法来自学生的样本摘要,而'Mixed'是指从真实情况和学生生成的摘要中进行统一采样。 9 | 10 | ImitKD 对应于使用 Forward KL 的'Mixed'采样。 正如第 2 节中所讨论的,具有反向 KL 和广义 JS 散度 (JSD) 的 GKD 优于其他方法。 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | --- 20 | 21 | 22 | 23 | 用于自回归序列模型的广义知识蒸馏 24 | 25 | 26 | 27 | 动机: 28 | 29 | 当前的自回归模型知识蒸馏方法存在两个关键问题: 30 | 1. 训练期间的输出序列与学生在部署期间生成的序列之间的分布不匹配; 31 | 2. 模型规格不足,即学生模型可能无法表达教师的分布。 32 | 33 | 为了解决这些问题,本文提出广义知识蒸馏(GKD)。 34 | 35 | 方法: 36 | 37 | 通过在训练期间从学生中采样输出序列来缓解分布不匹配。此外,GKD通过优化替代差异,如反向KL,这些差异专注于生成来自学生的样本,这些样本 38 | 在教师的分布下可能。 39 | 40 | 本文证明GKD在摘要、机器翻译和算术推理任务上超越了常用的LLM蒸馏方法。 41 | 42 | 优势:所提出方法在自然语言生成任务上的表现一直优于更常用的知识蒸馏基线。进一步展示了该方法可以与强化学习结合,以优化序列级奖励,除了蒸馏大型教师模型的知识。 43 | 44 | 45 | 46 | 提出一种名为广义知识蒸馏(GKD)的方法,可以解决自回归模型在知识蒸馏过程中面临的分布不匹配和模型规格不足的问题,在自然语言生成任务上的表现一直优于更常用的知识蒸馏基线。 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /llm-compression/distillation/MINILLM.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## MINILLM 4 | 5 | 6 | 论文:Knowledge Distillation of Large Language Models 7 | 8 | https://github.com/microsoft/LMOps/tree/main/minillm 9 | 10 | 11 | 12 | 13 | 14 | 通过 Rouge-L 进行挑选。 15 | 16 | 17 | 18 | 19 | 20 | 动机: 21 | 22 | 解决大型语言模型(LLM)的高计算需求问题,通过知识蒸馏(KD)技术减小模型规模。现有的KD方法主要适用于白盒分类模型或训练小模型模仿黑盒模型API(如ChatGPT),如何有效地从白盒生成LLM中蒸馏知识仍然不够充分。 23 | 24 | 方法: 25 | 26 | 提出名为MINILLM的新方法,能从生成式大型语言模型中蒸馏出较小的语言模型。首先将标准KD方法中的前向Kullback-Leibler散度(KLD)目标替换为更适合在生成语言模型上进行KD的反向KLD,以防止学生模型高估教师分布的低概率区域。然后,推导出一种有效的优化方法来学习此目标。 27 | 28 | 优势: 29 | 30 | 提出一种从生成式LLM中蒸馏较小语言模型的方法,通过改进KD方法,实现了更高的生成质量、更低的暴露偏差、更好的校准性和更高的长文本生成性能。该方法具有可扩展性,适用于不同规模的模型。 31 | 32 | 33 | 34 | 提出了一种MINILLM方法,通过改进知识蒸馏方法,实现了从大型语言模型到小型模型的知识传递,提升了生成质量和性能。 35 | 36 | 37 | -------------------------------------------------------------------------------- /llm-compression/distillation/SCOTT.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | --- 7 | 8 | 为了保证论据的准确性,SCOTT(论文:**SCOTT: Self-Consistent Chain-of-Thought Distillation**)采用对比解码,将每个论据与答案联系起来。它激励来自老师模型正确的逻辑依据。此外,引导学生进行反事实推理,并根据导致不同答案的逻辑依据进行预测。 9 | 10 | 11 | 12 | --- 13 | 14 | 图 2:我们用于可信推理(faithful reasoning)的知识蒸馏框架概述。 15 | 16 | (a) 教师:大型 LM 提示通过对比解码在给定问题和训练集中的黄金答案的情况下生成一致的基本原理。 17 | 18 | (b) 学生:一个小型 LM,经过微调以生成基本原理,然后通过反事实推理进行回答。 19 | 20 | 21 | 一种忠实的知识蒸馏方法,用于从大几个数量级的教师模型中学习小型、自洽的 CoT 模型。 22 | 23 | 为了形成更好的监督,我们通过对比解码从大型 LM(老师)那里引出支持黄金答案的理由,这鼓励老师生成只有在考虑答案时才变得更加可信的Token。 24 | 25 | 为了确保可信的蒸馏,我们使用教师生成的基本原理来学习具有反事实推理目标的学生 LM,这可以防止学生忽略基本原理而做出不一致的预测。 26 | 27 | 28 | 实验表明,在产生可比较的最终任务性能的同时,我们的方法可以生成比基线更可靠的 CoT 基本原理。 进一步分析表明,这样的模型在决策时更尊重理性; 因此,我们可以通过完善其原理来进一步提高其性能。 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /llm-compression/gptqmodel/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | git clone git@github.com:liguodongiot/GPTQModel.git 5 | 6 | 7 | git remote add upstream git@github.com:ModelCloud/GPTQModel.git 8 | 9 | 10 | # 拉取原始仓库数据 11 | git fetch upstream --tags 12 | 13 | # 如果你的主分支不是叫master,就把前面的master换成你的名字,比如main之类 14 | git rebase upstream/main 15 | 16 | # 推送 17 | git push 18 | 19 | # 推送tags 20 | git push --tags 21 | 22 | 23 | 24 | 25 | 26 | 27 | git checkout -b dev-code-v2.0.0 v2.0.0 28 | 29 | # 将新分支推送到远程仓库 30 | git push -u origin dev-code-v2.0.0 -------------------------------------------------------------------------------- /llm-compression/llm-compressor/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | git clone git@github.com:liguodongiot/llm-compressor.git 6 | 7 | 8 | git remote add upstream git@github.com:vllm-project/llm-compressor.git 9 | 10 | 11 | # 拉取原始仓库数据 12 | git fetch upstream --tags 13 | 14 | # 如果你的主分支不是叫master,就把前面的master换成你的名字,比如main之类 15 | git rebase upstream/main 16 | 17 | # 推送 18 | git push 19 | 20 | # 推送tags 21 | git push --tags 22 | 23 | ``` 24 | 25 | 26 | ## llm-compressor 27 | 28 | 支持的量化类型: 29 | - https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py 30 | 31 | 32 | 33 | int8: 34 | https://github.com/vllm-project/llm-compressor/tree/main/examples/quantization_w8a8_int8 35 | 36 | fp8 dynamic: 37 | https://github.com/vllm-project/llm-compressor/tree/main/examples/quantization_w8a8_fp8 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /llm-compression/llm-compressor/source-code.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-compression/pruning/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## 结构化剪枝 7 | 8 | 9 | ### LLM-Shearing 10 | 11 | - https://github.com/princeton-nlp/LLM-Shearing 12 | - https://arxiv.org/pdf/2310.06694.pdf 13 | - https://xiamengzhou.github.io/sheared-llama/ -------------------------------------------------------------------------------- /llm-compression/quantization/GPTQ.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | LLM.int8() 和 SmoothQuant 都属于 round-to-nearest (RTN) 量化:舍入到最近的定点数。GPT-Q 则是把量化问题视作优化问题,逐层寻找最优的量化权重。 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /llm-compression/quantization/PEQA.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | PEQA 5 | 6 | 参数高效的微调(PEFT)方法已经出现,以减轻全面微调大型语言模型(LLM)的高昂成本。 尽管如此,法学硕士的庞大规模阻碍了常规部署。 为了解决这个问题,我们提出了参数高效和量化感知适应(PEQA),这是一种新颖的量化感知 PEFT 技术,可以促进模型压缩并加速推理。 PEQA 通过双阶段过程运行:最初, 7 | 每个全连接层的参数矩阵经过量化为低位整数矩阵和标量向量; 随后,对每个下游任务的标量向量进行微调。 这种策略大大压缩了模型的大小,从而降低了部署时的推理延迟并减少了所需的总体内存。 同时,快速微调和高效的任务切换成为可能。 通过这种方式,PEQA 提供了量化的好处,同时继承了 PEFT 的优点。 我们比较 8 | PEQA 在从自然语言理解到生成基准的综合实验中具有竞争性基准。 9 | 10 | 这是使用多达 650 亿个参数的大型语言模型完成的,展示了 PEQA 的可扩展性、特定于任务的适应性能以及遵循指令的能力,即使在极低位设置下也是如此。 -------------------------------------------------------------------------------- /llm-compression/quantization/QQQ-W4A8.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | git clone git@github.com:liguodongiot/QQQ.git 6 | 7 | ``` 8 | 9 | 10 | -------------------------------------------------------------------------------- /llm-compression/quantization/SpinQuant.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /llm-compression/quantization/fp4.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | https://github.com/nbasyl/LLM-FP4 6 | 7 | 8 | LLM-FP4: 4-Bit Floating-Point Quantized Transformers 9 | 10 | 11 | ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats -------------------------------------------------------------------------------- /llm-compression/quantization/fp8.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://arxiv.org/pdf/2209.05433 5 | 6 | FP8 FORMATS FOR DEEP LEARNING 7 | 8 | 9 | 10 | FP8 Quantization: The Power of the Exponent 11 | 12 | https://arxiv.org/pdf/2208.09225 13 | 14 | 15 | https://zhuanlan.zhihu.com/p/574825662 16 | 17 | FP8 量化-原理、实现与误差分析 18 | 19 | 20 | https://zhuanlan.zhihu.com/p/619431625 21 | 22 | FP8 量化基础 23 | 24 | 25 | 26 | https://developer.nvidia.com/zh-cn/blog/nvidia-gpu-fp8-training-inference/ 27 | 28 | 29 | -------------------------------------------------------------------------------- /llm-compression/quantization/llm-qat/cfd70ff/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | bash run_train.sh 8 8 8 6 | ``` 7 | -------------------------------------------------------------------------------- /llm-compression/quantization/llm-qat/cfd70ff/merge_gen_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Meta Platforms, Inc. and affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | import json 9 | import os 10 | import random 11 | 12 | all_text = [] 13 | 14 | for i_start in range(8): 15 | if i_start % 10 == 0: 16 | print(i_start) 17 | for line in open("gen_data/gen.chunk."+str(i_start).zfill(2)+".jsonl", 'r'): 18 | all_text.append(json.loads(line)) 19 | 20 | 21 | with open("gen_data/all_gen.jsonl", "a") as f: 22 | for i in range(len(all_text)): 23 | f.write(json.dumps(all_text[i])) 24 | f.write('\n') 25 | -------------------------------------------------------------------------------- /llm-compression/quantization/llm-qat/cfd70ff/pip.conf: -------------------------------------------------------------------------------- 1 | # This file has been autogenerated or modified by NVIDIA PyIndex. 2 | # In case you need to modify your PIP configuration, please be aware that 3 | # some configuration files may have a priority order. Here are the following 4 | # files that may exists in your machine by order of priority: 5 | # 6 | # [Priority 1] Site level configuration files 7 | # 1. `/usr/pip.conf` 8 | # 9 | # [Priority 2] User level configuration files 10 | # 1. `/root/.config/pip/pip.conf` 11 | # 2. `/root/.pip/pip.conf` 12 | # 13 | # [Priority 3] Global level configuration files 14 | # 1. `/etc/pip.conf` 15 | # 2. `/etc/xdg/pip/pip.conf` 16 | 17 | [global] 18 | no-cache-dir = true 19 | index-url = https://pypi.org/simple 20 | extra-index-url = 21 | https://pypi.ngc.nvidia.com 22 | trusted-host = 23 | pypi.ngc.nvidia.com 24 | -------------------------------------------------------------------------------- /llm-compression/quantization/llm-qat/log.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 训练数据来源: 5 | 6 | 方案一:使用开源数据集。 7 | 8 | 方案二:使用LLM合成数据。 9 | 10 | 11 | 20230803: 12 | 13 | 目前,代码中支持使用开源的公开数据集。 14 | 15 | 20230814:(commit-id: f4d873a) 16 | 17 | 目前,代码中支持使用LLM合成数据。 18 | 19 | 20 | 21 | 22 | 23 | 目前存在的问题: 24 | - 未提供模型评估和模型推理的代码。仅有在模型训练过程中,使用Huggingface的evaluate方法进行评估,相当于仅能评估模型效果,不能评估推理速度。 25 | - 代码没办法直接运行,需要自行修改源码才能启动。 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /llm-compression/quantization/tools.md: -------------------------------------------------------------------------------- 1 | tools.md -------------------------------------------------------------------------------- /llm-compression/sparsity/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Wanda 6 | 7 | 8 | https://arxiv.org/abs/2306.11695 9 | 10 | 11 | 12 | SparseGPT 13 | https://arxiv.org/pdf/2301.00774 14 | 15 | 16 | 17 | Outlier Weighed Layerwise Sparsity (OWL) 18 | 19 | https://arxiv.org/pdf/2310.05175 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /llm-compression/tools.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://github.com/vllm-project/llm-compressor/ 5 | - https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_choosing_quant_methods.html 6 | 7 | 8 | - https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/deepseek/README.md 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-compression/经验.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | MoE gate 层对量化敏感,通常不进行量化以便保持完全精度。 6 | 7 | 8 | -------------------------------------------------------------------------------- /llm-data-engineering/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | - [Best Practices and Lessons Learned on Synthetic Data for Language Models](https://arxiv.org/pdf/2404.07503.pdf) 8 | - 语言模型合成数据的最佳实践和经验教训 9 | - Google DeepMind -------------------------------------------------------------------------------- /llm-data-engineering/dataset/baichuan2.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - 4 | 5 | 在数据采集过程中,为了数据的全面性和代表性,从多个来源进行数据收集,包括但不限于网页、书籍、研究论文、代码等,各类别数据分布如下所示。 6 | 7 | 8 | 9 | 并且对数据进行清洗,如下图所示,主要关注数据的频率和质量:数据频率:借助LSH-like和Embedding特征对数据进行聚类和去重,主要是对每个聚类的簇给文档、段落、句子进行去重和打分,分值用于用于最终的数据采样。数据质量:句子级别质量过滤,但未说明明确过滤规则。不过从下面模型安全部分可以得知,对数据进行了暴力、色情、种族歧视、仇恨言论等有害内容过滤,但应该还包含其他内容。 10 | 11 | PS:报告中没有给出过滤后数据采样比例&数据分布情况,比较遗憾。但从垂域效果来看,医疗和法律数据应该不会少,并且从数据本身质量来看,书籍&论文数据的采样率应该也会比较高。 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-data-engineering/dataset/chinese-corpus-all.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-data-engineering/dataset/chinese-corpus-all.md -------------------------------------------------------------------------------- /llm-data-engineering/dataset/english-corpus-all.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## 开源数据集: 4 | 5 | 6 | - TigerBot:https://github.com/TigerResearch/TigerBot 7 | - BELLE:https://github.com/LianjiaTech/BELLE 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-data-engineering/reference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /llm-data-engineering/sft-dataset/evol-instruct.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## Evol-Instruct 7 | 8 | 使用 LLM 而不是人类,自动大规模生产各种难度级别的开放域指令的新方法,以提高 LLM 的性能。 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /llm-data-engineering/sft-dataset/jinja.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-data-engineering/sft-dataset/jinja.md -------------------------------------------------------------------------------- /llm-eval/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/.DS_Store -------------------------------------------------------------------------------- /llm-eval/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - https://evalscope.readthedocs.io/zh-cn/latest/user_guides/backend/opencompass_backend.html 7 | - https://github.com/open-compass/opencompass 8 | 9 | 10 | 11 | 12 | ## 测评集 13 | 14 | - Humaneval:代码生成 15 | - CommonSenseQA:知识问答 16 | - MBPP:代码 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /llm-eval/eval-data/longtext_L115433-question.txt: -------------------------------------------------------------------------------- 1 | 第58届全球帅哥竞选活动有什么特别之处? -------------------------------------------------------------------------------- /llm-eval/eval-data/longtext_L32503_answer.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## 豆包 4 | 5 | 第58届全球帅哥竞选于2008年12月13日在南非约翰尼斯堡山敦会议中心举行。其特别之处如下: 6 | - **举办地点变更**:原计划在乌克兰基辅举办,但因乌克兰有受2008年南奥塞梯战争影响的可能,比赛地点移至约翰尼斯堡。 7 | - **参赛人数最多**:有109位来自世界各地的选手参赛,是全球帅哥竞选有记录以来参加人数最多的一届。 8 | - **比赛项目多样**:包括沙滩比赛(11月29日在南非德班比佛利山酒店举行,前25名由评判于11月27日选拔)、模特儿比赛(12月3日在南非索韦托西苏卢贡献广场举行,32名选手入围)、体能比赛(12月5日在南非豪登省举行)、才艺比赛(12月7日在南非约翰尼斯堡山敦会议中心举行)和心智比赛(12月13日在南非约翰尼斯堡山敦会议中心现场公布赛果)。 9 | - **冠军事迹感人**:冠军在成为特立尼达和多巴哥小姐后,加入“无私付出”非牟利组织。该组织由一对孩子因罕见癌症离世的父母创立,旨在帮助有孩子需要支援的家庭。冠军还为改善本国儿童癌症治疗环境做出诸多贡献,如募集10万美元捐赠给英语系哥伦比亚的小儿科,改善当地医院被长期忽视的单位。 10 | - **评委阵容强大**:评委团由8位知名人士组成。 11 | 12 | 13 | ## GLM-4-Plus 14 | 15 | 根据文章38,第58届全球帅哥竞选活动有以下特别之处: 16 | 17 | 比赛地点原定在乌克兰首都基辅举行,但由于乌克兰可能受到2008年南奥塞梯战争影响,比赛地点最终改为南非约翰尼斯堡。 18 | 19 | 参加人数创下全球帅哥有记录以来最多,共有109位来自世界各地的选手参加。 20 | 21 | 比赛包含多个项目,包括沙滩比赛、模特儿比赛、体能比赛、才艺比赛和心智比赛等。 22 | 23 | 冠军由2007年度全球帅哥冠军张梓琳为俄罗斯的肯笙妮雅·苏恩诺娃加冕。 24 | 25 | 比赛评委由8位知名人士组成。 26 | 27 | 冠军得主在获奖后加入了新成立的非牟利组织,名为“无私付出”,致力于帮助有需要的家庭。 28 | 29 | 这是全球帅哥比赛有史以来参加人数最多的一届,具有里程碑意义。 -------------------------------------------------------------------------------- /llm-eval/eval-data/longtext_L64031_question.txt: -------------------------------------------------------------------------------- 1 | 第58届全球帅哥竞选活动有什么特别之处? -------------------------------------------------------------------------------- /llm-eval/evalscope_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/evalscope_framework.png -------------------------------------------------------------------------------- /llm-eval/llm-performance/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-performance/.DS_Store -------------------------------------------------------------------------------- /llm-eval/llm-performance/AI芯片性能.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 模型算力利用率(MFU)+硬件算力利用率(HFU) 7 | 8 | 模型算力利用率(Model FLOPs Utilization, MFU)和硬件算力利用率(Hardware FLOPs Utilization, HFU)是评估某一模型实现对芯片计算性能利用情况的常用指标 9 | 10 | 简单来说: 11 | 12 | 模型算力利用率是指模型一次前反向计算消耗的矩阵算力与机器算力的比值 13 | 硬件算力利用率是指考虑重计算后,模型一次前反向计算消耗的矩阵算力与机器算力的比值 14 | 注:FLOPs指浮点运算次数,FLOPS指每秒的浮点运算次数 15 | 16 | 17 | 18 | 19 | - 模型算力利用率(MFU)+硬件算力利用率(HFU):https://zhuanlan.zhihu.com/p/671537309 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /llm-eval/llm-performance/hardware-performance/pynvml-stat-memory.py: -------------------------------------------------------------------------------- 1 | import psutil 2 | import pynvml 3 | import time 4 | 5 | 6 | UNIT = 1024 * 1024 7 | 8 | pynvml.nvmlInit() #初始化 9 | 10 | ids = [3, 7] 11 | 12 | max_mem_dict = { 13 | 14 | } 15 | 16 | while True: 17 | for i in ids: 18 | max_mem = max_mem_dict.get(str(i), 0) 19 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 20 | memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) 21 | 22 | used_mem = memoryInfo.used/UNIT 23 | if used_mem > max_mem: 24 | max_mem_dict[str(i)] = used_mem 25 | print(max_mem_dict) 26 | time.sleep(5) 27 | 28 | pynvml.nvmlShutdown() -------------------------------------------------------------------------------- /llm-eval/llm-performance/mindie/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-performance/mindie/.DS_Store -------------------------------------------------------------------------------- /llm-eval/llm-performance/mindie/locust-lantency-throughput/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-performance/mindie/locust-lantency-throughput/.DS_Store -------------------------------------------------------------------------------- /llm-eval/llm-performance/mindie/locust-lantency-throughput/hello.py: -------------------------------------------------------------------------------- 1 | 2 | # Locust用户脚本就是Python模块 3 | import time 4 | from locust import HttpUser, task, between 5 | 6 | # 定义用户行为 7 | # 类继承自HttpUser 8 | class QuickstartUser(HttpUser): 9 | 10 | # 被@task装饰的才会并发执行 11 | @task 12 | def hello_world(self): 13 | # client属性是HttpSession实例,用来发送HTTP请求 14 | self.client.get("/v1/models") 15 | -------------------------------------------------------------------------------- /llm-eval/llm-performance/mindie/locust-lantency-throughput/qwen-14b-chart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-performance/mindie/locust-lantency-throughput/qwen-14b-chart.jpg -------------------------------------------------------------------------------- /llm-eval/llm-performance/mindie/locust-lantency-throughput/qwen-14b-stat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-performance/mindie/locust-lantency-throughput/qwen-14b-stat.jpg -------------------------------------------------------------------------------- /llm-eval/llm-performance/mindie/locust-lantency-throughput/qwen1.5-7b-4tp-chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-performance/mindie/locust-lantency-throughput/qwen1.5-7b-4tp-chart.png -------------------------------------------------------------------------------- /llm-eval/llm-performance/mindie/locust-lantency-throughput/qwen1.5-7b-4tp-stat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-performance/mindie/locust-lantency-throughput/qwen1.5-7b-4tp-stat.png -------------------------------------------------------------------------------- /llm-eval/llm-performance/perfetto.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://perfetto.dev/ 5 | 6 | 7 | -------------------------------------------------------------------------------- /llm-eval/llm-performance/stat_gpu_memory.py: -------------------------------------------------------------------------------- 1 | import psutil 2 | import pynvml 3 | import time 4 | 5 | UNIT = 1024 * 1024 6 | 7 | pynvml.nvmlInit() #初始化 8 | 9 | ids = [3, 7] 10 | 11 | max_mem_dict = { 12 | 13 | } 14 | 15 | num = 0 16 | while True: 17 | for i in ids: 18 | max_mem = max_mem_dict.get(str(i), 0) 19 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 20 | memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) 21 | used_mem = memoryInfo.used/UNIT 22 | if num % 12 == 0: 23 | num = 0 24 | print("使用容量:", memoryInfo.used/UNIT, "MB, ", "剩余容量:", memoryInfo.free/UNIT, "MB") 25 | if used_mem > max_mem: 26 | max_mem_dict[str(i)] = used_mem 27 | print(max_mem_dict) 28 | time.sleep(5) 29 | num += 1 30 | 31 | pynvml.nvmlShutdown() 32 | 33 | 34 | -------------------------------------------------------------------------------- /llm-eval/llm-performance/tgi-benchmark.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://github.com/huggingface/text-generation-inference/tree/main/benchmark 5 | 6 | 7 | - https://github.com/huggingface/text-generation-inference/blob/v1.4.3/benchmark/src/generation.rs#L63 8 | 9 | 三步: 预热(预填充+解码)、预填充、解码 10 | 11 | 12 | 13 | - 预填充时延 14 | - 预填充吞吐量(token/s) 15 | 16 | 17 | 18 | - 解码端到端时延 19 | - 解码token时延 20 | - 解码吞吐量(token/s) 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /llm-eval/llm-performance/tgi-benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-performance/tgi-benchmark.png -------------------------------------------------------------------------------- /llm-eval/llm-performance/wrk-性能测试工具.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://github.com/wg/wrk 5 | 6 | 教程: 7 | 8 | https://www.cnblogs.com/quanxiaoha/p/10661650.html 9 | 10 | -------------------------------------------------------------------------------- /llm-eval/llm-performance/推理性能测试.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://bentoml.com/blog/benchmarking-llm-inference-backends 5 | 6 | 7 | 8 | ## 首词元时延和词元间时延 9 | 10 | 11 | 12 | | 模型 | GPU型号 | 优化方法 | 首词元时延(TTFT) |词元间时延(ITL) | 13 | | --- | --- | --- | --- | --- | 14 | | chatglm3-6b(chat) | A800 | 原始 | 125ms| 25ms | 15 | 16 | 17 | 18 | 19 | - 每分钟完成的请求数 20 | 21 | 首词元时间(TTFT) 22 | 23 | 在流式应用中,TTFT指的是LLM返回第一个词元前所需的时间。 24 | 25 | 26 | 27 | - 词元间时延(ITL) 28 | 词元间时延指的是连续输出词元之间的平均时间。 将TTFT纳入词元间时延的计算。 29 | 30 | 31 | - 端到端时延 32 | 33 | 端到端时延应该大致等于词元的平均输出长度乘以词元间时延。 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /llm-eval/llm-precision/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-eval/llm-precision/.DS_Store -------------------------------------------------------------------------------- /llm-inference/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-inference/.DS_Store -------------------------------------------------------------------------------- /llm-inference/DeepSpeed-Inference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Automatic Tensor Parallelism for HuggingFace Models 5 | https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/ 6 | 7 | 8 | ``` 9 | # --------------------------------------- 10 | # New automatic tensor parallelism method 11 | # --------------------------------------- 12 | import os 13 | import torch 14 | import transformers 15 | import deepspeed 16 | local_rank = int(os.getenv("LOCAL_RANK", "0")) 17 | world_size = int(os.getenv("WORLD_SIZE", "1")) 18 | # create the model pipeline 19 | pipe = transformers.pipeline(task="text2text-generation", model="google/t5-v1_1-small", device=local_rank) 20 | # Initialize the DeepSpeed-Inference engine 21 | pipe.model = deepspeed.init_inference( 22 | pipe.model, 23 | mp_size=world_size, 24 | dtype=torch.float 25 | ) 26 | output = pipe('Input String') 27 | ``` 28 | -------------------------------------------------------------------------------- /llm-inference/Flash-Decoding.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Flash-Decoding for long-context inference 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /llm-inference/FlashInfer.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | https://github.com/flashinfer-ai/flashinfer 6 | https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python/ 7 | 8 | 9 | 10 | 11 | 用FlashInfer加速大语言模型推理中的自注意力操作:https://zhuanlan.zhihu.com/p/681506469 12 | 13 | 14 | 15 | 16 | FlashInfer优化了分组自注意力,融合旋转位置编码的自注意力 和 量化自注意力 操作。 17 | 18 | 19 | - 使用CUDA Cores的传统GQA实现会被算力所限制。FlashInfer提出使用预填充阶段的自注意力内核(使用Tensor Cores来实现)用于GQA的解码自注意力操作 20 | - 融合旋转位置编码的自注意力 21 | - 量化自注意力 KV-Cache 4bit 22 | - FlashInfer中PageAttention实现预取(prefetch)了页表结构的索引,最小化page大小对于算子性能的影响。 23 | 24 | 25 | 26 | RoPE 需要 sin/cos 等计算,不能使用 Tensor Cores加速。 27 | 28 | 29 | 30 | 31 | 32 | FlashInfer中DeepSeek MLA的内核设计:https://zhuanlan.zhihu.com/p/25920092499 -------------------------------------------------------------------------------- /llm-inference/GuidedGeneration.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | vLLM: 4 | 5 | https://github.com/vllm-project/vllm/issues/3536 6 | 7 | 8 | 9 | https://arxiv.org/pdf/2307.09702 10 | 11 | Efficient Guided Generation for Large Language Models 12 | 13 | 14 | 15 | https://github.com/dottxt-ai/outlines 16 | 17 | 18 | 19 | 20 | 21 | 理解大语言模型 (Large Language Model) 中的引导式生成 (Guided Generation) 22 | 23 | https://zhuanlan.zhihu.com/p/706892979 24 | 25 | 26 | 使用结构化生成工具实现格式可控的 LLMs 输出 27 | https://zhuanlan.zhihu.com/p/700691767 -------------------------------------------------------------------------------- /llm-inference/KV-Cache优化.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://github.com/NVIDIA/kvpress 5 | 6 | 7 | -------------------------------------------------------------------------------- /llm-inference/Mooncake.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Mooncake: A KVCache-centric Disaggregated Architecture for LLM Serving 4 | 5 | https://github.com/kvcache-ai/Mooncake 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /llm-inference/NanoFlow.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://github.com/efeslab/Nanoflow 5 | 6 | 7 | -------------------------------------------------------------------------------- /llm-inference/PD分离.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | https://lmsys.org/blog/2025-05-05-large-scale-ep/ 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /llm-inference/RTP-LLM.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://github.com/alibaba/rtp-llm 4 | 5 | 项目主要基于FasterTransformer,并在此基础上集成了TensorRT-LLM的部分kernel实现。FasterTransformer和TensorRT-LLM为我们提供了可靠的性能保障。Flash-Attention2和cutlass也在我们持续的性能优化过程中提供了大量帮助。我们的continuous batching和increment decoding参考了vllm的实现;采样参考了transformers,投机采样部分集成了Medusa的实现,多模态部分集成了llava和qwen-vl的实现。 6 | 7 | 8 | -------------------------------------------------------------------------------- /llm-inference/ascend/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-inference/ascend/.DS_Store -------------------------------------------------------------------------------- /llm-inference/ascend/mindformers/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-inference/ascend/mindformers/.DS_Store -------------------------------------------------------------------------------- /llm-inference/ascend/mindformers/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | 6 | ``` 7 | 8 | 9 | 10 | 11 | ## 参考文档 12 | 13 | - chatglm3: https://gitee.com/mindspore/mindformers/blob/dev/docs/model_cards/glm3.md 14 | - baichuan2: https://gitee.com/mindspore/mindformers/blob/dev/research/baichuan2/baichuan2.md 15 | 16 | -------------------------------------------------------------------------------- /llm-inference/ascend/mindformers/baichuan2/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /llm-inference/ascend/mindformers/chatglm3/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-inference/ascend/mindformers/chatglm3/README.md -------------------------------------------------------------------------------- /llm-inference/deepspeed-mii/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | DeepSpeed-FastGen 是 DeepSpeed-MII 和 DeepSpeed-Inference 的协同组合. 8 | 9 | 10 | 11 | ## DeepSpeed-MII 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-inference/faster-transformer/gpt/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-inference/faster-transformer/gpt/README.md -------------------------------------------------------------------------------- /llm-inference/faster-transformer/llama/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## LLaMA 6 | 7 | - https://github.com/NVIDIA/FasterTransformer/issues/506 8 | - https://github.com/NVIDIA/FasterTransformer/pull/575 9 | - https://github.com/void-main/FasterTransformer 10 | - https://github.com/void-main/fastertransformer_backend 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-inference/huggingface-tgi/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://github.com/huggingface/text-generation-inference 4 | 5 | -------------------------------------------------------------------------------- /llm-inference/huggingface-transformer/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://huggingface.co/blog/how-to-generate 5 | - https://huggingface.co/blog/zh/how-to-generate 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /llm-inference/lightllm/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | https://github.com/ModelTC/lightllm 4 | 5 | https://github.com/ModelTC/lightllm/blob/main/docs/LightLLM.md 6 | 7 | 8 | - lightllm代码解读——显存管理机制:https://zhuanlan.zhihu.com/p/667730434 9 | - lightllm代码解读——之模型推理:https://zhuanlan.zhihu.com/p/666731524 10 | 11 | 12 | 13 | 14 | 15 | 16 | - https://github.com/ModelTC/lightllm/tree/main 17 | 18 | 19 | - LightLLM:纯Python超轻量高性能LLM推理框架: https://mp.weixin.qq.com/s/-wMLMGAHkxeyDYkixqni9Q 20 | 21 | 22 | -------------------------------------------------------------------------------- /llm-inference/lmdeploy/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://hub.docker.com/r/openmmlab/lmdeploy-builder/tags 5 | - https://hub.docker.com/r/openmmlab/lmdeploy/tags 6 | 7 | 8 | - https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/serving/api_server.md 9 | - https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/cli/utils.py#L64 10 | 11 | 12 | 13 | 14 | 15 | 请求队列 16 | - 推理请求首先先加入到请求队列中 17 | Persistent线程 18 | 1. 若批处理中有空闲槽位,从队列拉取请求,尽量填满空闲槽位。若无,继续对当前批处理中的请求进行Forward 19 | 2. 批次每Forward完一次 20 | - 判断是否有请求推理结束。结束的请求,发送结果,释放槽位 21 | - 转步骤1 -------------------------------------------------------------------------------- /llm-inference/lmdeploy/功能.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | https://lmdeploy.readthedocs.io/en/latest/inference/turbomind_config.html 8 | 9 | 10 | ## prefix caching switch 11 | 12 | 13 | 14 | 15 | ## kv quantization and inference switch 16 | 17 | 18 | 19 | 20 | ## long context switch 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /llm-inference/offload.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | - https://huggingface.co/docs/accelerate/concept_guides/big_model_inference 8 | - https://huggingface.co/docs/transformers/big_models 9 | 10 | 11 | 12 | 13 | - Efficient and Economic Large Language Model Inference with Attention Offloading 14 | - https://arxiv.org/pdf/2405.01814 15 | 16 | 17 | - DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale 18 | - https://arxiv.org/pdf/2207.00032 19 | 20 | 21 | FlexFlow 22 | - https://github.com/flexflow/FlexFlow 23 | 24 | FlexGen 25 | - https://github.com/FMInference/FlexGen 26 | 27 | 28 | 29 | 30 | kv cache offload: 31 | 32 | - https://github.com/NVIDIA/TensorRT-LLM/blob/a96cccafcf6365c128f004f779160951f8c0801c/docs/source/kv_cache_reuse.md 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /llm-inference/sglang/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - [SGLang 后端代码解析](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/sglang/code-walk-through/readme-CN.md) 7 | 8 | 9 | 10 | ## 量化 11 | 12 | 13 | https://docs.sglang.ai/backend/quantization.html 14 | 15 | 16 | 17 | 18 | 19 | 20 | ## 镜像 21 | 22 | - https://github.com/sgl-project/sglang/blob/072df753546b77438479f18a05e691fad91d7f9c/.github/workflows/release-docker.yml#L60 23 | - https://github.com/sgl-project/sglang/blob/main/docker/Dockerfile 24 | - https://hub.docker.com/r/lmsysorg/sglang/tags 25 | 26 | 27 | ``` 28 | docker build . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache 29 | docker push lmsysorg/sglang:${tag}${tag_suffix} 30 | ``` 31 | 32 | 33 | 34 | - docker pull lmsysorg/sglang:v0.4.5-cu125 35 | -------------------------------------------------------------------------------- /llm-inference/sglang/source-code.md: -------------------------------------------------------------------------------- 1 | 2 | - 参考:https://blog.csdn.net/sdujava2011/article/details/138312278 3 | 4 | ######## 5 | 6 | git clone git@github.com:liguodongiot/sglang.git 7 | 8 | 9 | git remote add upstream https://github.com/sgl-project/sglang.git 10 | 11 | 12 | # 拉取原始仓库数据 13 | git fetch upstream --tags 14 | 15 | # 如果你的主分支不是叫master,就把前面的master换成你的名字,比如main之类 16 | git rebase upstream/main 17 | 18 | # 推送 19 | git push 20 | 21 | # 推送tags 22 | git push --tags 23 | 24 | 25 | 26 | ######## 27 | 28 | 29 | # 查看所有tag 30 | git tag 31 | 32 | # 4. 检出指定的 tag 到新分支 33 | # 替换 'tag_name' 为你想要的 tag 名称 34 | 35 | git checkout -b dev-code-0.4.1 0.4.1 36 | 37 | # 将新分支推送到远程仓库 38 | git push -u origin dev-code-0.4.1 39 | 40 | 41 | -------------------------------------------------------------------------------- /llm-inference/sglang/项目代码结构.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /llm-inference/tensorrt-llm/FP8.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://nvidia.github.io/TensorRT-LLM/reference/precision.html#fp8-hopper 4 | 5 | 6 | 7 | - https://nvidia.github.io/TensorRT-Model-Optimizer/ 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /llm-inference/tensorrt-llm/Memory Usage of TensorRT-LLM.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://nvidia.github.io/TensorRT-LLM/reference/memory.html#understand-inference-time-gpu-memory-usage 4 | 5 | 6 | 7 | 8 | 权重、内部激活张量和 I/O 张量。 9 | 10 | 对于 I/O 张量,主要内存占用来自 KV 缓存张量。 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /llm-inference/tensorrt-llm/Triton服务启动参数.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /llm-inference/tensorrt/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 镜像: 6 | 7 | - https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch 8 | 9 | 10 | python tar包安装: 11 | 12 | - https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html#installing-pip 13 | - https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar 14 | 15 | 16 | - TensorRT: https://github.com/NVIDIA/TensorRT 17 | - Torch-TensorRT: https://github.com/pytorch/TensorRT 18 | 19 | 20 | -------------------------------------------------------------------------------- /llm-inference/triton/REAEME.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver 6 | 7 | 8 | 9 | - https://catalog.ngc.nvidia.com/containers?filters=&orderBy=weightPopularDESC&query=&page=&pageSize= 10 | - https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver 11 | 12 | 13 | 14 | 15 | - https://github.com/triton-inference-server/backend 16 | - https://github.com/triton-inference-server/server 17 | 18 | 19 | 20 | 21 | 22 | 23 | ``` 24 | docker run --privileged --gpus all -it --net=host -v ${PWD}:/workspace/ nvcr.io/nvidia/tritonserver:23.05-py3-sdk bash 25 | ``` 26 | -------------------------------------------------------------------------------- /llm-inference/triton/resnet50/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "resnet50" 2 | platform: "pytorch_libtorch" 3 | max_batch_size : 0 4 | input [ 5 | { 6 | name: "input__0" 7 | data_type: TYPE_FP32 8 | dims: [ 3, 224, 224 ] 9 | reshape { shape: [ 1, 3, 224, 224 ] } 10 | } 11 | ] 12 | output [ 13 | { 14 | name: "output__0" 15 | data_type: TYPE_FP32 16 | dims: [ 1, 1000 ,1, 1] 17 | reshape { shape: [ 1, 1000 ] } 18 | label_filename: "labels.txt" 19 | } 20 | ] 21 | -------------------------------------------------------------------------------- /llm-inference/triton/resnet50/resnet50_convert_torchscript.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision.models as models 3 | 4 | resnet50 = models.resnet50(pretrained=True) 5 | resnet50.eval() 6 | image = torch.randn(1, 3, 244, 244) 7 | resnet50_traced = torch.jit.trace(resnet50, image) 8 | resnet50(image) 9 | # resnet50_traced.save('/workspace/model/resnet50/model.pt') 10 | torch.jit.save(resnet50_traced, "/workspace/model/resnet50/model.pt") 11 | -------------------------------------------------------------------------------- /llm-inference/vllm/FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - https://github.com/vllm-project/vllm/issues/5001 7 | 8 | ``` 9 | pip3 install vllm 10 | pip3 install nvidia-cublas-cu12==12.3.4.1 11 | ``` 12 | 13 | -------------------------------------------------------------------------------- /llm-inference/vllm/REF.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /llm-inference/vllm/源码.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 入口 LLMEngine 5 | 6 | 7 | 8 | Worker 类 执行 9 | 10 | 核心执行逻辑: ModelRunner execute_model 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | ## CacheEngine 19 | 20 | 管理KV缓存 21 | 22 | 这个类负责初始化和管理GPU和CPU的KV缓存。它还提供了执行KV缓存操作的方法,比如交换和复制。 23 | 24 | 25 | 26 | ## CacheConfig 27 | 28 | 配置KV缓存 29 | 30 | - block_size:缓存块的大小,以token数量计算。 31 | - gpu_memory_utilization:用于vLLM执行的GPU内存的占比。 32 | - swap_space:每个GPU的CPU交换空间大小(以GiB为单位)。 33 | - cache_dtype:kv缓存存储的数据类型。 34 | - is_attention_free:模型是否是attention-fre模型。 35 | - num_gpu_blocks_override:要使用的GPU块数量。如果指定,将覆盖配置文件中的GPU块数量。如果为None,则不执行任何操作。 36 | - sliding_window:KV缓存的滑动窗口大小。不能与启用前缀缓存一起使用。 37 | - enable_prefix_caching:是否启用前缀缓存。 38 | - cpu_offload_gb:CPU卸载缓冲区的大小(以GiB为单位)。 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /llm-inference/web/fastapi/README.md: -------------------------------------------------------------------------------- 1 | 2 | ``` 3 | uvicorn llm-server:app --host 0.0.0.0 --port 8000 --reload 4 | ``` 5 | 6 | 7 | ``` 8 | from typing import Optional 9 | from pydantic import BaseModel, Field 10 | 11 | # 定义请求的数据模型 12 | class Item(BaseModel): 13 | name: str = Field(..., description="物品名称") 14 | description: Optional[str] = Field(None, description="物品描述") 15 | price: float = Field(..., description="物品价格") 16 | 17 | 18 | from fastapi import FastAPI, HTTPException, Request 19 | 20 | app = FastAPI() 21 | 22 | # 创建POST路由 23 | @app.post("/items/") 24 | async def create_item(item: Item): 25 | # 这里你可以处理接收到的数据,比如保存到数据库 26 | # 作为示例,我们只是打印出来 27 | print(item) 28 | 29 | 30 | 31 | uvicorn main:app --reload 32 | ``` 33 | -------------------------------------------------------------------------------- /llm-inference/web/flask/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | curl --request POST \ 5 | --url http://10.193.195.107:5000/predict \ 6 | --header 'content-type: application/json' \ 7 | --data '{"input_text": "保持健康的秘诀"}' 8 | -------------------------------------------------------------------------------- /llm-inference/web/sanic/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | RuntimeError: Start method 'spawn' was requested, but 'fork' was already set. 5 | 6 | - 解决方法:https://sanic.dev/en/guide/running/manager.html#sanic-and-start-methods 7 | - python多进程(一)Fork模式和Spawn模式的优缺点: https://blog.csdn.net/weixin_42575811/article/details/134041691 8 | 9 | 10 | 11 | 12 | 13 | - https://ida3.cn/zh/guide/deployment/running.html#sanic-%E6%9C%8D%E5%8A%A1%E5%99%A8-sanic-server 14 | 15 | 16 | sanic server.app --host=0.0.0.0 --port=1337 --workers=4 17 | -------------------------------------------------------------------------------- /llm-inference/xinference/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://inference.readthedocs.io/zh-cn/latest/index.html 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /llm-inference/分离式推理架构.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 分离式推理架构 9 | 10 | 11 | https://github.com/LLMServe/DistServe 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-interview/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## [大模型基础](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/base.md) 4 | 5 | ## [大模型结构](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/llm-algo.md) 6 | 7 | ## [大模型训练](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/llm-train.md) 8 | 9 | ## [大模型微调](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/llm-ft.md) 10 | 11 | ## [大模型评估](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/llm-eval.md) 12 | 13 | ## [大模型压缩](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/llm-compress.md) 14 | 15 | ## [大模型推理](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/llm-inference.md) 16 | 17 | ## [大模型应用](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/llm-app.md) 18 | 19 | ## [大模型综合性问题](https://github.com/liguodongiot/llm-action/blob/main/llm-interview/comprehensive.md) 20 | -------------------------------------------------------------------------------- /llm-interview/comprehensive.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - 解决显存不足的方法有哪些? 5 | 6 | 训练: 7 | 8 | 推理: 9 | 10 | - 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-interview/llm-app.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /llm-interview/llm-compress.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## 量化 7 | 8 | ### 常见的大模型量化方法有哪些? 9 | 10 | ## 剪枝 11 | 12 | 13 | 14 | ## 蒸馏 15 | 16 | -------------------------------------------------------------------------------- /llm-interview/llm-eval.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | > 如何评测生成,改写等开放性任务? 4 | 5 | 指导思想,开放性任务的写作能力这类任务本身就很主观,我们不太方便用Rouge或者BLEU这样的评价指标,因为它本身就不能体现模型的核心能力,而且与人类基准就是不对齐的(偏离实际需求)。 6 | 7 | 8 | 从更贴近实际需求的角度来说,Elo的方式还是最合理的; 9 | 10 | 或者如果你的模型的核心业务就是生成/改写/总结,那你本身就应该有一套业务逻辑的评价指标来评测你的模型——以你的业务需求为导向。 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /llm-interview/llm-ft.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## 微调 5 | 6 | 7 | 8 | 9 | > 介绍下 LoRA、AdaLoRA、QLoRA 这几种高效微调方法及其特点 10 | 11 | 12 | > 在LoRA中,A和B低秩矩阵的初始化方法,对A采用高斯初始化,对B采用零矩阵初始化,目的是让训练刚开始时BA的值为0,这样不会给模型带来额外的噪声。那么,对A做零矩阵初始化,对B做高斯初始化行不行呢?反正看起来只要让初始化为0就行? 13 | 14 | 当前作者还没有发现转换初始化方式产生的显著区别,只要这两者中任意一者为0,另一者不为0即可。 15 | 16 | 参考:https://github.com/microsoft/LoRA/issues/98 17 | 18 | 19 | 20 | 21 | 22 | > 介绍下 Prefix Tuning、Prompt Tuning、P-Tuning、P-Tuning v2 这四种高效微调方法的区别与联系? 23 | 24 | 1. Prompt Tuning和P-Tuning都是只在Embbedding层加入虚拟Token。而 Prefix Tuning、P-Tuning v2 会在每一层都加入虚拟Token,从而引入了更多的可训练参数;通过加入到更深层结构中的Prompt,能给模型预测带来更直接的影响。 25 | 2. P-Tuning通过 LSTM + MLP 去编码这些virtual token,再输入到模型,可以让模型收敛更快。 26 | 3. Prefix Tuning 为了防止直接更新 Prefix 的参数(virtual token)导致训练不稳定和性能下降的情况,在Prefix层前面加了MLP结构,训练完成后,只保留Prefix的参数。 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /llm-interview/llm-rlhf.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## RLHF 6 | 7 | > RLHF 完整训练过程是什么?RL建模过程中涉及到几个模型? 8 | 9 | 10 | > RLHF 过程中RM随着训练过程的进行,得分越来越高,效果就一定好吗? 11 | 12 | > 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /llm-localization/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-localization/.DS_Store -------------------------------------------------------------------------------- /llm-localization/README.md: -------------------------------------------------------------------------------- 1 | # 大模型国产化适配 2 | 3 | 4 | 5 | 6 | ## 昇腾 7 | 8 | - [大模型国产化适配-华为昇腾AI全栈软硬件平台总结](https://github.com/liguodongiot/llm-action/blob/main/docs/llm_localization/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%BD%E4%BA%A7%E5%8C%96%E9%80%82%E9%85%8D-%E5%8D%8E%E4%B8%BA%E6%98%87%E8%85%BEAI%E5%85%A8%E6%A0%88%E8%BD%AF%E7%A1%AC%E4%BB%B6%E5%B9%B3%E5%8F%B0%E6%80%BB%E7%BB%93.md) 9 | 10 | - https://gitee.com/ascend/ModelLink 11 | 12 | 13 | - 昇腾Ascend处理器相关介绍:https://huahuaboy.blog.csdn.net/article/details/127171363 14 | - AI芯片:华为Ascend(昇腾)910结构分析:https://blog.csdn.net/evolone/article/details/100061616 15 | 16 | 17 | ## 海光 18 | 19 | 20 | 21 | 22 | ## 寒武纪 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /llm-localization/ascend/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-localization/ascend/.DS_Store -------------------------------------------------------------------------------- /llm-localization/ascend/FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | docker: Error response from daemon: failed to create shim task: OCI runtime create failed: unable to retrieve OCI runtime error (open /var/run/docker/containerd/daemon/io.containerd.runtime.v2.task/moby/579418211a825ef5c7fcf5becdbe90804f0ed7862d9c59663995f9dd463937b4/log.json: no such file or directory): /usr/local/Ascend/Ascend-Docker-Runtime/ascend-docker-runtime did not terminate successfully: exit status 1: 2024/07/24 09:59:29 owner not right /usr/bin/runc 1000 7 | 8 | 9 | 10 | 11 | 错误信息表明/usr/bin/runc这个文件的所有权不正确,即它不是由root用户拥有或者它的所属用户不是1000。Docker在创建并运行容器时需要runc这个二进制文件,如果权限设置不当,Docker将无法正确执行。 12 | 13 | 14 | 解决办法: 15 | 16 | 17 | 查看权限 18 | 19 | ls -lah /usr/bin/runc 20 | 21 | 22 | 修改权限 23 | 24 | sudo chown root:root /usr/bin/runc 25 | 26 | -------------------------------------------------------------------------------- /llm-localization/ascend/HCCL.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/apiref/hcclapiref/hcclapi_07_0001.html 6 | 7 | HCCL提供了Python与C++两种语言的接口,其中Python语言的接口用于实现TensorFlow网络在昇腾AI处理器执行分布式优化;C++语言接口用于实现OPBase模式下的框架适配,实现分布式能力,例如HCCL单算子API嵌入到PyTorch后端代码中,PyTorch用户直接使用PyTorch原生集合通信API,即可实现分布式能力。 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /llm-localization/ascend/MacOS环境.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | ``` 8 | conda create -n mindspore-venv python=3.8 -y 9 | conda activate mindspore-venv 10 | 11 | 12 | pip install torch transformers 13 | 14 | pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/2.2.14/MindSpore/cpu/aarch64/mindspore-2.2.14-cp38-cp38-macosx_11_0_arm64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://pypi.tuna.tsinghua.edu.cn/simple 15 | ``` 16 | 17 | -------------------------------------------------------------------------------- /llm-localization/ascend/MindSpore-note.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Graph模式:静态图模式或者图模式,将神经网络模型编译成一整张图,然后下发执行。该模式利用图优化等技术提高运行性能,同时有助于规模部署和跨平台运行。 7 | 8 | PyNative模式:动态图模式,将神经网络中的各个算子逐一下发执行,方便用户编写和调试神经网络模型。 9 | 10 | 11 | 12 | 13 | 二者的主要区别也十分的明显。 14 | 15 | 使用场景:Graph模式需要一开始就构建好网络结构,然后框架做整图优化和执行,比较适合网络固定没有变化,且需要高性能的场景。而PyNative模式逐行执行算子,支持单独求梯度。 16 | 17 | 网络执行:Graph模式和PyNative模式在执行相同的网络和算子时,精度效果是一致的。由于Graph模式运用了图优化、计算图整图下沉等技术,Graph模式执行网络的性能和效率更高。 18 | 19 | 代码调试:在脚本开发和网络流程调试中,推荐使用PyNative模式进行调试。在PyNative模式下,可以方便地设置断点,获取网络执行的中间结果,也可以通过pdb的方式对网络进行调试。而Graph模式无法设置断点,只能先指定算子进行打印,然后在网络执行完成后查看输出结果。 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /llm-localization/ascend/ascend-c/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 算子API:https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha001/quickstart/quickstart/quickstart_18_0001.html 4 | 5 | --- 6 | 7 | 192.168.137.101 8 | 255.255.255.0 9 | 10 | 11 | 连接开发板: 12 | - ssh root@192.168.137.100 13 | - Mind@123 14 | 15 | 16 | --- 17 | 18 | cd /home/HwHiAiUser/samples/samples/operator/AddCustomSample/FrameworkLaunch/AddCustom 19 | bash build.sh 20 | 21 | 22 | cd build_out 23 | ./custom_opp_ubuntu_aarch64.run 24 | 25 | 26 | cd /home/HwHiAiUser/samples/samples/operator/AddCustomSample/FrameworkLaunch/AclNNInvocation 27 | bash run.sh 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /llm-localization/ascend/ascend-dmi.md: -------------------------------------------------------------------------------- 1 | # Ascend-DMI 2 | 主要为Atlas产品的标卡、板卡及模组类产品提供带宽测试、算力测试、功耗测试等功能。 3 | 4 | 本系统通过调用底层DCMI(设备控制管理接口)/DSMI(设备系统管理接口)以及ACL(Ascend Computing Language,昇腾计算语言)相关接口完成相关检测功能,对于系统级别的信息查询通过调用系统提供的通用库来实现,用户使用工具时通过配置参数来实现不同的测试功能。 5 | 6 | 参考:https://www.hiascend.com/document/detail/zh/canncommercial/321/othertools/ascenddmi/ascenddmi_000002.html 7 | 8 | ## 带宽测试 9 | 10 | 11 | ## 算力测试 12 | 13 | 14 | ## 功耗测试 15 | 16 | 17 | ## 设备实时状态查询 18 | 19 | 20 | ## 故障诊断 21 | 22 | 23 | ## 软硬件版本兼容性测试 24 | 25 | 26 | ## 设备拓扑检测 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /llm-localization/ascend/ascend-docker-runtime.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 昇腾docker runtime仓库,在docker容器场景下,使用昇腾NPU,提供更简单的设备和依赖路径挂载方法。 5 | 6 | 7 | https://gitee.com/ascend/ascend-docker-runtime 8 | 9 | 10 | 11 | 安装:https://www.hiascend.com/document/detail/zh/mindx-dl/300/dluserguide/clusterscheduling/dlug_installation_02_000025.html 12 | 13 | 14 | Ascend Docker Runtime组件参考信息说明: 15 | 16 | https://www.hiascend.com/document/detail/zh/mindx-dl/300/dluserguide/clusterscheduling/dlug_installation_02_000036.html 17 | 18 | -------------------------------------------------------------------------------- /llm-localization/ascend/ascend-docker.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | docker login -u 15708484031 ascendhub.huawei.com 7 | 8 | 9 | docker pull ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC2-centos7 10 | ``` 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /llm-localization/ascend/fabric-insight/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /llm-localization/ascend/firefly.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Xformers 4 | 5 | 6 | 7 | 8 | 9 | 10 | ``` 11 | conda activate llm-dev 12 | source /usr/local/Ascend/ascend-toolkit/set_env.sh 13 | 14 | cd /workspace/llm-train 15 | ``` 16 | 17 | 18 | ``` 19 | docker exec -it pytorch_ubuntu_dev bash 20 | conda activate llm-dev 21 | source /usr/local/Ascend/ascend-toolkit/set_env.sh 22 | cd /workspace/llm-train 23 | 24 | 25 | sh run_all_npu.sh 26 | ``` 27 | 28 | 29 | 30 | ``` 31 | 32 | sh run_lora_npu.sh 33 | ``` 34 | 35 | 36 | ``` 37 | docker start pytorch_ubuntu_upgrade 38 | 39 | docker exec -it pytorch_ubuntu_upgrade bash 40 | . /usr/local/Ascend/ascend-toolkit/set_env.sh 41 | conda activate llm-dev 42 | 43 | cd /workspace/llm-train 44 | 45 | sh run_all_npu.sh 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /llm-localization/ascend/log.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 2023.08.10: 6 | 1. 目前使用PyTorch或者MindSpore针对LLaMA-13B可以进行多机多卡训练(内部版本)。 7 | 2. 目前,针对LLaMA-13B,训练完成之后的模型进行推理,框架层面还没有适配。 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindformers/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-localization/ascend/mindformers/.DS_Store -------------------------------------------------------------------------------- /llm-localization/ascend/mindformers/baichuan2/baichuan2训练.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://gitee.com/mindspore/mindformers/blob/r1.0/research/baichuan2/baichuan2.md 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindformers/env.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | docker pull --platform=arm64 swr.cn-central-221.ovaijisuan.com/mindformers/mindformers1.0_mindspore2.2.11:aarch_20240125 6 | ``` 7 | 8 | 9 | ``` 10 | docker run -it -u root \ 11 | --ipc=host \ 12 | --network=host \ 13 | --device=/dev/davinci0 \ 14 | --device=/dev/davinci1 \ 15 | --device=/dev/davinci2 \ 16 | --device=/dev/davinci3 \ 17 | --device=/dev/davinci4 \ 18 | --device=/dev/davinci5 \ 19 | --device=/dev/davinci6 \ 20 | --device=/dev/davinci7 \ 21 | --device=/dev/davinci_manager \ 22 | --device=/dev/devmm_svm \ 23 | --device=/dev/hisi_hdc \ 24 | -v /var/log/npu/:/usr/slog \ 25 | -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ 26 | -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ 27 | --name mindformers_dev \ 28 | swr.cn-central-221.ovaijisuan.com/mindformers/mindformers1.0_mindspore2.2.11:aarch_20240125 \ 29 | /bin/bash 30 | ``` 31 | 32 | ``` 33 | git clone -b dev https://gitee.com/mindspore/mindformers.git 34 | cd mindformers 35 | bash build.sh 36 | ``` 37 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindformers/llama/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## 训练 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindformers/qwen/qwen1训练.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://gitee.com/mindspore/mindformers/blob/r1.0/research/qwen/qwen.md 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindformers/qwen1.5/qwen1.5训练.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - https://gitee.com/mindspore/mindformers/blob/r1.0/research/qwen1_5/qwen1_5.md 6 | 7 | 8 | 9 | 10 | 11 | 12 | docker pull swr.cn-central-221.ovaijisuan.com/mindformers/mindformers1.1_mindspore2.3rc2:20240511 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindformers/trick.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 16 = 2 * 1 * 8 * 1). 6 | 7 | 8 | 9 | batch_size : 数据批次大小 10 | 11 | micro_batch_num:流水线并行的微批次大小。pipeline_satge大于1时,开启流水并行时使用,此处需满足micro_batch_num >= pipeline_satge 12 | 13 | 14 | micro_batch_interleave_num: batch_size的拆分份数,多副本并行开关,通常在模型并行时使用,用于优化model_parallel时产生的通信损耗,纯流水并行时不建议使用。 15 | 16 | 17 | # compute throughput (samples/s/p) 每一步每一卡每一秒能处理的样本数 18 | throughput = self.global_batch_size / self.device_num / (per_step_seconds / 1000) 19 | ``` 20 | 21 | 22 | 23 | 24 | deepspeed: 25 | 26 | global_train_batch_size = train_micro_batch_size_per_gpu * gradient_accumulation_steps * number of GPUs -------------------------------------------------------------------------------- /llm-localization/ascend/mindie/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-localization/ascend/mindie/.DS_Store -------------------------------------------------------------------------------- /llm-localization/ascend/mindie/config/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-localization/ascend/mindie/config/.DS_Store -------------------------------------------------------------------------------- /llm-localization/ascend/mindie/docker/llm-server.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | 4 | source /usr/local/Ascend/ascend-toolkit/set_env.sh 5 | source /usr/local/Ascend/mindie/set_env.sh 6 | source /usr/local/Ascend/llm_model/set_env.sh 7 | 8 | 9 | export PYTHONPATH=/usr/local/Ascend/llm_model:$PYTHONPATH 10 | cd /usr/local/Ascend/mindie/latest/mindie-service/bin 11 | 12 | ./mindieservice_daemon 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindie/docker/mindie-1.0.Dockerfile: -------------------------------------------------------------------------------- 1 | #FROM ascendhub.huawei.com/public-ascendhub/mindie-service-env:1.0.RC1-800I-A2-aarch64 2 | FROM ascendhub.huawei.com/public-ascendhub/mindie-service-env:v2 3 | 4 | ENV APP_DIR=/workspace 5 | 6 | RUN mkdir -p $APP_DIR 7 | 8 | # COPY qwen1.5-14b.json /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json 9 | COPY baichuan2-7b.json /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json 10 | 11 | COPY llm-server.sh $APP_DIR 12 | 13 | RUN chmod -R 777 $APP_DIR/llm-server.sh 14 | 15 | ENTRYPOINT $APP_DIR/llm-server.sh 16 | 17 | # docker build --network=host -f mindie-1.0.Dockerfile -t ascendhub.huawei.com/public-ascendhub/mindie-service-online:v1.0 . 18 | # docker build --network=host -f mindie-1.0.Dockerfile -t ascendhub.huawei.com/public-ascendhub/mindie-service-online:v1.1 . -------------------------------------------------------------------------------- /llm-localization/ascend/mindie/docker/mindie-all-1.0.Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM ascendhub.huawei.com/public-ascendhub/mindie:1.0.RC1-800I-A2-aarch64 3 | 4 | # USER root 5 | 6 | COPY driver /usr/local/Ascend/driver 7 | 8 | RUN ls -al /usr/local/Ascend/driver 9 | 10 | ENV APP_DIR=/workspace 11 | 12 | RUN mkdir -p $APP_DIR 13 | 14 | COPY install_and_enable_cann.sh /opt/package/install_and_enable_cann.sh 15 | 16 | RUN cd /opt/package && ls -al && cat /opt/package/install_and_enable_cann.sh && source ./install_and_enable_cann.sh 17 | 18 | RUN pip install transformers==4.37.2 -i https://pypi.tuna.tsinghua.edu.cn/simple 19 | 20 | COPY qwen1.5-14b.json /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json 21 | 22 | COPY llm-server.sh $APP_DIR 23 | 24 | RUN chmod -R 777 $APP_DIR/llm-server.sh 25 | 26 | ENTRYPOINT ["$APP_DIR/llm-server.sh"] 27 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindie/docker/mindie-env-1.0.Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM ascendhub.huawei.com/public-ascendhub/mindie:1.0.RC1-800I-A2-aarch64 3 | 4 | USER root 5 | 6 | ENV APP_DIR=/workspace 7 | 8 | RUN mkdir -p $APP_DIR 9 | 10 | RUN cd /opt/package && ls -al && source ./install_and_enable_cann.sh 11 | 12 | RUN pip install transformers==4.37.2 -i https://pypi.tuna.tsinghua.edu.cn/simple 13 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindie/mindie-1.0.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ascendhub.huawei.com/public-ascendhub/mindie:1.0.RC1-800I-A2-aarch64 2 | 3 | RUN cd /opt/package && source install_and_enable_cann.sh \ 4 | && source /usr/local/Ascend/ascend-toolkit/set_env.sh \ 5 | && source /usr/local/Ascend/mindie/set_env.sh \ 6 | && source /usr/local/Ascend/llm_model/set_env.sh 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindie/性能调优.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 910b4 llama-7b 10g KV CACHE 6 | 7 | 8 | Total Block Num = 160 9 | 10 | 11 | 12 | 13 | Block Num = Ceil(输入Token数/Block Size)+Ceil(最大输出Token数/Block Size) 14 | 15 | 16 | 560/4 + 512/4 = 9 17 | 18 | batch_size: 20 19 | 20 | 21 | 22 | 23 | 910B3 llama-7b 30g KV CACHE 24 | 25 | Total Block Num = 480 26 | 27 | 28 | 29 | 560/4 + 512/4 = 9 30 | 31 | 32 | 33 | batch_size: 50 34 | 35 | 36 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindspore/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - https://gitee.com/mindspore/mindspore 7 | 8 | 9 | 10 | ``` 11 | import numpy as np 12 | import mindspore.context as context 13 | from mindspore import Tensor 14 | from mindspore.ops import functional as F 15 | 16 | context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU") 17 | 18 | x = Tensor(np.ones([1,3,3,4]).astype(np.float32)) 19 | y = Tensor(np.ones([1,3,3,4]).astype(np.float32)) 20 | print(F.tensor_add(x, y)) 21 | 22 | ``` 23 | 24 | 25 | 26 | 27 | https://ms-release.obs.cn-north-4.myhuaweicloud.com/2.3.0rc1/MindSpore/unified/aarch64/mindspore-2.3.0rc1-cp39-cp39-linux_aarch64.whl 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindspore/bert.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## bert 7 | 8 | ``` 9 | pip install wikiextractor 10 | python -m wikiextractor.WikiExtractor -o -b 11 | ``` 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /llm-localization/ascend/mindspore/reference.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://gitee.com/mindspore/mindspore 5 | - https://gitee.com/mindspore/mindformers 6 | - https://gitee.com/mindspore/mindformers.git 7 | - https://www.mindspore.cn/install 8 | 9 | 10 | 11 | 12 | 13 | - https://www.mindspore.cn/tutorials/zh-CN/r2.2/beginner/train.html 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /llm-localization/ascend/modellink/dataset.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://huggingface.co/docs/datasets/access 5 | - HuggingFace学习笔记--datasets的使用:https://blog.csdn.net/weixin_43863869/article/details/134653171 6 | - Huggingface处理数据排除异常值:https://blog.csdn.net/qq_18555105/article/details/130291508 7 | - HuggingFace数据集Datasets:https://baijiahao.baidu.com/s?id=1753227115689844799&wfr=spider&for=pc 8 | - Huggingface详细入门介绍之dataset库: https://zhuanlan.zhihu.com/p/554678463 -------------------------------------------------------------------------------- /llm-localization/ascend/modellink/qwen.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ``` 7 | git clone https://gitee.com/ascend/ModelLink.git 8 | git clone https://github.com/NVIDIA/Megatron-LM.git 9 | cd Megatron-LM 10 | git checkout -f bcce6f 11 | cp -r megatron ../ModelLink/ 12 | cd .. 13 | cd ModelLink 14 | mkdir logs 15 | mkdir model_from_hf 16 | mkdir dataset 17 | mkdir ckpt 18 | ``` 19 | 20 | 21 | ``` 22 | # python3.8 23 | conda create -n test python=3.8 24 | conda activate test 25 | 26 | # 安装 torch 和 torch_npu 27 | pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl 28 | pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl 29 | pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl 30 | 31 | # 安装加速库 32 | git clone https://gitee.com/ascend/AscendSpeed.git 33 | cd AscendSpeed 34 | git checkout 224ae35e8fc96778f957029d1371ddb623452a50 35 | pip install -r requirements.txt 36 | pip install -e . 37 | cd .. 38 | 39 | # 安装其余依赖库 40 | pip install -r requirements.txt 41 | ``` 42 | 43 | 44 | -------------------------------------------------------------------------------- /llm-localization/ascend/msmodelslim/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - https://gitee.com/ascend/msit/tree/master/msmodelslim/msmodelslim/pytorch/llm_ptq 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /llm-localization/ascend/msmodelslim/llm_quant/calib_set.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-localization/ascend/msmodelslim/llm_quant/calib_set.json -------------------------------------------------------------------------------- /llm-localization/ascend/network.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## 集群网络 4 | Atlas 900 AI集群采用“HCCS、 PCIe 4.0、100G以太”三类高速互联方式,百TB全互联无阻塞专属参数同步网络,降低网络时延,梯度同步时延缩短10~70%。 5 | 6 | 在AI服务器内部,昇腾910 AI处理器之间通过HCCS高速总线互联;昇腾910 AI处理器和CPU之间以最新的PCIe 4.0(速率16Gbps)技术互联,其速率是业界主流采用的PCIe 3.0(8.0Gbps)技术的两倍,使得数据传输更加快速和高效。在集群层面,采用面向数据中心的CloudEngine 8800系列交换机,提供单端口100Gbps的交换速率,将集群内的所有AI服务器接入高速交换网络。 7 | 8 | 独创iLossless 智能无损交换算法,对集群内的网络流量进行实时的学习训练,实现网络0丢包与E2E μs级时延。 9 | 10 | ## 系统级调优 11 | 12 | Atlas 900 AI集群通过华为集合通信库和作业调度平台,整合HCCS、 PCIe 4.0 和100G RoCE三种高速接口,充分释放昇腾910 AI处理器的强大性能。 13 | 14 | 华为集合通信库提供训练网络所需的分布式并行库,通信库+网络拓扑+训练算法进行系统级调优,实现集群线性度>80%,极大提升了作业调度效率。 15 | -------------------------------------------------------------------------------- /llm-localization/ascend/npu监控.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - AI模型运维——GPU性能监控NVML和DCGM 5 | - https://www.cnblogs.com/maxgongzuo/p/12582286.html 6 | 7 | dcgm exporter 监控GPU 8 | 9 | NPU-Exporter 监控NPU 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-localization/ascend/peft/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | 7 | 8 | ASCEND_RT_VISIBLE_DEVICES=6 python train_bloom_lora.py 9 | 10 | 11 | ``` -------------------------------------------------------------------------------- /llm-localization/ascend/pytorch/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-localization/ascend/pytorch/.DS_Store -------------------------------------------------------------------------------- /llm-localization/ascend/pytorch/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## 训练 6 | 7 | 8 | - https://gitee.com/mindspore/mindformers/blob/dev/docs/model_cards/llama.md 9 | - https://gitee.com/ascend/ModelZoo-PyTorch/tree/master/PyTorch/built-in/foundation/LLaMA-13B 10 | - https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/PyTorch/built-in/foundation/Qwen-7B/test/qwen_7B_64p.sh 11 | 12 | 13 | 14 | 15 | 16 | ``` 17 | ASCEND_VISIBLE_DEVICES=6 python train_bloom_lora.py 18 | ``` 19 | 20 | 21 | ## baichuan2 22 | 23 | 24 | ["W_pack", "o_proj", "gate_proj", "up_proj", "down_proj"], -------------------------------------------------------------------------------- /llm-localization/ascend/standford-alpaca/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "contiguous_gradients": true, 5 | "stage3_max_live_parameters": 0, 6 | "stage3_max_reuse_distance": 0, 7 | "stage3_prefetch_bucket_size": 0, 8 | "stage3_param_persistence_threshold": 1e2, 9 | "reduce_bucket_size": 1e2, 10 | "stage3_gather_16bit_weights_on_model_save": true 11 | }, 12 | "fp16": { 13 | "enabled": true, 14 | "auto_cast": false, 15 | "loss_scale": 0, 16 | "initial_scale_power": 32, 17 | "loss_scale_window": 1000, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1 20 | }, 21 | "train_batch_size": 128, 22 | "gradient_accumulation_steps": 8, 23 | "train_micro_batch_size_per_gpu": 2, 24 | "wall_clock_breakdown": false 25 | } -------------------------------------------------------------------------------- /llm-localization/ascend/standford-alpaca/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | rouge_score 3 | fire 4 | #openai 5 | transformers>=4.28.1 6 | #torch 7 | sentencepiece 8 | tokenizers>=0.13.3 9 | #wandb 10 | tensorboardX 11 | deepspeed 12 | -------------------------------------------------------------------------------- /llm-localization/ascend/transformers/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - https://gitee.com/ascend/transformers/ 5 | 6 | 7 | 8 | ``` 9 | pip3 install -U transformers 10 | ``` 11 | 12 | ``` 13 | git clone https://github.com/huggingface/transformers.git 14 | cd examples/pytorch/text-classification 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | 19 | ``` 20 | export TASK_NAME=mrpc 21 | 22 | python run_glue.py \ 23 | --model_name_or_path bert-base-cased \ 24 | --task_name $TASK_NAME \ 25 | --do_train \ 26 | --do_eval \ 27 | --max_seq_length 128 \ 28 | --per_device_train_batch_size 32 \ 29 | --learning_rate 2e-5 \ 30 | --num_train_epochs 3 \ 31 | --output_dir /tmp/$TASK_NAME/ 32 | 33 | ``` -------------------------------------------------------------------------------- /llm-localization/ascend/ubuntu操作系统.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 更换源 5 | - https://blog.csdn.net/zwcslj/article/details/134322879 6 | 7 | 8 | ``` 9 | 先备份旧的源 10 | 11 | 设置新的镜像源 12 | vim /etc/apt/sources.list 13 | 14 | 15 | 更新软件包列表并升级已安装的软件包 16 | apt-get update 17 | 18 | 19 | apt-get upgrade 20 | 21 | ``` 22 | 23 | 24 | 25 | ``` 26 | docker pull ubuntu:20.04 27 | 28 | docker pull ubuntu:22.04 29 | ``` 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /llm-localization/ascend/vllm-ascend-npu.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://github.com/vllm-project/vllm/issues/7692 5 | 6 | 7 | -------------------------------------------------------------------------------- /llm-localization/ascend/昇腾卡-soc版本.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ``` 4 | SOC版本:设备类型 5 | 100: "910PremiumA", 101: "910ProA", 102: "910A", 103: "910ProB", 104: "910B", 6 | 200: "310P1", 201: "310P2", 202: "310P3", 203: "310P4", 7 | 220: "910B1", 221: "910B2", 222: "910B3", 223: "910B4", 8 | 240: "310B1", 241: "310B2", 242: "310B3", 9 | 250: "910C1", 251: "910C2", 252: "910C3", 253: "910C4" 10 | ``` 11 | -------------------------------------------------------------------------------- /llm-localization/ascend/昇腾卡注意事项.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | --privileged 特权模型下,昇腾或者英伟达的 docker runtime 中会默认分配本机所有卡。 5 | 6 | 7 | 8 | - ASCEND_VISIBLE_DEVICES 容器级控制卡 9 | - ASCEND_RT_VISIBLE_DEVICES 进程级控制卡 类似于 CUDA_VISIBLE_DEVICES 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /llm-localization/ascend/服务器配置.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 最低配置: 5 | 6 | ``` 7 | Atlas 800 9000 A2 8 | CPU:4 * 鲲鹏920 48核@2.6GHZ 9 | GPU:8 * Ascend 910B4 10 | 内存:512G 11 | 硬盘:2 * 480G SSD,2 * 1.92 T PCIe SSD 12 | ``` 13 | 14 | 推荐配置: 15 | 16 | ``` 17 | Atlas 800 9000 A2 18 | CPU:4*鲲鹏920 48核@2.6GHZ 19 | GPU:8 * Ascend 910B3 20 | 内存:1T 21 | 硬盘:2 * 960G SSD,2 * 3.84T PCIe SSD 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /llm-localization/ascend/环境安装.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## 安装Python-3.9.2 5 | 6 | 7 | ``` 8 | wget https://www.python.org/ftp/python/3.9.2/Python-3.9.2.tgz 9 | tar -zxvf Python-3.9.2.tgz 10 | 11 | 12 | cd Python-3.9.2 13 | ./configure --prefix=/usr/local/python3.9.2 --with-ssl-default-suites=openssl --enable-shared CFLAGS=-fPIC 14 | make 15 | sudo make install 16 | 17 | 18 | #用于设置python3.9.2库文件路径 19 | export LD_LIBRARY_PATH=/usr/local/python3.9.2/lib:$LD_LIBRARY_PATH 20 | #如果用户环境存在多个python3版本,则指定使用python3.9.2版本 21 | export PATH=/usr/local/python3.9.2/bin:$PATH 22 | 23 | python3 --version 24 | pip3 --version 25 | ``` 26 | 27 | 28 | 29 | 30 | ## 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /llm-localization/ascend/达芬奇架构.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - 基本概念: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha001/devguide/opdevg/ascendcopdevg/atlas_ascendc_10_0009.html 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /llm-localization/modelscope/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 微调库: 6 | - https://github.com/modelscope/swift 7 | - https://github.com/modelscope/swift/blob/main/docs/source/LLM/NPU%E6%8E%A8%E7%90%86%E4%B8%8E%E5%BE%AE%E8%B0%83%E6%9C%80%E4%BD%B3%E5%AE%9E%E8%B7%B5.md 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /llm-localization/paddle/PaddleNLP.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /llm-localization/tianshuzhixin/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-localization/tianshuzhixin/README.md -------------------------------------------------------------------------------- /llm-localization/tianshuzhixin/ixsmi.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | ixsmi 7 | 8 | ixsmi -q -l | grep Used.[^G] 9 | 10 | ixsmi -q -i 0 -l | grep Used.[^G] 11 | ``` 12 | 13 | 14 | ``` 15 | ixsmi topo --matrix 16 | ``` 17 | 18 | 19 | 20 | ``` 21 | ixsmi -L 22 | ``` -------------------------------------------------------------------------------- /llm-maas/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - 智谱AI:https://open.bigmodel.cn/ 5 | - OpenAI: https://chat.openai.com/ 6 | - 文心一言:https://yiyan.baidu.com/ 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /llm-optimizer/FlashAttention.md: -------------------------------------------------------------------------------- 1 | 2 | FlashAttention 3 | 4 | 5 | Block-sparse FlashAttention 6 | 7 | 8 | FlashAttention2 9 | 10 | 11 | FlashAttention3 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /llm-optimizer/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | - FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion: https://arxiv.org/abs/2406.06858 8 | 9 | 10 | 11 | 12 | 13 | 14 | - BatchLLM: Optimizing Large Batched LLM Inference with Global Prefix Sharing and Throughput-oriented Token Batching 15 | https://arxiv.org/abs/2412.03594 16 | 17 | 18 | BlendServe: Optimizing Offline Inference for Auto-regressive Large Models with Resource-aware Batching 19 | 20 | https://arxiv.org/pdf/2411.16102 -------------------------------------------------------------------------------- /llm-optimizer/kv-cache.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - [H2O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models](https://arxiv.org/pdf/2306.14048) 7 | 8 | https://github.com/FMInference/H2O 9 | 10 | 11 | - [](https://arxiv.org/pdf/2310.01801) 12 | 13 | 14 | -------------------------------------------------------------------------------- /llm-optimizer/xformers.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | https://github.com/facebookresearch/xformers 4 | 5 | 6 | -------------------------------------------------------------------------------- /llm-tools/nvtx.md: -------------------------------------------------------------------------------- 1 | 2 | - https://nvtx.readthedocs.io/en/latest/annotate.html 3 | 4 | 5 | 装饰器: 6 | 7 | ``` 8 | @nvtx.annotate(message="my_message", color="blue") 9 | def my_func(): 10 | pass 11 | ``` 12 | 13 | 14 | 上下文管理器: 15 | 16 | ``` 17 | with nvtx.annotate(message="my_message", color="green"): 18 | pass 19 | ``` 20 | 21 | 22 | 范围: 23 | 24 | ``` 25 | rng = nvtx.start_range(message="my_message", color="blue") 26 | # ... do something ... # 27 | nvtx.end_range(rng) 28 | 29 | ``` 30 | 31 | 与start_range类似,但可以嵌套: 32 | 33 | ``` 34 | nvtx.push_range("batch " + str(i),"blue") 35 | 36 | nvtx.pop_range() 37 | ``` -------------------------------------------------------------------------------- /llm-train/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/.DS_Store -------------------------------------------------------------------------------- /llm-train/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ## 训练 6 | 7 | - https://github.com/yangjianxin1/Firefly/ 8 | - https://github.com/hiyouga/LLaMA-Factory 9 | 10 | 11 | 12 | 13 | - Characterization of Large Language Model Development in the Datacenter:https://arxiv.org/pdf/2403.07648 14 | - MegaScale: Scaling Large Language Model Training 15 | to More Than 10,000 GPUs:https://www.usenix.org/system/files/nsdi24-jiang-ziheng.pdf 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /llm-train/ascend/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/ascend/.DS_Store -------------------------------------------------------------------------------- /llm-train/chatglm-lora/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## ChatGLM-Tuning 3 | 4 | - code: `https://github.com/mymusise/ChatGLM-Tuning.git` 5 | - commit id : `997393046a49510e6cda36962f9a399297959311` 6 | 7 | 8 | -------------------------------------------------------------------------------- /llm-train/chatglm/README.md: -------------------------------------------------------------------------------- 1 | ## ChatGLM-6B 2 | - code: `https://github.com/THUDM/ChatGLM-6B ` 3 | - commit id: `8633db1503fc3b0edc1d035f64aa35dce5d97969` 4 | 5 | 6 | 7 | ## 问题 8 | 9 | ChatGLM-6B 使用 P-Tuning v2 和 Deepspeed 数据并行 loss 不一致的问题,请查看该[issue](https://github.com/THUDM/ChatGLM-6B/issues/644)。 10 | -------------------------------------------------------------------------------- /llm-train/chatglm/deepspeed.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": "auto", 3 | "zero_allow_untested_optimizer": true, 4 | "fp16": { 5 | "enabled": "auto", 6 | "loss_scale": 0, 7 | "initial_scale_power": 16, 8 | "loss_scale_window": 1000, 9 | "hysteresis": 2, 10 | "min_loss_scale": 1 11 | }, 12 | "zero_optimization": { 13 | "stage": 2, 14 | "allgather_partitions": true, 15 | "allgather_bucket_size": 5e8, 16 | "overlap_comm": false, 17 | "reduce_scatter": true, 18 | "reduce_bucket_size": 5e8, 19 | "contiguous_gradients" : true 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /llm-train/chatglm/ds_train_finetune.sh: -------------------------------------------------------------------------------- 1 | LR=1e-4 2 | 3 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 4 | 5 | deepspeed --num_gpus=8 --master_port $MASTER_PORT main.py \ 6 | --deepspeed deepspeed.json \ 7 | --do_train \ 8 | --train_file /data/nfs/llm/data/AdvertiseGen/train.json \ 9 | --test_file /data/nfs/llm/data/AdvertiseGen/dev.json \ 10 | --prompt_column content \ 11 | --response_column summary \ 12 | --overwrite_cache \ 13 | --model_name_or_path /data/nfs/llm/model/chatglm-6b \ 14 | --output_dir /home/guodong.li/output/adgen-chatglm-6b-ft-$LR \ 15 | --overwrite_output_dir \ 16 | --max_source_length 64 \ 17 | --max_target_length 64 \ 18 | --per_device_train_batch_size 30 \ 19 | --per_device_eval_batch_size 1 \ 20 | --gradient_accumulation_steps 2 \ 21 | --predict_with_generate \ 22 | --num_train_epochs 10 \ 23 | --logging_steps 10 \ 24 | --save_steps 1000 \ 25 | --learning_rate $LR \ 26 | --fp16 27 | 28 | -------------------------------------------------------------------------------- /llm-train/chatglm/evaluate.sh: -------------------------------------------------------------------------------- 1 | PRE_SEQ_LEN=128 2 | CHECKPOINT=adgen-chatglm-6b-pt-128-2e-2 3 | STEP=3000 4 | 5 | 6 | # --model_name_or_path /data/nfs/llm/model/chatglm-6b \ 7 | # --ptuning_checkpoint /home/guodong.li/output/adgen-chatglm-6b-pt-128-2e-2/checkpoint-500 \ 8 | # --predict_with_generate \ 9 | 10 | 11 | CUDA_VISIBLE_DEVICES=1 python3 main.py \ 12 | --do_predict \ 13 | --validation_file /data/nfs/llm/data/AdvertiseGen/dev.json \ 14 | --test_file /data/nfs/llm/data/AdvertiseGen/dev.json \ 15 | --overwrite_cache \ 16 | --prompt_column content \ 17 | --response_column summary \ 18 | --model_name_or_path /home/guodong.li/output/adgen-chatglm-6b-ft-1e-4 \ 19 | --output_dir /home/guodong.li/output/adgen-chatglm-6b-ft-1e-4 \ 20 | --overwrite_output_dir \ 21 | --max_source_length 64 \ 22 | --max_target_length 64 \ 23 | --per_device_eval_batch_size 1 \ 24 | --pre_seq_len $PRE_SEQ_LEN \ 25 | --quantization_bit 4 26 | 27 | -------------------------------------------------------------------------------- /llm-train/chatglm/evaluate_finetune.sh: -------------------------------------------------------------------------------- 1 | CHECKPOINT=adgen-chatglm-6b-ft-1e-4 2 | STEP=3000 3 | 4 | CUDA_VISIBLE_DEVICES=0 python3 main.py \ 5 | --do_predict \ 6 | --validation_file AdvertiseGen/dev.json \ 7 | --test_file AdvertiseGen/dev.json \ 8 | --overwrite_cache \ 9 | --prompt_column content \ 10 | --response_column summary \ 11 | --model_name_or_path ./output/$CHECKPOINT/checkpoint-$STEP \ 12 | --output_dir ./output/$CHECKPOINT \ 13 | --overwrite_output_dir \ 14 | --max_source_length 256 \ 15 | --max_target_length 256 \ 16 | --per_device_eval_batch_size 1 \ 17 | --predict_with_generate \ 18 | --fp16_full_eval 19 | -------------------------------------------------------------------------------- /llm-train/chatglm/train.sh: -------------------------------------------------------------------------------- 1 | PRE_SEQ_LEN=128 2 | LR=2e-2 3 | 4 | CUDA_VISIBLE_DEVICES=0 python3 main.py \ 5 | --do_train \ 6 | --train_file /data/nfs/llm/data/AdvertiseGen/dev.json \ 7 | --validation_file /data/nfs/llm/data/AdvertiseGen/dev.json \ 8 | --prompt_column content \ 9 | --response_column summary \ 10 | --overwrite_cache \ 11 | --model_name_or_path /data/nfs/llm/model/chatglm-6b \ 12 | --output_dir /home/guodong.li/output/adgen-chatglm-6b-pt-$PRE_SEQ_LEN-$LR \ 13 | --overwrite_output_dir \ 14 | --max_source_length 64 \ 15 | --max_target_length 64 \ 16 | --per_device_train_batch_size 128 \ 17 | --per_device_eval_batch_size 8 \ 18 | --gradient_accumulation_steps 16 \ 19 | --predict_with_generate \ 20 | --num_train_epochs 10 \ 21 | --logging_steps 10 \ 22 | --save_steps 100 \ 23 | --learning_rate $LR \ 24 | --pre_seq_len $PRE_SEQ_LEN 25 | -------------------------------------------------------------------------------- /llm-train/chatglm/train_ptuningv2_dp.sh: -------------------------------------------------------------------------------- 1 | PRE_SEQ_LEN=128 2 | LR=2e-2 3 | 4 | deepspeed --include localhost:1,2,3 --master_port 29001 main.py \ 5 | --deepspeed deepspeed.json \ 6 | --do_train \ 7 | --train_file /data/nfs/llm/data/AdvertiseGen/train.json \ 8 | --validation_file /data/nfs/llm/data/AdvertiseGen/dev.json \ 9 | --prompt_column content \ 10 | --response_column summary \ 11 | --overwrite_cache \ 12 | --model_name_or_path /data/nfs/llm/model/chatglm-6b \ 13 | --output_dir /home/guodong.li/output/adgen-chatglm-6b-pt \ 14 | --overwrite_output_dir \ 15 | --max_source_length 64 \ 16 | --max_target_length 64 \ 17 | --per_device_train_batch_size 128 \ 18 | --per_device_eval_batch_size 8 \ 19 | --gradient_accumulation_steps 16 \ 20 | --predict_with_generate \ 21 | --num_train_epochs 10 \ 22 | --logging_steps 10 \ 23 | --save_steps 100 \ 24 | --learning_rate $LR \ 25 | --pre_seq_len $PRE_SEQ_LEN 26 | -------------------------------------------------------------------------------- /llm-train/chinese-llama-alpaca/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## Chinese-LLaMA-Alpaca 5 | 6 | - 源码地址: https://github.com/ymcui/Chinese-LLaMA-Alpaca/ 7 | - commit id : 3e2f2529a4dc0d7567f46f1b2d3431a7d063588b 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /llm-train/deepspeedchat/llama/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - Llama/Llama-2 : https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /llm-train/firefly/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | - https://github.com/yangjianxin1/Firefly 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | ## 启动脚本 16 | 17 | 18 | ``` 19 | 20 | sh test_bash_getopts.sh -j /data/usw/web -m /opt/data/web 21 | 22 | sh test_bash_getopts.sh -j /data/usw/web -m /opt/data/web -u 23 | 24 | sh test_bash_getopts.sh -h 25 | 26 | sh test_bash_getopts.sh ? 27 | 28 | 29 | # sh bootstrap.sh -train_dataset_path /data/usw/web2 -pre_model_path /opt/data/web2 -model_output_path /opt/data/web3 -train_metrics_path /opt/data/web4 30 | 31 | 32 | sh bootstrap.sh -h 33 | sh bootstrap.sh -d /data/usw/web2 -p /opt/data/web2 -o /opt/data/web3 -m /opt/data/web4 34 | ``` 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /llm-train/firefly/bootstrap.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | 4 | func() { 5 | echo "Usage:" 6 | echo "test.sh [-d TRAIN_DATASET_PATH] [-p PRE_MODEL_PATH] [-o MODEL_OUTPUT_PATH] [-m MODEL_METRICS_PATH]" 7 | echo "Description:" 8 | echo "TRAIN_DATASET_PATH, 训练数据集路径." 9 | echo "PRE_MODEL_PATH, 预训练模型路径." 10 | echo "MODEL_OUTPUT_PATH, 模型输出模型." 11 | echo "MODEL_METRICS_PATH, 模型指标路径." 12 | exit -1 13 | } 14 | 15 | 16 | while getopts 'd:p:o:m' OPT; do 17 | case $OPT in 18 | d) TRAIN_DATASET_PATH="$OPTARG";; 19 | p) PRE_MODEL_PATH="$OPTARG";; 20 | o) MODEL_OUTPUT_PATH="$OPTARG";; 21 | m) MODEL_METRICS_PATH="$OPTARG";; 22 | h) func;; 23 | ?) func;; 24 | esac 25 | done 26 | 27 | echo "TRAIN_DATASET_PATH: $TRAIN_DATASET_PATH ,PRE_MODEL_PATH:$PRE_MODEL_PATH , MODEL_OUTPUT_PATH:$MODEL_OUTPUT_PATH ,MODEL_METRICS_PATH:$MODEL_METRICS_PATH" 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /llm-train/firefly/test_bash_getopts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | func() { 4 | echo "Usage:" 5 | echo "test.sh [-j S_DIR] [-m D_DIR]" 6 | echo "Description:" 7 | echo "S_DIR,the path of source." 8 | echo "D_DIR,the path of destination." 9 | exit -1 10 | } 11 | 12 | upload="false" 13 | 14 | while getopts 'h:j:m:u' OPT; do 15 | case $OPT in 16 | j) S_DIR="$OPTARG";; 17 | m) D_DIR="$OPTARG";; 18 | u) upload="true";; 19 | h) func;; 20 | ?) func;; 21 | esac 22 | done 23 | 24 | echo $S_DIR 25 | echo $D_DIR 26 | echo $upload -------------------------------------------------------------------------------- /llm-train/fp8.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://huggingface.co/docs/accelerate/v0.30.0/en/usage_guides/low_precision_training 4 | - https://huggingface.co/docs/accelerate/v0.30.0/en/concept_guides/low_precision_training 5 | 6 | 7 | 8 | 9 | - https://github.com/huggingface/transformers/issues/20991 10 | 11 | 12 | 13 | - https://zhuanlan.zhihu.com/p/700874387 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /llm-train/megatron-deepspeed/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## 微软 4 | - [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) 5 | - 目前,已支持llama/llama2 6 | 7 | 8 | 模型转换: 9 | - https://github.com/microsoft/Megatron-DeepSpeed/tree/main/tools/convert_checkpoint 10 | - https://github.com/huggingface/transformers/tree/main/src/transformers/models/megatron_gpt2 11 | 12 | ## bigscience 13 | 14 | - [Megatron-DeepSpeed](https://github.com/bigscience-workshop/Megatron-DeepSpeed) 15 | - 训练过程:https://github.com/bigscience-workshop/bigscience 16 | - https://huggingface.co/blog/zh/bloom-megatron-deepspeed 17 | - 支持bloom 18 | - 未支持序列并行 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /llm-train/megatron-deepspeed/bigscience/bloom-note.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /llm-train/megatron-deepspeed/microsoft/H800多机多卡训练坑点.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | CUDA 11.8 及以上 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 多机多卡:torchrun 13 | 14 | - https://pytorch.org/docs/stable/elastic/run.html 15 | 16 | For multi-node training you need to specify: 17 | 18 | --rdzv-id: A unique job id (shared by all nodes participating in the job) 19 | 20 | --rdzv-backend: An implementation of torch.distributed.elastic.rendezvous.RendezvousHandler 21 | 22 | --rdzv-endpoint: The endpoint where the rendezvous backend is running; usually in form host:port. 23 | 24 | 25 | 26 | 心跳检测失败 27 | 28 | - The node 'xxx' has failed to send a keep-alive heartbeat to the rendezvous 'xxx' due to an error of type RendezvousConnectionError. 29 | - https://github.com/pytorch/pytorch/issues/76287 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /llm-train/megatron-deepspeed/microsoft/pip.conf: -------------------------------------------------------------------------------- 1 | # This file has been autogenerated or modified by NVIDIA PyIndex. 2 | # In case you need to modify your PIP configuration, please be aware that 3 | # some configuration files may have a priority order. Here are the following 4 | # files that may exists in your machine by order of priority: 5 | # 6 | # [Priority 1] Site level configuration files 7 | # 1. `/usr/pip.conf` 8 | # 9 | # [Priority 2] User level configuration files 10 | # 1. `/root/.config/pip/pip.conf` 11 | # 2. `/root/.pip/pip.conf` 12 | # 13 | # [Priority 3] Global level configuration files 14 | # 1. `/etc/pip.conf` 15 | # 2. `/etc/xdg/pip/pip.conf` 16 | 17 | 18 | [global] 19 | index-url = http://nexus3.xxx.com/repository/pypi/simple 20 | 21 | [install] 22 | trusted-host = nexus3.xxx.com 23 | 24 | timeout = 90 25 | 26 | no-cache-dir = true 27 | 28 | -------------------------------------------------------------------------------- /llm-train/megatron-deepspeed/microsoft/slurm/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | srun -p h800-ib-1 --mpi=pmix_v3 -N 2 --gres=gpu:8 env 7 | ``` 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /llm-train/megatron-deepspeed/microsoft/代码.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | ``` 8 | from deepspeed.accelerator import get_accelerator 9 | get_accelerator().device_name() == 'cuda': 10 | 11 | ``` -------------------------------------------------------------------------------- /llm-train/megatron-deepspeed/microsoft/训练日志分析.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## fp16 5 | 6 | `megatron/training.py` 7 | 8 | 日志: 9 | ``` 10 | FusedAdam ( 11 | Parameter Group 0 12 | betas: (0.9, 0.95) 13 | bias_correction: True 14 | eps: 1e-08 15 | lr: 0.0 16 | lr_mult: 1.0 17 | name: wd_no_scale_lr 18 | step: 1 19 | wd_mult: 1.0 20 | weight_decay: 0.1 21 | 22 | Parameter Group 1 23 | betas: (0.9, 0.95) 24 | bias_correction: True 25 | eps: 1e-08 26 | lr: 0.0 27 | lr_mult: 1.0 28 | name: no_wd_no_scale_lr 29 | step: 1 30 | wd_mult: 0.0 31 | weight_decay: 0.0 32 | ) 33 | 34 | 35 | ``` 36 | -------------------------------------------------------------------------------- /llm-train/megatron-deepspeed/source-code.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | build_train_valid_test_datasets 7 | 8 | 单数据集 9 | _build_train_valid_test_datasets 10 | 11 | 多数据集 12 | 13 | 14 | 15 | 16 | 17 | ``` 18 | -------------------------------------------------------------------------------- /llm-train/megatron/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | ## 基于Megatron-LM实现的项目 7 | 8 | - [CodeGeeX](https://github.com/THUDM/CodeGeeX) 9 | 10 | - [如何使用 Megatron-LM 训练语言模型](https://huggingface.co/blog/zh/megatron-training):数据预处理,训练,模型转换,推理等 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | ### 数据加载 19 | 20 | Megatron-LM 带有一个高效的 DataLoader,其中数据在训练前被 tokenize 和 shuffle。它还将数据拆分为带有索引的编号序列,并将索引存储,因此 tokenize 只需要计算一次。为了构建索引,首先根据训练参数计算每个 epoch 的数量,并创建一个排序,然后对数据进行 shuffle 操作。这与大多数情况不同,我们通常迭代整个数据集直到其用尽,然后重复第二个 epoch 。这平滑了学习曲线并节省了训练时间。 21 | 22 | 23 | ### 融合 CUDA 内核 24 | 当一个计算在 GPU 上运行时,必要的数据会从内存中取出并加载到 GPU 上,然后计算结果被保存回内存。简单来说,融合内核的思想是: 将通常由 PyTorch 单独执行的类似操作组合成一个单独的硬件操作。因此可以将多个离散计算合并为一个,从而减少在多个离散计算中的内存移动次数。 25 | 26 | 27 | 当 f、g 和 h 融合在一个内核中时,f 和 g 的中间结果 x' 和 y' 存储在 GPU 寄存器中并立即被 h 使用。但是如果不融合,x' 和 y' 就需要复制到内存中,然后由 h 加载。因此,融合 CUDA 内核显着加快了计算速度。此外,Megatron-LM 还使用 Apex 的 AdamW 融合实现,它比 PyTorch 实现更快。 28 | 29 | 虽然我们可以在 transformers 中自定义 Megatron-LM 中的 DataLoader 和 Apex 的融合优化器,但自定义融合 CUDA 内核对新手来说太不友好了。 30 | 31 | 32 | -------------------------------------------------------------------------------- /llm-train/megatron/codegeex/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | docker pull codegeex/codegeex:latest 7 | # To enable GPU support, clarify device ids with --device 8 | docker run --gpus '"device=0,1"' -it --ipc=host --name=codegeex codegeex/codegeex 9 | ``` 10 | 11 | ``` 12 | git clone git@github.com:THUDM/CodeGeeX.git 13 | cd CodeGeeX 14 | pip install -e . 15 | ``` 16 | 17 | 18 | -------------------------------------------------------------------------------- /llm-train/megatron/codegeex/pic/CodeGeeX模型架构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/megatron/codegeex/pic/CodeGeeX模型架构.png -------------------------------------------------------------------------------- /llm-train/megatron/codegeex/pic/CodeGeeX训练配置.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/megatron/codegeex/pic/CodeGeeX训练配置.png -------------------------------------------------------------------------------- /llm-train/megatron/gpt2/merge_ck_and_inference/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - checkpoint_loader_megatron.py 5 | - checkpoint_saver_megatron.py 6 | 7 | ``` 8 | margs.vocab_file = '/workspace/model/gpt2-vocab/gpt2-vocab.json' 9 | margs.merge_file = '/workspace/model/gpt2-vocab/gpt2-merges.txt' 10 | ``` 11 | -------------------------------------------------------------------------------- /llm-train/megatron/gpt2/merge_ck_and_inference/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import sys 3 | import json 4 | import requests 5 | 6 | 7 | if __name__ == "__main__": 8 | url = sys.argv[1] 9 | url = 'http://' + url + '/api' 10 | headers = {'Content-Type': 'application/json'} 11 | 12 | while True: 13 | sentence = input("Enter prompt: ") 14 | tokens_to_generate = int(eval(input("Enter number of tokens to generate: "))) 15 | 16 | data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate} 17 | response = requests.put(url, data=json.dumps(data), headers=headers) 18 | 19 | if response.status_code != 200: 20 | print(f"Error {response.status_code}: {response.json()['message']}") 21 | else: 22 | print("Megatron Response: ") 23 | print(response.json()['text'][0]) 24 | -------------------------------------------------------------------------------- /llm-train/megatron/kernel_fusion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/megatron/kernel_fusion.png -------------------------------------------------------------------------------- /llm-train/megatron/pretrain.xmind: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/megatron/pretrain.xmind -------------------------------------------------------------------------------- /llm-train/paddle/paddlenlp/baichuan2/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | from paddlenlp.transformers import AutoTokenizer, AutoModelForCausalLM 6 | tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Base") 7 | model = AutoModelForCausalLM.from_pretrained("baichuan-inc/Baichuan2-7B-Base", dtype="float16") 8 | ``` -------------------------------------------------------------------------------- /llm-train/peft/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/peft/.DS_Store -------------------------------------------------------------------------------- /llm-train/peft/Prefix-Tuning.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ``` 6 | from peft import PrefixEncoder, PrefixTuningConfig 7 | 8 | config = PrefixTuningConfig( 9 | peft_type="PREFIX_TUNING", 10 | task_type="SEQ_2_SEQ_LM", 11 | num_virtual_tokens=20, 12 | token_dim=768, 13 | num_transformer_submodules=1, 14 | num_attention_heads=12, 15 | num_layers=12, 16 | encoder_hidden_size=768, 17 | ) 18 | 19 | prefix_encoder = PrefixEncoder(config) 20 | ``` 21 | 22 | 23 | 24 | https://huggingface.co/JackFram/llama-68m 25 | -------------------------------------------------------------------------------- /llm-train/peft/clm/accelerate_ds_zero3_cpu_offload_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | dynamo_backend: 'NO' 13 | fsdp_config: {} 14 | machine_rank: 0 15 | main_training_function: main 16 | megatron_lm_config: {} 17 | mixed_precision: 'no' 18 | num_machines: 1 19 | num_processes: 1 20 | rdzv_backend: static 21 | same_network: true 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /llm-train/peft/conditional_generation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/peft/conditional_generation/README.md -------------------------------------------------------------------------------- /llm-train/pytorch/Pytorch源码解读.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - PyTorch 源码解读系列:https://zhuanlan.zhihu.com/p/328674159 6 | - PyTorch训练推理:https://www.zhihu.com/column/gemfield 7 | - PyTorch 分布式:https://juejin.cn/post/7026144707591815175 8 | -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ``` 5 | docker run -dt --name pytorch_env_cu117 --restart=always --gpus all \ 6 | --network=host \ 7 | --shm-size 4G \ 8 | -v /home/gdong/workspace/code:/workspace/code \ 9 | -v /home/gdong/workspace/data:/workspace/data \ 10 | -v /home/gdong/workspace/model:/workspace/model \ 11 | -v /home/gdong/workspace/output:/workspace/output \ 12 | -v /home/gdong/workspace/package:/workspace/package \ 13 | -w /workspace \ 14 | pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel \ 15 | /bin/bash 16 | 17 | 18 | docker exec -it pytorch_env_cu117 bash 19 | 20 | ``` 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/api.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/data-parallel/minGPT-ddp/multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=multinode-example 4 | #SBATCH --partition=a800 #分区 5 | #SBATCH --output=log/%j.out #日志 6 | #SBATCH --error=log/%j.err #日志 7 | 8 | #SBATCH -N 2 9 | #SBATCH --ntasks=2 10 | #SBATCH --gres=gpu:4 11 | ##SBATCH --cpus-per-task=4 12 | 13 | NODELIST=$(scontrol show hostname $SLURM_JOB_NODELIST) 14 | # 对第一个节点赋值为主节点 15 | MASTER_NODE=$(head -n 1 <<< "$NODELIST") 16 | # 计数器 17 | NODE_COUNT=0 18 | # 一共的节点数 19 | NODE_NUM=($(echo $NODELIST | tr " " "\n" | wc -l)) 20 | 21 | # 打印 22 | echo $SLURM_NODEID 23 | echo $NODELIST 24 | echo $MASTER_NODE 25 | echo $NODE_NUM 26 | 27 | 28 | export NCCL_IB_DISABLE=1 29 | export NCCL_SOCKET_IFNAME=bond0 30 | 31 | 32 | srun --mpi=pmix_v3 singularity run --nv --pwd /workspaces/examples-main/distributed/minGPT-ddp/mingpt -B /data/hpc/home/guodong.li/:/workspaces:rw pytorch-multinode.sif torchrun --nproc_per_node=4 main.py 33 | -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/data-parallel/sbatch_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=multinode-example 4 | #SBATCH --nodes=4 5 | #SBATCH --ntasks=4 6 | #SBATCH --gpus-per-task=1 7 | #SBATCH --cpus-per-task=4 8 | 9 | nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) 10 | nodes_array=($nodes) 11 | head_node=${nodes_array[0]} 12 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 13 | 14 | echo Node IP: $head_node_ip 15 | export LOGLEVEL=INFO 16 | 17 | srun torchrun \ 18 | --nnodes 4 \ 19 | --nproc_per_node 1 \ 20 | --rdzv_id $RANDOM \ 21 | --rdzv_backend c10d \ 22 | --rdzv_endpoint $head_node_ip:29500 \ 23 | /shared/examples/multinode_torchrun.py 50 10 24 | -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/pipeline-parallel/4-使用DDP与流水线并行训练Transformer模型.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## 定义模型 4 | 5 | PositionalEncoding 模块注入一些有关序列中标记的相对或绝对位置的信息。 6 | 7 | 位置编码与嵌入具有相同的维度,因此可以将两者相加。 在这里,我们使用不同频率的正弦和余弦函数。 8 | 9 | 10 | 11 | 12 | 13 | 14 | 在本教程中,我们将在两个 GPU 上拆分 Transformer 模型,并使用流水线并行来训练模型。 15 | 16 | 除此之外,我们还使用分布式数据并行来训练该流水线的两个副本。 17 | 18 | 我们有一个进程驱动跨 GPU 0 和 1 的流水线,另一个进程驱动跨 GPU 2 和 3 的流水线。 19 | 20 | 然后,这两个进程都使用分布式数据并行来训练两个副本。 该模型与使用 nn.Transformer 和 TorchText 进行序列到序列建模教程中使用的模型完全相同,但分为两个阶段。 21 | 22 | 参数数量最多的是 nn.TransformerEncoder 层。 nn.TransformerEncoder 本身由 nlayers 个 nn.TransformerEncoderLayer 组成。 因此,我们的重点是 nn.TransformerEncoder,并拆分模型,使 nn.TransformerEncoderLayer 的一半位于一个 GPU 上,另一半位于另一个 GPU 上。 23 | 24 | 为此,我们将编码器和解码器部分提取到单独的模块中,然后构建一个代表原始 Transformer 模块的 nn.Sequential。 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/rpc/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - DISTRIBUTED PIPELINE PARALLELISM USING RPC:https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/sequence-parallelism/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/pytorch/distribution/sequence-parallelism/README.md -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/tensor-parallel/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | import torch.distributed as dist 6 | import torch.multiprocessing as mp 7 | import torch.nn as nn 8 | 9 | 10 | def setup(rank, world_size): 11 | os.environ["MASTER_ADDR"] = "localhost" 12 | os.environ["MASTER_PORT"] = "12355" 13 | 14 | # initialize the process group 15 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 16 | torch.cuda.set_device(rank) 17 | 18 | 19 | def cleanup(): 20 | dist.destroy_process_group() 21 | 22 | 23 | class ToyModel(nn.Module): 24 | def __init__(self): 25 | super(ToyModel, self).__init__() 26 | self.net1 = nn.Linear(10, 32) 27 | self.relu = nn.ReLU() 28 | self.net2 = nn.Linear(32, 5) 29 | 30 | def forward(self, x): 31 | return self.net2(self.relu(self.net1(x))) 32 | 33 | -------------------------------------------------------------------------------- /llm-train/pytorch/distribution/torchrun.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://pytorch.org/docs/stable/elastic/run.html 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | ``` 12 | torchrun 13 | --nnodes=$NUM_NODES 14 | --nproc-per-node=$NUM_TRAINERS 15 | --max-restarts=3 16 | --rdzv-id=$JOB_ID 17 | --rdzv-backend=c10d 18 | --rdzv-endpoint=$HOST_NODE_ADDR 19 | YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...) 20 | ``` 21 | 22 | 23 | 24 | For multi-node training you need to specify: 25 | 26 | --rdzv-id: A unique job id (shared by all nodes participating in the job) 27 | 28 | --rdzv-backend: An implementation of torch.distributed.elastic.rendezvous.RendezvousHandler 29 | 30 | --rdzv-endpoint: The endpoint where the rendezvous backend is running; usually in form host:port. 31 | 32 | Currently c10d (recommended), etcd-v2, and etcd (legacy) rendezvous backends are supported out of the box. To use etcd-v2 or etcd, setup an etcd server with the v2 api enabled (e.g. --enable-v2). 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /llm-train/pytorch/resource.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | --- 5 | 6 | - PyTorch工程的最佳实践:https://zhuanlan.zhihu.com/p/371978706 7 | 8 | 9 | 框架不统一的问题: 10 | 11 | 代码评审方面,无法有效提高代码评审效率; 12 | 代码复用方面,根本无法复用; 13 | 项目交接方面,问题逐渐呈现; 14 | 模型部署方面,一言难尽; 15 | 经验复用方面,想都别想; 16 | 以及人员备份、性能调试、客户技术支持等。 17 | 18 | 19 | 20 | 21 | 流水线环节: 22 | 23 | 数据集 24 | 数据增强 25 | 模型搭建和裁剪 26 | 损失函数 27 | 超参和训练 28 | 移植和部署 29 | 30 | 31 | --- 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /llm-train/pytorch/torchrun.md: -------------------------------------------------------------------------------- 1 | 2 | - https://pytorch.org/docs/stable/elastic/run.html 3 | 4 | # torchrun 5 | 6 | 7 | 8 | ## 环境变量 9 | 10 | ``` 11 | 12 | ``` 13 | 14 | 15 | -------------------------------------------------------------------------------- /llm-train/slurm/deepspeed/pp-standalone-singularity.slurm: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/sh 3 | 4 | #SBATCH --job-name=multinode-deepspeed-singularity # name 5 | 6 | #SBATCH -N 1 7 | #SBATCH --ntasks=4 8 | #SBATCH --gres=gpu:4 9 | #SBATCH --cpus-per-task=4 10 | 11 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 12 | #SBATCH --partition=a800 # 分区 13 | #SBATCH --output=log/%j.out # 日志 14 | #SBATCH --error=log/%j.err # 日志 15 | 16 | export GPUS_PER_NODE=4 17 | export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 18 | export MASTER_PORT=9901 19 | 20 | export NCCL_IB_DISABLE=1 21 | export NCCL_SOCKET_IFNAME=bond0 22 | export CC=/opt/hpcx/ompi/bin/mpicc 23 | 24 | echo $MASTER_ADDR 25 | 26 | 27 | singularity run --nv --pwd /workspaces/DeepSpeedExamples-20230430/training/pipeline_parallelism \ 28 | -B /data/hpc/home/guodong.li/:/workspaces:rw \ 29 | deepspeed.sif \ 30 | deepspeed --master_port $MASTER_PORT train.py --deepspeed_config=ds_config.json -p 2 --steps=200 31 | 32 | 33 | -------------------------------------------------------------------------------- /llm-train/slurm/pytorch/mingpt-singularity-multinode.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=multinode-example 4 | #SBATCH --partition=a800 #分区 5 | #SBATCH --output=log/%j.out #日志 6 | #SBATCH --error=log/%j.err #日志 7 | 8 | #SBATCH -N 2 9 | #SBATCH --ntasks=2 10 | #SBATCH --gres=gpu:4 11 | #SBATCH --cpus-per-task=4 12 | 13 | NODELIST=$(scontrol show hostname $SLURM_JOB_NODELIST) 14 | # 对第一个节点赋值为主节点 15 | MASTER_NODE=$(head -n 1 <<< "$NODELIST") 16 | # 计数器 17 | NODE_COUNT=0 18 | # 一共的节点数 19 | NODE_NUM=($(echo $NODELIST | tr " " "\n" | wc -l)) 20 | 21 | # 打印 22 | echo $SLURM_NODEID 23 | echo $NODELIST 24 | echo $MASTER_NODE 25 | echo $NODE_NUM 26 | 27 | 28 | export NCCL_IB_DISABLE=1 29 | export NCCL_SOCKET_IFNAME=bond0 30 | 31 | 32 | srun --mpi=pmix_v3 singularity run --nv --pwd /workspaces/examples-main/distributed/minGPT-ddp/mingpt -B /data/hpc/home/guodong.li/:/workspaces:rw pytorch-multinode.sif torchrun --nproc_per_node=4 main.py 33 | 34 | -------------------------------------------------------------------------------- /llm-train/vicuna/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/llm-train/vicuna/README.md -------------------------------------------------------------------------------- /llmops/FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | ## k8s 的 pod 中下载 s3 对象存储的数据网络不通? 12 | 13 | 可能的问题: 14 | 1. 服务器的时间和s3对象存储服务器时间不一致 15 | 2. 异构集群导致网络镜像拉取有问题,比如:昇腾NPU服务器需要calico网络镜像是aarch64架构,却下载了 x86 架构的镜像,导致容器启动不成功。 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /llmops/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 千帆大模型平台: 7 | 8 | - https://cloud.baidu.com/product/wenxinworkshop 9 | - https://cloud.baidu.com/doc/WENXINWORKSHOP/s/Xlkb0e6eu 10 | 11 | 12 | 13 | --- 14 | 15 | 数据集对应关系说明: 16 | 17 | https://cloud.baidu.com/doc/WENXINWORKSHOP/s/Wlj3vd4gy 18 | 19 | 20 | --- 21 | 22 | 模型评估 23 | 24 | 25 | 人工评估: 26 | 综合人类专家的主观见解、经验等从不同评价维度对模型回复进行打分,用于评估模型回复的效果。 27 | 28 | 自动评估: 29 | 对⽣成式⼤模型的输出效果进⾏全⽅位评价,提供⾯向事实类或开放性问答的多种打分模式; 30 | 31 | 32 | 自动规则打分 33 | 计算模型预测结果与真实标注的文本相似度指标(例如ROUGE、BLUE等),适合标准选择题或简单问答场景。 34 | 35 | 36 | 自动裁判员打分 37 | 使用能力更强的大模型作为裁判员,对被评估模型的生成结果进行自动化打分,适用于开放性或复杂问答场景。 38 | 39 | -------------------------------------------------------------------------------- /llmops/kubernetes.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 修改共享内存:https://www.alibabacloud.com/help/zh/eci/user-guide/mount-an-emptydir-volume-to-modify-the-shm-size-of-a-pod?spm=a2c63.p38356.0.0.1c055267llapbW 6 | 7 | swap(交换内存)和shm(共享内存)的区别:https://blog.csdn.net/songyu0120/article/details/89169987 8 | 9 | - tmpfs使用内存空间而swap使用物理存储空间 10 | 11 | 12 | 训练过程内存碎片化问题 13 | 14 | - export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512' 15 | - 一文读懂 PyTorch 显存管理机制:https://zhuanlan.zhihu.com/p/486360176 16 | 17 | 18 | RuntimeError: CUDA out of memory.一些调bug路程:https://zhuanlan.zhihu.com/p/581606031 19 | 20 | 21 | 22 | max_split_size_mb 阻止原生(native)分配器分割大于此大小(MB)的块。这可以减少碎片,并允许某些边缘工作负载在内存不耗尽的情况下完成。性能代价从 "零 "到 "大量 "不等,取决于分配模式。 23 | 默认值没有限制,即所有块都可以分割。memory_stats()和 memory_summary()方法可用于调整。如果工作负载因 "内存不足 "而终止,并显示大量未活动的分割块,则应在万不得已时使用该选项。 max_split_size_mb 只对 backend:native 有意义。在使用 backend:cudaMallocAsync 时,max_split_size_mb 将被忽略。 24 | 25 | - 官方:https://pytorch.org/docs/stable/notes/cuda.html 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /llmops/tq-llm/train/FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## FAQ 4 | 5 | 6 | ### baichuan2报错 7 | 8 | - 'BitsAndBytesConfig' object is not subscriptable 9 | 10 | https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/discussions/2 11 | 12 | 13 | 14 | - AttributeError: 'BaichuanTokenizer' object has no attribute 'sp_model' 15 | 16 | 降低版本到4.34.0及以下 : pip install transformers==4.34.0 17 | 18 | 19 | 20 | ## Pytorch 21 | 22 | - RuntimeError: DataLoader worker (pid xxxxx) is killed by signal: Killed. 23 | 24 | 可能共享内存太小了 25 | 26 | --shm-size 4G 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /llmops/使用docker进行多机多卡训练.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - Docker容器中DeepSpeed多机多卡集群分布式训练大模型实践:https://cloud.baidu.com/article/3273769 -------------------------------------------------------------------------------- /llmops/模型推理平台方案.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | GPU解决方案: 4 | - https://github.com/NVIDIA/gpu-operator 5 | 6 | 7 | IB解决方案: 8 | 9 | - https://github.com/Mellanox/network-operator 10 | - https://docs.nvidia.com/networking/display/cokan10/network+operator 11 | 12 | 13 | -------------------------------------------------------------------------------- /mkdir-dir-file.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | dir=$1 4 | 5 | 6 | # sh mkdir-dir-file.sh llm-algo/chatglm3 7 | 8 | mkdir $dir 9 | 10 | touch $dir/README.md 11 | touch $dir/reference.md 12 | 13 | 14 | tree -h $dir 15 | 16 | 17 | -------------------------------------------------------------------------------- /paper/LESS-选择有影响力的数据进行目标指令精调.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - LESS: Selecting Influential Data for Targeted Instruction Tuning 6 | - https://github.com/princeton-nlp/LESS 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /paper/LLM增强LLMS.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - https://github.com/lucidrains/CALM-pytorch 4 | - LLM AUGMENTED LLMS:EXPANDING CAPABILITIES THROUGH COMPOSITION 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /paper/PagedAttention.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 许多LLM的参数大小超过了单个GPU的容量,因此有必要将它们跨分布式GPU进行分割在以模型并行方式下执行。这就要求内存管理器能够处理分布式内存。vLLM通过支持Megatron-LM风格的张量模型并行策略来在分布式设置中有效工作。这种策略遵循SPMD(单程序多数据)执行计划,其中线性层被分割以执行块矩阵乘法,并且GPU通过allreduce操作不断同步中间结果。具体来说,注意力运算在注意力头维度上被分割,每个SPMD进程处理多头注意力中的一个子集。 15 | 16 | 我们观察到,即使在模型并行执行的情况下,每个模型分片仍然处理相同的输入Token集,因此需要相同位置的KV缓存。因此,vLLM在调度器内设置了一个单一的KV缓存管理器,如图4所示。 17 | 不同的GPU workers 共享管理器以及从逻辑块到物理块的映射。这个共同的映射允许 GPU workers 使用调度器为每个输入请求提供的物理块来执行模型。虽然每个GPU worker 有相同的物理块ID,但worker只存储其相应注意力头的KV缓存的一部分。 18 | 19 | 在每一步中,调度器首先为每个请求准备包含输入Token ID的消息,以及每个请求的块表。 20 | 接下来,调度器将此控制消息广播给GPU workers。然后,GPU工作器开始使用输入Token ID执行模型。在注意力层中,GPU工作器根据控制消息中的块表读取KV缓存。在执行过程中,GPU工作器使用all-reduce通信原语同步中间结果,而不需要调度器的协调。最后,GPU工作器将本次迭代的采样Token发送回调度器。总之,GPU工作器不需要在内存管理上进行同步,它们只需要在每个解码迭代开始时以及步骤输入时接收所有的内存管理信息即可。 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /paper/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /paper/data/LESS-选择有影响力的数据进行目标指令精调.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - LESS: Selecting Influential Data for Targeted Instruction Tuning 6 | - https://github.com/princeton-nlp/LESS 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /paper/inference/llm-in-a-flash.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | LLM in a Flash: Efficient Inference Techniques With Limited Memory 4 | 5 | https://arxiv.org/abs/2312.11514 6 | 7 | 8 | https://medium.com/@marketing_novita.ai/llm-in-a-flash-efficient-inference-techniques-with-limited-memory-5f0a404794b0 9 | 10 | 11 | -------------------------------------------------------------------------------- /paper/inference/迈向高效的生成式大语言模型服务综述-从算法到系统.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | LLM 推理优化技术分类: 4 | 5 | 算法创新 6 | 解码算法 7 | Non-autoregressive Decoding 8 | Speculative Decoding 9 | Early Exiting 10 | Cascade Inference 11 | 架构设计 12 | Config Downsizing 13 | Attention Simplification 14 | Recurrent Unit 15 | Activation Sharing 16 | Conditional Computing 17 | 模型压缩 18 | 知识蒸馏 19 | 网络剪枝 20 | 系统优化 21 | 低比特量化 22 | 并行计算 23 | 模型并行 24 | Decentralized Inference 25 | 内存管理 26 | 请求调度 27 | Kernel 优化 28 | Kernel 融合 29 | Tailored Attention 30 | Sampling Optimization 31 | Variable Sequence length 32 | 自动编译 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /paper/llm对齐综述.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | A Comprehensive Survey of LLM Alignment Techniques: RLHF, RLAIF, PPO, DPO and More 5 | https://arxiv.org/abs/2407.16216 6 | 7 | 8 | 9 | 10 | # 笔记 11 | 12 | https://f46522gm22.feishu.cn/docx/HfxXdnEheouZ42xAoWYcy1q8nSd 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /paper/moe/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | - https://github.com/GCYZSL/MoLA 7 | 8 | 9 | 10 | - 大模型微调新范式:当LoRA遇见MoE:https://zhuanlan.zhihu.com/p/683637455 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /paper/parameter-pruning/LLM-Pruner.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | - LLM-Pruner:https://github.com/horseee/LLM-Pruner 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /paper/parameter-pruning/SparseGPT.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - SparseGPT:https://github.com/IST-DASLab/sparsegpt 4 | 5 | 6 | -------------------------------------------------------------------------------- /paper/parameter-pruning/Wanda.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /paper/parameter-pruning/公式.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | - MarkDown数学公式基本语法:https://blog.csdn.net/qq_38342510/article/details/124064158 6 | - 史上最全Markdown公式、符号总结:https://blog.csdn.net/weixin_42782150/article/details/104878759 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /paper/training/Reducing Activation Recomputation in Large Transformer Models.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Reducing Activation Recomputation in Large Transformer Models:https://arxiv.org/pdf/2205.05198 6 | 7 | **选择性激活重计算**(selective activation recomputation),是一种策略,即只对那些**占用大量内存但重新计算成本不高的Transformer层的部分激活进行存储和重计算**。例如,在自注意力机制中,某些操作(如: $QK^T$矩阵乘法、softmax、softmax dropout和对V的注意力)会产生较大的激活,但每个输入元素所需的浮点运算次数却相对较低。通过选择性地存储这些激活,可以在使用较少内存的同时,以较低的计算开销重新计算未存储的激活。 8 | 9 | 10 | 11 | 通过结合使用序列并行性(sequence parallelism)和张量并行性(tensor parallelism),以及选择性激活重计算,论文中的方法能够在减少5倍激活内存需求的同时,将由激活重计算引起的执行时间开销降低90%以上。这使得在大规模参数的语言模型上训练变换器模型变得更加高效。 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /pic/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/.DS_Store -------------------------------------------------------------------------------- /pic/llm-action-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/llm-action-v2.png -------------------------------------------------------------------------------- /pic/llm-action-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/llm-action-v3.png -------------------------------------------------------------------------------- /pic/llm/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/llm/.DS_Store -------------------------------------------------------------------------------- /pic/llm/model/llm-famliy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/llm/model/llm-famliy.jpg -------------------------------------------------------------------------------- /pic/llm/model/llm-timeline-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/llm/model/llm-timeline-v2.png -------------------------------------------------------------------------------- /pic/llm/train/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/llm/train/.DS_Store -------------------------------------------------------------------------------- /pic/llm/train/pretrain/llm-pretrain-pipeline-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/llm/train/pretrain/llm-pretrain-pipeline-v2.png -------------------------------------------------------------------------------- /pic/llm/train/sft/peft方法.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/llm/train/sft/peft方法.jpg -------------------------------------------------------------------------------- /pic/wechat.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/wechat.jpeg -------------------------------------------------------------------------------- /pic/wx-gzh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/wx-gzh.png -------------------------------------------------------------------------------- /pic/wx.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/wx.jpg -------------------------------------------------------------------------------- /pic/公众号.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liguodongiot/llm-action/9f0dde5738c3fe2b4ada3e58c1b2f43dd5d9651e/pic/公众号.jpeg --------------------------------------------------------------------------------