├── .dockerignore ├── .github └── workflows │ └── Build.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── Dockerfile ├── LICENSE ├── README.md ├── README_EN.md ├── docker-compose.yaml ├── docs ├── benchmark.md ├── custom.md ├── custom_op.md ├── deepseek.md ├── demo_arguments.md ├── english_custom.md ├── english_demo_arguments.md ├── faq.md ├── ftllm.md ├── llama_cookbook.md ├── mixforward.md ├── models.md ├── qwen3.md ├── rocm.md ├── tfacc.md ├── version.md └── wechat_group0.jpg ├── example ├── Android │ └── LLMAssistant │ │ ├── .gitignore │ │ ├── .idea │ │ ├── .gitignore │ │ ├── .name │ │ ├── compiler.xml │ │ ├── dbnavigator.xml │ │ ├── deploymentTargetDropDown.xml │ │ ├── gradle.xml │ │ ├── misc.xml │ │ └── vcs.xml │ │ ├── app │ │ ├── .gitignore │ │ ├── build.gradle │ │ ├── libs │ │ │ ├── arm64-v8a │ │ │ │ └── libassistant.so │ │ │ └── armeabi-v7a │ │ │ │ └── libassistant.so │ │ ├── proguard-rules.pro │ │ ├── release │ │ │ ├── app-arm64-v8a-release-unsigned.apk │ │ │ ├── app-armeabi-v7a-release-unsigned.apk │ │ │ ├── app-universal-release-unsigned.apk │ │ │ └── app-x86-release-unsigned.apk │ │ └── src │ │ │ ├── androidTest │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── doujiao │ │ │ │ └── xiaozhihuiassistant │ │ │ │ └── ExampleInstrumentedTest.java │ │ │ ├── main │ │ │ ├── AndroidManifest.xml │ │ │ ├── cpp │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── LLMChat.cpp │ │ │ │ ├── LLMChat.h │ │ │ │ ├── main.cpp │ │ │ │ └── native-lib.cpp │ │ │ ├── java │ │ │ │ └── com │ │ │ │ │ └── doujiao │ │ │ │ │ ├── core │ │ │ │ │ └── AssistantCore.java │ │ │ │ │ └── xiaozhihuiassistant │ │ │ │ │ ├── ChatMessage.java │ │ │ │ │ ├── MainActivity.java │ │ │ │ │ ├── adapter │ │ │ │ │ ├── BaseViewHolder.java │ │ │ │ │ └── MyAdapter.java │ │ │ │ │ ├── utils │ │ │ │ │ ├── PrefUtil.java │ │ │ │ │ ├── StatusBarUtils.java │ │ │ │ │ └── UriUtils.java │ │ │ │ │ └── widget │ │ │ │ │ ├── ChatPromptViewManager.java │ │ │ │ │ ├── Location.java │ │ │ │ │ ├── PromptView.java │ │ │ │ │ ├── PromptViewHelper.java │ │ │ │ │ └── location │ │ │ │ │ ├── BottomCenterLocation.java │ │ │ │ │ ├── ICalculateLocation.java │ │ │ │ │ ├── TopCenterLocation.java │ │ │ │ │ ├── TopLeftLocation.java │ │ │ │ │ └── TopRightLocation.java │ │ │ └── res │ │ │ │ ├── drawable-v24 │ │ │ │ └── ic_launcher_foreground.xml │ │ │ │ ├── drawable │ │ │ │ ├── btnbg.xml │ │ │ │ ├── editbg.xml │ │ │ │ └── ic_launcher_background.xml │ │ │ │ ├── layout │ │ │ │ ├── activity_item_left.xml │ │ │ │ ├── activity_item_right.xml │ │ │ │ └── activity_main.xml │ │ │ │ ├── mipmap-anydpi-v26 │ │ │ │ ├── ic_launcher.xml │ │ │ │ └── ic_launcher_round.xml │ │ │ │ ├── mipmap-hdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── mipmap-mdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── mipmap-xhdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── mipmap-xxhdpi │ │ │ │ ├── glm.png │ │ │ │ ├── ic_launcher.webp │ │ │ │ ├── ic_launcher_round.webp │ │ │ │ └── me.png │ │ │ │ ├── mipmap-xxxhdpi │ │ │ │ ├── ic_launcher.webp │ │ │ │ └── ic_launcher_round.webp │ │ │ │ ├── values-night │ │ │ │ └── themes.xml │ │ │ │ └── values │ │ │ │ ├── colors.xml │ │ │ │ ├── strings.xml │ │ │ │ └── themes.xml │ │ │ └── test │ │ │ └── java │ │ │ └── com │ │ │ └── doujiao │ │ │ └── xiaozhihuiassistant │ │ │ └── ExampleUnitTest.java │ │ ├── build.gradle │ │ ├── gradle.properties │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ └── settings.gradle ├── FastllmStudio │ └── cli │ │ ├── cli.cpp │ │ ├── ui.cpp │ │ └── ui.h ├── Qui │ ├── FastLLM.cpp │ ├── bin │ │ ├── Qt5Core.dll │ │ ├── Qt5Gui.dll │ │ ├── Qt5Widgets.dll │ │ ├── Qui.exe │ │ ├── fastllm_cpu.exe │ │ ├── fastllm_cuda.exe │ │ ├── path.txt │ │ ├── platforms │ │ │ └── qwindows.dll │ │ ├── qui_cn.qm │ │ └── styles │ │ │ └── qwindowsvistastyle.dll │ └── src │ │ ├── Qui.cpp │ │ ├── Qui.h │ │ ├── Qui.pro │ │ ├── Qui.ui │ │ ├── main.cpp │ │ └── qui_cn.ts ├── README.md ├── Win32Demo │ ├── StringUtils.h │ ├── Win32Demo.cpp │ ├── Win32Demo.sln │ ├── Win32Demo.vcxproj │ ├── bin │ │ └── web │ │ │ ├── css │ │ │ ├── github-markdown-light.min.css │ │ │ ├── github.min.css │ │ │ ├── katex.min.css │ │ │ └── texmath.css │ │ │ ├── index.html │ │ │ └── js │ │ │ ├── highlight.min.js │ │ │ ├── katex.min.js │ │ │ ├── markdown-it-link-attributes.min.js │ │ │ ├── markdown-it.min.js │ │ │ └── texmath.js │ ├── fastllm-gpu.vcxproj │ ├── fastllm-gpu.vcxproj.filters │ ├── fastllm.vcxproj │ ├── fastllm.vcxproj.filters │ └── httplib.h ├── apiserver │ └── apiserver.cpp ├── benchmark │ ├── benchmark.cpp │ └── prompts │ │ ├── beijing.txt │ │ └── hello.txt ├── openai_server │ ├── README.md │ ├── fastllm_completion.py │ ├── openai_api_server.py │ ├── protocal │ │ ├── __init__.py │ │ └── openai_protocol.py │ └── requirements.txt ├── python │ ├── custom_model.py │ └── qwen2.py └── webui │ ├── httplib.h │ ├── web │ ├── css │ │ ├── github-markdown-light.min.css │ │ ├── github.min.css │ │ ├── katex.min.css │ │ └── texmath.css │ ├── index.html │ └── js │ │ ├── highlight.min.js │ │ ├── katex.min.js │ │ ├── markdown-it-link-attributes.min.js │ │ ├── markdown-it.min.js │ │ └── texmath.js │ └── webui.cpp ├── include ├── device.h ├── devices │ ├── cpu │ │ ├── alivethreadpool.h │ │ ├── computeutils.h │ │ ├── cpudevice.h │ │ └── cputhreadpool.h │ ├── cuda │ │ ├── cudadevice.h │ │ ├── fastllm-cuda.cuh │ │ └── fastllm-hip.h │ ├── multicuda │ │ ├── fastllm-multicuda.cuh │ │ └── multicudadevice.h │ ├── numa │ │ ├── computeserver.h │ │ ├── fastllm-numa.h │ │ ├── kvcache.h │ │ └── numadevice.h │ ├── tfacc │ │ ├── fastllm-tfacc.h │ │ └── tfaccdevice.h │ └── tops │ │ └── topsdevice.h ├── executor.h ├── fastllm.h ├── graph.h ├── model.h ├── models │ ├── basellm.h │ ├── bert.h │ ├── chatglm.h │ ├── cogvlm.h │ ├── deepseekv2.h │ ├── factoryllm.h │ ├── glm.h │ ├── graphllm.h │ ├── internlm2.h │ ├── llama.h │ ├── minicpm.h │ ├── minicpm3.h │ ├── moe.h │ ├── moss.h │ ├── phi3.h │ ├── qwen.h │ ├── qwen3.h │ ├── qwen3_moe.h │ └── xlmroberta.h ├── template.h └── utils │ ├── armMath.h │ ├── avxMath.h │ └── utils.h ├── install.sh ├── main.cpp ├── make_whl.sh ├── make_whl_rocm.sh ├── pyfastllm ├── README.md ├── examples │ ├── cli_low_level.py │ ├── cli_simple.py │ ├── convert_model.py │ ├── test_chatglm2.py │ ├── test_chatglm2_cpp.py │ ├── test_chatglm2_func.py │ ├── test_ops.py │ ├── web_api.py │ └── web_api_client.py ├── fastllm │ ├── __init__.py │ ├── convert.py │ ├── functions │ │ ├── __init__.py │ │ ├── custom_ops.py │ │ ├── fastllm_ops.py │ │ ├── numpy_ops.py │ │ └── util.py │ ├── hub │ │ ├── __init__.py │ │ └── chatglm2.py │ ├── models.py │ ├── nn │ │ ├── __init__.py │ │ ├── base_module.py │ │ └── modules.py │ └── utils │ │ ├── __init__.py │ │ ├── converter.py │ │ ├── quantizer.py │ │ └── writer.py ├── install.sh └── setup.py ├── requirements-server.txt ├── simple_install.sh ├── src ├── device.cpp ├── devices │ ├── cpu │ │ ├── avx512bf16.cpp │ │ ├── avx512vnni.cpp │ │ ├── cpudevice.cpp │ │ ├── cpudevicebatch.cpp │ │ └── linear.cpp │ ├── cuda │ │ ├── cudadevice.cpp │ │ ├── cudadevicebatch.cpp │ │ └── fastllm-cuda.cu │ ├── multicuda │ │ ├── fastllm-multicuda.cu │ │ └── multicudadevice.cpp │ ├── numa │ │ ├── computeserver.cpp │ │ ├── fastllm-numa.cpp │ │ ├── kvcache.cpp │ │ └── numadevice.cpp │ ├── tfacc │ │ ├── fastllm-tfacc.cpp │ │ └── tfaccdevice.cpp │ └── tops │ │ └── topsdevice.cpp ├── executor.cpp ├── fastllm.cpp ├── graph.cpp ├── model.cpp ├── models │ ├── basellm.cpp │ ├── bert.cpp │ ├── chatglm.cpp │ ├── cogvlm.cpp │ ├── deepseekv2.cpp │ ├── glm.cpp │ ├── graph │ │ ├── fastllmjson.cpp │ │ ├── gemma2.cpp │ │ ├── minicpm3.cpp │ │ ├── phi3.cpp │ │ ├── qwen2.cpp │ │ └── telechat.cpp │ ├── graphllm.cpp │ ├── internlm2.cpp │ ├── llama.cpp │ ├── minicpm.cpp │ ├── minicpm3.cpp │ ├── moe.cpp │ ├── moss.cpp │ ├── phi3.cpp │ ├── qwen.cpp │ ├── qwen3.cpp │ ├── qwen3_moe.cpp │ └── xlmroberta.cpp ├── pybinding.cpp └── template.cpp ├── test ├── basic │ ├── config.py │ ├── forward_check.py │ └── tokenizer_check.py ├── cmmlu │ ├── README.md │ ├── baichuan.py │ ├── categories.py │ ├── chatglm.py │ ├── eval.py │ ├── minicpm3.py │ ├── qwen.py │ └── qwq.py └── ops │ ├── cppOps.cpp │ └── tokenizerTest.cpp ├── third_party ├── hipify_torch │ ├── LICENSE.txt │ ├── README.md │ ├── cmake │ │ └── Hipify.cmake │ ├── hipify_cli.py │ ├── hipify_torch │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── cuda_to_hip_mappings.py │ │ ├── hipify_python.py │ │ └── version.py │ ├── setup.py │ ├── test │ │ └── test_installation.py │ └── tools │ │ └── replace_cuda_with_hip_files.py ├── json11 │ ├── json11.cpp │ └── json11.hpp └── tfacc │ ├── driver │ └── tfacc2 │ │ ├── Makefile │ │ ├── build_driver.sh │ │ ├── modules.order │ │ ├── tfacc2.c │ │ └── tfacc2.h │ ├── launch.py │ ├── pull.sh │ └── server ├── tools ├── fastllm_pytools │ ├── __init__.py │ ├── chat.py │ ├── cli.py │ ├── download.py │ ├── export.py │ ├── hf_model.py │ ├── llm.py │ ├── openai_server │ │ ├── fastllm_completion.py │ │ ├── fastllm_embed.py │ │ ├── fastllm_model.py │ │ ├── fastllm_reranker.py │ │ └── protocal │ │ │ └── openai_protocol.py │ ├── server.py │ ├── torch2flm.py │ ├── ui.py │ ├── util.py │ ├── web_demo.py │ └── webui.py ├── scripts │ ├── alpaca2flm.py │ ├── baichuan2_2flm.py │ ├── baichuan2flm.py │ ├── bert2flm.py │ ├── chatglm_export.py │ ├── cli_demo.py │ ├── glm_export.py │ ├── llama3_to_flm.py │ ├── llamalike2flm.py │ ├── minicpm2flm.py │ ├── moss_export.py │ ├── qwen2flm.py │ ├── setup.py │ ├── setup_rocm.py │ └── web_demo.py └── src │ ├── pytools.cpp │ ├── pytools_t2s.cpp │ └── quant.cpp ├── whl_docker └── Dockerfile └── whl_docker_rocm ├── 24.04 └── Dockerfile └── Dockerfile /.dockerignore: -------------------------------------------------------------------------------- 1 | ./models 2 | ./build/ -------------------------------------------------------------------------------- /.github/workflows/Build.yml: -------------------------------------------------------------------------------- 1 | name: Action Build 2 | on: [push] 3 | 4 | jobs: 5 | build: 6 | runs-on: ubuntu-latest 7 | 8 | steps: 9 | - name: Set up JDK 10 | uses: actions/setup-java@v1 11 | with: 12 | java-version: '11' 13 | 14 | - name: Checkout code 15 | uses: actions/checkout@v2 16 | 17 | #- name: Build with arm64-v8a 18 | # run: | 19 | # wget -q https://dl.google.com/android/repository/android-ndk-r22b-linux-x86_64.zip 20 | # unzip android-ndk-r22b-linux-x86_64.zip 21 | # export NDK=$GITHUB_WORKSPACE/android-ndk-r22b 22 | # mkdir build-android 23 | # cd build-android 24 | #ls ${NDK}/build/cmake/android.toolchain.cmake 25 | # cmake -DCMAKE_MAKE_PROGRAM=/usr/bin/make -DCMAKE_CXX_COMPILER=/usr/bin/g++ -DCMAKE_TOOLCHAIN_FILE=${NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_CXX_FLAGS=-march=armv8.2a+dotprod .. 26 | # make -j -B 27 | # cp main fastllm-main-android 28 | 29 | - name: Build with x86 30 | run: | 31 | mkdir build-x86 32 | cd build-x86 33 | cmake .. -DUSE_CUDA=OFF 34 | make -j $(nproc) 35 | cp main fastllm-main-x86_64 36 | 37 | - name: Export and Upload Artifact 38 | uses: actions/upload-artifact@v4 39 | with: 40 | name: Output 41 | path: | 42 | build-android/fastllm-main-android 43 | build-x86/fastllm-main-x86_64 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.pyc 3 | token 4 | /cmake-build-debug/ 5 | /build* 6 | /pyfastllm/build/ 7 | /pyfastllm/dist/ 8 | /.idea/ 9 | /.vscode/ 10 | /example/Win32Demo/bin/*.* 11 | /example/Win32Demo/Win32 12 | /example/Win32Demo/x64 13 | /example/Win32Demo/*.filters 14 | /example/Win32Demo/*.user 15 | /example/Win32Demo/.vs 16 | /example/Android/LLMAssistant/*.iml 17 | /example/Android/LLMAssistant/.gradle 18 | /example/Android/LLMAssistant/local.properties 19 | /example/Android/LLMAssistant/.idea/caches 20 | /example/Android/LLMAssistant/.idea/libraries 21 | /example/Android/LLMAssistant/.idea/modules.xml 22 | /example/Android/LLMAssistant/.idea/workspace.xml 23 | /example/Android/LLMAssistant/.idea/navEditor.xml 24 | /example/Android/LLMAssistant/.idea/assetWizardSettings.xml 25 | /example/Android/LLMAssistant/.DS_Store 26 | /example/Android/LLMAssistant/build 27 | /example/Android/LLMAssistant/captures 28 | /example/Android/LLMAssistant/.externalNativeBuild 29 | /example/Android/LLMAssistant/.cxx 30 | /example/Android/LLMAssistant/local.properties 31 | /test/cmmlu/results/ 32 | /models/ 33 | /localtest/ 34 | /third_party/tfacc/driver/tfacc2/result 35 | /.chainlit 36 | /.files 37 | /src/devices/hip 38 | /src/devices/multihip 39 | /test/mmlu 40 | *.o -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/pybind11"] 2 | path = third_party/pybind11 3 | url = https://github.com/pybind/pybind11.git 4 | branch = v2.10.5 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1-labs 2 | FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 3 | 4 | # Update Apt repositories 5 | RUN apt-get update 6 | 7 | # Install and configure Python 8 | RUN apt-get -y --no-install-recommends install wget build-essential python3.10 python3-pip 9 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 10 | RUN pip install setuptools streamlit-chat 11 | 12 | ENV WORKDIR /fastllm 13 | 14 | # Install cmake 15 | RUN wget -c https://cmake.org/files/LatestRelease/cmake-3.28.3-linux-x86_64.sh && bash ./cmake-3.28.3-linux-x86_64.sh --skip-license --prefix=/usr/ 16 | 17 | WORKDIR $WORKDIR 18 | ADD . $WORKDIR/ 19 | 20 | RUN mkdir $WORKDIR/build && cd build && cmake .. -DUSE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native && make -j && cd tools && python setup.py install 21 | 22 | CMD /fastllm/build/webui -p /models/chatglm2-6b-int8.flm 23 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | fastllm: 4 | build: 5 | context: . 6 | args: 7 | DOCKER_BUILDKIT: 0 8 | # privileged: true 9 | platforms: 10 | - "linux/amd64" 11 | tags: 12 | - "fastllm:v0.9" 13 | restart: always 14 | ports: 15 | - 11234:8081 16 | volumes: 17 | - ./models/:/models/ 18 | command: /fastllm/build/webui -p /models/chatglm2-6b-int8.flm -w ./example/webui/web 19 | 20 | -------------------------------------------------------------------------------- /docs/benchmark.md: -------------------------------------------------------------------------------- 1 | ## 推理速度 2 | 3 | 可以使用benchmark程序进行测速,根据不同配置、不同输入,推理速度也会有一些差别 4 | 5 | 例如: 6 | 7 | ``` sh 8 | ./benchmark -p ~/chatglm-6b-int4.flm -f ../example/benchmark/prompts/beijing.txt -b 1 9 | ./benchmark -p ~/chatglm-6b-int8.flm -f ../example/benchmark/prompts/beijing.txt -b 1 10 | ./benchmark -p ~/chatglm-6b-fp16.flm -f ../example/benchmark/prompts/hello.txt -b 512 -l 18 11 | ``` 12 | 13 | | 模型 | Data精度 | 平台 | Batch | 最大推理速度(token / s) | 14 | |-----------------:|---------|--------------------|-----------|---------------------:| 15 | | ChatGLM-6b-int4 | float32 | RTX 4090 | 1 | 176 | 16 | | ChatGLM-6b-int8 | float32 | RTX 4090 | 1 | 121 | 17 | | ChatGLM-6b-fp16 | float32 | RTX 4090 | 64 | 2919 | 18 | | ChatGLM-6b-fp16 | float32 | RTX 4090 | 256 | 7871 | 19 | | ChatGLM-6b-fp16 | float32 | RTX 4090 | 512 | 10209 | 20 | | ChatGLM-6b-int4 | float32 | Xiaomi 10 Pro - 4 Threads | 1 | 4 ~ 5 | 21 | -------------------------------------------------------------------------------- /docs/custom.md: -------------------------------------------------------------------------------- 1 | ### 自定义模型 2 | 3 | 对于Fastllm框架中没有支持的模型,可以通过自定义模型结构来支持 4 | 5 | Pyhton 自定义模型只需要一个python文件来描述模型结构,可参考 [QWEN](../example/python/qwen2.py) 中的实现 6 | 7 | ### Python自定义模型的使用 8 | 9 | 使用ftllm.chat, ftllm.webui, ftllm.server时,可以加入参数--custom来指定自定义模型文件 10 | 11 | 假设我们的模型位于 `~/Qwen2-7B-Instruct/` 目录,自定义模型位于 `~/qwen2.py` 12 | 13 | 那么可以使用命令 14 | 15 | ``` sh 16 | python3 -m ftllm.chat -t 16 -p ~/Qwen2-7B-Instruct/ --custom ~/qwen2.py 17 | ``` 18 | 19 | 来通过自定义模型文件加在Qwen2模型,server和webui用法类似 20 | 21 | ### Python自定义模型的写法 22 | 23 | 自定义模型时,需要实现一个模型的描述类,继承自ftllm.llm.ComputeGraph 24 | 25 | 对应 [QWEN](../example/python/qwen2.py) 中的代码 26 | 27 | ``` python 28 | from ftllm.llm import ComputeGraph 29 | class Qwen2Model(ComputeGraph): 30 | ``` 31 | 32 | 文件最后需要定义 `__model__` 变量来指定自定义模型结构对应的class, 对应代码 33 | 34 | ``` python 35 | __model__ = Qwen2Model 36 | ``` 37 | 38 | 模型描述类中需要实现build方法,来获取模型参数、描述计算流程 39 | 40 | 这里以示例代码为例介绍 41 | 42 | ``` python 43 | class Qwen2Model(ComputeGraph): 44 | def build(self): 45 | # 1. 获取weight, data, config 46 | weight, data, config = self.weight, self.data, self.config 47 | 48 | # 2. 设置一些config 49 | config["max_positions"] = 128000 50 | 51 | # 3. 描述计算流程 52 | head_dim = config["hidden_size"] // config["num_attention_heads"] 53 | self.Embedding(data["inputIds"], weight["model.embed_tokens.weight"], data["hiddenStates"]); 54 | # 以下是计算流程,具体参见示例代码 55 | ``` 56 | 57 | #### `self.config` 58 | 59 | 模型配置,默认会从模型文件夹下的 `config.json` 文件中读取 60 | 61 | build方法中可以修改config中的参数,例如改动 `max_positions` 可以修改上下文长度 62 | 63 | 有一些模型的 `config.json` 中使用的变量名不一致,需要在build过程中手动为config赋值。 64 | 65 | 例如在TeleChat7B模型的配置中没有 `max_positions` 变量,而是用 `seq_length` 变量代表长度,那么在build方法中需要用如下代码赋值: 66 | 67 | ``` python 68 | self.config["max_positions"] = self.config["seq_length"] 69 | ``` 70 | 71 | config中,有以下变量必须要赋值(如果config.json中变量名一致,可以不处理): 72 | 73 | ``` python 74 | self.config["max_positions"] #代表最长上下文长度 75 | ``` 76 | 77 | #### `self.weight` 78 | 79 | 代表权重数据 80 | 81 | `self.weight[weightName]` 代表模型文件中名为weightName的参数(对应HF模型文件夹中.safetensors文件中的参数名) 82 | 83 | #### ```self.data``` 84 | 85 | 代表计算流程的中间变量和输入变量 86 | 87 | `self.data[dataName]` 代表名为dataName的中间变量,`dataName` 可以使用除以下输入变量名之外的任意字符串 88 | 89 | 输入变量: 90 | 91 | ``` python 92 | data["inputIds"] # 输入token 93 | data["positionIds"] # 位置信息 94 | data["attentionMask"] # mask信息 95 | data["sin"] # 用于旋转编码的sin 96 | data["cos"] # 用于旋转编码的cos 97 | data["atype"] # 推理中的数据类型 98 | data["pastKey."][i] # 第i个block的key cache 99 | data["pastValue."][i] # 第i个block的value cache 100 | ``` 101 | 102 | #### 计算流程及算子 103 | 104 | 使用基类ComputeGraph添加算子的函数来描述计算流程 105 | 106 | 目前支持的算子见文档 [自定义模型算子](./custom_op.md) 107 | 108 | ### cpp版本的自定义模型 109 | 110 | (cpp版本的自定义模型接口还在修改中...) 111 | -------------------------------------------------------------------------------- /docs/demo_arguments.md: -------------------------------------------------------------------------------- 1 | # Fastllm Python Demo 参数说明 2 | 3 | ## 通用参数 4 | 5 | 模型相关配置,OpenAI API Server, WebUI, 对话Demo 均可使用 6 | 7 | - **模型路径 (`-p, --path`)**: 指定模型的路径,可以是fastllm模型文件或Hugging Face模型文件夹。例如: 8 | ```bash 9 | --path ~/Qwen2-7B-Instruct/ # 从~/Qwen2-7B-Instruct/中读取模型,这里的模型需要是从HuggingFace或ModelScope或其他网站下载的Hugging face格式的标准模型,暂不支持AWQ,GPTQ等格式 10 | --path ~/model.flm # 从~/model.flm中读取模型,这里的模型是Fastllm格式的模型文件 11 | ``` 12 | - **推理类型 (`--atype`)**: 设置中间计算类型,可以指定为`float16`或`float32` 13 | - **权重类型 (`--dtype`)**: 指定模型的权重类型,适用于读取Hugging Face模型时。可以指定为`float16`, `int8`, `int4`, `int4g`(int4分组量化),例如: 14 | ```bash 15 | --dtype float16 # 使用float16权重(不量化) 16 | --dtype int8 # 在线量化成int8权重 17 | --dtype int4g128 # 在线量化成int4分组权重(128个权重一组) 18 | --dtype int4g256 # 在线量化成int4分组权重(256个权重一组) 19 | --dtype int4 # 在线量化成int4权重 20 | ``` 21 | - **使用的设备 (`--device`)**: 指定服务器使用的设备。可以指定为`cpu`或`cuda`或额外编译的其余device类型 22 | - **CUDA Embedding (`--cuda_embedding`)**: 若带上此配置且device设置为`cuda`,那么会在cuda设备上进行embedding操作,这样速度会略微提升,显存占用也会提升,建议在显存非常充足的情况下使用 23 | - **KV缓存最大使用量 (`--kv_cache_limit`)**: 设置KV缓存的最大使用量。若不使用此参数或设置为`auto`,框架会自动处理。手动设定示例如下: 24 | ```bash 25 | --kv_cache_limit 5G # 设置为5G 26 | --kv_cache_limit 100M # 设置为100M 27 | --kv_cache_limit 168K # 设置为168K 28 | ``` 29 | - **最大Batch数量 (`--max_batch`)**: 设置每次同时处理的请求数量。若不使用此参数,框架会自动处理 30 | - **线程数量 (`-t, --threads`)**: 设置CPU线程数量,device设置为`cpu`时对速度有较大影响,设置为`cuda`时影响较小,主要影响读取模型的速度 31 | - **自定义模型描述文件 (`--custom`)**: 指定描述自定义模型的Python文件。具体见 [自定义模型](custom.md) 32 | 33 | ## OpenAI API Server配置参数 34 | - **模型名称 (`--model_name`)**: 指定部署的模型名称,API调用时会进行名称核验 35 | - **API服务器主机地址 (`--host`)**: 设置API服务器的主机地址 36 | - **API服务器端口号 (`--port`)**: 设置API服务器的端口号 37 | 38 | 39 | ## Web UI 配置参数 40 | - **API服务器端口号 (`--port`)**: 设置WebUI的端口号 41 | - **页面标题 (`--title`)**: 设置WebUI的页面标题 -------------------------------------------------------------------------------- /docs/ftllm.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ztxz16/fastllm/170090720d2585f065b3dd95bdafa79dcf6913e8/docs/ftllm.md -------------------------------------------------------------------------------- /docs/mixforward.md: -------------------------------------------------------------------------------- 1 | # 混合推理使用说明 2 | 3 | 这个文档以`DeepSeek-V3-0324-INT4`模型为例,介绍如何使用混合推理来榨干硬件 4 | 5 | ## 基本用法 6 | 7 | 假设我们在一台有两张48G的显卡上部署`DeepSeek-V3-0324-INT4`模型,一般用法是这样的 8 | 9 | 10 | ``` 11 | ftllm server fastllm/DeepSeek-V3-0324-INT4 12 | ``` 13 | 14 | 这时候会默认将模型的moe部分运行在cpu上,非moe部分运行在cuda上,等价于如下命令: 15 | 16 | ``` 17 | ftllm server fastllm/DeepSeek-V3-0324-INT4 --device cuda --moe_device cpu 18 | ``` 19 | 20 | (注意:之后的优化目前仅对`cuda`和`cpu`的混合推理有效,`numa`无法使用这些功能) 21 | 22 | ## 将部分moe层运行在单张显卡上 23 | 24 | 用上述命令运行时,显存会有大量剩余,我们可以通过设置`moe_device`,将一部分moe层 25 | 指定在cuda上运行 26 | 27 | ``` 28 | ftllm server fastllm/DeepSeek-V3-0324-INT4 --device cuda --moe_device "{'cuda':1,'cpu':19}" 29 | ``` 30 | 31 | 上述命令中将`moe_device`设置为`"{'cuda':1,'cpu':19}"`,代表`1/20`的moe层运行在cuda上,`19/20`的moe层运行在cpu上 32 | 33 | 这样能轻微提升decode速度,但是可能会降低上下文长度 34 | 35 | ## 将部分moe层运行在多张显卡上 36 | 37 | 使用下面的命令可以使用多张显卡来加速部分moe层 38 | 39 | ``` 40 | ftllm server fastllm/DeepSeek-V3-0324-INT4 --device cuda --moe_device "{'multicuda:0,1':15,'cpu':85}" 41 | ``` 42 | 43 | 上述命令中将`moe_device`设置为`"{'multicuda:0,1':15,'cpu':85}"`,代表`15/100`的moe层使用0,1两张gpu张量并行推理,`85/100`的moe层运行在cpu上 44 | 45 | 这样能进一步提升decode速度 46 | 47 | (建议看到这里就结束,但如果想了解更多的花活也可以继续往下看) 48 | 49 | ## 将部分moe层使用混合张量并行推理 50 | 51 | 使用下面的命令可以使用混合张量并来加速部分moe层 52 | 53 | ``` 54 | ftllm server fastllm/DeepSeek-V3-0324-INT4 --device cuda --moe_device "{'multicuda:0:3,1:3,cpu:2':15,'cpu':85}" 55 | ``` 56 | 57 | 上述命令中将`moe_device`设置为`"{'multicuda:0:3,1:3,cpu:2':15,'cpu':85}"`,代表: 58 | - `15/100`的moe层使用混合张量并行,这时候两张显卡和cpu会同时工作,`3/8`的计算量在显卡0上,`3/8`的计算量在显卡1上,`2/8`的计算量在cpu上 59 | - `85/100`的moe层运行在cpu上 60 | 61 | 这样理论上能更进一步提升decode速度,但目前实现效率不高,速度还不如上一步,后续会继续优化 62 | 63 | -------------------------------------------------------------------------------- /docs/qwen3.md: -------------------------------------------------------------------------------- 1 | ## Qwen3模型介绍 2 | 3 | Qwen3是阿里巴巴出品的系列模型 4 | 5 | ### 安装Fastllm 6 | 7 | - PIP安装 8 | 9 | Linux系统可尝试直接pip安装,命令如下: 10 | ``` 11 | pip install ftllm -U 12 | ``` 13 | 若安装失败则参考[源码安装](../README.md#安装) 14 | 15 | ### 运行示例 16 | 17 | #### 命令行聊天: 18 | 19 | ``` 20 | ftllm run fastllm/Qwen3-235B-A22B-INT4MIX 21 | ftllm run Qwen/Qwen3-30B-A3B 22 | ``` 23 | 24 | #### webui: 25 | 26 | ``` 27 | ftllm webui fastllm/Qwen3-235B-A22B-INT4MIX 28 | ftllm webui Qwen/Qwen3-30B-A3B 29 | ``` 30 | 31 | #### api server (openai风格): 32 | 33 | ``` 34 | ftllm server fastllm/Qwen3-235B-A22B-INT4MIX 35 | ftllm server Qwen/Qwen3-30B-A3B 36 | ``` 37 | 38 | #### 参数建议 39 | 40 | 如有需要,可以将以下参数可以加在运行命令中 41 | 42 | - 硬思考模式: 千问3的独有模式,该模式默认打开,可以通过enable_thinking参数来关闭,关闭后模型将不生成思考。例如 43 | 44 | ```bash 45 | ftllm server Qwen/Qwen3-30B-A3B --enable_thinking false 46 | ``` 47 | 48 | - 推理设备: 非MOE模型默认使用显卡推理,若显存容量不足希望使用纯CPU推理,可以设置`--device cpu`, 或`--device numa`使用多路numa加速 49 | - 量化: Qwen3系列模型目前建议使用参数`--dtype int4g256`指定4bit量化,`--dtype int8`指定8bit量化 50 | 51 | 52 | - MOE模型(Qwen3-30B-A3B, Qwen3-235B-A22B)默认使用cpu+gpu混合推理,若希望使用cuda推理需要指定device参数,例如 53 | ``` bash 54 | ftllm server Qwen/Qwen3-30B-A3B --device cuda --dtype int4g256 55 | ftllm server Qwen/Qwen3-30B-A3B --device cuda --dtype int8 56 | ``` 57 | 58 | - 更多参数信息可参考 [常用参数](../README.md#常用参数) 59 | 60 | #### NUMA加速 61 | 62 | 若想使用单NUMA节点,建议用numactl绑定numa节点 63 | 64 | 可以设定环境变量来激活多NUMA节点加速(PIP版本可直接激活,源码安装时需要在编译时加入-DUSE_NUMA=ON选项) 65 | 66 | ``` 67 | export FASTLLM_USE_NUMA=ON 68 | # export FASTLLM_NUMA_THREADS=27 # 选用,这个变量用于设定每个numa节点开启的线程数 69 | ``` 70 | 71 | #### 本地模型 72 | 73 | 可以启动本地下载好的模型(支持原始模型,AWQ模型,FASTLLM模型,暂不支持GGUF模型),假设本地模型路径为 `/mnt/Qwen/Qwen3-30B-A3B` 74 | 则可以用如下命令启动(webui, server类似) 75 | 76 | ``` 77 | ftllm run /mnt/Qwen/Qwen3-30B-A3B 78 | ``` 79 | 80 | ### 模型下载 81 | 82 | 可以使用如下命令将模型下载到本地 83 | 84 | ``` 85 | ftllm download Qwen/Qwen3-30B-A3B 86 | ``` 87 | -------------------------------------------------------------------------------- /docs/rocm.md: -------------------------------------------------------------------------------- 1 | # ROCm 编译 2 | 3 | ## 0. 支持平台 4 | 5 | ROCm 编译目前仅支持Linux平台。 6 | 7 | 目前支持的GPU型号如下: 8 | 9 | - AMD Radeon Instinct MI系列,如MI50, MI100,MI210等 10 | - AMD Radeon RDNA RX 7000 游戏卡和工作站卡系列,W7800,W7900等 11 | - 海光系列GPU,如K100等(未验证,理论可行) 12 | 13 | ## 1. 安装 ROCm,获取 ROCm Arch 14 | 15 | 请参考 [ROCm 官方文档](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/) 安装 ROCm。 16 | 17 | 可以在 [架构列表](https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html)的LLVM target列中找到GPU的 ROCm Arch。 18 | 19 | 常见GPU对应的架构: 20 | | 架构代号 | 架构系列 | 代表产品示例 | 推荐 ROCm 版本 | 21 | |----------|-----------|---------------------------------------------|----------------| 22 | | gfx900 | GCN5.0 | Radeon Instinct MI25 | ❌不支持 | 23 | | gfx906 | GCN5.1 | Radeon VII, Instinct MI50 | [6.3.3](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.3.3/install/quick-start.html) | 24 | | gfx908 | CDNA | Radeon Instinct MI100 | [6.4.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/install/quick-start.html) | 25 | | gfx90a | CDNA2 | Radeon Instinct MI210/MI250/MI250X | [6.4.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/install/quick-start.html) | 26 | | gfx942 | CDNA3 | Instinct MI300A/MI300X/MI325X | [6.4.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/install/quick-start.html) | 27 | | gfx1030 | RDNA2 | Radeon PRO W6800/V620 | [6.4.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/install/quick-start.html) | 28 | | gfx1100 | RDNA3 | Radeon PRO W7800/W7900, RX 7900 XT/XTX/GRE | [6.4.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/install/quick-start.html) | 29 | | gfx1101 | RDNA3 | Radeon PRO V710 | [6.4.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/install/quick-start.html) | 30 | 31 | 32 | 33 | 把需要编译的GPU架构用`;`分隔,填入`-DROCM_ARCH`参数中。默认为`gfx908;gfx90a;gfx1100`。 34 | 35 | 注意,部分GPU(比如RX6000系列、MI50不支持矩阵乘法加速`rocwmma`,只要列表中有一个GPU不支持`rocwmma`,则编译时不会使用`rocwmma`。 36 | 37 | ## 2. 编译 38 | 39 | ``` sh 40 | bash install.sh -DUSE_ROCM=ON -DROCM_ARCH="gfx908;gfx90a;gfx1100" 41 | ``` 42 | 43 | ## TODO 44 | 45 | - [ ] 海光系列GPU的验证 46 | - [ ] 支持`rocwmma`,能使用矩阵乘法加速 47 | 48 | ## 鸣谢 49 | 50 | [leavelet](https://github.com/leavelet) 提供ROCM支持 51 | -------------------------------------------------------------------------------- /docs/tfacc.md: -------------------------------------------------------------------------------- 1 | ## TFACC介绍 2 | 3 | TFACC是ThinkForce公司7000系列处理器的AI算力平台,可用于TF 7000系列处理器的大模型推理加速。 4 | 5 | ## 快速开始 6 | 7 | ### 加载驱动 8 | 9 | ``` sh 10 | cd fastllm/third_party/tfacc/driver/tfacc2 11 | ./build_driver.sh 12 | modprobe tfacc2 13 | ``` 14 | 15 | ### 打开TFACC计算服务 16 | 17 | ``` sh 18 | cd fastllm/third_party/tfacc 19 | python3 ./launch.py 4 & #这里的参数是numa节点数量,需要根据7000服务器具体的型号设定 20 | ``` 21 | 22 | ### 编译 23 | 24 | 建议使用cmake编译,需要提前安装c++编译器,make, cmake 25 | 26 | gcc版本建议9.4以上,cmake版本建议3.23以上 27 | 28 | 使用如下命令编译 29 | 30 | ``` sh 31 | bash install.sh -DUSE_TFACC=ON 32 | ``` 33 | 34 | ### 运行demo程序 35 | 36 | 我们假设已经获取了名为`model.flm`的模型(参照 [模型获取](#模型获取),初次使用可以先下载转换好的模型) 37 | 38 | 编译完成之后在build目录下可以使用下列demo: 39 | 40 | ``` sh 41 | # 这时在fastllm/build目录下 42 | 43 | # 命令行聊天程序, 支持打字机效果 (只支持Linux) 44 | ./main -p model.flm 45 | 46 | # 简易webui, 使用流式输出 + 动态batch,可多路并发访问 47 | ./webui -p model.flm --port 1234 48 | 49 | # python版本的命令行聊天程序,使用了模型创建以及流式对话效果 50 | python tools/cli_demo.py -p model.flm 51 | 52 | # python版本的简易webui,需要先安装streamlit-chat 53 | streamlit run tools/web_demo.py model.flm 54 | 55 | ``` 56 | 57 | 更多功能及接口请参照[详细文档](../README.md) -------------------------------------------------------------------------------- /docs/version.md: -------------------------------------------------------------------------------- 1 | ## V0.1.2.0 2 | 3 | - 规范版本号 a.b.c.d 4 | - a为保留位,目前为0 5 | - b为大版本号 6 | - c为小版本号 7 | - d为bug修复版本的编号 8 | 9 | ## V0.0.1.2 10 | 11 | - 优化了numa加速 12 | - 略微提升了prefill和decode速度 13 | - 支持了moe的混合张量并行,参考[混合推理指南](mixforward.md) 14 | - 修复了multicuda的一些bug,支持了所有精度的混合张量并行 15 | - 修复了C++下Jinja模板的一些bug,支持Qwen3, DS等一系列模型的内置分词器 16 | 17 | ## V0.0.1.1 18 | 19 | - 支持了 `FP8_E4M3` 精度(新老硬件均可) 20 | - MOE模型支持用`--moe_dtype`来设置混合精度 21 | - 可以在`ROCM`环境下使用`pip`安装了 22 | - 修复了C++下Jinja模板的一些bug 23 | - api server的默认输出token数由8K提升到32K 24 | 25 | ## V0.0.1.0 26 | 27 | - 支持了千问3模型 [部署指南](qwen3.md) 28 | - 优化了DeepSeek模型的显存使用 29 | - 增加参数`--cache_fast`来指定是否使用显存缓存 30 | 31 | ## V0.0.0.9 32 | 33 | - 优化了使用DeepSeek模型时的多轮对话缓存 34 | - 略微提升了DeepSeek模型的多并发速度 35 | - 减少了DeepSeek模型Prefill时的显存消耗,可以支持更长的上下文 36 | - 支持了DeepSeek模型的INT8量化 (使用原始模型时`--dtype int8`,或者导出时`--dtype int8`) 37 | - 隐藏了 "None of PyTorch, TensorFlow >= 2.0 ..." 的警告信息 38 | - 增加了`--cache_dir`参数来指定缓存目录 39 | - server增加了`--hide_input`参数来隐藏日志中的请求信息 40 | - webui增加了`--max_token`参数来指定最大输出,--think参数来强制思考 41 | 42 | ## V0.0.0.8 43 | 44 | - api server增加api_key参数,来设定api_key 45 | - api server支持了一些复合输入 46 | - 提升了moe模型prefill的速度 47 | - 增加了--version参数查看版本号 48 | 49 | ## V0.0.0.7 50 | 51 | - 增加config选项,可通过config.json文件来启动模型 52 | - 提升moe模型的速度 53 | 54 | ## V0.0.0.6 55 | 56 | - 降低GLIBC版本,PIP安装包兼容更多系统 57 | - PIP安装包支持更多架构(目前最低支持到SM_52) 58 | 59 | ## V0.0.0.5 60 | 61 | - 修改文档,增加了一些pip安装后无法使用的情况说明 62 | - 聊天模式下自动读取模型的生成配置文件 63 | - 修复一些情况下kv_cache_limit计算错误的问题 64 | 65 | ## V0.0.0.4 66 | 67 | - 增加ftllm run, chat, webui, server接口 -------------------------------------------------------------------------------- /docs/wechat_group0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ztxz16/fastllm/170090720d2585f065b3dd95bdafa79dcf6913e8/docs/wechat_group0.jpg -------------------------------------------------------------------------------- /example/Android/LLMAssistant/.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .gradle 3 | /local.properties 4 | /.idea/caches 5 | /.idea/libraries 6 | /.idea/modules.xml 7 | /.idea/workspace.xml 8 | /.idea/navEditor.xml 9 | /.idea/assetWizardSettings.xml 10 | .DS_Store 11 | /build 12 | /captures 13 | .externalNativeBuild 14 | .cxx 15 | local.properties 16 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/.idea/.name: -------------------------------------------------------------------------------- 1 | XiaoZhihuiAssistant -------------------------------------------------------------------------------- /example/Android/LLMAssistant/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/.idea/deploymentTargetDropDown.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/.idea/gradle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/.gitignore: -------------------------------------------------------------------------------- 1 | /build -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'com.android.application' 3 | } 4 | 5 | android { 6 | compileSdk 30 7 | 8 | defaultConfig { 9 | applicationId "com.doujiao.xiaozhihuiassistant" 10 | minSdk 21 11 | targetSdk 26 12 | versionCode 1 13 | versionName "1.0" 14 | 15 | testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner" 16 | externalNativeBuild { 17 | cmake { 18 | cppFlags '-std=c++11' 19 | } 20 | } 21 | ndk { 22 | abiFilters 'arm64-v8a','armeabi-v7a','x86' 23 | } 24 | } 25 | 26 | buildTypes { 27 | release { 28 | minifyEnabled false 29 | proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' 30 | } 31 | } 32 | compileOptions { 33 | sourceCompatibility JavaVersion.VERSION_1_8 34 | targetCompatibility JavaVersion.VERSION_1_8 35 | } 36 | externalNativeBuild { 37 | cmake { 38 | path file('src/main/cpp/CMakeLists.txt') 39 | version '3.18.1' 40 | } 41 | } 42 | // sourceSets { 43 | // main { 44 | // // jnilib 45 | // jniLibs.srcDirs = ['libs'] 46 | // } 47 | // } 48 | splits { 49 | abi { 50 | enable true 51 | reset() 52 | include 'arm64-v8a', 'armeabi-v7a','x86' 53 | universalApk true 54 | } 55 | } 56 | buildFeatures { 57 | viewBinding true 58 | } 59 | } 60 | 61 | dependencies { 62 | 63 | implementation 'com.android.support:appcompat-v7:28.0.0' 64 | implementation 'com.android.support:recyclerview-v7:28.0.0' 65 | implementation 'com.android.support.constraint:constraint-layout:2.0.4' 66 | testImplementation 'junit:junit:4.13.2' 67 | androidTestImplementation 'com.android.support.test:runner:1.0.2' 68 | androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2' 69 | } -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/libs/arm64-v8a/libassistant.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ztxz16/fastllm/170090720d2585f065b3dd95bdafa79dcf6913e8/example/Android/LLMAssistant/app/libs/arm64-v8a/libassistant.so -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/libs/armeabi-v7a/libassistant.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ztxz16/fastllm/170090720d2585f065b3dd95bdafa79dcf6913e8/example/Android/LLMAssistant/app/libs/armeabi-v7a/libassistant.so -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/proguard-rules.pro: -------------------------------------------------------------------------------- 1 | # Add project specific ProGuard rules here. 2 | # You can control the set of applied configuration files using the 3 | # proguardFiles setting in build.gradle. 4 | # 5 | # For more details, see 6 | # http://developer.android.com/guide/developing/tools/proguard.html 7 | 8 | # If your project uses WebView with JS, uncomment the following 9 | # and specify the fully qualified class name to the JavaScript interface 10 | # class: 11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview { 12 | # public *; 13 | #} 14 | 15 | # Uncomment this to preserve the line number information for 16 | # debugging stack traces. 17 | #-keepattributes SourceFile,LineNumberTable 18 | 19 | # If you keep the line number information, uncomment this to 20 | # hide the original source file name. 21 | #-renamesourcefileattribute SourceFile -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/release/app-arm64-v8a-release-unsigned.apk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ztxz16/fastllm/170090720d2585f065b3dd95bdafa79dcf6913e8/example/Android/LLMAssistant/app/release/app-arm64-v8a-release-unsigned.apk -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/release/app-armeabi-v7a-release-unsigned.apk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ztxz16/fastllm/170090720d2585f065b3dd95bdafa79dcf6913e8/example/Android/LLMAssistant/app/release/app-armeabi-v7a-release-unsigned.apk -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/release/app-universal-release-unsigned.apk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ztxz16/fastllm/170090720d2585f065b3dd95bdafa79dcf6913e8/example/Android/LLMAssistant/app/release/app-universal-release-unsigned.apk -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/release/app-x86-release-unsigned.apk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ztxz16/fastllm/170090720d2585f065b3dd95bdafa79dcf6913e8/example/Android/LLMAssistant/app/release/app-x86-release-unsigned.apk -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/androidTest/java/com/doujiao/xiaozhihuiassistant/ExampleInstrumentedTest.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant; 2 | 3 | import android.content.Context; 4 | import android.support.test.InstrumentationRegistry; 5 | import android.support.test.runner.AndroidJUnit4; 6 | 7 | import org.junit.Test; 8 | import org.junit.runner.RunWith; 9 | 10 | import static org.junit.Assert.*; 11 | 12 | /** 13 | * Instrumented test, which will execute on an Android device. 14 | * 15 | * @see Testing documentation 16 | */ 17 | @RunWith(AndroidJUnit4.class) 18 | public class ExampleInstrumentedTest { 19 | @Test 20 | public void useAppContext() { 21 | // Context of the app under test. 22 | Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); 23 | assertEquals("com.doujiao.xiaozhihuiassistant", appContext.getPackageName()); 24 | } 25 | } -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 16 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/cpp/LLMChat.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "LLMChat.h" 6 | 7 | #include "model.h" 8 | //void(^ __nonnull RuntimeChat)(int index,const char* _Nonnull content) = NULL;//实时回调 9 | 10 | static int modeltype = 0; 11 | static char* modelpath = NULL; 12 | static std::unique_ptr chatGlm = NULL; 13 | static int sRound = 0; 14 | static std::string history; 15 | static RuntimeResultMobile g_callback = NULL; 16 | 17 | std::string initGptConf(const char* modelPath,int threads) { 18 | fastllm::SetThreads(threads); 19 | LOG_Debug("@@init llmpath:%s\n",modelPath); 20 | chatGlm = fastllm::CreateLLMModelFromFile(modelPath); 21 | if(chatGlm != NULL) 22 | { 23 | std::string modelName = chatGlm->model_type; 24 | LOG_Debug("@@model name:%s\n",modelName.c_str()); 25 | return modelName; 26 | } 27 | LOG_Debug("@@CreateLLMModelFromFile failed."); 28 | return ""; 29 | } 30 | 31 | int chat(const char* prompt, RuntimeResultMobile chatCallback) { 32 | std::string ret = ""; 33 | g_callback = chatCallback; 34 | LOG_Debug("@@init llm:type:%d,prompt:%s\n",modeltype,prompt); 35 | std::string input(prompt); 36 | 37 | if (input == "reset") { 38 | history = ""; 39 | sRound = 0; 40 | g_callback(0,"Done!"); 41 | g_callback(-1,""); 42 | return 0; 43 | } 44 | 45 | ret = chatGlm->Response(chatGlm->MakeInput(history, sRound, input), [](int index, const char* content) { 46 | g_callback(index,content); 47 | }); 48 | history = chatGlm->MakeHistory(history, sRound, input, ret); 49 | sRound++; 50 | 51 | long len = ret.length(); 52 | return len; 53 | } 54 | 55 | void uninitLLM() 56 | { 57 | } 58 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/cpp/LLMChat.h: -------------------------------------------------------------------------------- 1 | // 2 | // LLMChat.h 3 | // LLMChat 4 | // 5 | // Created by 胡其斌 on 2023/5/18. 6 | // 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | #include 13 | #define LOG_Debug(...) __android_log_print(ANDROID_LOG_DEBUG, "Assistant", __VA_ARGS__) 14 | 15 | typedef void(* RuntimeResultMobile)(int index,const char* content); 16 | 17 | std::string initGptConf(const char* modelPath,int threads); 18 | int chat(const char* prompt, RuntimeResultMobile chatCallback); 19 | void uninitLLM(); 20 | 21 | #ifdef __cplusplus 22 | } 23 | #endif 24 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/core/AssistantCore.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.core; 2 | 3 | import android.support.annotation.Keep; 4 | import android.util.Log; 5 | 6 | public class AssistantCore { 7 | 8 | private static AssistantCore instance = null; 9 | private static runtimeResult mRuntimeRes = null; 10 | 11 | static { 12 | System.loadLibrary("assistant"); 13 | } 14 | 15 | /*静态对象*/ 16 | public static AssistantCore getInstance(){ 17 | if(instance == null){ 18 | instance = new AssistantCore(); 19 | } 20 | 21 | return instance; 22 | } 23 | 24 | public String initLLM(String path,runtimeResult callback) { 25 | mRuntimeRes = callback; 26 | return initLLMConfig(path,8); 27 | } 28 | 29 | @Keep 30 | public void reportChat(String content,int index) { 31 | Log.d("@@@","recv:"+content+",index:"+index); 32 | if (mRuntimeRes != null) { 33 | mRuntimeRes.callbackResult(index,content); 34 | } 35 | } 36 | 37 | public interface runtimeResult { 38 | void callbackResult(int index,String content); 39 | } 40 | 41 | private native String initLLMConfig(String path,int threads); 42 | public native int chat(String prompt); 43 | public native int uninitLLM(); 44 | } 45 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/ChatMessage.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant; 2 | 3 | /** 4 | * Created by chenpengfei on 2016/10/27. 5 | */ 6 | public class ChatMessage { 7 | 8 | private String content; 9 | 10 | private int type; 11 | 12 | public ChatMessage(String content, int type) { 13 | this.content = content; 14 | this.type = type; 15 | } 16 | 17 | public ChatMessage(String content) { 18 | this(content, 1); 19 | } 20 | 21 | 22 | public String getContent() { 23 | return content; 24 | } 25 | 26 | public void setContent(String content) { 27 | this.content = content; 28 | } 29 | 30 | public int getType() { 31 | return type; 32 | } 33 | 34 | public void setType(int type) { 35 | this.type = type; 36 | } 37 | 38 | 39 | } 40 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/adapter/BaseViewHolder.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.adapter; 2 | 3 | import android.support.v7.widget.RecyclerView; 4 | import android.view.View; 5 | 6 | /** 7 | * Created by chenpengfei on 2016/10/27. 8 | */ 9 | public class BaseViewHolder extends RecyclerView.ViewHolder { 10 | 11 | private View iv; 12 | 13 | public BaseViewHolder(View itemView) { 14 | super(itemView); 15 | iv = itemView; 16 | } 17 | 18 | public View findViewById(int id) { 19 | return iv.findViewById(id); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/utils/PrefUtil.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.utils; 2 | 3 | import android.content.Context; 4 | import android.content.SharedPreferences; 5 | 6 | public class PrefUtil { 7 | private static final String SF_NAME = "com.doujiao.llm.config"; 8 | private static final String MOLE_PATH = "llm_path"; 9 | private static SharedPreferences mPref; 10 | 11 | public static void initPref(Context context) { 12 | if (mPref == null) { 13 | mPref = context.getSharedPreferences(SF_NAME, Context.MODE_PRIVATE); 14 | } 15 | } 16 | 17 | public static void setModelPath(String path) { 18 | if (mPref != null) { 19 | mPref.edit().putString(MOLE_PATH,path).apply(); 20 | } 21 | } 22 | 23 | public static String getModelPath() { 24 | if (mPref != null) { 25 | return mPref.getString(MOLE_PATH,""); 26 | } 27 | return ""; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/utils/StatusBarUtils.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.utils; 2 | 3 | import android.content.ClipData; 4 | import android.content.ClipboardManager; 5 | import android.content.Context; 6 | import android.graphics.Color; 7 | import android.os.Build; 8 | import android.support.v7.app.ActionBar; 9 | import android.support.v7.app.AppCompatActivity; 10 | import android.view.View; 11 | import android.view.Window; 12 | import android.view.WindowManager; 13 | 14 | public class StatusBarUtils { 15 | public static void setTranslucentStatus(AppCompatActivity activity) { 16 | if (Build.VERSION.SDK_INT >= 21) { 17 | View decorView = activity.getWindow().getDecorView(); 18 | int option = View.SYSTEM_UI_FLAG_LAYOUT_FULLSCREEN 19 | | View.SYSTEM_UI_FLAG_LAYOUT_STABLE; 20 | decorView.setSystemUiVisibility(option); 21 | activity.getWindow().setStatusBarColor(Color.TRANSPARENT); 22 | } 23 | ActionBar actionBar = activity.getSupportActionBar(); 24 | actionBar.hide(); 25 | } 26 | 27 | public static void hideStatusBar(Window window, boolean darkText) { 28 | window.clearFlags(WindowManager.LayoutParams.FLAG_TRANSLUCENT_STATUS); 29 | window.addFlags(WindowManager.LayoutParams.FLAG_DRAWS_SYSTEM_BAR_BACKGROUNDS); 30 | window.setStatusBarColor(Color.TRANSPARENT); 31 | 32 | int flag = View.SYSTEM_UI_FLAG_LAYOUT_STABLE; 33 | if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M && darkText) { 34 | flag = View.SYSTEM_UI_FLAG_LIGHT_STATUS_BAR; 35 | } 36 | 37 | window.getDecorView().setSystemUiVisibility(flag | 38 | View.SYSTEM_UI_FLAG_LAYOUT_FULLSCREEN); 39 | } 40 | 41 | public static boolean copyStr2ClibBoard(Context context, String copyStr) { 42 | try { 43 | //获取剪贴板管理器 44 | ClipboardManager cm = (ClipboardManager) context.getSystemService(Context.CLIPBOARD_SERVICE); 45 | // 创建普通字符型ClipData 46 | ClipData mClipData = ClipData.newPlainText("Label", copyStr); 47 | // 将ClipData内容放到系统剪贴板里。 48 | cm.setPrimaryClip(mClipData); 49 | return true; 50 | } catch (Exception e) { 51 | return false; 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/widget/ChatPromptViewManager.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.widget; 2 | 3 | import android.app.Activity; 4 | import android.view.View; 5 | 6 | /** 7 | * Created by chenpengfei on 2016/11/2. 8 | */ 9 | public class ChatPromptViewManager extends PromptViewHelper.PromptViewManager { 10 | 11 | public ChatPromptViewManager(Activity activity, String[] dataArray, Location location) { 12 | super(activity, dataArray, location); 13 | } 14 | 15 | public ChatPromptViewManager(Activity activity) { 16 | this(activity, new String[]{"复制"}, Location.TOP_LEFT); 17 | } 18 | 19 | public ChatPromptViewManager(Activity activity, Location location) { 20 | this(activity, new String[]{"复制"}, location); 21 | } 22 | 23 | 24 | @Override 25 | public View inflateView() { 26 | return new PromptView(activity); 27 | } 28 | 29 | @Override 30 | public void bindData(View view, String[] dataArray) { 31 | if(view instanceof PromptView) { 32 | PromptView promptView = (PromptView) view; 33 | promptView.setContentArray(dataArray); 34 | promptView.setOnItemClickListener(new PromptView.OnItemClickListener() { 35 | @Override 36 | public void onItemClick(int position) { 37 | if(onItemClickListener != null) onItemClickListener.onItemClick(position); 38 | } 39 | }); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/widget/Location.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.widget; 2 | 3 | import com.doujiao.xiaozhihuiassistant.widget.location.BottomCenterLocation; 4 | import com.doujiao.xiaozhihuiassistant.widget.location.ICalculateLocation; 5 | import com.doujiao.xiaozhihuiassistant.widget.location.TopCenterLocation; 6 | import com.doujiao.xiaozhihuiassistant.widget.location.TopLeftLocation; 7 | import com.doujiao.xiaozhihuiassistant.widget.location.TopRightLocation; 8 | 9 | /** 10 | * Created by chenpengfei on 2016/11/2. 11 | */ 12 | public enum Location { 13 | 14 | TOP_LEFT(1), 15 | TOP_CENTER(2), 16 | TOP_RIGHT(3), 17 | BOTTOM_LEFT(4), 18 | BOTTOM_CENTER(5), 19 | BOTTOM_RIGHT(6); 20 | 21 | ICalculateLocation calculateLocation; 22 | 23 | private Location(int type) { 24 | switch (type) { 25 | case 1: 26 | calculateLocation = new TopLeftLocation(); 27 | break; 28 | case 2: 29 | calculateLocation = new TopCenterLocation(); 30 | break; 31 | case 3: 32 | calculateLocation = new TopRightLocation(); 33 | break; 34 | case 4: 35 | calculateLocation = new TopLeftLocation(); 36 | break; 37 | case 5: 38 | calculateLocation = new BottomCenterLocation(); 39 | break; 40 | case 6: 41 | calculateLocation = new TopLeftLocation(); 42 | break; 43 | } 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/widget/location/BottomCenterLocation.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.widget.location; 2 | 3 | import android.view.View; 4 | 5 | /** 6 | * Created by chenpengfei on 2016/11/2. 7 | */ 8 | public class BottomCenterLocation implements ICalculateLocation { 9 | 10 | @Override 11 | public int[] calculate(int[] srcViewLocation, View srcView, View promptView) { 12 | int[] location = new int[2]; 13 | int offset = (promptView.getWidth() - srcView.getWidth()) / 2; 14 | location[0] = srcViewLocation[0] - offset; 15 | location[1] = srcViewLocation[1] + promptView.getHeight(); 16 | return location; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/widget/location/ICalculateLocation.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.widget.location; 2 | 3 | import android.view.View; 4 | 5 | /** 6 | * Created by chenpengfei on 2016/11/2. 7 | */ 8 | public interface ICalculateLocation { 9 | 10 | int[] calculate(int[] srcViewLocation, View srcView, View promptView); 11 | 12 | } 13 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/widget/location/TopCenterLocation.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.widget.location; 2 | 3 | import android.view.View; 4 | 5 | /** 6 | * Created by chenpengfei on 2016/11/2. 7 | */ 8 | public class TopCenterLocation implements ICalculateLocation { 9 | 10 | @Override 11 | public int[] calculate(int[] srcViewLocation, View srcView, View promptView) { 12 | int[] location = new int[2]; 13 | int offset = (promptView.getWidth() - srcView.getWidth()) / 2; 14 | location[0] = srcViewLocation[0] - offset; 15 | location[1] = srcViewLocation[1] - promptView.getHeight(); 16 | return location; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/widget/location/TopLeftLocation.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.widget.location; 2 | 3 | import android.view.View; 4 | 5 | /** 6 | * Created by chenpengfei on 2016/11/2. 7 | */ 8 | public class TopLeftLocation implements ICalculateLocation { 9 | 10 | @Override 11 | public int[] calculate(int[] srcViewLocation, View srcView, View promptView) { 12 | int[] location = new int[2]; 13 | location[0] = srcViewLocation[0]; 14 | location[1] = srcViewLocation[1] - promptView.getHeight(); 15 | return location; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/java/com/doujiao/xiaozhihuiassistant/widget/location/TopRightLocation.java: -------------------------------------------------------------------------------- 1 | package com.doujiao.xiaozhihuiassistant.widget.location; 2 | 3 | import android.view.View; 4 | 5 | /** 6 | * Created by chenpengfei on 2016/11/2. 7 | */ 8 | public class TopRightLocation implements ICalculateLocation { 9 | 10 | @Override 11 | public int[] calculate(int[] srcViewLocation, View srcView, View promptView) { 12 | int[] location = new int[2]; 13 | int offset = promptView.getWidth() - srcView.getWidth(); 14 | location[0] = srcViewLocation[0] - offset; 15 | location[1] = srcViewLocation[1] - promptView.getHeight(); 16 | return location; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/res/drawable-v24/ic_launcher_foreground.xml: -------------------------------------------------------------------------------- 1 | 7 | 8 | 9 | 15 | 18 | 21 | 22 | 23 | 24 | 30 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/res/drawable/btnbg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/res/drawable/editbg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/res/layout/activity_item_left.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 11 | 12 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/res/layout/activity_item_right.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 13 | 14 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /example/Android/LLMAssistant/app/src/main/res/layout/activity_main.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 17 | 21 | 26 |