├── CMakeLists.txt ├── README.md ├── build-linux.sh ├── main.cpp ├── speech_to_text.cpp ├── speech_to_text.h ├── text_to_Speech.cpp ├── text_to_Speech.h └── 演示视频.mp4 /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(llm_demo) 3 | 4 | set(CMAKE_CXX_FLAGS " -O3 -w -std=c++11 -DNDEBUG -DKENLM_MAX_ORDER=6 ") 5 | set(CMAKE_C_FLAGS " -O3 -w -std=c++11 -DNDEBUG -DKENLM_MAX_ORDER=6 ") 6 | 7 | # 添加所有源文件 8 | set(SOURCE_FILES 9 | src/main.cpp 10 | src/text_to_Speech.cpp 11 | 12 | src/tn/glog/src/demangle.cc 13 | src/tn/glog/src/logging.cc 14 | src/tn/glog/src/raw_logging.cc 15 | src/tn/glog/src/symbolize.cc 16 | src/tn/glog/src/utilities.cc 17 | src/tn/glog/src/vlog_is_on.cc 18 | src/tn/glog/src/signalhandler.cc 19 | src/tn/gflags/src/gflags.cc 20 | src/tn/gflags/src/gflags_reporting.cc 21 | src/tn/gflags/src/gflags_completions.cc 22 | src/tn/openfst/src/lib/compat.cc 23 | src/tn/openfst/src/lib/flags.cc 24 | src/tn/openfst/src/lib/fst.cc 25 | src/tn/openfst/src/lib/fst-types.cc 26 | src/tn/openfst/src/lib/mapped-file.cc 27 | src/tn/openfst/src/lib/properties.cc 28 | src/tn/openfst/src/lib/symbol-table.cc 29 | src/tn/openfst/src/lib/symbol-table-ops.cc 30 | src/tn/openfst/src/lib/util.cc 31 | src/tn/openfst/src/lib/weight.cc 32 | src/tn/processor.cc 33 | src/tn/token_parser.cc 34 | src/tn/utf8_string.cc 35 | src/engipa/EnglishText2Id.cpp 36 | src/engipa/InitIPASymbols.cpp 37 | src/engipa/alphabet.cpp 38 | src/engipa/ipa.cpp 39 | src/hz2py/hanzi2phoneid.cpp 40 | src/hz2py/Hanz2Piny.cpp 41 | src/hz2py/pinyinmap.cpp 42 | src/nn_op/nn_conv1d.cpp 43 | src/nn_op/nn_softmax.cpp 44 | src/nn_op/nn_layer_norm.cpp 45 | src/nn_op/nn_relu.cpp 46 | src/nn_op/nn_gelu.cpp 47 | src/nn_op/nn_tanh.cpp 48 | src/nn_op/nn_flip.cpp 49 | src/nn_op/nn_cumsum.cpp 50 | src/nn_op/nn_softplus.cpp 51 | src/nn_op/nn_clamp_min.cpp 52 | src/nn_op/nn_sigmoid.cpp 53 | src/nn_op/nn_conv1d_transposed.cpp 54 | src/nn_op/nn_leaky_relu.cpp 55 | src/platform/tts_file_io.cpp 56 | src/platform/tts_logger.cpp 57 | src/utils/utils.cpp 58 | src/modules/iStft.cpp 59 | src/modules/hann.cpp 60 | src/modules/attention_encoder.cpp 61 | src/modules/multi_head_attention.cpp 62 | src/modules/ffn.cpp 63 | src/modules/ConvFlow.cpp 64 | src/modules/DDSConv.cpp 65 | src/modules/ElementwiseAffine.cpp 66 | src/modules/random_gen.cpp 67 | src/modules/ResidualCouplingLayer.cpp 68 | src/modules/ResBlock1.cpp 69 | src/modules/WN.cpp 70 | src/modules/pqmf.cpp 71 | src/models/TextEncoder.cpp 72 | src/models/StochasticDurationPredictor.cpp 73 | src/models/FixDurationPredictor.cpp 74 | src/models/DurationPredictor_base.cpp 75 | src/models/ResidualCouplingBlock.cpp 76 | src/models/Generator_base.cpp 77 | src/models/Generator_hifigan.cpp 78 | src/models/Generator_MS.cpp 79 | src/models/Generator_Istft.cpp 80 | src/models/Generator_MBB.cpp 81 | src/models/SynthesizerTrn.cpp 82 | 83 | SummerAsr-master2/src/am/am.cpp 84 | SummerAsr-master2/src/asr/asr.cpp 85 | SummerAsr-master2/src/decoder/ctc_beam_search_decoder.cpp 86 | SummerAsr-master2/src/feat/extract_feat.cpp 87 | SummerAsr-master2/src/feat/hanning.cpp 88 | SummerAsr-master2/src/nn/nn.cpp 89 | SummerAsr-master2/src/decoder/scorer.cpp 90 | SummerAsr-master2/src/vad/vad.cpp 91 | SummerAsr-master2/src/vad/vad_internal_api.c 92 | SummerAsr-master2/src/lm/bhiksha.cc 93 | SummerAsr-master2/src/lm/binary_format.cc 94 | SummerAsr-master2/src/lm/config.cc 95 | SummerAsr-master2/src/lm/lm_exception.cc 96 | SummerAsr-master2/src/lm/model.cc 97 | SummerAsr-master2/src/lm/quantize.cc 98 | SummerAsr-master2/src/lm/read_arpa.cc 99 | SummerAsr-master2/src/lm/search_hashed.cc 100 | SummerAsr-master2/src/lm/search_trie.cc 101 | SummerAsr-master2/src/lm/sizes.cc 102 | SummerAsr-master2/src/lm/trie.cc 103 | SummerAsr-master2/src/lm/trie_sort.cc 104 | SummerAsr-master2/src/lm/value_build.cc 105 | SummerAsr-master2/src/lm/virtual_interface.cc 106 | SummerAsr-master2/src/lm/vocab.cc 107 | SummerAsr-master2/src/util/bit_packing.cc 108 | SummerAsr-master2/src/util/ersatz_progress.cc 109 | SummerAsr-master2/src/util/exception.cc 110 | SummerAsr-master2/src/util/file.cc 111 | SummerAsr-master2/src/util/file_piece.cc 112 | SummerAsr-master2/src/util/float_to_string.cc 113 | SummerAsr-master2/src/util/integer_to_string.cc 114 | SummerAsr-master2/src/util/mmap.cc 115 | SummerAsr-master2/src/util/murmur_hash.cc 116 | SummerAsr-master2/src/util/parallel_read.cc 117 | SummerAsr-master2/src/util/pool.cc 118 | SummerAsr-master2/src/util/read_compressed.cc 119 | SummerAsr-master2/src/util/scoped.cc 120 | SummerAsr-master2/src/util/spaces.cc 121 | SummerAsr-master2/src/util/string_piece.cc 122 | SummerAsr-master2/src/util/usage.cc 123 | SummerAsr-master2/src/util/double-conversion/bignum.cc 124 | SummerAsr-master2/src/util/double-conversion/bignum-dtoa.cc 125 | SummerAsr-master2/src/util/double-conversion/cached-powers.cc 126 | SummerAsr-master2/src/util/double-conversion/diy-fp.cc 127 | SummerAsr-master2/src/util/double-conversion/double-conversion.cc 128 | SummerAsr-master2/src/util/double-conversion/fast-dtoa.cc 129 | SummerAsr-master2/src/util/double-conversion/fixed-dtoa.cc 130 | SummerAsr-master2/src/util/double-conversion/strtod.cc 131 | SummerAsr-master2/src/librnnoise/celt_lpc.c 132 | SummerAsr-master2/src/librnnoise/denoise.c 133 | SummerAsr-master2/src/librnnoise/kiss_fft.c 134 | SummerAsr-master2/src/librnnoise/pitch.c 135 | SummerAsr-master2/src/librnnoise/rnn.c 136 | SummerAsr-master2/src/librnnoise/rnn_data.c 137 | 138 | 139 | ) 140 | 141 | add_executable(${PROJECT_NAME} ${SOURCE_FILES}) 142 | 143 | # 设置包含目录 144 | include_directories( 145 | ${CMAKE_SOURCE_DIR}/include 146 | ${CMAKE_SOURCE_DIR}/src/tn/glog/src/glog 147 | ${CMAKE_SOURCE_DIR}/src 148 | ./eigen-3.4.0 149 | ./src/tn/header 150 | ./include 151 | ./src/header 152 | /usr/include 153 | ./SummerAsr-master2/include 154 | ./SummerAsr-master2/src 155 | ./SummerAsr-master2/src/until 156 | ) 157 | 158 | # RKLLM API 路径 159 | set(RKLLM_API_PATH "${CMAKE_SOURCE_DIR}/../../runtime/Linux/librkllm_api") 160 | include_directories(${RKLLM_API_PATH}/include) 161 | 162 | # 设置 RKLLM 库路径 163 | set(RKLLM_RT_LIB ${RKLLM_API_PATH}/aarch64/librkllmrt.so) 164 | target_link_libraries(${PROJECT_NAME} ${RKLLM_RT_LIB}) 165 | 166 | # 确保链接 pocketsphinx 和 sphinxbase 167 | find_package(PkgConfig REQUIRED) 168 | pkg_check_modules(POCKETSPHINX REQUIRED pocketsphinx) 169 | pkg_check_modules(SPHINXBASE REQUIRED sphinxbase) 170 | 171 | include_directories(${POCKETSPHINX_INCLUDE_DIRS} ${SPHINXBASE_INCLUDE_DIRS}) 172 | link_directories(${POCKETSPHINX_LIBRARY_DIRS} ${SPHINXBASE_LIBRARY_DIRS}) 173 | 174 | # 链接所需的库 175 | target_link_libraries(${PROJECT_NAME} ${POCKETSPHINX_LIBRARIES} ${SPHINXBASE_LIBRARIES}) 176 | 177 | # 设置 espeak 库和包含目录 178 | set(ESPEAK_INCLUDE_DIRS /usr/include/espeak) 179 | set(ESPEAK_LIBRARIES /usr/lib/aarch64-linux-gnu/libespeak.so) 180 | 181 | include_directories(${ESPEAK_INCLUDE_DIRS}) 182 | target_link_libraries(${PROJECT_NAME} ${ESPEAK_LIBRARIES}) 183 | 184 | # 添加 PortAudio 185 | find_library(PORTAUDIO_LIBRARY NAMES portaudio PATHS /usr/lib/aarch64-linux-gnu) 186 | include_directories(/usr/include) # 添加 PortAudio 头文件的路径 187 | target_link_libraries(${PROJECT_NAME} ${PORTAUDIO_LIBRARY}) 188 | 189 | # 可能需要调整的其他库的路径和设置 190 | 191 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rkllm_talking 2 | 3 | # Description 4 | - rkllm_talking is a standalone compiled voice communication system based on a large model, which can run locally without requiring an internet connection. 5 | - The voice input and output of this project reference the Github projects SummerASR and SummerTTS. The Github links to the reference projects are as follows: 6 | - SummerASR: https://github.com/huakunyang/SummerAsr 7 | - SummerTTS: https://github.com/huakunyang/SummerTTS 8 | - This project has been successfully compiled and run on RK3588's Ubuntu Debian 10. 9 | - The code for running the project and the cmake-related files are on Github. Since the entire project is too large, the full project can be downloaded from Baidu Netdisk. 10 | - Link: https://pan.baidu.com/s/1g1ucz1efKthsmbCbum4xWQ?pwd=2141 11 | Extraction code: 2141 12 | - The project's demo video is also included in the open-source repository on Github. 13 | 14 | # Usage Instructions 15 | - Download the project from Baidu Netdisk. 16 | - Compile the project (if cmake reports an error, you may need to clear the cmake cache). The specific command to execute is `./build-linux.sh`. 17 | - Place the "models" folder from the project directory at the same level as the compiled executable file. In the project, it is `./build/build_linux_aarch64_Release`. 18 | - One thing to note: to clear the cache, you can directly delete the "build" directory, but when re-running, you need to place the "models" folder from the project directory at the same level as the executable file, which is `./build/build_linux_aarch64_Release`. 19 | - The project directory structure is as follows: 20 | 21 | ├──build-android.sh 22 | 23 | ├──build-linux.sh 24 | 25 | ├──CMakeLists.txt 26 | 27 | ├──eigen-3.4.0 28 | 29 | ├──include 30 | 31 | ├──models 32 | 33 | ├── Readme.md 34 | 35 | ├──src 36 | 37 | └── SummerAsr-master2 38 | 39 | - The rkllm deployment requires an NPU kernel version of v0.9.6 or higher. 40 | - The project uses Qwen-1_8B, and you need to download it for usage: 41 | - `git lfs install` 42 | - `git clone https://huggingface.co/Qwen/Qwen-1_8B-Chat` 43 | - After installing the PortAudio and espeak libraries, you can run the following command in the `build_linux_aarch64_Release` directory: 44 | - `ulimit -HSn 10240` 45 | - `./llm_demo path/qwen.rkllm ./models/single_speaker_fast.bin test.wav` 46 | 47 | # Project Dependencies 48 | - PortAudio, espeak, RKLLM 49 | 50 | # Contact Information 51 | - QQ: 2867191922 52 | - WeChat (weixin): zjq15396069991 53 | - Email: zhujiaqi@tiangong.edu.cn 54 | 55 | # Acknowledgements 56 | This project references and uses the following solutions, and we express our gratitude: 57 | - SummerASR (https://github.com/huakunyang/SummerAsr) 58 | - SummerTTS (https://github.com/huakunyang/SummerTTS) 59 | - Wildfire Embedded AI Tutorial for RKLLM Deployment (https://doc.embedfire.com/linux/rk356x/Ai/zh/latest/lubancat_ai/env/rkllm.html) 60 | 61 | 62 | ======================================================================================================================================================================================== 63 | 64 | 65 | 66 | # 说明 67 | - rkllm_talking 是一个独立编译的基于大模型的语音交流系统,是一个可以本地运行的系统,不需要连接网络 68 | - 该项目的语音输入和输出参考自Github项目 SummerASR 和 SummerTTS , 参考项目的Github链接如下 69 | - SummerASR:https://github.com/huakunyang/SummerAsr 70 | - SummerTTS:https://github.com/huakunyang/SummerTTS 71 | - 本项目在 RK3588 的 Ubuntu Debin10 上编译运行通过 72 | - 在github上的是项目运行的代码和cmake相关文件,因整个项目太大,故要获取整个项目的话需要在百度网盘上下载 73 | - 链接:https://pan.baidu.com/s/1g1ucz1efKthsmbCbum4xWQ?pwd=2141 74 | 提取码:2141 75 | - 项目的演示视频也置入于github开源中 76 | 77 | 78 | 79 | # 使用说明 80 | - 从百度网盘上下载本项目 81 | - 进行编译(如果cmake报错可能需要清除Cmake缓存),具体执行命令为./build-linux.sh 82 | - 将项目目录中的models置入编译后输出的可执行文件的同级目录下,项目中为./build/build_linux_aarch64_Release 83 | - 需要注意的一点是:清除缓存可以直接删除build目录,但在再次运行时需要将项目目录中的models置入可执行文件的同级目录下,项目为./build/build_linux_aarch64_Release 84 | - 项目目录结构为 85 | 86 | ├──build-android.sh 87 | 88 | ├──build-linux.sh 89 | 90 | ├──CMakeLists.txt 91 | 92 | ├──eigen-3.4.0 93 | 94 | ├──include 95 | 96 | ├──models 97 | 98 | ├── Readme.md 99 | 100 | ├──src 101 | 102 | └── SummerAsr-master2 103 | 104 | - 其中rkllm部署需要v0.9.6 版本以上的NPU内核 105 | - 项目使用Qwen-1_8B,使用时需要下载: 106 | - git lfs install 107 | - git clone https://huggingface.co/Qwen/Qwen-1_8B-Chat 108 | - 在安装完PortAudio和espeak库后可在build_linux_aarch64_Release目录下执行执行以下命令运行: 109 | - ulimit -HSn 10240 110 | - ./llm_demo path/qwen.rkllm ./models/single_speaker_fast.bin test.wav 111 | 112 | # 项目依赖 113 | - PortAudio , espeak , RKLLM 114 | 115 | # 联系方式 116 | - qq:2867191922 117 | - 微信(weixin): zjq15396069991 118 | - 邮箱:zhujiaqi@tiangong.edu.cn 119 | 120 | 121 | # 感谢 122 | 本项目参考并使用了下列方案,在此表示感谢 123 | - SummerASR(https://github.com/huakunyang/SummerAsr) 124 | - SummerTTS(https://github.com/huakunyang/SummerTTS) 125 | - 野火嵌入式AI教程RKLLM部署方案(https://doc.embedfire.com/linux/rk356x/Ai/zh/latest/lubancat_ai/env/rkllm.html) 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /build-linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Debug / Release / RelWithDebInfo 3 | if [[ -z ${BUILD_TYPE} ]]; then 4 | BUILD_TYPE=Release 5 | fi 6 | 7 | GCC_COMPILER_PATH=aarch64-linux-gnu 8 | C_COMPILER=${GCC_COMPILER_PATH}-gcc 9 | CXX_COMPILER=${GCC_COMPILER_PATH}-g++ 10 | STRIP_COMPILER=${GCC_COMPILER_PATH}-strip 11 | 12 | TARGET_ARCH=aarch64 13 | TARGET_PLATFORM=linux 14 | if [[ -n ${TARGET_ARCH} ]]; then 15 | TARGET_PLATFORM=${TARGET_PLATFORM}_${TARGET_ARCH} 16 | fi 17 | 18 | ROOT_PWD=$( cd "$( dirname $0 )" && cd -P "$( dirname "$SOURCE" )" && pwd ) 19 | BUILD_DIR=${ROOT_PWD}/build/build_${TARGET_PLATFORM}_${BUILD_TYPE} 20 | 21 | if [[ ! -d "${BUILD_DIR}" ]]; then 22 | mkdir -p ${BUILD_DIR} 23 | fi 24 | 25 | cd ${BUILD_DIR} 26 | cmake ../.. \ 27 | -DCMAKE_SYSTEM_PROCESSOR=${TARGET_ARCH} \ 28 | -DCMAKE_SYSTEM_NAME=Linux \ 29 | -DCMAKE_C_COMPILER=${C_COMPILER} \ 30 | -DCMAKE_CXX_COMPILER=${CXX_COMPILER} \ 31 | -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ 32 | -DCMAKE_POSITION_INDEPENDENT_CODE=ON 33 | 34 | make -j4 35 | 36 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "rkllm.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "text_to_Speech.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | //#include 19 | //#include 20 | #include 21 | #include 22 | #include 23 | #include 24 | //#include 25 | //#include 26 | #include 27 | #include 28 | //#include 29 | #include "extract_feat.h" 30 | #include "asr.h" 31 | #include 32 | 33 | #define SAMPLE_RATE 16000 34 | #define FRAMES_PER_BUFFER 512 35 | #define NUM_CHANNELS 1 36 | #define SAMPLE_FORMAT paInt16 37 | 38 | 39 | #define PROMPT_TEXT_PREFIX "system You are a helpful assistant. user" 40 | #define PROMPT_TEXT_POSTFIX "assistant" 41 | 42 | using namespace std; 43 | LLMHandle llmHandle = nullptr; 44 | char* tts_model_path; 45 | string textfile = "outputllm.txt"; 46 | 47 | char* tts_textfile_path; 48 | 49 | char* tts_wavfile_path; 50 | 51 | string rkllm_answer; 52 | 53 | 54 | 55 | void writeTextToFile(const std::string& filePath, const std::string& content) { 56 | std::ofstream outFile(filePath); 57 | if (!outFile.is_open()) { 58 | std::cerr << "无法打开文件以写入: " << filePath << std::endl; 59 | return; 60 | } 61 | outFile << content; 62 | outFile.close(); 63 | } 64 | 65 | 66 | void exit_handler(int signal) 67 | { 68 | if (llmHandle != nullptr) 69 | { 70 | { 71 | cout << "程序即将退出" << endl; 72 | LLMHandle _tmp = llmHandle; 73 | llmHandle = nullptr; 74 | rkllm_destroy(_tmp); 75 | } 76 | exit(signal); 77 | } 78 | } 79 | 80 | void callback(RKLLMResult *result, void *userdata, LLMCallState state) 81 | { 82 | if (state == LLM_RUN_FINISH) 83 | { 84 | 85 | printf("\n"); 86 | 87 | // 读取文件内容并去掉空格和换行符 88 | //std::ifstream infile(textfile); 89 | //std::stringstream buffer; 90 | //buffer << infile.rdbuf(); 91 | //std::string file_content = buffer.str(); 92 | // infile.close(); 93 | 94 | //调用文字转语音的函数 95 | // text_to_speech(tts_textfile_path , tts_model_path , tts_wavfile_path); 96 | 97 | 98 | // 去掉空格和换行符 99 | // file_content.erase(std::remove_if(file_content.begin(), file_content.end(), ::isspace), file_content.end()); 100 | 101 | // 将处理后的内容存储到变量中 102 | //std::string processed_content = file_content; 103 | //打印变量来测试 104 | //printf("%s" , processed_content.c_str()); 105 | 106 | 107 | 108 | // 清空文件内容 109 | //std::ofstream clear_file(textfile, std::ofstream::out | std::ofstream::trunc); 110 | //clear_file.close(); 111 | 112 | // 调用 text_to_speech 函数输出语音 113 | // text_to_speech(processed_content , tts_model_path); 114 | } 115 | else if (state == LLM_RUN_ERROR) 116 | { 117 | printf("\\run error\n"); 118 | } 119 | else 120 | { 121 | printf("%s", result->text); 122 | 123 | // 将 result->text 去掉空格和换行符 124 | std::string clean_text(result->text); 125 | clean_text.erase(std::remove_if(clean_text.begin(), clean_text.end(), ::isspace), clean_text.end()); 126 | // 将处理后的内容添加到变量中 127 | rkllm_answer += clean_text; 128 | 129 | 130 | } 131 | } 132 | 133 | 134 | //语音输入识别的相关定义开始 135 | const int max_record_duration = 5; // 固定录音时长5秒 136 | const int total_frames = SAMPLE_RATE * max_record_duration; 137 | 138 | PaStream *stream; 139 | 140 | 141 | //语音输入识别的相关定义结束 142 | 143 | //PaStream *stream;//声明音频流 144 | 145 | int main(int argc, char **argv) 146 | { 147 | /* 148 | if (argc != 3) 149 | { 150 | printf("Usage: %s \n", argv[0]); 151 | return -1; 152 | }*/ 153 | 154 | 155 | if (argc != 4) 156 | { 157 | printf("Usage: %s \n" , argv[0]); 158 | } 159 | 160 | 161 | string rkllm_model(argv[1]); 162 | string tts_model(argv[2]); 163 | string output_text_file = "output_text.txt"; // 固定的文本文件路径 164 | 165 | string speech_input;//语音输入字符串 166 | 167 | tts_model_path = argv[2]; 168 | // tts_textfile_path = argv[3]; 169 | tts_wavfile_path = argv[3]; 170 | 171 | signal(SIGINT, exit_handler); 172 | printf("rkllm init start\n"); 173 | 174 | //设置参数及初始化 175 | RKLLMParam param = rkllm_createDefaultParam(); 176 | param.model_path = rkllm_model.c_str(); 177 | param.num_npu_core = 2; 178 | param.top_k = 1; 179 | param.max_new_tokens = 256; 180 | param.max_context_len = 512; 181 | param.logprobs = false; 182 | param.top_logprobs = 5; 183 | param.use_gpu = false; 184 | rkllm_init(&llmHandle, param, callback); 185 | printf("rkllm init success\n"); 186 | 187 | vector pre_input; 188 | pre_input.push_back("把下面的现代文翻译成文言文:到了春风和煦,阳光明媚的时候,湖面平静,没有惊涛骇浪,天色湖光相连,一片碧绿,广阔无际;沙洲上的鸥鸟,时而飞翔,时而停歇,美丽的鱼游来游去,岸上与小洲上的花草,青翠欲滴。"); 189 | pre_input.push_back("以咏梅为题目,帮我写一首古诗,要求包含梅花、白雪等元素。"); 190 | pre_input.push_back("上联: 江边惯看千帆过"); 191 | pre_input.push_back("把这句话翻译成中文:Knowledge can be acquired from many sources. These include books, teachers and practical experience, and each has its own advantages. The knowledge we gain from books and formal education enables us to learn about things that we have no opportunity to experience in daily life. We can also develop our analytical skills and learn how to view and interpret the world around us in different ways. Furthermore, we can learn from the past by reading books. In this way, we won't repeat the mistakes of others and can build on their achievements."); 192 | pre_input.push_back("把这句话翻译成英文:RK3588是新一代高端处理器,具有高算力、低功耗、超强多媒体、丰富数据接口等特点"); 193 | pre_input.push_back("通过语音输入问题"); 194 | cout << "\n**********************可输入以下问题对应序号获取回答/或自定义输入********************\n" 195 | << endl; 196 | for (int i = 0; i < (int)pre_input.size(); i++) 197 | { 198 | cout << "[" << i << "] " << pre_input[i] << endl; 199 | } 200 | // cout << "[voice] 通过语音输入问题" << endl; // 添加语音选项提示 201 | // cout << "\n*************************************************************************\n" 202 | // << endl; 203 | 204 | 205 | // speech_to_text(); 206 | 207 | 208 | while (true) 209 | { 210 | std::string input_str; 211 | printf("\n"); 212 | printf("user: "); 213 | std::getline(std::cin, input_str); 214 | if (input_str == "exit") 215 | { 216 | break; 217 | } 218 | 219 | for (int i = 0; i < (int)pre_input.size(); i++) 220 | { 221 | //如果输入语音的话,就调用语音输入 222 | // 223 | if(i == 4) 224 | { 225 | printf("语音输入\r\n"); 226 | 227 | //调用语音输入函数 228 | // Initialize PortAudio 229 | PaError err = Pa_Initialize(); 230 | 231 | if (err != paNoError) { 232 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 233 | return -1; 234 | } 235 | 236 | err = Pa_OpenDefaultStream(&stream, 237 | NUM_CHANNELS, 238 | 0, 239 | SAMPLE_FORMAT, 240 | SAMPLE_RATE, 241 | FRAMES_PER_BUFFER, 242 | NULL, 243 | NULL); 244 | 245 | if (err != paNoError) { 246 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 247 | return -1; 248 | } 249 | printf("请开始说话\r\n"); 250 | err = Pa_StartStream(stream); 251 | if (err != paNoError) { 252 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 253 | return -1; 254 | } 255 | 256 | std::vector wavBuf; 257 | int16_t buffer[FRAMES_PER_BUFFER]; 258 | int frames_recorded = 0; 259 | 260 | 261 | while (frames_recorded < total_frames) { 262 | err = Pa_ReadStream(stream, buffer, FRAMES_PER_BUFFER); 263 | if (err != paNoError) { 264 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 265 | break; 266 | } 267 | 268 | wavBuf.insert(wavBuf.end(), buffer, buffer + FRAMES_PER_BUFFER); 269 | frames_recorded += FRAMES_PER_BUFFER; 270 | } 271 | 272 | err = Pa_StopStream(stream); 273 | if (err != paNoError) { 274 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 275 | } 276 | //printf("stream will close \r\n"); 277 | Pa_CloseStream(stream); 278 | Pa_Terminate(); 279 | //printf("stream is closed\r\n"); 280 | struct timeval start, end; 281 | float timeuse, wavDur = static_cast(wavBuf.size()) / SAMPLE_RATE; 282 | //printf("timeuse is cleared\r\n"); 283 | gettimeofday(&start, NULL); 284 | //printf("model will load\r\n"); 285 | void *asrData = asrInit("./models/am.model", "./models/char.txt", "./models/lm.model", false); 286 | gettimeofday(&end, NULL); 287 | timeuse = 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec; 288 | std::cout << "Model loading time costs: " << std::setprecision(6) << timeuse / 1000000 << 's' << "\n"; 289 | 290 | gettimeofday(&start, NULL); 291 | std::string asrResult = asrRun_without_vad(asrData, wavBuf.data(), wavBuf.size()); 292 | gettimeofday(&end, NULL); 293 | 294 | std::cout << "Asr Result: " << asrResult << "\n"; 295 | timeuse = 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec; 296 | std::cout << "Wav duration: " << std::setprecision(6) << wavDur << "s, Asr Decoding time costs: " << std::setprecision(6) << timeuse / 1000000 << "s, RTF: " << std::setprecision(6) << (timeuse / 1000000) / wavDur << "\n"; 297 | 298 | 299 | //将语音内容置入大模型 300 | input_str = asrResult; 301 | 302 | //speech_input = speech_to_text(); 303 | // 打印 speech_input 的内容 304 | //std::cout << "Speech Input: " << speech_input << std::endl; 305 | // 清空 speech_input 的内容 306 | //speech_input.clear(); 307 | } 308 | else if(input_str == to_string(i)) 309 | { 310 | input_str = pre_input[i]; // 如果用户输入的是序号,则将其替换为对应的预设输入 311 | cout << input_str << endl; 312 | } 313 | } 314 | // 拼接前缀和后缀以生成完整的输入文本 315 | // string text = PROMPT_TEXT_PREFIX + input_str + PROMPT_TEXT_POSTFIX; 316 | string text = input_str; 317 | 318 | 319 | // 调用大模型 320 | printf("robot: "); 321 | rkllm_run(llmHandle, input_str.c_str(), NULL); 322 | 323 | // 等待回调完成并保存文本 324 | std::this_thread::sleep_for(std::chrono::seconds(1)); // 确保回调完成 325 | 326 | // 打印处理后的内容 327 | printf("Processed Content: %s\n", rkllm_answer.c_str()); 328 | 329 | 330 | text_to_speech(rkllm_answer.c_str() , tts_model_path , tts_wavfile_path); 331 | 332 | 333 | 334 | 335 | 336 | // 清空处理后的内容 337 | rkllm_answer.clear(); 338 | 339 | 340 | 341 | 342 | // 读取大模型输出文本 343 | // std::ifstream inFile("output_text.txt"); 344 | // if (!inFile) 345 | // { 346 | // cerr << "无法读取文本文件" << endl; 347 | // continue; 348 | // } 349 | // std::string output_text((std::istreambuf_iterator(inFile)), 350 | // std::istreambuf_iterator()); 351 | 352 | // 调用 text_to_speech 生成语音 353 | // text_to_speech(output_text, tts_model); 354 | 355 | //inFile.close(); 356 | } 357 | 358 | rkllm_destroy(llmHandle); 359 | 360 | return 0; 361 | } 362 | 363 | 364 | -------------------------------------------------------------------------------- /speech_to_text.cpp: -------------------------------------------------------------------------------- 1 | #include "speech_to_text.h" // 包含语音转文字功能的头文件 2 | #include // 标准输入输出库 3 | #include // 用于获取时间的库 4 | #include // 用于标准输入输出流操作的库 5 | #include // 文件流操作库 6 | #include // 用于控制输出格式的库 7 | #include // 包含数据类型定义的库 8 | #include // 用于文件状态操作的库 9 | #include // 目录操作库 10 | #include // POSIX 操作系统 API 库 11 | #include // 用于字符串操作的库,包含 strstr 函数 12 | #include // 时间库 13 | #include // 标准整数类型库 14 | #include // 向量容器库 15 | #include "extract_feat.h" // 特征提取相关的头文件 16 | #include "asr.h" // 自动语音识别(ASR)相关头文件 17 | #include // PortAudio 库,用于音频输入输出 18 | 19 | // 定义采样率、每个缓冲区的帧数、音频通道数和采样格式 20 | #define SAMPLE_RATE 16000 // 采样率为16000Hz 21 | #define FRAMES_PER_BUFFER 512 // 每个缓冲区包含512帧 22 | #define NUM_CHANNELS 1 // 单声道(一个通道) 23 | #define SAMPLE_FORMAT paInt16 // 采样格式为16位整型 24 | 25 | 26 | PaStream *stream; 27 | 28 | /** 29 | * 函数:speech_to_text 30 | * 描述:录制音频并将其转换为文字。 31 | * 返回值:std::string,识别到的文本结果。 32 | */ 33 | std::string speech_to_text() { 34 | const int max_record_duration = 5; // 最大录音时长为5秒 35 | const int total_frames = SAMPLE_RATE * max_record_duration; // 总帧数,根据录音时长和采样率计算 36 | string asrResult; 37 | // 初始化 PortAudio 库 38 | PaError err = Pa_Initialize(); 39 | if (err != paNoError) { 40 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 41 | return ""; 42 | } 43 | 44 | // 打开默认音频流 45 | //PaStream *stream; 46 | err = Pa_OpenDefaultStream(&stream, 47 | NUM_CHANNELS, // 输入通道数 48 | 0, // 输出通道数(这里不需要输出) 49 | SAMPLE_FORMAT, // 采样格式 50 | SAMPLE_RATE, // 采样率 51 | FRAMES_PER_BUFFER, // 每个缓冲区的帧数 52 | NULL, // 没有回调函数 53 | NULL); // 没有回调函数的用户数据 54 | if (err != paNoError) { 55 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 56 | return ""; 57 | } 58 | printf("开始录制\r\n"); 59 | // 开始音频流 60 | err = Pa_StartStream(stream); 61 | if (err != paNoError) { 62 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 63 | return ""; 64 | } 65 | 66 | // 用于存储录制的音频数据的向量 67 | std::vector wavBuf; 68 | int16_t buffer[FRAMES_PER_BUFFER]; // 临时缓冲区,用于存储从音频流中读取的帧 69 | int frames_recorded = 0; // 已录制的帧数 70 | 71 | 72 | printf("开始存储录制数据\r\n"); 73 | 74 | // 循环录制音频,直到达到最大帧数 75 | while (frames_recorded < total_frames) { 76 | err = Pa_ReadStream(stream, buffer, FRAMES_PER_BUFFER); // 从流中读取数据 77 | if (err != paNoError) { 78 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 79 | break; 80 | } 81 | 82 | std::cout << "Read " << frames_recorded << " frames" << std::endl; 83 | 84 | // 将读取的音频数据添加到 wavBuf 中 85 | wavBuf.insert(wavBuf.end(), buffer, buffer + FRAMES_PER_BUFFER); 86 | frames_recorded += FRAMES_PER_BUFFER; // 更新已录制的帧数 87 | } 88 | std::cout << "Total frames recorded: " << frames_recorded << std::endl; 89 | 90 | std::cout << "Wav buffer size: " << wavBuf.size() << std::endl; 91 | std::cout << "Wav buffer pointer: " << static_cast(wavBuf.data()) << std::endl; 92 | 93 | // 停止音频流 94 | err = Pa_StopStream(stream); 95 | printf("音频流停止"); 96 | if (err != paNoError) { 97 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 98 | } 99 | Pa_CloseStream(stream); // 关闭音频流 100 | Pa_Terminate(); // 终止 PortAudio 库 101 | 102 | // 获取开始时间和结束时间用于计算时间消耗 103 | struct timeval start, end; 104 | float timeuse, wavDur = static_cast(wavBuf.size()) / SAMPLE_RATE; // 计算录音的时长 105 | 106 | gettimeofday(&start, NULL); // 获取模型加载开始时间 107 | void *asrData = asrInit("../model/am.model", "../model/char.txt", "../model/lm.model", false); // 初始化 ASR 模型 108 | gettimeofday(&end, NULL); // 获取模型加载结束时间 109 | timeuse = 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec; // 计算模型加载时间 110 | std::cout << "Model loading time costs: " << std::setprecision(6) << timeuse / 1000000 << 's' << "\n"; // 输出模型加载时间 111 | 112 | gettimeofday(&start, NULL); // 获取 ASR 开始时间 113 | try { 114 | asrResult = asrRun_without_vad(asrData, wavBuf.data(), wavBuf.size()); 115 | } catch (const std::exception &e) { 116 | std::cerr << "Exception caught: " << e.what() << std::endl; 117 | } 118 | // std::string asrResult = asrRun_without_vad(asrData, wavBuf.data(), wavBuf.size()); // 运行 ASR 识别,传入录音数据 119 | gettimeofday(&end, NULL); // 获取 ASR 结束时间 120 | 121 | 122 | std::cout << "Wav buffer size: " << wavBuf.size() << std::endl; 123 | std::cout << "Wav buffer pointer: " << wavBuf.data() << std::endl; 124 | 125 | std::cout << "Asr Result: " << asrResult << "\n"; // 输出识别结果 126 | timeuse = 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec; // 计算 ASR 识别时间 127 | std::cout << "Wav duration: " << std::setprecision(6) << wavDur << "s, Asr Decoding time costs: " << std::setprecision(6) << timeuse / 1000000 << "s, RTF: " << std::setprecision(6) << (timeuse / 1000000) / wavDur << "\n"; // 输出识别时间和实时因子(RTF) 128 | 129 | return asrResult; // 返回识别结果 130 | } 131 | 132 | -------------------------------------------------------------------------------- /speech_to_text.h: -------------------------------------------------------------------------------- 1 | #ifndef SPEECH_TO_TEXT_H 2 | #define SPEECH_TO_TEXT_H 3 | 4 | #include 5 | 6 | std::string speech_to_text(); 7 | 8 | #endif // SPEECH_TO_TEXT_H 9 | 10 | -------------------------------------------------------------------------------- /text_to_Speech.cpp: -------------------------------------------------------------------------------- 1 | #include "SynthesizerTrn.h" 2 | #include "utils.h" 3 | #include "Hanz2Piny.h" 4 | #include "hanzi2phoneid.h" 5 | #include "portaudio.h" 6 | #include "text_to_Speech.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | 14 | using namespace std; 15 | 16 | void convertAudioToWavBuf(char* toBuf, char* fromBuf, int totalAudioLen) { 17 | char* header = toBuf; 18 | int byteRate = 16 * 16000 * 1 / 8; 19 | int totalDataLen = totalAudioLen + 36; 20 | int channels = 1; 21 | int longSampleRate = 16000; 22 | 23 | header[0] = 'R'; // RIFF/WAVE header 24 | header[1] = 'I'; 25 | header[2] = 'F'; 26 | header[3] = 'F'; 27 | header[4] = (char)(totalDataLen & 0xff); 28 | header[5] = (char)((totalDataLen >> 8) & 0xff); 29 | header[6] = (char)((totalDataLen >> 16) & 0xff); 30 | header[7] = (char)((totalDataLen >> 24) & 0xff); 31 | header[8] = 'W'; 32 | header[9] = 'A'; 33 | header[10] = 'V'; 34 | header[11] = 'E'; 35 | header[12] = 'f'; // 'fmt ' chunk 36 | header[13] = 'm'; 37 | header[14] = 't'; 38 | header[15] = ' '; 39 | header[16] = 16; // 4 bytes: size of 'fmt ' chunk 40 | header[17] = 0; 41 | header[18] = 0; 42 | header[19] = 0; 43 | header[20] = 1; // format = 1 44 | header[21] = 0; 45 | header[22] = (char)channels; 46 | header[23] = 0; 47 | header[24] = (char)(longSampleRate & 0xff); 48 | header[25] = (char)((longSampleRate >> 8) & 0xff); 49 | header[26] = (char)((longSampleRate >> 16) & 0xff); 50 | header[27] = (char)((longSampleRate >> 24) & 0xff); 51 | header[28] = (char)(byteRate & 0xff); 52 | header[29] = (char)((byteRate >> 8) & 0xff); 53 | header[30] = (char)((byteRate >> 16) & 0xff); 54 | header[31] = (char)((byteRate >> 24) & 0xff); 55 | header[32] = (char)(1 * 16 / 8); // block align 56 | header[33] = 0; 57 | header[34] = 16; // bits per sample 58 | header[35] = 0; 59 | header[36] = 'd'; 60 | header[37] = 'a'; 61 | header[38] = 't'; 62 | header[39] = 'a'; 63 | header[40] = (char)(totalAudioLen & 0xff); 64 | header[41] = (char)((totalAudioLen >> 8) & 0xff); 65 | header[42] = (char)((totalAudioLen >> 16) & 0xff); 66 | header[43] = (char)((totalAudioLen >> 24) & 0xff); 67 | 68 | memcpy(toBuf + 44, fromBuf, totalAudioLen); 69 | } 70 | 71 | 72 | 73 | int text_to_speech(const char* textstring, char* modelFilePath, char* outputWavFilePath) 74 | { 75 | const Hanz2Piny hanz2piny; 76 | 77 | std::string line(textstring); 78 | 79 | // 如果字符串是UTF-8编码,并且以BOM开头,移除BOM 80 | if (hanz2piny.isStartWithBom(line)) { 81 | line = std::string(line.cbegin() + 3, line.cend()); 82 | } 83 | 84 | float* dataW = NULL; 85 | int32_t modelSize = ttsLoadModel(const_cast(modelFilePath), &dataW); 86 | 87 | SynthesizerTrn* synthesizer = new SynthesizerTrn(dataW, modelSize); 88 | int32_t spkNum = synthesizer->getSpeakerNum(); 89 | std::cout << "Available speakers in the model are " << spkNum << std::endl; 90 | 91 | PaError err = Pa_Initialize(); 92 | if (err != paNoError) { 93 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 94 | return -1; 95 | } 96 | 97 | int32_t dataLen; 98 | int16_t* audioData = synthesizer->infer(line, 0, 1.0, dataLen); 99 | 100 | int totalAudioLen = dataLen * sizeof(int16_t); 101 | char* wavBuf = new char[totalAudioLen + 44]; 102 | convertAudioToWavBuf(wavBuf, reinterpret_cast(audioData), totalAudioLen); 103 | 104 | std::ofstream outFile(outputWavFilePath, std::ios::binary); 105 | outFile.write(wavBuf, totalAudioLen + 44); 106 | outFile.close(); 107 | 108 | // PortAudio playback 109 | PaStream* stream; 110 | err = Pa_OpenDefaultStream(&stream, 0, 1, paInt16, 16000, 256, NULL, NULL); 111 | if (err != paNoError) { 112 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 113 | return -1; 114 | } 115 | 116 | err = Pa_StartStream(stream); 117 | if (err != paNoError) { 118 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 119 | return -1; 120 | } 121 | 122 | err = Pa_WriteStream(stream, audioData, dataLen); 123 | if (err != paNoError) { 124 | std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; 125 | return -1; 126 | } 127 | 128 | Pa_CloseStream(stream); 129 | Pa_Terminate(); 130 | 131 | delete[] wavBuf; 132 | delete synthesizer; 133 | 134 | return 0; 135 | } 136 | -------------------------------------------------------------------------------- /text_to_Speech.h: -------------------------------------------------------------------------------- 1 | #ifndef TEXT_TO_SPEECH_H 2 | #define TEXT_TO_SPEECH_H 3 | 4 | #include 5 | 6 | int text_to_speech(const char* textstring, char* modelFilePath, char* outputWavFilePath); 7 | 8 | #endif // TEXT_TO_SPEECH_H 9 | 10 | -------------------------------------------------------------------------------- /演示视频.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dadDR/rkllm_talking/eecb341d03181b3f4052f466aaa9a5a1422a20e9/演示视频.mp4 --------------------------------------------------------------------------------