├── Linux ├── CMakeLists.txt ├── include │ ├── phonemize.h │ └── wavfile.hpp ├── main.cpp └── src │ └── phonemize.cpp ├── MSCV ├── Vits2-onnx-cpp.sln ├── Vits2-onnx-cpp.vcxproj ├── Vits2-onnx-cpp.vcxproj.filters ├── Vits2-onnx-cpp.vcxproj.user ├── VitsONNX.cpp ├── VitsONNX.h ├── main.cpp ├── phonemize.cpp ├── phonemize.h └── wavfile.h └── README.md /Linux/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # cmake needs this line 2 | cmake_minimum_required(VERSION 3.10) 3 | 4 | # Define project name 5 | project(vits) 6 | set(CMAKE_CXX_STANDARD 17 CACHE STRING "v") 7 | set(CMAKE_CXX_STANDARD_REQUIRED True) 8 | set(CMAKE_CXX_EXTENSIONS OFF) 9 | 10 | # Find OpenCV, you may need to set OpenCV_DIR variable 11 | # to the absolute path to the directory containing OpenCVConfig.cmake file 12 | # via the command line or GUI 13 | # If the package has been found, several variables will 14 | # be set, you can find the full list with descriptions 15 | # in the OpenCVConfig.cmake file. 16 | # Print some message showing some of them 17 | 18 | find_path(ONNX_RUNTIME_SESSION_INCLUDE_DIRS onnxruntime_cxx_api.h HINTS /usr/local/include/onnxruntime/) 19 | find_library(ONNX_RUNTIME_LIB onnxruntime HINTS /usr/local/lib) 20 | 21 | set(ESPEAK_NG_DIR ${PROJECT_SOURCE_DIR}/espeak-ng) 22 | INCLUDE_DIRECTORIES(${ESPEAK_NG_DIR}/include/) 23 | ADD_LIBRARY(espeak-ng SHARED IMPORTED) 24 | set_property(TARGET espeak-ng PROPERTY IMPORTED_LOCATION ${ESPEAK_NG_DIR}/lib/libespeak-ng.so) 25 | 26 | 27 | include_directories(${PROJECT_SOURCE_DIR}/include) 28 | 29 | file(GLOB_RECURSE SOURCES "${PROJECT_SOURCE_DIR}/main.cpp" "${PROJECT_SOURCE_DIR}/src/*.cpp" "${PROJECT_SOURCE_DIR}/src/*.c" "${PROJECT_SOURCE_DIR}/src/*.h" "${PROJECT_SOURCE_DIR}/src/*.hpp") 30 | 31 | 32 | add_executable(${PROJECT_NAME} ${SOURCES}) 33 | 34 | 35 | target_include_directories(vits PRIVATE ${ONNX_RUNTIME_SESSION_INCLUDE_DIRS} ) 36 | target_link_libraries(vits PRIVATE ${ONNX_RUNTIME_LIB} espeak-ng) 37 | -------------------------------------------------------------------------------- /Linux/include/phonemize.h: -------------------------------------------------------------------------------- 1 | #ifndef PHOEMIZE_H_ 2 | #define PHOEMIZE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | typedef char32_t Phoneme; 15 | typedef std::map> PhonemeMap; 16 | 17 | struct eSpeakPhonemeConfig { 18 | std::string voice = "en-us"; 19 | 20 | Phoneme period = U'.'; // CLAUSE_PERIOD 21 | Phoneme comma = U','; // CLAUSE_COMMA 22 | Phoneme question = U'?'; // CLAUSE_QUESTION 23 | Phoneme exclamation = U'!'; // CLAUSE_EXCLAMATION 24 | Phoneme colon = U':'; // CLAUSE_COLON 25 | Phoneme semicolon = U';'; // CLAUSE_SEMICOLON 26 | Phoneme space = U' '; 27 | 28 | // Remove language switch flags like "(en)" 29 | bool keepLanguageFlags = false; 30 | 31 | std::shared_ptr phonemeMap; 32 | }; 33 | 34 | // Phonemizes text using espeak-ng. 35 | // Returns phonemes for each sentence as a separate std::vector. 36 | // 37 | // Assumes espeak_Initialize has already been called. 38 | std::string phonemize_eSpeak(std::string text, eSpeakPhonemeConfig &config); 39 | std::vector text_to_sequence(const std::string& text, eSpeakPhonemeConfig &config); 40 | #endif // PHONEMIZE_H_ -------------------------------------------------------------------------------- /Linux/include/wavfile.hpp: -------------------------------------------------------------------------------- 1 | #ifndef WAVFILE_H_ 2 | #define WAVFILE_H_ 3 | 4 | #include 5 | 6 | struct WavHeader { 7 | uint8_t RIFF[4] = {'R', 'I', 'F', 'F'}; 8 | uint32_t chunkSize; 9 | uint8_t WAVE[4] = {'W', 'A', 'V', 'E'}; 10 | 11 | // fmt 12 | uint8_t fmt[4] = {'f', 'm', 't', ' '}; 13 | uint32_t fmtSize = 16; // bytes 14 | uint16_t audioFormat = 1; // PCM 15 | uint16_t numChannels; // mono 16 | uint32_t sampleRate; // Hertz 17 | uint32_t bytesPerSec; // sampleRate * sampleWidth 18 | uint16_t blockAlign = 2; // 16-bit mono 19 | uint16_t bitsPerSample = 16; 20 | 21 | // data 22 | uint8_t data[4] = {'d', 'a', 't', 'a'}; 23 | uint32_t dataSize; 24 | }; 25 | 26 | // Write WAV file header only 27 | void writeWavHeader(int sampleRate, int sampleWidth, int channels, 28 | uint32_t numSamples, std::ostream &audioFile) { 29 | WavHeader header; 30 | header.dataSize = numSamples * sampleWidth * channels; 31 | header.chunkSize = header.dataSize + sizeof(WavHeader) - 8; 32 | header.sampleRate = sampleRate; 33 | header.numChannels = channels; 34 | header.bytesPerSec = sampleRate * sampleWidth * channels; 35 | header.blockAlign = sampleWidth * channels; 36 | audioFile.write(reinterpret_cast(&header), sizeof(header)); 37 | 38 | } /* writeWavHeader */ 39 | 40 | #endif // WAVFILE_H_ 41 | -------------------------------------------------------------------------------- /Linux/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "wavfile.hpp" 12 | #include 13 | #include "espeak-ng/speak_lib.h" 14 | #include "phonemize.h" 15 | 16 | const std::string instanceName{"vits"}; 17 | 18 | typedef int64_t SpeakerId; 19 | 20 | const float MAX_WAV_VALUE = 32767.0f; 21 | 22 | struct ModelSession { 23 | Ort::Session onnx; 24 | Ort::AllocatorWithDefaultOptions allocator; 25 | Ort::SessionOptions options; 26 | Ort::Env env; 27 | 28 | ModelSession() : onnx(nullptr){}; 29 | }; 30 | 31 | struct SynthesisResult { 32 | double inferSeconds; 33 | double audioSeconds; 34 | double realTimeFactor; 35 | }; 36 | 37 | 38 | struct SynthesisConfig { 39 | // VITS inference settings 40 | float noiseScale = 0.667f; 41 | float lengthScale = 1.0f; 42 | float noiseW = 0.8f; 43 | 44 | // Audio settings 45 | int sampleRate = 22050; 46 | int sampleWidth = 2; // 16-bit 47 | int channels = 1; // mono 48 | 49 | // Speaker id from 0 to numSpeakers - 1 50 | std::optional speakerId; 51 | 52 | // Extra silence 53 | float sentenceSilenceSeconds = 0.2f; 54 | std::optional> phonemeSilenceSeconds; 55 | }; 56 | 57 | 58 | 59 | 60 | void loadModel(std::string modelPath, ModelSession &session, bool useCuda) { 61 | session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, 62 | instanceName.c_str()); 63 | session.env.DisableTelemetryEvents(); 64 | 65 | if (useCuda) { 66 | // Use CUDA provider 67 | OrtCUDAProviderOptions cuda_options{}; 68 | cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic; 69 | session.options.AppendExecutionProvider_CUDA(cuda_options); 70 | } 71 | 72 | // Slows down performance by ~2x 73 | // session.options.SetIntraOpNumThreads(1); 74 | 75 | // Roughly doubles load time for no visible inference benefit 76 | // session.options.SetGraphOptimizationLevel( 77 | // GraphOptimizationLevel::ORT_ENABLE_EXTENDED); 78 | 79 | session.options.SetGraphOptimizationLevel( 80 | GraphOptimizationLevel::ORT_DISABLE_ALL); 81 | 82 | // Slows down performance very slightly 83 | // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL); 84 | 85 | session.options.DisableCpuMemArena(); 86 | session.options.DisableMemPattern(); 87 | session.options.DisableProfiling(); 88 | 89 | 90 | #ifdef _WIN32 91 | auto modelPathW = std::wstring(modelPath.begin(), modelPath.end()); 92 | auto modelPathStr = modelPathW.c_str(); 93 | #else 94 | auto modelPathStr = modelPath.c_str(); 95 | #endif 96 | 97 | session.onnx = Ort::Session(session.env, modelPathStr, session.options); 98 | } 99 | 100 | void Synthesize(std::vector &phonemeIds, 101 | SynthesisConfig &synthesisConfig, ModelSession &session, 102 | std::vector &audioBuffer, SynthesisResult &result){ 103 | 104 | auto memoryInfo = Ort::MemoryInfo::CreateCpu( 105 | OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); 106 | 107 | std::vector phonemeIdLengths{(int64_t)phonemeIds.size()}; 108 | std::vector scales{synthesisConfig.noiseScale, 109 | synthesisConfig.lengthScale, 110 | synthesisConfig.noiseW}; 111 | 112 | std::vector inputTensors; 113 | std::vector phonemeIdsShape{1, (int64_t)phonemeIds.size()}; 114 | inputTensors.push_back(Ort::Value::CreateTensor( 115 | memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(), 116 | phonemeIdsShape.size())); 117 | 118 | std::vector phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()}; 119 | inputTensors.push_back(Ort::Value::CreateTensor( 120 | memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(), 121 | phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size())); 122 | 123 | std::vector scalesShape{(int64_t)scales.size()}; 124 | inputTensors.push_back( 125 | Ort::Value::CreateTensor(memoryInfo, scales.data(), scales.size(), 126 | scalesShape.data(), scalesShape.size())); 127 | // Add speaker id. 128 | // NOTE: These must be kept outside the "if" below to avoid being deallocated. 129 | std::vector speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)}; 130 | std::vector speakerIdShape{(int64_t)speakerId.size()}; 131 | 132 | if (synthesisConfig.speakerId) { 133 | inputTensors.push_back(Ort::Value::CreateTensor( 134 | memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(), 135 | speakerIdShape.size())); 136 | } 137 | // From export_onnx.py 138 | std::array inputNames = {"input", "input_lengths", "scales", 139 | "sid"}; 140 | std::array outputNames = {"output"}; 141 | 142 | // Infer 143 | auto startTime = std::chrono::steady_clock::now(); 144 | auto outputTensors = session.onnx.Run( 145 | Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(), 146 | inputTensors.size(), outputNames.data(), outputNames.size()); 147 | auto endTime = std::chrono::steady_clock::now(); 148 | auto inferDuration = std::chrono::duration(endTime - startTime); 149 | std::cout<<"Infertime: " << inferDuration.count() <(); 155 | auto audioShape = 156 | outputTensors.front().GetTensorTypeAndShapeInfo().GetShape(); 157 | int64_t audioCount = audioShape[audioShape.size() - 1]; 158 | 159 | result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate; 160 | result.realTimeFactor = 0.0; 161 | if (result.audioSeconds > 0) { 162 | result.realTimeFactor = result.inferSeconds / result.audioSeconds; 163 | } 164 | 165 | // Get max audio value for scaling 166 | float maxAudioValue = 0.01f; 167 | for (int64_t i = 0; i < audioCount; i++) { 168 | float audioValue = abs(audio[i]); 169 | if (audioValue > maxAudioValue) { 170 | maxAudioValue = audioValue; 171 | } 172 | } 173 | 174 | // We know the size up front 175 | audioBuffer.reserve(audioCount); 176 | 177 | // Scale audio to fill range and convert to int16 178 | float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue)); 179 | for (int64_t i = 0; i < audioCount; i++) { 180 | int16_t intAudioValue = static_cast( 181 | std::clamp(audio[i] * audioScale, 182 | static_cast(std::numeric_limits::min()), 183 | static_cast(std::numeric_limits::max()))); 184 | 185 | audioBuffer.push_back(intAudioValue); 186 | } 187 | 188 | // Clean up 189 | for (std::size_t i = 0; i < outputTensors.size(); i++) { 190 | Ort::detail::OrtRelease(outputTensors[i].release()); 191 | } 192 | 193 | for (std::size_t i = 0; i < inputTensors.size(); i++) { 194 | Ort::detail::OrtRelease(inputTensors[i].release()); 195 | } 196 | } 197 | 198 | int main(){ 199 | // std::cout<<"Hello world!"< audio; 207 | std::vector> phonemes; 208 | std::string espeak_data = "espeak-ng/share/espeak-ng-data/"; 209 | int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, 210 | espeak_data.c_str(), 0); 211 | if (result < 0) { 212 | throw std::runtime_error("Failed to initialize eSpeak"); 213 | } 214 | 215 | std::ofstream audioFile("test.wav", std::ios::binary); 216 | loadModel(model_path, session, false); 217 | // std::string test = phonemize_eSpeak(text, eSpeakConfig); 218 | std::vector Phonemeid = text_to_sequence(text, eSpeakConfig); 219 | 220 | Synthesize(Phonemeid, syncfig, session, audio, res); 221 | 222 | writeWavHeader(syncfig.sampleRate, syncfig.sampleWidth, syncfig.channels, (int32_t)audio.size(),audioFile); 223 | audioFile.write((const char *)audio.data(), sizeof(int16_t) * audio.size()); 224 | return 0; 225 | } 226 | -------------------------------------------------------------------------------- /Linux/src/phonemize.cpp: -------------------------------------------------------------------------------- 1 | #include "phonemize.h" 2 | #include "espeak-ng/speak_lib.h" 3 | #include 4 | // language -> phoneme -> [phoneme, ...] 5 | 6 | std::unordered_map _symbol_to_id = { 7 | {u'_', 0}, {u';', 1}, {u':', 2}, {u',', 3}, {u'.', 4}, {u'!', 5}, {u'?', 6}, {u'¡', 7}, {u'¿', 8}, {u'—', 9}, {u'…', 10}, 8 | {u'"', 11}, {u'«', 12}, {u'»', 13}, {u'“', 14}, {u'”', 15}, {u' ', 16}, {u'A', 17}, {u'B', 18}, {u'C', 19}, {u'D', 20}, 9 | {u'E', 21}, {u'F', 22}, {u'G', 23}, {u'H', 24}, {u'I', 25}, {u'J', 26}, {u'K', 27}, {u'L', 28}, {u'M', 29}, {u'N', 30}, 10 | {u'O', 31}, {u'P', 32}, {u'Q', 33}, {u'R', 34}, {u'S', 35}, {u'T', 36}, {u'U', 37}, {u'V', 38}, {u'W', 39}, {u'X', 40}, 11 | {u'Y', 41}, {u'Z', 42}, {u'a', 43}, {u'b', 44}, {u'c', 45}, {u'd', 46}, {u'e', 47}, {u'f', 48}, {u'g', 49}, {u'h', 50}, 12 | {u'i', 51}, {u'j', 52}, {u'k', 53}, {u'l', 54}, {u'm', 55}, {u'n', 56}, {u'o', 57}, {u'p', 58}, {u'q', 59}, {u'r', 60}, 13 | {u's', 61}, {u't', 62}, {u'u', 63}, {u'v', 64}, {u'w', 65}, {u'x', 66}, {u'y', 67}, {u'z', 68}, {u'ɑ', 69}, {u'ɐ', 70}, 14 | {u'ɒ', 71}, {u'æ', 72}, {u'ɓ', 73}, {u'ʙ', 74}, {u'β', 75}, {u'ɔ', 76}, {u'ɕ', 77}, {u'ç', 78}, {u'ɗ', 79}, {u'ɖ', 80}, 15 | {u'ð', 81}, {u'ʤ', 82}, {u'ə', 83}, {u'ɘ', 84}, {u'ɚ', 85}, {u'ɛ', 86}, {u'ɜ', 87}, {u'ɝ', 88}, {u'ɞ', 89}, {u'ɟ', 90}, 16 | {u'ʄ', 91}, {u'ɡ', 92}, {u'ɠ', 93}, {u'ɢ', 94}, {u'ʛ', 95}, {u'ɦ', 96}, {u'ɧ', 97}, {u'ħ', 98}, {u'ɥ', 99}, {u'ʜ', 100}, 17 | {u'ɨ', 101}, {u'ɪ', 102}, {u'ʝ', 103}, {u'ɭ', 104}, {u'ɬ', 105}, {u'ɫ', 106}, {u'ɮ', 107}, {u'ʟ', 108}, {u'ɱ', 109}, 18 | {u'ɯ', 110}, {u'ɰ', 111}, {u'ŋ', 112}, {u'ɳ', 113}, {u'ɲ', 114}, {u'ɴ', 115}, {u'ø', 116}, {u'ɵ', 117}, {u'ɸ', 118}, 19 | {u'θ', 119}, {u'œ', 120}, {u'ɶ', 121}, {u'ʘ', 122}, {u'ɹ', 123}, {u'ɺ', 124}, {u'ɾ', 125}, {u'ɻ', 126}, {u'ʀ', 127}, 20 | {u'ʁ', 128}, {u'ɽ', 129}, {u'ʂ', 130}, {u'ʃ', 131}, {u'ʈ', 132}, {u'ʧ', 133}, {u'ʉ', 134}, {u'ʊ', 135}, {u'ʋ', 136}, 21 | {u'ⱱ', 137}, {u'ʌ', 138}, {u'ɣ', 139}, {u'ɤ', 140}, {u'ʍ', 141}, {u'χ', 142}, {u'ʎ', 143}, {u'ʏ', 144}, {u'ʑ', 145}, 22 | {u'ʐ', 146}, {u'ʒ', 147}, {u'ʔ', 148}, {u'ʡ', 149}, {u'ʕ', 150}, {u'ʢ', 151}, {u'ǀ', 152}, {u'ǁ', 153}, {u'ǂ', 154}, 23 | {u'ǃ', 155}, {u'ˈ', 156}, {u'ˌ', 157}, {u'ː', 158}, {u'ˑ', 159}, {u'ʼ', 160}, {u'ʴ', 161}, {u'ʰ', 162}, {u'ʱ', 163}, 24 | {u'ʲ', 164}, {u'ʷ', 165}, {u'ˠ', 166}, {u'ˤ', 167}, {u'˞', 168}, {u'↓', 169}, {u'↑', 170}, {u'→', 171}, {u'↗', 172}, 25 | {u'↘', 173}, { u'̩', 175}, {u'ᵻ', 177} 26 | }; 27 | 28 | std::unordered_map _id_to_symbol = { 29 | {0, u'_'}, {1, u';'}, {2, u':'}, {3, u','}, {4, u'.'}, {5, u'!'}, {6, u'?'}, {7, u'¡'}, {8, u'¿'}, {9, u'—'}, {10, u'…'}, 30 | {11, u'"'}, {12, u'«'}, {13, u'»'}, {14, u'“'}, {15, u'”'}, {16, u' '}, {17, u'A'}, {18, u'B'}, {19, u'C'}, {20, u'D'}, 31 | {21, u'E'}, {22, u'F'}, {23, u'G'}, {24, u'H'}, {25, u'I'}, {26, u'J'}, {27, u'K'}, {28, u'L'}, {29, u'M'}, {30, u'N'}, 32 | {31, u'O'}, {32, u'P'}, {33, u'Q'}, {34, u'R'}, {35, u'S'}, {36, u'T'}, {37, u'U'}, {38, u'V'}, {39, u'W'}, {40, u'X'}, 33 | {41, u'Y'}, {42, u'Z'}, {43, u'a'}, {44, u'b'}, {45, u'c'}, {46, u'd'}, {47, u'e'}, {48, u'f'}, {49, u'g'}, {50, u'h'}, 34 | {51, u'i'}, {52, u'j'}, {53, u'k'}, {54, u'l'}, {55, u'm'}, {56, u'n'}, {57, u'o'}, {58, u'p'}, {59, u'q'}, {60, u'r'}, 35 | {61, u's'}, {62, u't'}, {63, u'u'}, {64, u'v'}, {65, u'w'}, {66, u'x'}, {67, u'y'}, {68, u'z'}, {69, u'ɑ'}, {70, u'ɐ'}, 36 | {71, u'ɒ'}, {72, u'æ'}, {73, u'ɓ'}, {74, u'ʙ'}, {75, u'β'}, {76, u'ɔ'}, {77, u'ɕ'}, {78, u'ç'}, {79, u'ɗ'}, {80, u'ɖ'}, 37 | {81, u'ð'}, {82, u'ʤ'}, {83, u'ə'}, {84, u'ɘ'}, {85, u'ɚ'}, {86, u'ɛ'}, {87, u'ɜ'}, {88, u'ɝ'}, {89, u'ɞ'}, {90, u'ɟ'}, 38 | {91, u'ʄ'}, {92, u'ɡ'}, {93, u'ɠ'}, {94, u'ɢ'}, {95, u'ʛ'}, {96, u'ɦ'}, {97, u'ɧ'}, {98, u'ħ'}, {99, u'ɥ'}, {100, u'ʜ'}, 39 | {101, u'ɨ'}, {102, u'ɪ'}, {103, u'ʝ'}, {104, u'ɭ'}, {105, u'ɬ'}, {106, u'ɫ'}, {107, u'ɮ'}, {108, u'ʟ'}, {109, u'ɱ'}, {110, u'ɯ'}, 40 | {111, u'ɰ'}, {112, u'ŋ'}, {113, u'ɳ'}, {114, u'ɲ'}, {115, u'ɴ'}, {116, u'ø'}, {117, u'ɵ'}, {118, u'ɸ'}, {119, u'θ'}, {120, u'œ'}, 41 | {121, u'ɶ'}, {122, u'ʘ'}, {123, u'ɹ'}, {124, u'ɺ'}, {125, u'ɾ'}, {126, u'ɻ'}, {127, u'ʀ'}, {128, u'ʁ'}, {129, u'ɽ'}, {130, u'ʂ'}, 42 | {131, u'ʃ'}, {132, u'ʈ'}, {133, u'ʧ'}, {134, u'ʉ'}, {135, u'ʊ'}, {136, u'ʋ'}, {137, u'ⱱ'}, {138, u'ʌ'}, {139, u'ɣ'}, {140, u'ɤ'}, 43 | {141, u'ʍ'}, {142, u'χ'}, {143, u'ʎ'}, {144, u'ʏ'}, {145, u'ʑ'}, {146, u'ʐ'}, {147, u'ʒ'}, {148, u'ʔ'}, {149, u'ʡ'}, {150, u'ʕ'}, 44 | {151, u'ʢ'}, {152, u'ǀ'}, {153, u'ǁ'}, {154, u'ǂ'}, {155, u'ǃ'}, {156, u'ˈ'}, {157, u'ˌ'}, {158, u'ː'}, {159, u'ˑ'}, {160, u'ʼ'}, 45 | {161, u'ʴ'}, {162, u'ʰ'}, {163, u'ʱ'}, {164, u'ʲ'}, {165, u'ʷ'}, {166, u'ˠ'}, {167, u'ˤ'}, {168, u'˞'}, {169, u'↓'}, {170, u'↑'}, 46 | {171, u'→'}, {172, u'↗'}, {173, u'↘'}, {175, u'̩'}, {177, u'ᵻ'} 47 | }; 48 | 49 | // convert UTF-8 string to wstring 50 | std::wstring utf8_to_wstring (const std::string& str) { 51 | std::wstring_convert> myconv; 52 | return myconv.from_bytes(str); 53 | } 54 | std::vector char_vector_from_string(const std::string& str) { 55 | return std::vector(str.begin(), str.end()); 56 | } 57 | std::vector text_to_sequence(const std::string& text, eSpeakPhonemeConfig &config) { 58 | std::vector sequence; 59 | std::string clean_text = phonemize_eSpeak(text, config); 60 | std::wstring clean_text_wstring= utf8_to_wstring(clean_text); 61 | 62 | // Sử dụng thư viện locale để định dạng chuỗi 63 | for (char16_t symbol: clean_text_wstring) { 64 | if (_symbol_to_id.count(symbol) > 0) { // Kiểm tra symbol có trong map không 65 | int symbol_id = _symbol_to_id[symbol]; 66 | sequence.push_back(symbol_id); 67 | } // else không có gì thêm vào nếu symbol không hợp lệ 68 | } 69 | return sequence; 70 | } 71 | 72 | std::string phonemize_eSpeak(std::string text, eSpeakPhonemeConfig &config) { 73 | auto voice = config.voice; 74 | int result = espeak_SetVoiceByName(voice.c_str()); 75 | if (result != 0) { 76 | throw std::runtime_error("Failed to set eSpeak-ng voice"); 77 | 78 | } 79 | 80 | // Modified by eSpeak 81 | std::string textCopy(text); 82 | 83 | std::vector *sentencePhonemes = nullptr; 84 | const char *inputTextPointer = textCopy.c_str(); 85 | int terminator = 0; 86 | std::string res = ""; 87 | while (inputTextPointer != NULL) { 88 | // Modified espeak-ng API to get access to clause terminator 89 | 90 | std::string clausePhonemes(espeak_TextToPhonemes( 91 | (const void **)&inputTextPointer, 92 | /*textmode*/ espeakCHARS_AUTO, 93 | /*phonememode = IPA*/ 0x02)); 94 | 95 | res = res + clausePhonemes; 96 | res = res + ", "; 97 | 98 | } // while inputTextPointer != NULL 99 | res.pop_back(); 100 | res.pop_back(); 101 | res+="."; 102 | std::cout< 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 17.0 33 | {BF2ED1DC-445E-4216-8FE6-2219E3C5548C} 34 | Win32Proj 35 | 36 | 37 | 38 | Application 39 | true 40 | v143 41 | 42 | 43 | Application 44 | false 45 | v143 46 | 47 | 48 | Application 49 | true 50 | v143 51 | 52 | 53 | Application 54 | false 55 | v143 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | true 77 | onnxruntime\include;espeak-ng\include;$(IncludePath) 78 | onnxruntime\lib;espeak-ng\lib;$(LibraryPath) 79 | 80 | 81 | true 82 | onnxruntime\include;espeak-ng\include;$(IncludePath) 83 | onnxruntime\lib;espeak-ng\lib;$(LibraryPath) 84 | 85 | 86 | true 87 | 88 | 89 | true 90 | 91 | 92 | 93 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 94 | Level3 95 | /D _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS /utf-8 %(AdditionalOptions) 96 | stdcpp17 97 | 98 | 99 | true 100 | Console 101 | espeak-ng.lib;onnxruntime.lib;%(AdditionalDependencies) 102 | 103 | 104 | 105 | 106 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 107 | Level3 108 | /D _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS /utf-8 %(AdditionalOptions) 109 | stdcpp17 110 | 111 | 112 | true 113 | Console 114 | espeak-ng.lib;onnxruntime.lib;%(AdditionalDependencies) 115 | 116 | 117 | 118 | 119 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 120 | Level3 121 | 122 | 123 | true 124 | Console 125 | true 126 | true 127 | 128 | 129 | 130 | 131 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 132 | Level3 133 | 134 | 135 | true 136 | Console 137 | true 138 | true 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /MSCV/Vits2-onnx-cpp.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav 15 | 16 | 17 | -------------------------------------------------------------------------------- /MSCV/Vits2-onnx-cpp.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /MSCV/VitsONNX.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "VitsONNX.h" 4 | 5 | 6 | 7 | 8 | 9 | static std::basic_string string_to_wstring(const std::string& str) 10 | { 11 | std::wstring wide_string_arg2 = std::wstring(str.begin(), str.end()); 12 | std::basic_string modelFilepath = std::basic_string(wide_string_arg2); 13 | return modelFilepath; 14 | } 15 | 16 | VitsONNX::VitsONNX(const std::string& onnx_model_path):m_session(nullptr), m_env(nullptr) 17 | { 18 | m_env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "Vits_onnxruntime_cpu"); 19 | std::wstring onnx_model_path_wstr = string_to_wstring(onnx_model_path); 20 | int cpu_processor_num = std::thread::hardware_concurrency(); 21 | cpu_processor_num /= 2; 22 | 23 | Ort::SessionOptions session_options; 24 | session_options.SetIntraOpNumThreads(cpu_processor_num); 25 | session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); 26 | session_options.SetLogSeverityLevel(4); 27 | 28 | if (!OrtSessionOptionsAppendExecutionProvider_CPU(session_options, 0)) 29 | std::cerr << ""; 30 | m_session = Ort::Session(m_env, onnx_model_path_wstr.c_str(), session_options); 31 | //PrintModelInfo(m_session); 32 | this->phonemizer.Init("en-us"); 33 | } 34 | 35 | 36 | VitsONNX::~VitsONNX() { 37 | } 38 | 39 | std::map VitsONNX::getSynthesisConfig() { 40 | std::map SynthesisConfig = { 41 | {"sampleRate", 22050}, {"sampleWidth", 2}, {"channels", 1} 42 | }; 43 | return SynthesisConfig; 44 | } 45 | 46 | void VitsONNX::PrintModelInfo(Ort::Session& session) 47 | { 48 | // print the number of model input nodes 49 | size_t num_input_nodes = session.GetInputCount(); 50 | size_t num_output_nodes = session.GetOutputCount(); 51 | std::cout << "Number of input node is:" << num_input_nodes << std::endl; 52 | std::cout << "Number of output node is:" << num_output_nodes << std::endl; 53 | 54 | // print node name 55 | Ort::AllocatorWithDefaultOptions allocator; 56 | std::cout << std::endl; 57 | for (auto i = 0; i < num_input_nodes; i++) 58 | std::cout << "The input op-name " << i << " is:" << session.GetInputNameAllocated(i, allocator) << std::endl; 59 | for (auto i = 0; i < num_output_nodes; i++) 60 | std::cout << "The output op-name " << i << " is:" << session.GetOutputNameAllocated(i, allocator) << std::endl; 61 | 62 | 63 | // print input and output dims 64 | for (auto i = 0; i < num_input_nodes; i++) 65 | { 66 | std::vector input_dims = session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape(); 67 | std::cout << std::endl << "input " << i << " dim is: "; 68 | for (auto j = 0; j < input_dims.size(); j++) 69 | std::cout << input_dims[j] << " "; 70 | } 71 | for (auto i = 0; i < num_output_nodes; i++) 72 | { 73 | std::vector output_dims = session.GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape(); 74 | std::cout << std::endl << "output " << i << " dim is: "; 75 | for (auto j = 0; j < output_dims.size(); j++) 76 | std::cout << output_dims[j] << " "; 77 | } 78 | } 79 | 80 | 81 | std::vector < int16_t > VitsONNX::inference(std::string text_input) { 82 | std::vector < int16_t > audioBuffer; 83 | std::vector phonemeIds = this->phonemizer.text_to_sequence(text_input); 84 | /*for (int i = 0; i < phonemeIds.size(); i++) { 85 | std::cout << phonemeIds[i] << " "; 86 | } 87 | std::cout< phonemeIdLengths{ (int64_t)phonemeIds.size() }; 92 | 93 | std::vector scales{ this->noiseScale, 94 | this->lengthScale, 95 | this->noiseW }; 96 | 97 | std::vector inputTensors; 98 | 99 | std::vector phonemeIdsShape{ 1, (int64_t)phonemeIds.size() }; 100 | inputTensors.push_back(Ort::Value::CreateTensor(memoryInfo, phonemeIds.data(), phonemeIds.size(), 101 | phonemeIdsShape.data(), phonemeIdsShape.size())); 102 | 103 | std::vector phomemeIdLengthsShape{ (int64_t)phonemeIdLengths.size() }; 104 | inputTensors.push_back(Ort::Value::CreateTensor(memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(), 105 | phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size())); 106 | 107 | std::vector scalesShape{ (int64_t)scales.size() }; 108 | inputTensors.push_back(Ort::Value::CreateTensor(memoryInfo, scales.data(), scales.size(), 109 | scalesShape.data(), scalesShape.size())); 110 | 111 | std::vector speakerId{(int64_t)this->speakerId.value_or(0) }; 112 | std::vector speakerIdShape{ (int64_t)speakerId.size() }; 113 | 114 | 115 | if (this->speakerId) { 116 | inputTensors.push_back(Ort::Value::CreateTensor(memoryInfo, speakerId.data(), speakerId.size(), 117 | speakerIdShape.data(), speakerIdShape.size())); 118 | } 119 | 120 | // From export_onnx.py 121 | std::array inputNames = { "input", "input_lengths", "scales", 122 | "sid" }; 123 | std::array outputNames = { "output" }; 124 | 125 | // Infer 126 | auto startTime = std::chrono::steady_clock::now(); 127 | 128 | auto outputTensors = m_session.Run( 129 | Ort::RunOptions{ nullptr }, inputNames.data(), inputTensors.data(), 130 | inputTensors.size(), outputNames.data(), outputNames.size()); 131 | auto endTime = std::chrono::steady_clock::now(); 132 | 133 | 134 | if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) { 135 | throw std::runtime_error("Invalid output tensors"); 136 | } 137 | auto inferDuration = std::chrono::duration(endTime - startTime); 138 | std::cout << "infer time: " << inferDuration.count() << std::endl; 139 | 140 | const float* audio = outputTensors.front().GetTensorData(); 141 | auto audioShape = outputTensors.front().GetTensorTypeAndShapeInfo().GetShape(); 142 | int64_t audioCount = audioShape[audioShape.size() - 1]; 143 | 144 | // Get max audio value for scaling 145 | float maxAudioValue = 0.01f; 146 | for (int64_t i = 0; i < audioCount; i++) { 147 | float audioValue = abs(audio[i]); 148 | if (audioValue > maxAudioValue) { 149 | maxAudioValue = audioValue; 150 | } 151 | } 152 | 153 | // We know the size up front 154 | audioBuffer.reserve(audioCount); 155 | 156 | // Scale audio to fill range and convert to int16 157 | float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue)); 158 | for (int64_t i = 0; i < audioCount; i++) { 159 | int16_t intAudioValue = static_cast( 160 | std::clamp(audio[i] * audioScale, 161 | static_cast(std::numeric_limits::min()), 162 | static_cast(std::numeric_limits::max()))); 163 | 164 | audioBuffer.push_back(intAudioValue); 165 | } 166 | // Clean up 167 | for (std::size_t i = 0; i < outputTensors.size(); i++) { 168 | Ort::detail::OrtRelease(outputTensors[i].release()); 169 | } 170 | 171 | for (std::size_t i = 0; i < inputTensors.size(); i++) { 172 | Ort::detail::OrtRelease(inputTensors[i].release()); 173 | } 174 | 175 | return audioBuffer; 176 | } -------------------------------------------------------------------------------- /MSCV/VitsONNX.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "onnxruntime_cxx_api.h" 13 | #include "cpu_provider_factory.h" 14 | #include "phonemize.h" 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | 22 | struct SynthesisResult { 23 | double inferSeconds = 0; 24 | double audioSeconds = 0; 25 | double realTimeFactor = 0; 26 | }; 27 | 28 | class VitsONNX 29 | { 30 | public: 31 | VitsONNX() = delete; 32 | VitsONNX(const std::string& onnx_model_path); 33 | virtual ~VitsONNX(); 34 | 35 | std::map getSynthesisConfig(); 36 | std::vector < int16_t > inference(std::string text_input); 37 | 38 | private: 39 | void PrintModelInfo(Ort::Session& session); 40 | std::vector Phonemes; 41 | std::vector PhonemeIDs; 42 | private: 43 | Ort::Env m_env; 44 | Ort::Session m_session; 45 | 46 | std::optional speakerId; 47 | float noiseScale = 0.667f; 48 | float lengthScale = 1.0f; 49 | float noiseW = 0.8f; 50 | 51 | // Audio settings 52 | int sampleRate = 22050; 53 | int sampleWidth = 2; // 16-bit 54 | int channels = 1; // mono 55 | 56 | const float MAX_WAV_VALUE = 32767.0f; 57 | 58 | PhonemizerEngine phonemizer; 59 | }; 60 | 61 | -------------------------------------------------------------------------------- /MSCV/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "VitsONNX.h" 4 | #include "wavfile.h" 5 | #include 6 | 7 | int main() { 8 | const std::string& model_path = "vits2_model.onnx"; 9 | std::ofstream audioFile("test.wav", std::ios::binary); 10 | 11 | while (true){ 12 | std::string text = ""; 13 | std::cout << "Text input: "; 14 | std::getline(std::cin, text); 15 | std::cout << std::endl; 16 | 17 | if (text == "quit") { 18 | break; 19 | } 20 | 21 | VitsONNX vitsmodel(model_path); 22 | std::map synthesisConfig = vitsmodel.getSynthesisConfig(); 23 | 24 | std::vector < int16_t > audio = vitsmodel.inference(text); 25 | //std::cout << synthesisConfig["sampleWidth"]; 26 | writeWavHeader(synthesisConfig["sampleRate"], synthesisConfig["sampleWidth"], synthesisConfig["channels"], (int32_t)audio.size(), audioFile); 27 | audioFile.write((const char*)audio.data(), sizeof(int16_t) * audio.size()); 28 | } 29 | return 0; 30 | } -------------------------------------------------------------------------------- /MSCV/phonemize.cpp: -------------------------------------------------------------------------------- 1 | #include "phonemize.h" 2 | 3 | 4 | // convert UTF-8 string to wstring 5 | static std::wstring utf8_to_wstring(const std::string& str) { 6 | std::wstring_convert> myconv; 7 | return myconv.from_bytes(str); 8 | } 9 | 10 | 11 | PhonemizerEngine::PhonemizerEngine() { 12 | } 13 | void PhonemizerEngine::Init(std::string voice) { 14 | std::string espeak_data = "espeak-ng/share/espeak-ng-data/"; 15 | int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, 16 | espeak_data.c_str(), 0); 17 | if (result < 0) { 18 | throw std::runtime_error("Failed to initialize eSpeak"); 19 | } 20 | int setvoice = espeak_SetVoiceByName(voice.c_str()); 21 | if (setvoice != 0) { 22 | throw std::runtime_error("Failed to set eSpeak-ng voice"); 23 | 24 | } 25 | } 26 | PhonemizerEngine::~PhonemizerEngine() { 27 | 28 | } 29 | 30 | std::vector PhonemizerEngine::text_to_sequence(const std::string& text) { 31 | std::vector sequence; 32 | std::string clean_text = phonemize_eSpeak(text); 33 | std::wstring clean_text_wstring = utf8_to_wstring(clean_text); 34 | 35 | for (char16_t symbol : clean_text_wstring) { 36 | if (this->_symbol_to_id.count(symbol) > 0) { 37 | int symbol_id = _symbol_to_id[symbol]; 38 | sequence.push_back(symbol_id); 39 | } 40 | } 41 | return sequence; 42 | } 43 | 44 | std::string PhonemizerEngine::phonemize_eSpeak(std::string text) { 45 | std::string textCopy(text); 46 | 47 | std::vector* sentencePhonemes = nullptr; 48 | const char* inputTextPointer = textCopy.c_str(); 49 | int terminator = 0; 50 | std::string res = ""; 51 | while (inputTextPointer != NULL) { 52 | 53 | std::string clausePhonemes(espeak_TextToPhonemes( 54 | (const void**)&inputTextPointer, 55 | /*textmode*/ espeakCHARS_AUTO, 56 | /*phonememode = IPA*/ 0x02)); 57 | 58 | res = res + clausePhonemes; 59 | res = res + ", "; 60 | 61 | } // while inputTextPointer != NULL 62 | res.pop_back(); 63 | res.pop_back(); 64 | res += "."; 65 | return res; 66 | } /* phonemize_eSpeak */ -------------------------------------------------------------------------------- /MSCV/phonemize.h: -------------------------------------------------------------------------------- 1 | #ifndef PHONEMIZE_H_ 2 | #define PHONEMIZE_H_ 3 | 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "espeak-ng/speak_lib.h" 14 | 15 | 16 | class PhonemizerEngine{ 17 | private: 18 | 19 | std::unordered_map _symbol_to_id = { 20 | {u'_', 0}, {u';', 1}, {u':', 2}, {u',', 3}, {u'.', 4}, {u'!', 5}, {u'?', 6}, {u'¡', 7}, {u'¿', 8}, {u'—', 9}, {u'…', 10}, 21 | {u'"', 11}, {u'«', 12}, {u'»', 13}, {u'“', 14}, {u'”', 15}, {u' ', 16}, {u'A', 17}, {u'B', 18}, {u'C', 19}, {u'D', 20}, 22 | {u'E', 21}, {u'F', 22}, {u'G', 23}, {u'H', 24}, {u'I', 25}, {u'J', 26}, {u'K', 27}, {u'L', 28}, {u'M', 29}, {u'N', 30}, 23 | {u'O', 31}, {u'P', 32}, {u'Q', 33}, {u'R', 34}, {u'S', 35}, {u'T', 36}, {u'U', 37}, {u'V', 38}, {u'W', 39}, {u'X', 40}, 24 | {u'Y', 41}, {u'Z', 42}, {u'a', 43}, {u'b', 44}, {u'c', 45}, {u'd', 46}, {u'e', 47}, {u'f', 48}, {u'g', 49}, {u'h', 50}, 25 | {u'i', 51}, {u'j', 52}, {u'k', 53}, {u'l', 54}, {u'm', 55}, {u'n', 56}, {u'o', 57}, {u'p', 58}, {u'q', 59}, {u'r', 60}, 26 | {u's', 61}, {u't', 62}, {u'u', 63}, {u'v', 64}, {u'w', 65}, {u'x', 66}, {u'y', 67}, {u'z', 68}, {u'ɑ', 69}, {u'ɐ', 70}, 27 | {u'ɒ', 71}, {u'æ', 72}, {u'ɓ', 73}, {u'ʙ', 74}, {u'β', 75}, {u'ɔ', 76}, {u'ɕ', 77}, {u'ç', 78}, {u'ɗ', 79}, {u'ɖ', 80}, 28 | {u'ð', 81}, {u'ʤ', 82}, {u'ə', 83}, {u'ɘ', 84}, {u'ɚ', 85}, {u'ɛ', 86}, {u'ɜ', 87}, {u'ɝ', 88}, {u'ɞ', 89}, {u'ɟ', 90}, 29 | {u'ʄ', 91}, {u'ɡ', 92}, {u'ɠ', 93}, {u'ɢ', 94}, {u'ʛ', 95}, {u'ɦ', 96}, {u'ɧ', 97}, {u'ħ', 98}, {u'ɥ', 99}, {u'ʜ', 100}, 30 | {u'ɨ', 101}, {u'ɪ', 102}, {u'ʝ', 103}, {u'ɭ', 104}, {u'ɬ', 105}, {u'ɫ', 106}, {u'ɮ', 107}, {u'ʟ', 108}, {u'ɱ', 109}, 31 | {u'ɯ', 110}, {u'ɰ', 111}, {u'ŋ', 112}, {u'ɳ', 113}, {u'ɲ', 114}, {u'ɴ', 115}, {u'ø', 116}, {u'ɵ', 117}, {u'ɸ', 118}, 32 | {u'θ', 119}, {u'œ', 120}, {u'ɶ', 121}, {u'ʘ', 122}, {u'ɹ', 123}, {u'ɺ', 124}, {u'ɾ', 125}, {u'ɻ', 126}, {u'ʀ', 127}, 33 | {u'ʁ', 128}, {u'ɽ', 129}, {u'ʂ', 130}, {u'ʃ', 131}, {u'ʈ', 132}, {u'ʧ', 133}, {u'ʉ', 134}, {u'ʊ', 135}, {u'ʋ', 136}, 34 | {u'ⱱ', 137}, {u'ʌ', 138}, {u'ɣ', 139}, {u'ɤ', 140}, {u'ʍ', 141}, {u'χ', 142}, {u'ʎ', 143}, {u'ʏ', 144}, {u'ʑ', 145}, 35 | {u'ʐ', 146}, {u'ʒ', 147}, {u'ʔ', 148}, {u'ʡ', 149}, {u'ʕ', 150}, {u'ʢ', 151}, {u'ǀ', 152}, {u'ǁ', 153}, {u'ǂ', 154}, 36 | {u'ǃ', 155}, {u'ˈ', 156}, {u'ˌ', 157}, {u'ː', 158}, {u'ˑ', 159}, {u'ʼ', 160}, {u'ʴ', 161}, {u'ʰ', 162}, {u'ʱ', 163}, 37 | {u'ʲ', 164}, {u'ʷ', 165}, {u'ˠ', 166}, {u'ˤ', 167}, {u'˞', 168}, {u'↓', 169}, {u'↑', 170}, {u'→', 171}, {u'↗', 172}, 38 | {u'↘', 173}, { u'̩', 175}, {u'ᵻ', 177} 39 | }; 40 | 41 | // Phonemizes text using espeak-ng. 42 | // Returns phonemes for each sentence as a separate std::vector. 43 | // 44 | // Assumes espeak_Initialize has already been called. 45 | private: 46 | std::string phonemize_eSpeak(std::string text); 47 | public: 48 | PhonemizerEngine(); 49 | void Init(std::string voice); 50 | ~PhonemizerEngine(); 51 | std::vector text_to_sequence(const std::string& text); 52 | }; 53 | #endif // PHONEMIZE_H_ -------------------------------------------------------------------------------- /MSCV/wavfile.h: -------------------------------------------------------------------------------- 1 | #ifndef WAVFILE_H_ 2 | #define WAVFILE_H_ 3 | 4 | #include 5 | 6 | struct WavHeader { 7 | uint8_t RIFF[4] = { 'R', 'I', 'F', 'F' }; 8 | uint32_t chunkSize; 9 | uint8_t WAVE[4] = { 'W', 'A', 'V', 'E' }; 10 | 11 | // fmt 12 | uint8_t fmt[4] = { 'f', 'm', 't', ' ' }; 13 | uint32_t fmtSize = 16; // bytes 14 | uint16_t audioFormat = 1; // PCM 15 | uint16_t numChannels; // mono 16 | uint32_t sampleRate; // Hertz 17 | uint32_t bytesPerSec; // sampleRate * sampleWidth 18 | uint16_t blockAlign = 2; // 16-bit mono 19 | uint16_t bitsPerSample = 16; 20 | 21 | // data 22 | uint8_t data[4] = { 'd', 'a', 't', 'a' }; 23 | uint32_t dataSize; 24 | }; 25 | 26 | // Write WAV file header only 27 | static void writeWavHeader(int sampleRate, int sampleWidth, int channels, 28 | uint32_t numSamples, std::ostream& audioFile) { 29 | WavHeader header; 30 | header.dataSize = numSamples * sampleWidth * channels; 31 | header.chunkSize = header.dataSize + sizeof(WavHeader) - 8; 32 | header.sampleRate = sampleRate; 33 | header.numChannels = channels; 34 | header.bytesPerSec = sampleRate * sampleWidth * channels; 35 | header.blockAlign = sampleWidth * channels; 36 | audioFile.write(reinterpret_cast(&header), sizeof(header)); 37 | 38 | } /* writeWavHeader */ 39 | 40 | #endif // WAVFILE_H_ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vits2-onnx-cpp 2 | This repository contains C++ source code for performing inference of the Vits2 TTS model on Linux Ubuntu and Windows environments. The model is converted from ONNX format to a format compatible with C++ using the onnxruntime library. 3 | 4 | 5 | # Special mentions 6 | [@p0p4k](https://github.com/p0p4k) for vits2 pytorch repo (Please check his awesome [vits2_pytorch](https://github.com/p0p4k/vits2_pytorch) repo). 7 | --------------------------------------------------------------------------------