├── .gitignore
├── Dockerfile.cuda.cn
├── README.md
├── app.py
├── assets
├── app.js
├── audio_process.js
├── images
│ ├── record.svg
│ └── speaking.svg
├── index.html
└── voice.png
├── examples
└── sherpa_examples.py
├── requirements.cuda.txt
├── requirements.txt
├── screenshot.jpg
└── voiceapi
├── asr.py
└── tts.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 | venv
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 | .env*
17 | .vscode/
18 | .idea/
19 | # Mac OS file
20 | .DS_Store
21 | *.pyc
22 | __pycache__/
23 | *~
24 | .venv
25 | frpc.ini
26 |
27 | models/
28 | examples/*.wav
29 | models
30 |
--------------------------------------------------------------------------------
/Dockerfile.cuda.cn:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04
2 |
3 | RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list
4 | RUN apt-get update -y && apt-get install -y python3 python3-pip libasound2 libcublas-12-6 libcudnn8-dev
5 | RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/web/simple
6 | RUN pip3 install sherpa-onnx==1.11.1+cuda -f https://k2-fsa.github.io/sherpa/onnx/cuda-cn.html
7 |
8 | WORKDIR /app
9 | ADD requirements.cuda.txt /app/
10 | RUN pip3 install -r requirements.cuda.txt
11 |
12 | ADD . /app/
13 | ENTRYPOINT ["python3", "app.py"]
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # voiceapi - A simple and clean voice transcription/synthesis API with sherpa-onnx
2 |
3 | Thanks to [k2-fsa/sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx), we can easily build a voice API with Python.
4 |
5 |
6 | ## Supported models
7 | | Model | Language | Type | Description |
8 | | -------------------------------------- | ----------------------------- | ----------- | ----------------------------------- |
9 | | zipformer-bilingual-zh-en-2023-02-20 | Chinese + English | Online ASR | Streaming Zipformer, Bilingual |
10 | | sense-voice-zh-en-ja-ko-yue-2024-07-17 | Chinese + English | Offline ASR | SenseVoice, Bilingual |
11 | | paraformer-trilingual-zh-cantonese-en | Chinese + Cantonese + English | Offline ASR | Paraformer, Trilingual |
12 | | paraformer-en-2024-03-09 | English | Offline ASR | Paraformer, English |
13 | | vits-zh-hf-theresa | Chinese | TTS | VITS, Chinese, 804 speakers |
14 | | melo-tts-zh_en | Chinese + English | TTS | Melo, Chinese + English, 1 speakers |
15 | | kokoro-multi-lang-v1_0 | Chinese + English | TTS | Chinese + English, 53 speakers |
16 |
17 | ## Run the app locally
18 | Python 3.10+ is required
19 |
20 | ```shell
21 | python3 -m venv venv
22 | . venv/bin/activate
23 |
24 | pip install -r requirements.txt
25 | python app.py
26 | ```
27 |
28 | Visit `http://localhost:8000/` to see the demo page
29 |
30 | ## Build cuda image (for Chinese users)
31 | ```shell
32 | docker build -t voiceapi:cuda_dev -f Dockerfile.cuda.cn .
33 | ```
34 |
35 | ## Streaming API (via WebSocket)
36 | ### /asr
37 | Send PCM 16bit audio data to the server, and the server will return the transcription result.
38 | - `samplerate` can be set in the query string, default is 16000.
39 |
40 | The server will return the transcription result in JSON format, with the following fields:
41 | - `text`: the transcription result
42 | - `finished`: whether the segment is finished
43 | - `idx`: the index of the segment
44 |
45 | ```javascript
46 | const ws = new WebSocket('ws://localhost:8000/asr?samplerate=16000');
47 | ws.onopen = () => {
48 | console.log('connected');
49 | ws.send('{"sid": 0}');
50 | };
51 | ws.onmessage = (e) => {
52 | const data = JSON.parse(e.data);
53 | const { text, finished, idx } = data;
54 | // do something with text
55 | // finished is true when the segment is finished
56 | };
57 | // send audio data
58 | // PCM 16bit, with samplerate
59 | ws.send(int16Array.buffer);
60 | ```
61 | ### /tts
62 | Send text to the server, and the server will return the synthesized audio data.
63 | - `samplerate` can be set in the query string, default is 16000.
64 | - `sid` is the Speaker ID, default is 0.
65 | - `speed` is the speed of the synthesized audio, default is 1.0.
66 | - `chunk_size` is the size of the audio chunk, default is 1024.
67 |
68 | The server will return the synthesized audio data in binary format.
69 | - The audio data is in PCM 16bit format, with the binary data in the response body.
70 | - The server will return the synthesized result with json format, with the following fields:
71 | - `elapsed`: the elapsed time
72 | - `progress`: the progress of the synthesis
73 | - `duration`: the duration of the synthesis
74 | - `size`: the size of the synthesized audio data
75 |
76 | ```javascript
77 | const ws = new WebSocket('ws://localhost:8000/tts?samplerate=16000');
78 | ws.onopen = () => {
79 | console.log('connected');
80 | ws.send('Your text here');
81 | };
82 | ws.onmessage = (e) => {
83 | if (e.data instanceof Blob) {
84 | // Chunked audio data
85 | e.data.arrayBuffer().then((arrayBuffer) => {
86 | const int16Array = new Int16Array(arrayBuffer);
87 | let float32Array = new Float32Array(int16Array.length);
88 | for (let i = 0; i < int16Array.length; i++) {
89 | float32Array[i] = int16Array[i] / 32768.;
90 | }
91 | playNode.port.postMessage({ message: 'audioData', audioData: float32Array });
92 | });
93 | } else {
94 | // The server will return the synthesized result
95 | const {elapsed, progress, duration, size } = JSON.parse(e.data);
96 | this.elapsedTime = elapsed;
97 | }
98 | };
99 | ```
100 |
101 | ### No Streaming API
102 | #### /tts
103 | Send text to the server, and the server will return the synthesized audio data.
104 |
105 | - `text` is the text to be synthesized.
106 | - `samplerate` can be set in the query string, default is 16000.
107 | - `sid` is the Speaker ID, default is 0.
108 | - `speed` is the speed of the synthesized audio, default is 1.0.
109 | -
110 | ```shell
111 | curl -X POST "http://localhost:8000/tts" \
112 | -H "Content-Type: application/json" \
113 | -d '{
114 | "text": "Hello, world!",
115 | "sid": 0,
116 | "samplerate": 16000
117 | }' -o helloworkd.wav
118 | ```
119 |
120 | ## Download models
121 | All models are stored in the `models` directory
122 | Only download the models you need. default models are:
123 | - asr models: `sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20`(Bilingual, Chinese + English). Streaming
124 | - tts models: `vits-zh-hf-theresa` (Chinese + English)
125 |
126 | ### silero_vad.onnx
127 | > silero is required for ASR
128 | ```bash
129 | mkdir -p silero_vad
130 | cd silero_vad
131 | curl -SL -o silero_vad/silero_vad.onnx https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
132 | ```
133 |
134 | ### FireRedASR-AED-L
135 | ```bash
136 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16.tar.bz2
137 | ```
138 | ### kokoro-multi-lang-v1_0
139 | ```bash
140 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
141 | ```
142 |
143 | ### vits-zh-hf-theresa
144 | ```bash
145 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-hf-theresa.tar.bz2
146 | ```
147 |
148 | ### vits-melo-tts-zh_en
149 | ```bash
150 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2
151 | ```
152 | ### sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20
153 | ```bash
154 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
155 | ```
156 |
157 | ### sherpa-onnx-paraformer-trilingual-zh-cantonese-en
158 | ```bash
159 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2
160 | ```
161 | ### whisper
162 | ```bash
163 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
164 | ```
165 | ### sensevoice
166 | ```bash
167 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
168 | ```
169 |
170 | ### sherpa-onnx-streaming-paraformer-bilingual-zh-en
171 | ```bash
172 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
173 | ```
174 |
175 | ### sherpa-onnx-paraformer-trilingual-zh-cantonese-en
176 | ```bash
177 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2
178 | ```
179 | ### sherpa-onnx-paraformer-en
180 | ```bash
181 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-en-2024-03-09.tar.bz2
182 | ```
183 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from typing import *
2 | from fastapi import FastAPI, HTTPException, Request, WebSocket, WebSocketDisconnect, Query
3 | from fastapi.responses import HTMLResponse, StreamingResponse
4 | from fastapi.staticfiles import StaticFiles
5 | import asyncio
6 | import logging
7 | from pydantic import BaseModel, Field
8 | import uvicorn
9 | from voiceapi.tts import TTSResult, start_tts_stream, TTSStream
10 | from voiceapi.asr import start_asr_stream, ASRStream, ASRResult
11 | import logging
12 | import argparse
13 | import os
14 |
15 | app = FastAPI()
16 | logger = logging.getLogger(__file__)
17 |
18 |
19 | @app.websocket("/asr")
20 | async def websocket_asr(websocket: WebSocket,
21 | samplerate: int = Query(16000, title="Sample Rate",
22 | description="The sample rate of the audio."),):
23 | await websocket.accept()
24 |
25 | asr_stream: ASRStream = await start_asr_stream(samplerate, args)
26 | if not asr_stream:
27 | logger.error("failed to start ASR stream")
28 | await websocket.close()
29 | return
30 |
31 | async def task_recv_pcm():
32 | while True:
33 | pcm_bytes = await websocket.receive_bytes()
34 | if not pcm_bytes:
35 | return
36 | await asr_stream.write(pcm_bytes)
37 |
38 | async def task_send_result():
39 | while True:
40 | result: ASRResult = await asr_stream.read()
41 | if not result:
42 | return
43 | await websocket.send_json(result.to_dict())
44 | try:
45 | await asyncio.gather(task_recv_pcm(), task_send_result())
46 | except WebSocketDisconnect:
47 | logger.info("asr: disconnected")
48 | finally:
49 | await asr_stream.close()
50 |
51 |
52 | @app.websocket("/tts")
53 | async def websocket_tts(websocket: WebSocket,
54 | samplerate: int = Query(16000,
55 | title="Sample Rate",
56 | description="The sample rate of the generated audio."),
57 | interrupt: bool = Query(True,
58 | title="Interrupt",
59 | description="Interrupt the current TTS stream when a new text is received."),
60 | sid: int = Query(0,
61 | title="Speaker ID",
62 | description="The ID of the speaker to use for TTS."),
63 | chunk_size: int = Query(1024,
64 | title="Chunk Size",
65 | description="The size of the chunk to send to the client."),
66 | speed: float = Query(1.0,
67 | title="Speed",
68 | description="The speed of the generated audio."),
69 | split: bool = Query(True,
70 | title="Split",
71 | description="Split the text into sentences.")):
72 |
73 | await websocket.accept()
74 | tts_stream: TTSStream = None
75 |
76 | async def task_recv_text():
77 | nonlocal tts_stream
78 | while True:
79 | text = await websocket.receive_text()
80 | if not text:
81 | return
82 |
83 | if interrupt or not tts_stream:
84 | if tts_stream:
85 | await tts_stream.close()
86 | logger.info("tts: stream interrupt")
87 |
88 | tts_stream = await start_tts_stream(sid, samplerate, speed, args)
89 | if not tts_stream:
90 | logger.error("tts: failed to allocate tts stream")
91 | await websocket.close()
92 | return
93 | logger.info(f"tts: received: {text} (split={split})")
94 | await tts_stream.write(text, split)
95 |
96 | async def task_send_pcm():
97 | nonlocal tts_stream
98 | while not tts_stream:
99 | # wait for tts stream to be created
100 | await asyncio.sleep(0.1)
101 |
102 | while True:
103 | result: TTSResult = await tts_stream.read()
104 | if not result:
105 | return
106 |
107 | if result.finished:
108 | await websocket.send_json(result.to_dict())
109 | else:
110 | for i in range(0, len(result.pcm_bytes), chunk_size):
111 | await websocket.send_bytes(result.pcm_bytes[i:i+chunk_size])
112 |
113 | try:
114 | await asyncio.gather(task_recv_text(), task_send_pcm())
115 | except WebSocketDisconnect:
116 | logger.info("tts: disconnected")
117 | finally:
118 | if tts_stream:
119 | await tts_stream.close()
120 |
121 |
122 | class TTSRequest(BaseModel):
123 | text: str = Field(..., title="Text",
124 | description="The text to be converted to speech.",
125 | examples=["Hello, world!"])
126 | sid: int = Field(0, title="Speaker ID",
127 | description="The ID of the speaker to use for TTS.")
128 | samplerate: int = Field(16000, title="Sample Rate",
129 | description="The sample rate of the generated audio.")
130 | speed: float = Field(1.0, title="Speed",
131 | description="The speed of the generated audio.")
132 |
133 |
134 | @ app.post("/tts",
135 | description="Generate speech audio from text.",
136 | response_class=StreamingResponse, responses={200: {"content": {"audio/wav": {}}}})
137 | async def tts_generate(req: TTSRequest):
138 | if not req.text:
139 | raise HTTPException(status_code=400, detail="text is required")
140 |
141 | tts_stream = await start_tts_stream(req.sid, req.samplerate, req.speed, args)
142 | if not tts_stream:
143 | raise HTTPException(
144 | status_code=500, detail="failed to start TTS stream")
145 |
146 | r = await tts_stream.generate(req.text)
147 | return StreamingResponse(r, media_type="audio/wav")
148 |
149 |
150 | if __name__ == "__main__":
151 | models_root = './models'
152 |
153 | for d in ['.', '..', '../..']:
154 | if os.path.isdir(f'{d}/models'):
155 | models_root = f'{d}/models'
156 | break
157 |
158 | parser = argparse.ArgumentParser()
159 | parser.add_argument("--port", type=int, default=8000, help="port number")
160 | parser.add_argument("--addr", type=str,
161 | default="0.0.0.0", help="serve address")
162 |
163 | parser.add_argument("--asr-provider", type=str,
164 | default="cpu", help="asr provider, cpu or cuda")
165 | parser.add_argument("--tts-provider", type=str,
166 | default="cpu", help="tts provider, cpu or cuda")
167 |
168 | parser.add_argument("--threads", type=int, default=2,
169 | help="number of threads")
170 |
171 | parser.add_argument("--models-root", type=str, default=models_root,
172 | help="model root directory")
173 |
174 | parser.add_argument("--asr-model", type=str, default='sensevoice',
175 | help="ASR model name: zipformer-bilingual, sensevoice, paraformer-trilingual, paraformer-en, fireredasr")
176 |
177 | parser.add_argument("--asr-lang", type=str, default='zh',
178 | help="ASR language, zh, en, ja, ko, yue")
179 |
180 | parser.add_argument("--tts-model", type=str, default='vits-zh-hf-theresa',
181 | help="TTS model name: vits-zh-hf-theresa, vits-melo-tts-zh_en, kokoro-multi-lang-v1_0")
182 |
183 | args = parser.parse_args()
184 |
185 | if args.tts_model == 'vits-melo-tts-zh_en' and args.tts_provider == 'cuda':
186 | logger.warning(
187 | "vits-melo-tts-zh_en does not support CUDA fallback to CPU")
188 | args.tts_provider = 'cpu'
189 |
190 | app.mount("/", app=StaticFiles(directory="./assets", html=True), name="assets")
191 |
192 | logging.basicConfig(format='%(levelname)s: %(asctime)s %(name)s:%(lineno)s %(message)s',
193 | level=logging.INFO)
194 | uvicorn.run(app, host=args.addr, port=args.port)
195 |
--------------------------------------------------------------------------------
/assets/app.js:
--------------------------------------------------------------------------------
1 | const demoapp = {
2 | text: '讲个冷笑话吧,要很好笑的那种。',
3 | recording: false,
4 | asrWS: null,
5 | currentText: null,
6 | disabled: false,
7 | elapsedTime: null,
8 | logs: [{ idx: 0, text: 'Happily here at ruzhila.cn.' }],
9 | async init() {
10 | },
11 | async dotts() {
12 | let audioContext = new AudioContext({ sampleRate: 16000 })
13 | await audioContext.audioWorklet.addModule('./audio_process.js')
14 |
15 | const ws = new WebSocket('/tts');
16 | ws.onopen = () => {
17 | ws.send(this.text);
18 | };
19 | const playNode = new AudioWorkletNode(audioContext, 'play-audio-processor');
20 | playNode.connect(audioContext.destination);
21 |
22 | this.disabled = true;
23 | ws.onmessage = async (e) => {
24 | if (e.data instanceof Blob) {
25 | e.data.arrayBuffer().then((arrayBuffer) => {
26 | const int16Array = new Int16Array(arrayBuffer);
27 | let float32Array = new Float32Array(int16Array.length);
28 | for (let i = 0; i < int16Array.length; i++) {
29 | float32Array[i] = int16Array[i] / 32768.;
30 | }
31 | playNode.port.postMessage({ message: 'audioData', audioData: float32Array });
32 | });
33 | } else {
34 | this.elapsedTime = JSON.parse(e.data)?.elapsed;
35 | this.disabled = false;
36 | }
37 | }
38 | },
39 |
40 | async stopasr() {
41 | if (!this.asrWS) {
42 | return;
43 | }
44 | this.asrWS.close();
45 | this.asrWS = null;
46 | this.recording = false;
47 | if (this.currentText) {
48 | this.logs.push({ idx: this.logs.length + 1, text: this.currentText });
49 | }
50 | this.currentText = null;
51 |
52 | },
53 |
54 | async doasr() {
55 | const audioConstraints = {
56 | video: false,
57 | audio: true,
58 | };
59 |
60 | const mediaStream = await navigator.mediaDevices.getUserMedia(audioConstraints);
61 |
62 | const ws = new WebSocket('/asr');
63 | let currentMessage = '';
64 |
65 | ws.onopen = () => {
66 | this.logs = [];
67 | };
68 |
69 | ws.onmessage = (e) => {
70 | const data = JSON.parse(e.data);
71 | const { text, finished, idx } = data;
72 |
73 | currentMessage = text;
74 | this.currentText = text
75 |
76 | if (finished) {
77 | this.logs.push({ text: currentMessage, idx: idx });
78 | currentMessage = '';
79 | this.currentText = null
80 | }
81 | };
82 |
83 | let audioContext = new AudioContext({ sampleRate: 16000 })
84 | await audioContext.audioWorklet.addModule('./audio_process.js')
85 |
86 | const recordNode = new AudioWorkletNode(audioContext, 'record-audio-processor');
87 | recordNode.connect(audioContext.destination);
88 | recordNode.port.onmessage = (event) => {
89 | if (ws && ws.readyState === WebSocket.OPEN) {
90 | const int16Array = event.data.data;
91 | ws.send(int16Array.buffer);
92 | }
93 | }
94 | const source = audioContext.createMediaStreamSource(mediaStream);
95 | source.connect(recordNode);
96 | this.asrWS = ws;
97 | this.recording = true;
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/assets/audio_process.js:
--------------------------------------------------------------------------------
1 | class PlayerAudioProcessor extends AudioWorkletProcessor {
2 | constructor() {
3 | super();
4 | this.buffer = new Float32Array();
5 | this.port.onmessage = (event) => {
6 | let newFetchedData = new Float32Array(this.buffer.length + event.data.audioData.length);
7 | newFetchedData.set(this.buffer, 0);
8 | newFetchedData.set(event.data.audioData, this.buffer.length);
9 | this.buffer = newFetchedData;
10 | };
11 | }
12 |
13 | process(inputs, outputs, parameters) {
14 | const output = outputs[0];
15 | const channel = output[0];
16 | const bufferLength = this.buffer.length;
17 | for (let i = 0; i < channel.length; i++) {
18 | channel[i] = (i < bufferLength) ? this.buffer[i] : 0;
19 | }
20 | this.buffer = this.buffer.slice(channel.length);
21 | return true;
22 | }
23 | }
24 |
25 | class RecordAudioProcessor extends AudioWorkletProcessor {
26 | constructor() {
27 | super();
28 | }
29 |
30 | process(inputs, outputs, parameters) {
31 | const channel = inputs[0][0];
32 | if (!channel || channel.length === 0) {
33 | return true;
34 | }
35 | const int16Array = new Int16Array(channel.length);
36 | for (let i = 0; i < channel.length; i++) {
37 | int16Array[i] = channel[i] * 32767;
38 | }
39 | this.port.postMessage({ data: int16Array });
40 | return true
41 | }
42 | }
43 |
44 | registerProcessor('play-audio-processor', PlayerAudioProcessor);
45 | registerProcessor('record-audio-processor', RecordAudioProcessor);
--------------------------------------------------------------------------------
/assets/images/record.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/images/speaking.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
91 | VoiceAPI Demo / 92 | ruzhila.cn 93 |
94 | 95 | 101 | 102 |
154 |
155 |
Click to record !
157 |
165 |
167 |
Click to stop recording !
170 |