├── .gitignore
├── 1.启动ASR服务-SSL.bat
├── 1.启动ASR服务.bat
├── 2.启动WebUI.bat
├── 7.激活conda环境.bat
├── README.md
├── funasr_client_api.py
├── funasr_wss_client.py
├── funasr_wss_server.py
├── requirements.txt
├── requirements_client.txt
├── requirements_server.txt
├── web
├── index.html
├── main.js
├── pcm.js
├── recorder-core.js
├── wav.js
└── wsconnecter.js
└── webui.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | build/
3 | *.egg-info/
4 | *.so
5 | *.mp4
6 |
7 | tmp*
8 | trial*/
9 |
10 | data
11 | data_utils/face_tracking/3DMM/*
12 | data_utils/face_parsing/79999_iter.pth
13 |
14 | pretrained
15 | *.mp4
16 | .DS_Store
17 | workspace/log_ngp.txt
18 | .idea
19 |
20 | Miniconda3/
21 | hf_download/
22 |
23 | *.pth
24 | *.pt
25 | *log.txt
26 | log.txt
27 |
28 | wav2lip/results/
29 |
30 | metahuman-stream*
31 | *.zip
32 | FunASR_WS/
33 | *.pem
--------------------------------------------------------------------------------
/1.启动ASR服务-SSL.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | chcp 65001
3 |
4 | SET CONDA_PATH=.\Miniconda3
5 |
6 | REM 激活base环境
7 | CALL %CONDA_PATH%\Scripts\activate.bat %CONDA_PATH%
8 |
9 | SET KMP_DUPLICATE_LIB_OK=TRUE
10 | SET CONDA_PATH=.\Miniconda3
11 | set HF_ENDPOINT=https://hf-mirror.com
12 | set HF_HOME=%CD%\hf_download
13 | set MODELSCOPE_CACHE=%CD%\hf_download
14 |
15 | set disable_update=True
16 |
17 | python funasr_wss_server.py --port 10096 --certfile "cert.pem" --keyfile "key.pem" --asr_model iic/SenseVoiceSmall --asr_model_revision master --asr_model_online iic/SenseVoiceSmall --asr_model_online_revision master
18 |
19 | cmd /k
--------------------------------------------------------------------------------
/1.启动ASR服务.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | chcp 65001
3 |
4 | SET CONDA_PATH=.\Miniconda3
5 |
6 | REM 激活base环境
7 | CALL %CONDA_PATH%\Scripts\activate.bat %CONDA_PATH%
8 |
9 | SET KMP_DUPLICATE_LIB_OK=TRUE
10 | SET CONDA_PATH=.\Miniconda3
11 | set HF_ENDPOINT=https://hf-mirror.com
12 | set HF_HOME=%CD%\hf_download
13 | set MODELSCOPE_CACHE=%CD%\hf_download
14 |
15 | set disable_update=True
16 |
17 | python funasr_wss_server.py --port 10096 --certfile "" --asr_model iic/SenseVoiceSmall --asr_model_revision master --asr_model_online iic/SenseVoiceSmall --asr_model_online_revision master
18 |
19 | cmd /k
--------------------------------------------------------------------------------
/2.启动WebUI.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | chcp 65001
3 |
4 | SET CONDA_PATH=.\Miniconda3
5 |
6 | REM 激活base环境
7 | CALL %CONDA_PATH%\Scripts\activate.bat %CONDA_PATH%
8 |
9 | SET KMP_DUPLICATE_LIB_OK=TRUE
10 | SET CONDA_PATH=.\Miniconda3
11 | set HF_ENDPOINT=https://hf-mirror.com
12 | set HF_HOME=%CD%\hf_download
13 | set MODELSCOPE_CACHE=%CD%\hf_download
14 |
15 | set disable_update=True
16 |
17 | start "" "http://127.0.0.1:8101/web/index.html"
18 |
19 | python webui.py
20 |
21 | cmd /k
--------------------------------------------------------------------------------
/7.激活conda环境.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | SET CONDA_PATH=.\Miniconda3
4 |
5 | REM 激活base环境
6 | CALL %CONDA_PATH%\Scripts\activate.bat %CONDA_PATH%
7 |
8 | cmd /k
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 基于FunASR官方Demo修改的WS服务端,配合FastAPI提供HTTP服务,可以在浏览器中进行实时ASR测试
2 |
3 | 安装依赖:
4 | ```shell
5 | pip install -r requirements.txt
6 | ```
7 |
8 | 启动ASR服务:
9 | ```shell
10 | python main.py
11 | ```
12 |
13 | 启动WebUI:
14 | ```shell
15 | python webui.py
16 | ```
17 |
18 |
19 | 浏览器访问:
20 | ```shell
21 | http://127.0.0.1:8101
22 | ```
23 |
24 | 效果预览:
25 | 
26 |
27 |
28 |
29 | # Service with websocket-python
30 |
31 | This is a demo using funasr pipeline with websocket python-api. It supports the offline, online, offline/online-2pass unifying speech recognition.
32 |
33 | ## For the Server
34 |
35 | ### Install the modelscope and funasr
36 |
37 | ```shell
38 | pip install -U modelscope funasr
39 | # For the users in China, you could install with the command:
40 | # pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
41 | git clone https://github.com/alibaba/FunASR.git && cd FunASR
42 | ```
43 |
44 | ### Install the requirements for server
45 |
46 | ```shell
47 | cd runtime/python/websocket
48 | pip install -r requirements_server.txt
49 | ```
50 |
51 | ### Start server
52 |
53 | ##### API-reference
54 | ```shell
55 | python funasr_wss_server.py \
56 | --port [port id] \
57 | --asr_model [asr model_name] \
58 | --asr_model_online [asr model_name] \
59 | --punc_model [punc model_name] \
60 | --ngpu [0 or 1] \
61 | --ncpu [1 or 4] \
62 | --certfile [path of certfile for ssl] \
63 | --keyfile [path of keyfile for ssl]
64 | ```
65 | ##### Usage examples
66 | ```shell
67 | python funasr_wss_server.py --port 10095
68 | ```
69 |
70 | ## For the client
71 |
72 | Install the requirements for client
73 | ```shell
74 | git clone https://github.com/alibaba/FunASR.git && cd FunASR
75 | cd funasr/runtime/python/websocket
76 | pip install -r requirements_client.txt
77 | ```
78 | If you want infer from videos, you should install `ffmpeg`
79 | ```shell
80 | apt-get install -y ffmpeg #ubuntu
81 | # yum install -y ffmpeg # centos
82 | # brew install ffmpeg # mac
83 | # winget install ffmpeg # wins
84 | pip3 install websockets ffmpeg-python
85 | ```
86 |
87 | ### Start client
88 | #### API-reference
89 | ```shell
90 | python funasr_wss_client.py \
91 | --host [ip_address] \
92 | --port [port id] \
93 | --chunk_size ["5,10,5"=600ms, "8,8,4"=480ms] \
94 | --chunk_interval [duration of send chunk_size/chunk_interval] \
95 | --words_max_print [max number of words to print] \
96 | --audio_in [if set, loadding from wav.scp, else recording from mircrophone] \
97 | --output_dir [if set, write the results to output_dir] \
98 | --mode [`online` for streaming asr, `offline` for non-streaming, `2pass` for unifying streaming and non-streaming asr] \
99 | --thread_num [thread_num for send data]
100 | ```
101 |
102 | #### Usage examples
103 | ##### ASR offline client
104 | Recording from mircrophone
105 | ```shell
106 | # --chunk_interval, "10": 600/10=60ms, "5"=600/5=120ms, "20": 600/12=30ms
107 | python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode offline
108 | ```
109 | Loadding from wav.scp(kaldi style)
110 | ```shell
111 | # --chunk_interval, "10": 600/10=60ms, "5"=600/5=120ms, "20": 600/12=30ms
112 | python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode offline --audio_in "./data/wav.scp" --output_dir "./results"
113 | ```
114 |
115 | ##### ASR streaming client
116 | Recording from mircrophone
117 | ```shell
118 | # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
119 | python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode online --chunk_size "5,10,5"
120 | ```
121 | Loadding from wav.scp(kaldi style)
122 | ```shell
123 | # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
124 | python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode online --chunk_size "5,10,5" --audio_in "./data/wav.scp" --output_dir "./results"
125 | ```
126 |
127 | ##### ASR offline/online 2pass client
128 | Recording from mircrophone
129 | ```shell
130 | # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
131 | python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode 2pass --chunk_size "8,8,4"
132 | ```
133 | Loadding from wav.scp(kaldi style)
134 | ```shell
135 | # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
136 | python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode 2pass --chunk_size "8,8,4" --audio_in "./data/wav.scp" --output_dir "./results"
137 | ```
138 |
139 | #### Websocket api
140 | ```shell
141 | # class Funasr_websocket_recognizer example with 3 step
142 | # 1.create an recognizer
143 | rcg=Funasr_websocket_recognizer(host="127.0.0.1",port="30035",is_ssl=True,mode="2pass")
144 | # 2.send pcm data to asr engine and get asr result
145 | text=rcg.feed_chunk(data)
146 | print("text",text)
147 | # 3.get last result, set timeout=3
148 | text=rcg.close(timeout=3)
149 | print("text",text)
150 | ```
151 |
152 | ## Acknowledge
153 | 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
154 | 2. We acknowledge [zhaoming](https://github.com/zhaomingwork/FunASR/tree/fix_bug_for_python_websocket) for contributing the websocket service.
155 | 3. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service of offline model.
156 |
--------------------------------------------------------------------------------
/funasr_client_api.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
3 | Reserved. MIT License (https://opensource.org/licenses/MIT)
4 |
5 | 2022-2023 by zhaomingwork@qq.com
6 | """
7 |
8 | # pip install websocket-client
9 | import ssl
10 | from websocket import ABNF
11 | from websocket import create_connection
12 | from queue import Queue
13 | import threading
14 | import traceback
15 | import json
16 | import time
17 | import numpy as np
18 |
19 |
20 | # class for recognizer in websocket
21 | class Funasr_websocket_recognizer:
22 | """
23 | python asr recognizer lib
24 |
25 | """
26 |
27 | def __init__(
28 | self,
29 | host="127.0.0.1",
30 | port="30035",
31 | is_ssl=True,
32 | chunk_size="0, 10, 5",
33 | chunk_interval=10,
34 | mode="offline",
35 | wav_name="default",
36 | ):
37 | """
38 | host: server host ip
39 | port: server port
40 | is_ssl: True for wss protocal, False for ws
41 | """
42 | try:
43 | if is_ssl == True:
44 | ssl_context = ssl.SSLContext()
45 | ssl_context.check_hostname = False
46 | ssl_context.verify_mode = ssl.CERT_NONE
47 | uri = "wss://{}:{}".format(host, port)
48 | ssl_opt = {"cert_reqs": ssl.CERT_NONE}
49 | else:
50 | uri = "ws://{}:{}".format(host, port)
51 | ssl_context = None
52 | ssl_opt = None
53 | self.host = host
54 | self.port = port
55 |
56 | self.msg_queue = Queue() # used for recognized result text
57 |
58 | print("connect to url", uri)
59 | self.websocket = create_connection(uri, ssl=ssl_context, sslopt=ssl_opt)
60 |
61 | self.thread_msg = threading.Thread(
62 | target=Funasr_websocket_recognizer.thread_rec_msg, args=(self,)
63 | )
64 | self.thread_msg.start()
65 | chunk_size = [int(x) for x in chunk_size.split(",")]
66 | stride = int(60 * chunk_size[1] / chunk_interval / 1000 * 16000 * 2)
67 | chunk_num = (len(audio_bytes) - 1) // stride + 1
68 |
69 | message = json.dumps(
70 | {
71 | "mode": mode,
72 | "chunk_size": chunk_size,
73 | "encoder_chunk_look_back": 4,
74 | "decoder_chunk_look_back": 1,
75 | "chunk_interval": chunk_interval,
76 | "wav_name": wav_name,
77 | "is_speaking": True,
78 | }
79 | )
80 |
81 | self.websocket.send(message)
82 |
83 | print("send json", message)
84 |
85 | except Exception as e:
86 | print("Exception:", e)
87 | traceback.print_exc()
88 |
89 | # threads for rev msg
90 | def thread_rec_msg(self):
91 | try:
92 | while True:
93 | msg = self.websocket.recv()
94 | if msg is None or len(msg) == 0:
95 | continue
96 | msg = json.loads(msg)
97 |
98 | self.msg_queue.put(msg)
99 | except Exception as e:
100 | print("client closed")
101 |
102 | # feed data to asr engine, wait_time means waiting for result until time out
103 | def feed_chunk(self, chunk, wait_time=0.01):
104 | try:
105 | self.websocket.send(chunk, ABNF.OPCODE_BINARY)
106 | # loop to check if there is a message, timeout in 0.01s
107 | while True:
108 | msg = self.msg_queue.get(timeout=wait_time)
109 | if self.msg_queue.empty():
110 | break
111 |
112 | return msg
113 | except:
114 | return ""
115 |
116 | def close(self, timeout=1):
117 | message = json.dumps({"is_speaking": False})
118 | self.websocket.send(message)
119 | # sleep for timeout seconds to wait for result
120 | time.sleep(timeout)
121 | msg = ""
122 | while not self.msg_queue.empty():
123 | msg = self.msg_queue.get()
124 |
125 | self.websocket.close()
126 | # only resturn the last msg
127 | return msg
128 |
129 |
130 | if __name__ == "__main__":
131 |
132 | print("example for Funasr_websocket_recognizer")
133 | import wave
134 |
135 | wav_path = "/Users/zhifu/Downloads/modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
136 | with wave.open(wav_path, "rb") as wav_file:
137 | params = wav_file.getparams()
138 | frames = wav_file.readframes(wav_file.getnframes())
139 | audio_bytes = bytes(frames)
140 |
141 | stride = int(60 * 10 / 10 / 1000 * 16000 * 2)
142 | chunk_num = (len(audio_bytes) - 1) // stride + 1
143 | # create an recognizer
144 | rcg = Funasr_websocket_recognizer(
145 | host="127.0.0.1", port="10095", is_ssl=True, mode="2pass", chunk_size="0,10,5"
146 | )
147 | # loop to send chunk
148 | for i in range(chunk_num):
149 |
150 | beg = i * stride
151 | data = audio_bytes[beg : beg + stride]
152 |
153 | text = rcg.feed_chunk(data, wait_time=0.02)
154 | if len(text) > 0:
155 | print("text", text)
156 | time.sleep(0.05)
157 |
158 | # get last message
159 | text = rcg.close(timeout=3)
160 | print("text", text)
161 |
--------------------------------------------------------------------------------
/funasr_wss_client.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | import os
3 | import time
4 | import websockets, ssl
5 | import asyncio
6 |
7 | # import threading
8 | import argparse
9 | import json
10 | import traceback
11 | from multiprocessing import Process
12 |
13 | # from funasr.fileio.datadir_writer import DatadirWriter
14 |
15 | import logging
16 |
17 | logging.basicConfig(level=logging.ERROR)
18 |
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument(
21 | "--host", type=str, default="localhost", required=False, help="host ip, localhost, 0.0.0.0"
22 | )
23 | parser.add_argument("--port", type=int, default=10095, required=False, help="grpc server port")
24 | parser.add_argument("--chunk_size", type=str, default="5, 10, 5", help="chunk")
25 | parser.add_argument("--encoder_chunk_look_back", type=int, default=4, help="chunk")
26 | parser.add_argument("--decoder_chunk_look_back", type=int, default=0, help="chunk")
27 | parser.add_argument("--chunk_interval", type=int, default=10, help="chunk")
28 | parser.add_argument(
29 | "--hotword",
30 | type=str,
31 | default="",
32 | help="hotword file path, one hotword perline (e.g.:阿里巴巴 20)",
33 | )
34 | parser.add_argument("--audio_in", type=str, default=None, help="audio_in")
35 | parser.add_argument("--audio_fs", type=int, default=16000, help="audio_fs")
36 | parser.add_argument(
37 | "--send_without_sleep",
38 | action="store_true",
39 | default=True,
40 | help="if audio_in is set, send_without_sleep",
41 | )
42 | parser.add_argument("--thread_num", type=int, default=1, help="thread_num")
43 | parser.add_argument("--words_max_print", type=int, default=10000, help="chunk")
44 | parser.add_argument("--output_dir", type=str, default=None, help="output_dir")
45 | parser.add_argument("--ssl", type=int, default=1, help="1 for ssl connect, 0 for no ssl")
46 | parser.add_argument("--use_itn", type=int, default=1, help="1 for using itn, 0 for not itn")
47 | parser.add_argument("--mode", type=str, default="2pass", help="offline, online, 2pass")
48 |
49 | args = parser.parse_args()
50 | args.chunk_size = [int(x) for x in args.chunk_size.split(",")]
51 | print(args)
52 | # voices = asyncio.Queue()
53 | from queue import Queue
54 |
55 | voices = Queue()
56 | offline_msg_done = False
57 |
58 | if args.output_dir is not None:
59 | # if os.path.exists(args.output_dir):
60 | # os.remove(args.output_dir)
61 |
62 | if not os.path.exists(args.output_dir):
63 | os.makedirs(args.output_dir)
64 |
65 |
66 | async def record_microphone():
67 | is_finished = False
68 | import pyaudio
69 |
70 | # print("2")
71 | global voices
72 | FORMAT = pyaudio.paInt16
73 | CHANNELS = 1
74 | RATE = 16000
75 | chunk_size = 60 * args.chunk_size[1] / args.chunk_interval
76 | CHUNK = int(RATE / 1000 * chunk_size)
77 |
78 | p = pyaudio.PyAudio()
79 |
80 | stream = p.open(
81 | format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK
82 | )
83 | # hotwords
84 | fst_dict = {}
85 | hotword_msg = ""
86 | if args.hotword.strip() != "":
87 | if os.path.exists(args.hotword):
88 | f_scp = open(args.hotword)
89 | hot_lines = f_scp.readlines()
90 | for line in hot_lines:
91 | words = line.strip().split(" ")
92 | if len(words) < 2:
93 | print("Please checkout format of hotwords")
94 | continue
95 | try:
96 | fst_dict[" ".join(words[:-1])] = int(words[-1])
97 | except ValueError:
98 | print("Please checkout format of hotwords")
99 | hotword_msg = json.dumps(fst_dict)
100 | else:
101 | hotword_msg = args.hotword
102 |
103 | use_itn = True
104 | if args.use_itn == 0:
105 | use_itn = False
106 |
107 | message = json.dumps(
108 | {
109 | "mode": args.mode,
110 | "chunk_size": args.chunk_size,
111 | "chunk_interval": args.chunk_interval,
112 | "encoder_chunk_look_back": args.encoder_chunk_look_back,
113 | "decoder_chunk_look_back": args.decoder_chunk_look_back,
114 | "wav_name": "microphone",
115 | "is_speaking": True,
116 | "hotwords": hotword_msg,
117 | "itn": use_itn,
118 | }
119 | )
120 | # voices.put(message)
121 | await websocket.send(message)
122 | while True:
123 | data = stream.read(CHUNK)
124 | message = data
125 | # voices.put(message)
126 | await websocket.send(message)
127 | await asyncio.sleep(0.005)
128 |
129 |
130 | async def record_from_scp(chunk_begin, chunk_size):
131 | global voices
132 | is_finished = False
133 | if args.audio_in.endswith(".scp"):
134 | f_scp = open(args.audio_in)
135 | wavs = f_scp.readlines()
136 | else:
137 | wavs = [args.audio_in]
138 |
139 | # hotwords
140 | fst_dict = {}
141 | hotword_msg = ""
142 | if args.hotword.strip() != "":
143 | if os.path.exists(args.hotword):
144 | f_scp = open(args.hotword)
145 | hot_lines = f_scp.readlines()
146 | for line in hot_lines:
147 | words = line.strip().split(" ")
148 | if len(words) < 2:
149 | print("Please checkout format of hotwords")
150 | continue
151 | try:
152 | fst_dict[" ".join(words[:-1])] = int(words[-1])
153 | except ValueError:
154 | print("Please checkout format of hotwords")
155 | hotword_msg = json.dumps(fst_dict)
156 | else:
157 | hotword_msg = args.hotword
158 | print(hotword_msg)
159 |
160 | sample_rate = args.audio_fs
161 | wav_format = "pcm"
162 | use_itn = True
163 | if args.use_itn == 0:
164 | use_itn = False
165 |
166 | if chunk_size > 0:
167 | wavs = wavs[chunk_begin : chunk_begin + chunk_size]
168 | for wav in wavs:
169 | wav_splits = wav.strip().split()
170 |
171 | wav_name = wav_splits[0] if len(wav_splits) > 1 else "demo"
172 | wav_path = wav_splits[1] if len(wav_splits) > 1 else wav_splits[0]
173 | if not len(wav_path.strip()) > 0:
174 | continue
175 | if wav_path.endswith(".pcm"):
176 | with open(wav_path, "rb") as f:
177 | audio_bytes = f.read()
178 | elif wav_path.endswith(".wav"):
179 | import wave
180 |
181 | with wave.open(wav_path, "rb") as wav_file:
182 | params = wav_file.getparams()
183 | sample_rate = wav_file.getframerate()
184 | frames = wav_file.readframes(wav_file.getnframes())
185 | audio_bytes = bytes(frames)
186 | else:
187 | wav_format = "others"
188 | with open(wav_path, "rb") as f:
189 | audio_bytes = f.read()
190 |
191 | stride = int(60 * args.chunk_size[1] / args.chunk_interval / 1000 * sample_rate * 2)
192 | chunk_num = (len(audio_bytes) - 1) // stride + 1
193 | # print(stride)
194 |
195 | # send first time
196 | message = json.dumps(
197 | {
198 | "mode": args.mode,
199 | "chunk_size": args.chunk_size,
200 | "chunk_interval": args.chunk_interval,
201 | "encoder_chunk_look_back": args.encoder_chunk_look_back,
202 | "decoder_chunk_look_back": args.decoder_chunk_look_back,
203 | "audio_fs": sample_rate,
204 | "wav_name": wav_name,
205 | "wav_format": wav_format,
206 | "is_speaking": True,
207 | "hotwords": hotword_msg,
208 | "itn": use_itn,
209 | }
210 | )
211 |
212 | # voices.put(message)
213 | await websocket.send(message)
214 | is_speaking = True
215 | for i in range(chunk_num):
216 |
217 | beg = i * stride
218 | data = audio_bytes[beg : beg + stride]
219 | message = data
220 | # voices.put(message)
221 | await websocket.send(message)
222 | if i == chunk_num - 1:
223 | is_speaking = False
224 | message = json.dumps({"is_speaking": is_speaking})
225 | # voices.put(message)
226 | await websocket.send(message)
227 |
228 | sleep_duration = (
229 | 0.001
230 | if args.mode == "offline"
231 | else 60 * args.chunk_size[1] / args.chunk_interval / 1000
232 | )
233 |
234 | await asyncio.sleep(sleep_duration)
235 |
236 | if not args.mode == "offline":
237 | await asyncio.sleep(2)
238 | # offline model need to wait for message recved
239 |
240 | if args.mode == "offline":
241 | global offline_msg_done
242 | while not offline_msg_done:
243 | await asyncio.sleep(1)
244 |
245 | await websocket.close()
246 |
247 |
248 | async def message(id):
249 | global websocket, voices, offline_msg_done
250 | text_print = ""
251 | text_print_2pass_online = ""
252 | text_print_2pass_offline = ""
253 | if args.output_dir is not None:
254 | ibest_writer = open(
255 | os.path.join(args.output_dir, "text.{}".format(id)), "a", encoding="utf-8"
256 | )
257 | else:
258 | ibest_writer = None
259 | try:
260 | while True:
261 |
262 | meg = await websocket.recv()
263 | meg = json.loads(meg)
264 | wav_name = meg.get("wav_name", "demo")
265 | text = meg["text"]
266 | timestamp = ""
267 | offline_msg_done = meg.get("is_final", False)
268 | if "timestamp" in meg:
269 | timestamp = meg["timestamp"]
270 |
271 | if ibest_writer is not None:
272 | if timestamp != "":
273 | text_write_line = "{}\t{}\t{}\n".format(wav_name, text, timestamp)
274 | else:
275 | text_write_line = "{}\t{}\n".format(wav_name, text)
276 | ibest_writer.write(text_write_line)
277 |
278 | if "mode" not in meg:
279 | continue
280 | if meg["mode"] == "online":
281 | text_print += "{}".format(text)
282 | text_print = text_print[-args.words_max_print :]
283 | os.system("clear")
284 | print("\rpid" + str(id) + ": " + text_print)
285 | elif meg["mode"] == "offline":
286 | if timestamp != "":
287 | text_print += "{} timestamp: {}".format(text, timestamp)
288 | else:
289 | text_print += "{}".format(text)
290 |
291 | # text_print = text_print[-args.words_max_print:]
292 | # os.system('clear')
293 | print("\rpid" + str(id) + ": " + wav_name + ": " + text_print)
294 | offline_msg_done = True
295 | else:
296 | if meg["mode"] == "2pass-online":
297 | text_print_2pass_online += "{}".format(text)
298 | text_print = text_print_2pass_offline + text_print_2pass_online
299 | else:
300 | text_print_2pass_online = ""
301 | text_print = text_print_2pass_offline + "{}".format(text)
302 | text_print_2pass_offline += "{}".format(text)
303 | text_print = text_print[-args.words_max_print :]
304 | os.system("clear")
305 | print("\rpid" + str(id) + ": " + text_print)
306 | # offline_msg_done=True
307 |
308 | except Exception as e:
309 | print("Exception:", e)
310 | # traceback.print_exc()
311 | # await websocket.close()
312 |
313 |
314 | async def ws_client(id, chunk_begin, chunk_size):
315 | if args.audio_in is None:
316 | chunk_begin = 0
317 | chunk_size = 1
318 | global websocket, voices, offline_msg_done
319 |
320 | for i in range(chunk_begin, chunk_begin + chunk_size):
321 | offline_msg_done = False
322 | voices = Queue()
323 | if args.ssl == 1:
324 | ssl_context = ssl.SSLContext()
325 | ssl_context.check_hostname = False
326 | ssl_context.verify_mode = ssl.CERT_NONE
327 | uri = "wss://{}:{}".format(args.host, args.port)
328 | else:
329 | uri = "ws://{}:{}".format(args.host, args.port)
330 | ssl_context = None
331 | print("connect to", uri)
332 | async with websockets.connect(
333 | uri, subprotocols=["binary"], ping_interval=None, ssl=ssl_context
334 | ) as websocket:
335 | if args.audio_in is not None:
336 | task = asyncio.create_task(record_from_scp(i, 1))
337 | else:
338 | task = asyncio.create_task(record_microphone())
339 | task3 = asyncio.create_task(message(str(id) + "_" + str(i))) # processid+fileid
340 | await asyncio.gather(task, task3)
341 | exit(0)
342 |
343 |
344 | def one_thread(id, chunk_begin, chunk_size):
345 | asyncio.get_event_loop().run_until_complete(ws_client(id, chunk_begin, chunk_size))
346 | asyncio.get_event_loop().run_forever()
347 |
348 |
349 | if __name__ == "__main__":
350 | # for microphone
351 | if args.audio_in is None:
352 | p = Process(target=one_thread, args=(0, 0, 0))
353 | p.start()
354 | p.join()
355 | print("end")
356 | else:
357 | # calculate the number of wavs for each preocess
358 | if args.audio_in.endswith(".scp"):
359 | f_scp = open(args.audio_in)
360 | wavs = f_scp.readlines()
361 | else:
362 | wavs = [args.audio_in]
363 | for wav in wavs:
364 | wav_splits = wav.strip().split()
365 | wav_name = wav_splits[0] if len(wav_splits) > 1 else "demo"
366 | wav_path = wav_splits[1] if len(wav_splits) > 1 else wav_splits[0]
367 | audio_type = os.path.splitext(wav_path)[-1].lower()
368 |
369 | total_len = len(wavs)
370 | if total_len >= args.thread_num:
371 | chunk_size = int(total_len / args.thread_num)
372 | remain_wavs = total_len - chunk_size * args.thread_num
373 | else:
374 | chunk_size = 1
375 | remain_wavs = 0
376 |
377 | process_list = []
378 | chunk_begin = 0
379 | for i in range(args.thread_num):
380 | now_chunk_size = chunk_size
381 | if remain_wavs > 0:
382 | now_chunk_size = chunk_size + 1
383 | remain_wavs = remain_wavs - 1
384 | # process i handle wavs at chunk_begin and size of now_chunk_size
385 | p = Process(target=one_thread, args=(i, chunk_begin, now_chunk_size))
386 | chunk_begin = chunk_begin + now_chunk_size
387 | p.start()
388 | process_list.append(p)
389 |
390 | for p in process_list:
391 | p.join()
392 |
393 | print("end")
394 |
--------------------------------------------------------------------------------
/funasr_wss_server.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 | import websockets
4 | import time
5 | import logging
6 | import tracemalloc
7 | import numpy as np
8 | import argparse
9 | import ssl
10 | from loguru import logger
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument(
14 | "--host", type=str, default="0.0.0.0", required=False, help="host ip, localhost, 0.0.0.0"
15 | )
16 | parser.add_argument("--port", type=int, default=10095, required=False, help="grpc server port")
17 | parser.add_argument(
18 | "--asr_model",
19 | type=str,
20 | default="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
21 | help="model from modelscope",
22 | )
23 | parser.add_argument("--asr_model_revision", type=str, default="v2.0.4", help="")
24 | parser.add_argument(
25 | "--asr_model_online",
26 | type=str,
27 | default="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online",
28 | help="model from modelscope",
29 | )
30 | parser.add_argument("--asr_model_online_revision", type=str, default="v2.0.4", help="")
31 | parser.add_argument(
32 | "--vad_model",
33 | type=str,
34 | default="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
35 | help="model from modelscope",
36 | )
37 | parser.add_argument("--vad_model_revision", type=str, default="v2.0.4", help="")
38 | parser.add_argument(
39 | "--punc_model",
40 | type=str,
41 | default="iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
42 | help="model from modelscope",
43 | )
44 | parser.add_argument("--punc_model_revision", type=str, default="v2.0.4", help="")
45 | parser.add_argument("--ngpu", type=int, default=1, help="0 for cpu, 1 for gpu")
46 | parser.add_argument("--device", type=str, default="cuda", help="cuda, cpu")
47 | parser.add_argument("--ncpu", type=int, default=4, help="cpu cores")
48 | parser.add_argument(
49 | "--certfile",
50 | type=str,
51 | default="cert.pem",
52 | required=False,
53 | help="certfile for ssl",
54 | )
55 |
56 | parser.add_argument(
57 | "--keyfile",
58 | type=str,
59 | default="key.pem",
60 | required=False,
61 | help="keyfile for ssl",
62 | )
63 | args = parser.parse_args()
64 |
65 |
66 | websocket_users = set()
67 |
68 | logger.info("模型加载中,请耐心等待...")
69 | from funasr import AutoModel
70 |
71 | # asr
72 | model_asr = AutoModel(
73 | model=args.asr_model,
74 | model_revision=args.asr_model_revision,
75 | ngpu=args.ngpu,
76 | ncpu=args.ncpu,
77 | device=args.device,
78 | disable_pbar=True,
79 | disable_log=True,
80 | disable_update=True,
81 | )
82 | # asr
83 | model_asr_streaming = AutoModel(
84 | model=args.asr_model_online,
85 | model_revision=args.asr_model_online_revision,
86 | ngpu=args.ngpu,
87 | ncpu=args.ncpu,
88 | device=args.device,
89 | disable_pbar=True,
90 | disable_log=True,
91 | disable_update=True,
92 | )
93 | # vad
94 | model_vad = AutoModel(
95 | model=args.vad_model,
96 | model_revision=args.vad_model_revision,
97 | ngpu=args.ngpu,
98 | ncpu=args.ncpu,
99 | device=args.device,
100 | disable_pbar=True,
101 | disable_log=True,
102 | # chunk_size=60,
103 | disable_update=True,
104 | )
105 |
106 | if args.punc_model != "":
107 | model_punc = AutoModel(
108 | model=args.punc_model,
109 | model_revision=args.punc_model_revision,
110 | ngpu=args.ngpu,
111 | ncpu=args.ncpu,
112 | device=args.device,
113 | disable_pbar=True,
114 | disable_log=True,
115 | disable_update=True,
116 | )
117 | else:
118 | model_punc = None
119 |
120 |
121 | logger.info("模型已加载!现在只能同时支持一个客户端!!!!")
122 |
123 |
124 | async def ws_reset(websocket):
125 | logger.info("WS已重置, 总连接数 ", len(websocket_users))
126 |
127 | websocket.status_dict_asr_online["cache"] = {}
128 | websocket.status_dict_asr_online["is_final"] = True
129 | websocket.status_dict_vad["cache"] = {}
130 | websocket.status_dict_vad["is_final"] = True
131 | websocket.status_dict_punc["cache"] = {}
132 |
133 | await websocket.close()
134 |
135 |
136 | async def clear_websocket():
137 | for websocket in websocket_users:
138 | await ws_reset(websocket)
139 | websocket_users.clear()
140 |
141 |
142 | async def ws_serve(websocket, path):
143 | frames = []
144 | frames_asr = []
145 | frames_asr_online = []
146 | global websocket_users
147 | # await clear_websocket()
148 | websocket_users.add(websocket)
149 | websocket.status_dict_asr = {}
150 | websocket.status_dict_asr_online = {"cache": {}, "is_final": False}
151 | websocket.status_dict_vad = {"cache": {}, "is_final": False}
152 | websocket.status_dict_punc = {"cache": {}}
153 | websocket.chunk_interval = 10
154 | websocket.vad_pre_idx = 0
155 | speech_start = False
156 | speech_end_i = -1
157 | websocket.wav_name = "microphone"
158 | websocket.mode = "2pass"
159 | logger.info("新用户已连接")
160 |
161 | try:
162 | async for message in websocket:
163 | if isinstance(message, str):
164 | messagejson = json.loads(message)
165 |
166 | if "is_speaking" in messagejson:
167 | websocket.is_speaking = messagejson["is_speaking"]
168 | websocket.status_dict_asr_online["is_final"] = not websocket.is_speaking
169 | if "chunk_interval" in messagejson:
170 | websocket.chunk_interval = messagejson["chunk_interval"]
171 | if "wav_name" in messagejson:
172 | websocket.wav_name = messagejson.get("wav_name")
173 | if "chunk_size" in messagejson:
174 | chunk_size = messagejson["chunk_size"]
175 | if isinstance(chunk_size, str):
176 | chunk_size = chunk_size.split(",")
177 | websocket.status_dict_asr_online["chunk_size"] = [int(x) for x in chunk_size]
178 | if "encoder_chunk_look_back" in messagejson:
179 | websocket.status_dict_asr_online["encoder_chunk_look_back"] = messagejson[
180 | "encoder_chunk_look_back"
181 | ]
182 | if "decoder_chunk_look_back" in messagejson:
183 | websocket.status_dict_asr_online["decoder_chunk_look_back"] = messagejson[
184 | "decoder_chunk_look_back"
185 | ]
186 | if "hotword" in messagejson:
187 | websocket.status_dict_asr["hotword"] = messagejson["hotwords"]
188 | if "mode" in messagejson:
189 | websocket.mode = messagejson["mode"]
190 |
191 | websocket.status_dict_vad["chunk_size"] = int(
192 | websocket.status_dict_asr_online["chunk_size"][1] * 60 / websocket.chunk_interval
193 | )
194 | if len(frames_asr_online) > 0 or len(frames_asr) >= 0 or not isinstance(message, str):
195 | if not isinstance(message, str):
196 | frames.append(message)
197 | duration_ms = len(message) // 32
198 | websocket.vad_pre_idx += duration_ms
199 |
200 | # asr online
201 | frames_asr_online.append(message)
202 | websocket.status_dict_asr_online["is_final"] = speech_end_i != -1
203 | if (
204 | len(frames_asr_online) % websocket.chunk_interval == 0
205 | or websocket.status_dict_asr_online["is_final"]
206 | ):
207 | if websocket.mode == "2pass" or websocket.mode == "online":
208 | audio_in = b"".join(frames_asr_online)
209 | try:
210 | await async_asr_online(websocket, audio_in)
211 | except:
212 | logger.error(f"error in asr streaming, {websocket.status_dict_asr_online}")
213 | frames_asr_online = []
214 | if speech_start:
215 | frames_asr.append(message)
216 | # vad online
217 | try:
218 | speech_start_i, speech_end_i = await async_vad(websocket, message)
219 | except:
220 | logger.error("error in vad")
221 | if speech_start_i != -1:
222 | speech_start = True
223 | beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms
224 | frames_pre = frames[-beg_bias:]
225 | frames_asr = []
226 | frames_asr.extend(frames_pre)
227 | # asr punc offline
228 | if speech_end_i != -1 or not websocket.is_speaking:
229 | # logger.info("vad end point")
230 | if websocket.mode == "2pass" or websocket.mode == "offline":
231 | audio_in = b"".join(frames_asr)
232 | try:
233 | await async_asr(websocket, audio_in)
234 | except:
235 | logger.info("error in asr offline")
236 | frames_asr = []
237 | speech_start = False
238 | frames_asr_online = []
239 | websocket.status_dict_asr_online["cache"] = {}
240 | if not websocket.is_speaking:
241 | websocket.vad_pre_idx = 0
242 | frames = []
243 | websocket.status_dict_vad["cache"] = {}
244 | else:
245 | frames = frames[-20:]
246 |
247 | except websockets.ConnectionClosed:
248 | logger.info("ConnectionClosed...", websocket_users, flush=True)
249 | await ws_reset(websocket)
250 | websocket_users.remove(websocket)
251 | except websockets.InvalidState:
252 | logger.info("InvalidState...")
253 | except Exception as e:
254 | logger.info("Exception:", e)
255 |
256 |
257 | async def async_vad(websocket, audio_in):
258 |
259 | segments_result = model_vad.generate(input=audio_in, **websocket.status_dict_vad)[0]["value"]
260 | # logger.info(segments_result)
261 |
262 | speech_start = -1
263 | speech_end = -1
264 |
265 | if len(segments_result) == 0 or len(segments_result) > 1:
266 | return speech_start, speech_end
267 | if segments_result[0][0] != -1:
268 | speech_start = segments_result[0][0]
269 | if segments_result[0][1] != -1:
270 | speech_end = segments_result[0][1]
271 | return speech_start, speech_end
272 |
273 |
274 | async def async_asr(websocket, audio_in):
275 | if len(audio_in) > 0:
276 | # logger.info(len(audio_in))
277 | rec_result = model_asr.generate(input=audio_in, **websocket.status_dict_asr)[0]
278 | # logger.info("offline_asr, ", rec_result)
279 | if model_punc is not None and len(rec_result["text"]) > 0:
280 | # logger.info("offline, before punc", rec_result, "cache", websocket.status_dict_punc)
281 | rec_result = model_punc.generate(
282 | input=rec_result["text"], **websocket.status_dict_punc
283 | )[0]
284 | # logger.info("offline, after punc", rec_result)
285 | if len(rec_result["text"]) > 0:
286 | # logger.info("offline", rec_result)
287 | mode = "2pass-offline" if "2pass" in websocket.mode else websocket.mode
288 | message = json.dumps(
289 | {
290 | "mode": mode,
291 | "text": rec_result["text"],
292 | "wav_name": websocket.wav_name,
293 | "is_final": websocket.is_speaking,
294 | }
295 | )
296 | await websocket.send(message)
297 |
298 | else:
299 | mode = "2pass-offline" if "2pass" in websocket.mode else websocket.mode
300 | message = json.dumps(
301 | {
302 | "mode": mode,
303 | "text": "",
304 | "wav_name": websocket.wav_name,
305 | "is_final": websocket.is_speaking,
306 | }
307 | )
308 | await websocket.send(message)
309 |
310 | async def async_asr_online(websocket, audio_in):
311 | if len(audio_in) > 0:
312 | # logger.info(websocket.status_dict_asr_online.get("is_final", False))
313 | rec_result = model_asr_streaming.generate(
314 | input=audio_in, **websocket.status_dict_asr_online
315 | )[0]
316 | # logger.info("online, ", rec_result)
317 | if websocket.mode == "2pass" and websocket.status_dict_asr_online.get("is_final", False):
318 | return
319 | # websocket.status_dict_asr_online["cache"] = dict()
320 | if len(rec_result["text"]):
321 | mode = "2pass-online" if "2pass" in websocket.mode else websocket.mode
322 | message = json.dumps(
323 | {
324 | "mode": mode,
325 | "text": rec_result["text"],
326 | "wav_name": websocket.wav_name,
327 | "is_final": websocket.is_speaking,
328 | }
329 | )
330 | await websocket.send(message)
331 |
332 |
333 | if len(args.certfile) > 0:
334 | ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
335 |
336 | # Generate with Lets Encrypt, copied to this location, chown to current user and 400 permissions
337 | ssl_cert = args.certfile
338 | ssl_key = args.keyfile
339 |
340 | ssl_context.load_cert_chain(ssl_cert, keyfile=ssl_key)
341 | start_server = websockets.serve(
342 | ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None, ssl=ssl_context
343 | )
344 | else:
345 | start_server = websockets.serve(
346 | ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None
347 | )
348 | asyncio.get_event_loop().run_until_complete(start_server)
349 | asyncio.get_event_loop().run_forever()
350 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | loguru
3 | uvicorn
4 | websockets==12.0
5 | pydub
6 | onnxruntime
7 | onnx==1.15.0
8 | FunASR==1.1.16
--------------------------------------------------------------------------------
/requirements_client.txt:
--------------------------------------------------------------------------------
1 | websockets
2 | pyaudio
3 |
--------------------------------------------------------------------------------
/requirements_server.txt:
--------------------------------------------------------------------------------
1 | websockets==12.0
2 | pydub
3 | onnxruntime
4 | onnx==1.15.0
5 | FunASR==1.1.16
--------------------------------------------------------------------------------
/web/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | 语音识别
8 |
127 |
128 |
129 |
130 |
131 |
语音识别
132 |
143 |
144 |
157 |
158 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
194 |
195 |
196 |
197 |
200 |
201 |
202 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
请点击开始
227 |
228 |
229 |
230 |
231 |
232 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
--------------------------------------------------------------------------------
/web/main.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
3 | * Reserved. MIT License (https://opensource.org/licenses/MIT)
4 | */
5 | /* 2022-2023 by zhaoming,mali aihealthx.com */
6 |
7 |
8 | // 连接; 定义socket连接类对象与语音对象
9 | var wsconnecter = new WebSocketConnectMethod({ msgHandle: getJsonMessage, stateHandle: getConnState });
10 | var audioBlob;
11 |
12 | // 录音; 定义录音对象,wav格式
13 | var rec = Recorder({
14 | type: "pcm",
15 | bitRate: 16,
16 | sampleRate: 16000,
17 | onProcess: recProcess
18 | });
19 |
20 |
21 | var sampleBuf = new Int16Array();
22 | // 定义按钮响应事件
23 | var btnStart = document.getElementById('btnStart');
24 | btnStart.onclick = record;
25 | var btnStop = document.getElementById('btnStop');
26 | btnStop.onclick = stop;
27 | btnStop.disabled = true;
28 | btnStart.disabled = true;
29 |
30 | btnConnect = document.getElementById('btnConnect');
31 | btnConnect.onclick = start;
32 |
33 | var awsslink = document.getElementById('wsslink');
34 |
35 | var rec_text = ""; // for online rec asr result
36 | var offline_text = ""; // for offline rec asr result
37 | var info_div = document.getElementById('info_div');
38 |
39 | var upfile = document.getElementById('upfile');
40 |
41 | var isfilemode = false; // if it is in file mode
42 | var file_ext = "";
43 | var file_sample_rate = 16000; //for wav file sample rate
44 | var file_data_array; // array to save file data
45 |
46 | var totalsend = 0;
47 |
48 | // 数据转发模式
49 | var data_forward = "none";
50 |
51 |
52 | // 构建url
53 | function buildUrl(baseUrl, endpoint) {
54 | // 创建一个新的URL对象并设置其pathname为endpoint
55 | const url = new URL(baseUrl);
56 | url.pathname = new URL(endpoint, 'http://dummy.com').pathname;
57 |
58 | return url.toString();
59 | }
60 |
61 | // var now_ipaddress=window.location.href;
62 | // now_ipaddress=now_ipaddress.replace("https://","wss://");
63 | // now_ipaddress=now_ipaddress.replace("static/index.html","");
64 | // var localport=window.location.port;
65 | // now_ipaddress=now_ipaddress.replace(localport,"10095");
66 | // document.getElementById('wssip').value=now_ipaddress;
67 | addresschange();
68 | function addresschange() {
69 |
70 | var Uri = document.getElementById('wssip').value;
71 | // document.getElementById('info_wslink').innerHTML = "点此处手工授权(IOS手机)";
72 | Uri = Uri.replace(/wss/g, "https");
73 | console.log("addresschange uri=", Uri);
74 |
75 | awsslink.onclick = function () {
76 | window.open(Uri, '_blank');
77 | }
78 |
79 | }
80 |
81 | upfile.onclick = function () {
82 | btnStart.disabled = true;
83 | btnStop.disabled = true;
84 | btnConnect.disabled = false;
85 |
86 | }
87 |
88 | // from https://github.com/xiangyuecn/Recorder/tree/master
89 | var readWavInfo = function (bytes) {
90 | //读取wav文件头,统一成44字节的头
91 | if (bytes.byteLength < 44) {
92 | return null;
93 | };
94 | var wavView = bytes;
95 | var eq = function (p, s) {
96 | for (var i = 0; i < s.length; i++) {
97 | if (wavView[p + i] != s.charCodeAt(i)) {
98 | return false;
99 | };
100 | };
101 | return true;
102 | };
103 |
104 | if (eq(0, "RIFF") && eq(8, "WAVEfmt ")) {
105 |
106 | var numCh = wavView[22];
107 | if (wavView[20] == 1 && (numCh == 1 || numCh == 2)) {//raw pcm 单或双声道
108 | var sampleRate = wavView[24] + (wavView[25] << 8) + (wavView[26] << 16) + (wavView[27] << 24);
109 | var bitRate = wavView[34] + (wavView[35] << 8);
110 | var heads = [wavView.subarray(0, 12)], headSize = 12;//head只保留必要的块
111 | //搜索data块的位置
112 | var dataPos = 0; // 44 或有更多块
113 | for (var i = 12, iL = wavView.length - 8; i < iL;) {
114 | if (wavView[i] == 100 && wavView[i + 1] == 97 && wavView[i + 2] == 116 && wavView[i + 3] == 97) {//eq(i,"data")
115 | heads.push(wavView.subarray(i, i + 8));
116 | headSize += 8;
117 | dataPos = i + 8; break;
118 | }
119 | var i0 = i;
120 | i += 4;
121 | i += 4 + wavView[i] + (wavView[i + 1] << 8) + (wavView[i + 2] << 16) + (wavView[i + 3] << 24);
122 | if (i0 == 12) {//fmt
123 | heads.push(wavView.subarray(i0, i));
124 | headSize += i - i0;
125 | }
126 | }
127 | if (dataPos) {
128 | var wavHead = new Uint8Array(headSize);
129 | for (var i = 0, n = 0; i < heads.length; i++) {
130 | wavHead.set(heads[i], n); n += heads[i].length;
131 | }
132 | return {
133 | sampleRate: sampleRate
134 | , bitRate: bitRate
135 | , numChannels: numCh
136 | , wavHead44: wavHead
137 | , dataPos: dataPos
138 | };
139 | };
140 | };
141 | };
142 | return null;
143 | };
144 |
145 | upfile.onchange = function () {
146 | var len = this.files.length;
147 | for (let i = 0; i < len; i++) {
148 |
149 | let fileAudio = new FileReader();
150 | fileAudio.readAsArrayBuffer(this.files[i]);
151 |
152 | file_ext = this.files[i].name.split('.').pop().toLowerCase();
153 | var audioblob;
154 | fileAudio.onload = function () {
155 | audioblob = fileAudio.result;
156 | file_data_array = audioblob;
157 | info_div.innerHTML = '请点击连接进行识别';
158 | }
159 |
160 | fileAudio.onerror = function (e) {
161 | console.log('error' + e);
162 | }
163 | }
164 | // for wav file, we get the sample rate
165 | if (file_ext == "wav")
166 | for (let i = 0; i < len; i++) {
167 |
168 | let fileAudio = new FileReader();
169 | fileAudio.readAsArrayBuffer(this.files[i]);
170 | fileAudio.onload = function () {
171 | audioblob = new Uint8Array(fileAudio.result);
172 |
173 | // for wav file, we can get the sample rate
174 | var info = readWavInfo(audioblob);
175 | console.log(info);
176 | file_sample_rate = info.sampleRate;
177 | }
178 | }
179 | }
180 |
181 | function play_file() {
182 | var audioblob = new Blob([new Uint8Array(file_data_array)], { type: "audio/wav" });
183 | var audio_record = document.getElementById('audio_record');
184 | audio_record.src = (window.URL || webkitURL).createObjectURL(audioblob);
185 | audio_record.controls = true;
186 | //audio_record.play(); //not auto play
187 | }
188 | function start_file_send() {
189 | sampleBuf = new Uint8Array(file_data_array);
190 |
191 | var chunk_size = 960; // for asr chunk_size [5, 10, 5]
192 |
193 | while (sampleBuf.length >= chunk_size) {
194 |
195 | sendBuf = sampleBuf.slice(0, chunk_size);
196 | totalsend = totalsend + sampleBuf.length;
197 | sampleBuf = sampleBuf.slice(chunk_size, sampleBuf.length);
198 | wsconnecter.wsSend(sendBuf);
199 | }
200 |
201 | stop();
202 | }
203 |
204 | // 修改数据转发模式
205 | function on_data_forward_change() {
206 | var item = null;
207 | var obj = document.getElementsByName("data_forward");
208 | for (var i = 0; i < obj.length; i++) { //遍历Radio
209 | if (obj[i].checked) {
210 | item = obj[i].value;
211 | break;
212 | }
213 | }
214 |
215 | data_forward = item;
216 | }
217 |
218 | function on_recoder_mode_change() {
219 | var item = null;
220 | var obj = document.getElementsByName("recoder_mode");
221 | for (var i = 0; i < obj.length; i++) { //遍历Radio
222 | if (obj[i].checked) {
223 | item = obj[i].value;
224 | break;
225 | }
226 | }
227 | if (item == "mic") {
228 | document.getElementById("mic_mode_div").style.display = 'block';
229 | document.getElementById("rec_mode_div").style.display = 'none';
230 |
231 | btnStart.disabled = true;
232 | btnStop.disabled = true;
233 | btnConnect.disabled = false;
234 | isfilemode = false;
235 | } else {
236 | document.getElementById("mic_mode_div").style.display = 'none';
237 | document.getElementById("rec_mode_div").style.display = 'block';
238 |
239 | btnStart.disabled = true;
240 | btnStop.disabled = true;
241 | btnConnect.disabled = true;
242 | isfilemode = true;
243 | info_div.innerHTML = '请点击选择文件';
244 | }
245 | }
246 |
247 |
248 | function getHotwords() {
249 | var obj = document.getElementById("varHot");
250 |
251 | if (typeof (obj) == 'undefined' || obj == null || obj.value.length <= 0) {
252 | return null;
253 | }
254 | let val = obj.value.toString();
255 |
256 | console.log("hotwords=" + val);
257 | let items = val.split(/[(\r\n)\r\n]+/); //split by \r\n
258 | var jsonresult = {};
259 | const regexNum = /^[0-9]*$/; // test number
260 | for (item of items) {
261 |
262 | let result = item.split(" ");
263 | if (result.length >= 2 && regexNum.test(result[result.length - 1])) {
264 | var wordstr = "";
265 | for (var i = 0; i < result.length - 1; i++)
266 | wordstr = wordstr + result[i] + " ";
267 |
268 | jsonresult[wordstr.trim()] = parseInt(result[result.length - 1]);
269 | }
270 | }
271 | console.log("jsonresult=" + JSON.stringify(jsonresult));
272 | return JSON.stringify(jsonresult);
273 |
274 | }
275 | function getAsrMode() {
276 |
277 | var item = null;
278 | var obj = document.getElementsByName("asr_mode");
279 | for (var i = 0; i < obj.length; i++) { //遍历Radio
280 | if (obj[i].checked) {
281 | item = obj[i].value;
282 | break;
283 | }
284 | }
285 | if (isfilemode) {
286 | item = "offline";
287 | }
288 | console.log("asr mode" + item);
289 |
290 | return item;
291 | }
292 |
293 | function handleWithTimestamp(tmptext, tmptime) {
294 | console.log("tmptext: " + tmptext);
295 | console.log("tmptime: " + tmptime);
296 | if (tmptime == null || tmptime == "undefined" || tmptext.length <= 0) {
297 | return tmptext;
298 | }
299 | tmptext = tmptext.replace(/。|?|,|、|\?|\.|\ /g, ","); // in case there are a lot of "。"
300 | var words = tmptext.split(","); // split to chinese sentence or english words
301 | var jsontime = JSON.parse(tmptime); //JSON.parse(tmptime.replace(/\]\]\[\[/g, "],[")); // in case there are a lot segments by VAD
302 | var char_index = 0; // index for timestamp
303 | var text_withtime = "";
304 | for (var i = 0; i < words.length; i++) {
305 | if (words[i] == "undefined" || words[i].length <= 0) {
306 | continue;
307 | }
308 | console.log("words===", words[i]);
309 | console.log("words: " + words[i] + ",time=" + jsontime[char_index][0] / 1000);
310 | if (/^[a-zA-Z]+$/.test(words[i])) { // if it is english
311 | text_withtime = text_withtime + jsontime[char_index][0] / 1000 + ":" + words[i] + "\n";
312 | char_index = char_index + 1; //for english, timestamp unit is about a word
313 | }
314 | else {
315 | // if it is chinese
316 | text_withtime = text_withtime + jsontime[char_index][0] / 1000 + ":" + words[i] + "\n";
317 | char_index = char_index + words[i].length; //for chinese, timestamp unit is about a char
318 | }
319 | }
320 | return text_withtime;
321 | }
322 |
323 | const sleep = (delay) => new Promise((resolve) => setTimeout(resolve, delay))
324 | async function is_speaking() {
325 | try {
326 | if (data_forward == "livetalking") {
327 | const response = await fetch(buildUrl(document.getElementById("livetalking_api_url").value, '/is_speaking'), {
328 | body: JSON.stringify({
329 | sessionid: 0,
330 | }),
331 | headers: {
332 | 'Content-Type': 'application/json'
333 | },
334 | method: 'POST'
335 | });
336 | const data = await response.json();
337 | console.log('is_speaking res:', data)
338 | return data.data
339 | } else if (data_forward == "ai_vtuber") {
340 | const response = await fetch(buildUrl(document.getElementById("ai_vtuber_api_url").value, '/get_sys_info'), {
341 | headers: {
342 | 'Content-Type': 'application/json'
343 | },
344 | method: 'GET'
345 | });
346 | const data = await response.json();
347 | console.log('is_speaking res:', data)
348 |
349 | // 如果等待播放和等待合成的消息数量都为0,则认为没有在说话
350 | if (data["data"]["audio"]["wait_play_audio_num"] == 0 && data["data"]["audio"]["wait_synthesis_msg_num"] == 0 &&
351 | data["data"]["metahuman-stream"]["wait_play_audio_num"] == 0 && data["data"]["metahuman-stream"]["wait_synthesis_msg_num"] == 0
352 | ) {
353 | return false;
354 | } else {
355 | return true;
356 | }
357 | }
358 |
359 | return false
360 | } catch (error) {
361 | console.error('is_speaking error:', error)
362 | return false
363 | }
364 | }
365 |
366 | async function waitSpeakingEnd() {
367 | if (data_forward == "none") {
368 | return
369 | } else if (data_forward == "livetalking" || data_forward == "ai_vtuber") {
370 | rec.stop() //关闭录音
371 | for (let i = 0; i < 10; i++) { //等待数字人开始讲话,最长等待10s
372 | bspeak = await is_speaking()
373 | if (bspeak) {
374 | break
375 | }
376 | await sleep(1000)
377 | }
378 |
379 | while (true) { //等待数字人讲话结束
380 | bspeak = await is_speaking()
381 | if (!bspeak) {
382 | break
383 | }
384 | await sleep(1000)
385 | }
386 | await sleep(2000)
387 | rec.start()
388 | }
389 | }
390 | // 语音识别结果; 对jsonMsg数据解析,将识别结果附加到编辑框中
391 | function getJsonMessage(jsonMsg) {
392 | //console.log(jsonMsg);
393 | // console.log("message: " + JSON.parse(jsonMsg.data)['text']);
394 | var rectxt = "" + JSON.parse(jsonMsg.data)['text'];
395 | var asrmodel = JSON.parse(jsonMsg.data)['mode'];
396 | var is_final = JSON.parse(jsonMsg.data)['is_final'];
397 | var timestamp = JSON.parse(jsonMsg.data)['timestamp'];
398 | if (asrmodel == "2pass-offline" || asrmodel == "offline") {
399 | // 过滤特殊字符
400 | rectxt = rectxt.replace(/<[^>]*>/g, '');
401 |
402 | offline_text = offline_text + rectxt.replace(/ +/g, "") + '\n'; //handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,"");
403 | rec_text = offline_text;
404 |
405 | if (data_forward == "livetalking") {
406 | fetch(buildUrl(document.getElementById("livetalking_api_url").value, '/human'), {
407 | body: JSON.stringify({
408 | text: rectxt.replace(/ +/g, ""),
409 | type: 'chat',
410 | }),
411 | headers: {
412 | 'Content-Type': 'application/json'
413 | },
414 | method: 'POST'
415 | });
416 | } else if (data_forward == "ai_vtuber") {
417 | fetch(buildUrl(document.getElementById("ai_vtuber_api_url").value, '/send'), {
418 | body: JSON.stringify({
419 | type: 'comment',
420 | data: {
421 | "type": 'comment',
422 | "username": '主人',
423 | "content": rectxt.replace(/ +/g, ""),
424 | }
425 | }),
426 | headers: {
427 | 'Content-Type': 'application/json'
428 | },
429 | method: 'POST'
430 | });
431 | }
432 |
433 | waitSpeakingEnd();
434 | }
435 | else {
436 | rec_text = rec_text + rectxt; //.replace(/ +/g,"");
437 | }
438 | var varArea = document.getElementById('varArea');
439 |
440 | // 过滤特殊字符
441 | rec_text = rec_text.replace(/<[^>]*>/g, '');
442 | varArea.value = rec_text;
443 | // console.log("offline_text: " + asrmodel + "," + offline_text);
444 | // console.log("rec_text: " + rec_text);
445 | if (isfilemode == true && is_final == true) {
446 | console.log("call stop ws!");
447 | play_file();
448 | wsconnecter.wsStop();
449 |
450 | info_div.innerHTML = "请点击连接";
451 |
452 | btnStart.disabled = true;
453 | btnStop.disabled = true;
454 | btnConnect.disabled = false;
455 | }
456 | }
457 |
458 | // 连接状态响应
459 | function getConnState(connState) {
460 | if (connState === 0) { //on open
461 |
462 |
463 | info_div.innerHTML = '连接成功!请点击开始';
464 | if (isfilemode == true) {
465 | info_div.innerHTML = '请耐心等待,大文件等待时间更长';
466 | start_file_send();
467 | }
468 | else {
469 | btnStart.disabled = false;
470 | btnStop.disabled = true;
471 | btnConnect.disabled = true;
472 | }
473 | } else if (connState === 1) {
474 | //stop();
475 | } else if (connState === 2) {
476 | stop();
477 | console.log('connecttion error');
478 |
479 | alert("连接地址" + document.getElementById('wssip').value + "失败,请检查asr地址和端口。或试试界面上手动授权,再连接。");
480 | btnStart.disabled = true;
481 | btnStop.disabled = true;
482 | btnConnect.disabled = false;
483 |
484 | info_div.innerHTML = '请点击连接';
485 | }
486 | }
487 |
488 | function record() {
489 | rec.open(function () {
490 | rec.start();
491 | console.log("开始");
492 | btnStart.disabled = true;
493 | btnStop.disabled = false;
494 | btnConnect.disabled = true;
495 | });
496 | }
497 |
498 | // 识别启动、停止、清空操作
499 | function start() {
500 | // 清除显示
501 | clear();
502 | //控件状态更新
503 | console.log("isfilemode" + isfilemode);
504 |
505 | //启动连接
506 | var ret = wsconnecter.wsStart();
507 | // 1 is ok, 0 is error
508 | if (ret == 1) {
509 | info_div.innerHTML = "正在连接asr服务器,请等待...";
510 | isRec = true;
511 | btnStart.disabled = true;
512 | btnStop.disabled = true;
513 | btnConnect.disabled = true;
514 |
515 | return 1;
516 | }
517 | else {
518 | info_div.innerHTML = "请点击开始";
519 | btnStart.disabled = true;
520 | btnStop.disabled = true;
521 | btnConnect.disabled = false;
522 |
523 | return 0;
524 | }
525 | }
526 |
527 |
528 | function stop() {
529 | var chunk_size = new Array(5, 10, 5);
530 | var request = {
531 | "chunk_size": chunk_size,
532 | "wav_name": "h5",
533 | "is_speaking": false,
534 | "chunk_interval": 10,
535 | "mode": getAsrMode(),
536 | "url": document.getElementById('audio_record').src,
537 | };
538 | console.log(request);
539 | if (sampleBuf.length > 0) {
540 | wsconnecter.wsSend(sampleBuf);
541 | console.log("sampleBuf.length" + sampleBuf.length);
542 | sampleBuf = new Int16Array();
543 | }
544 | wsconnecter.wsSend(JSON.stringify(request));
545 |
546 | // 控件状态更新
547 |
548 | isRec = false;
549 | info_div.innerHTML = "发送完数据,请等候,正在识别...";
550 |
551 | if (isfilemode == false) {
552 | btnStop.disabled = true;
553 | btnStart.disabled = true;
554 | btnConnect.disabled = true;
555 | //wait 3s for asr result
556 | setTimeout(function () {
557 | console.log("call stop ws!");
558 | wsconnecter.wsStop();
559 | btnConnect.disabled = false;
560 | info_div.innerHTML = "请点击连接";
561 | }, 3000);
562 |
563 | rec.stop(function (blob, duration) {
564 | console.log(blob);
565 | var audioBlob = Recorder.pcm2wav(data = { sampleRate: 16000, bitRate: 16, blob: blob },
566 | function (theblob, duration) {
567 | console.log(theblob);
568 | var audio_record = document.getElementById('audio_record');
569 | audio_record.src = (window.URL || webkitURL).createObjectURL(theblob);
570 | audio_record.controls = true;
571 | //audio_record.play();
572 | }, function (msg) {
573 | console.log(msg);
574 | }
575 | );
576 | }, function (errMsg) {
577 | console.log("errMsg: " + errMsg);
578 | });
579 | }
580 | // 停止连接
581 | }
582 |
583 | function clear() {
584 | var varArea = document.getElementById('varArea');
585 |
586 | varArea.value = "";
587 | rec_text = "";
588 | offline_text = "";
589 | }
590 |
591 | function recProcess(buffer, powerLevel, bufferDuration, bufferSampleRate, newBufferIdx, asyncEnd) {
592 | if (isRec === true) {
593 | var data_48k = buffer[buffer.length - 1];
594 |
595 | var array_48k = new Array(data_48k);
596 | var data_16k = Recorder.SampleData(array_48k, bufferSampleRate, 16000).data;
597 |
598 | sampleBuf = Int16Array.from([...sampleBuf, ...data_16k]);
599 | var chunk_size = 960; // for asr chunk_size [5, 10, 5]
600 | info_div.innerHTML = "" + bufferDuration / 1000 + "s";
601 | while (sampleBuf.length >= chunk_size) {
602 | sendBuf = sampleBuf.slice(0, chunk_size);
603 | sampleBuf = sampleBuf.slice(chunk_size, sampleBuf.length);
604 | wsconnecter.wsSend(sendBuf);
605 | }
606 | }
607 | }
608 |
609 | function getUseITN() {
610 | var obj = document.getElementsByName("use_itn");
611 | for (var i = 0; i < obj.length; i++) {
612 | if (obj[i].checked) {
613 | return obj[i].value === "true";
614 | }
615 | }
616 | return false;
617 | }
618 |
--------------------------------------------------------------------------------
/web/pcm.js:
--------------------------------------------------------------------------------
1 | /*
2 | pcm编码器+编码引擎
3 | https://github.com/xiangyuecn/Recorder
4 |
5 | 编码原理:本编码器输出的pcm格式数据其实就是Recorder中的buffers原始数据(经过了重新采样),16位时为LE小端模式(Little Endian),并未经过任何编码处理
6 |
7 | 编码的代码和wav.js区别不大,pcm加上一个44字节wav头即成wav文件;所以要播放pcm就很简单了,直接转成wav文件来播放,已提供转换函数 Recorder.pcm2wav
8 | */
9 | (function(){
10 | "use strict";
11 |
12 | Recorder.prototype.enc_pcm={
13 | stable:true
14 | ,testmsg:"pcm为未封装的原始音频数据,pcm数据文件无法直接播放;支持位数8位、16位(填在比特率里面),采样率取值无限制"
15 | };
16 | Recorder.prototype.pcm=function(res,True,False){
17 | var This=this,set=This.set
18 | ,size=res.length
19 | ,bitRate=set.bitRate==8?8:16;
20 |
21 | var buffer=new ArrayBuffer(size*(bitRate/8));
22 | var data=new DataView(buffer);
23 | var offset=0;
24 |
25 | // 写入采样数据
26 | if(bitRate==8) {
27 | for(var i=0;i>8)+128;
30 | data.setInt8(offset,val,true);
31 | };
32 | }else{
33 | for (var i=0;i=pcmSampleRate时不会进行任何处理,小于时会进行重新采样
424 | prevChunkInfo:{} 可选,上次调用时的返回值,用于连续转换,本次调用将从上次结束位置开始进行处理。或可自行定义一个ChunkInfo从pcmDatas指定的位置开始进行转换
425 | option:{ 可选,配置项
426 | frameSize:123456 帧大小,每帧的PCM Int16的数量,采样率转换后的pcm长度为frameSize的整数倍,用于连续转换。目前仅在mp3格式时才有用,frameSize取值为1152,这样编码出来的mp3时长和pcm的时长完全一致,否则会因为mp3最后一帧录音不够填满时添加填充数据导致mp3的时长变长。
427 | frameType:"" 帧类型,一般为rec.set.type,提供此参数时无需提供frameSize,会自动使用最佳的值给frameSize赋值,目前仅支持mp3=1152(MPEG1 Layer3的每帧采采样数),其他类型=1。
428 | 以上两个参数用于连续转换时使用,最多使用一个,不提供时不进行帧的特殊处理,提供时必须同时提供prevChunkInfo才有作用。最后一段数据处理时无需提供帧大小以便输出最后一丁点残留数据。
429 | }
430 |
431 | 返回ChunkInfo:{
432 | //可定义,从指定位置开始转换到结尾
433 | index:0 pcmDatas已处理到的索引
434 | offset:0.0 已处理到的index对应的pcm中的偏移的下一个位置
435 |
436 | //仅作为返回值
437 | frameNext:null||[Int16,...] 下一帧的部分数据,frameSize设置了的时候才可能会有
438 | sampleRate:16000 结果的采样率,<=newSampleRate
439 | data:[Int16,...] 转换后的PCM结果;如果是连续转换,并且pcmDatas中并没有新数据时,data的长度可能为0
440 | }
441 | */
442 | Recorder.SampleData=function(pcmDatas,pcmSampleRate,newSampleRate,prevChunkInfo,option){
443 | prevChunkInfo||(prevChunkInfo={});
444 | var index=prevChunkInfo.index||0;
445 | var offset=prevChunkInfo.offset||0;
446 |
447 | var frameNext=prevChunkInfo.frameNext||[];
448 | option||(option={});
449 | var frameSize=option.frameSize||1;
450 | if(option.frameType){
451 | frameSize=option.frameType=="mp3"?1152:1;
452 | };
453 |
454 | var nLen=pcmDatas.length;
455 | if(index>nLen+1){
456 | CLog("SampleData似乎传入了未重置chunk "+index+">"+nLen,3);
457 | };
458 | var size=0;
459 | for(var i=index;i1){//新采样低于录音采样,进行抽样
467 | size=Math.floor(size/step);
468 | }else{//新采样高于录音采样不处理,省去了插值处理
469 | step=1;
470 | newSampleRate=pcmSampleRate;
471 | };
472 |
473 | size+=frameNext.length;
474 | var res=new Int16Array(size);
475 | var idx=0;
476 | //添加上一次不够一帧的剩余数据
477 | for(var i=0;i0){
510 | var u8Pos=(res.length-frameNextSize)*2;
511 | frameNext=new Int16Array(res.buffer.slice(u8Pos));
512 | res=new Int16Array(res.buffer.slice(0,u8Pos));
513 | };
514 |
515 | return {
516 | index:index
517 | ,offset:offset
518 |
519 | ,frameNext:frameNext
520 | ,sampleRate:newSampleRate
521 | ,data:res
522 | };
523 | };
524 |
525 |
526 | /*计算音量百分比的一个方法
527 | pcmAbsSum: pcm Int16所有采样的绝对值的和
528 | pcmLength: pcm长度
529 | 返回值:0-100,主要当做百分比用
530 | 注意:这个不是分贝,因此没用volume当做名称*/
531 | Recorder.PowerLevel=function(pcmAbsSum,pcmLength){
532 | /*计算音量 https://blog.csdn.net/jody1989/article/details/73480259
533 | 更高灵敏度算法:
534 | 限定最大感应值10000
535 | 线性曲线:低音量不友好
536 | power/10000*100
537 | 对数曲线:低音量友好,但需限定最低感应值
538 | (1+Math.log10(power/10000))*100
539 | */
540 | var power=(pcmAbsSum/pcmLength) || 0;//NaN
541 | var level;
542 | if(power<1251){//1250的结果10%,更小的音量采用线性取值
543 | level=Math.round(power/1250*10);
544 | }else{
545 | level=Math.round(Math.min(100,Math.max(0,(1+Math.log(power/10000)/Math.log(10))*100)));
546 | };
547 | return level;
548 | };
549 |
550 | /*计算音量,单位dBFS(满刻度相对电平)
551 | maxSample: 为16位pcm采样的绝对值中最大的一个(计算峰值音量),或者为pcm中所有采样的绝对值的平局值
552 | 返回值:-100~0 (最大值0dB,最小值-100代替-∞)
553 | */
554 | Recorder.PowerDBFS=function(maxSample){
555 | var val=Math.max(0.1, maxSample||0),Pref=0x7FFF;
556 | val=Math.min(val,Pref);
557 | //https://www.logiclocmusic.com/can-you-tell-the-decibel/
558 | //https://blog.csdn.net/qq_17256689/article/details/120442510
559 | val=20*Math.log(val/Pref)/Math.log(10);
560 | return Math.max(-100,Math.round(val));
561 | };
562 |
563 |
564 |
565 |
566 | //带时间的日志输出,可设为一个空函数来屏蔽日志输出
567 | //CLog(msg,errOrLogMsg, logMsg...) err为数字时代表日志类型1:error 2:log默认 3:warn,否则当做内容输出,第一个参数不能是对象因为要拼接时间,后面可以接无数个输出参数
568 | Recorder.CLog=function(msg,err){
569 | var now=new Date();
570 | var t=("0"+now.getMinutes()).substr(-2)
571 | +":"+("0"+now.getSeconds()).substr(-2)
572 | +"."+("00"+now.getMilliseconds()).substr(-3);
573 | var recID=this&&this.envIn&&this.envCheck&&this.id;
574 | var arr=["["+t+" "+RecTxt+(recID?":"+recID:"")+"]"+msg];
575 | var a=arguments,console=window.console||{};
576 | var i=2,fn=console.log;
577 | if(typeof(err)=="number"){
578 | fn=err==1?console.error:err==3?console.warn:fn;
579 | }else{
580 | i=1;
581 | };
582 | for(;i1?arr:"");
587 | }else{
588 | fn.apply(console,arr);
589 | };
590 | };
591 | var CLog=function(){ Recorder.CLog.apply(this,arguments); };
592 | var IsLoser=true;try{IsLoser=!console.log.apply;}catch(e){};
593 |
594 |
595 |
596 |
597 | var ID=0;
598 | function initFn(set){
599 | this.id=++ID;
600 |
601 | //如果开启了流量统计,这里将发送一个图片请求
602 | Traffic();
603 |
604 |
605 | var o={
606 | type:"mp3" //输出类型:mp3,wav,wav输出文件尺寸超大不推荐使用,但mp3编码支持会导致js文件超大,如果不需支持mp3可以使js文件大幅减小
607 | ,bitRate:16 //比特率 wav:16或8位,MP3:8kbps 1k/s,8kbps 2k/s 录音文件很小
608 |
609 | ,sampleRate:16000 //采样率,wav格式大小=sampleRate*时间;mp3此项对低比特率有影响,高比特率几乎无影响。
610 | //wav任意值,mp3取值范围:48000, 44100, 32000, 24000, 22050, 16000, 12000, 11025, 8000
611 | //采样率参考https://www.cnblogs.com/devin87/p/mp3-recorder.html
612 |
613 | ,onProcess:NOOP //fn(buffers,powerLevel,bufferDuration,bufferSampleRate,newBufferIdx,asyncEnd) buffers=[[Int16,...],...]:缓冲的PCM数据,为从开始录音到现在的所有pcm片段;powerLevel:当前缓冲的音量级别0-100,bufferDuration:已缓冲时长,bufferSampleRate:缓冲使用的采样率(当type支持边录边转码(Worker)时,此采样率和设置的采样率相同,否则不一定相同);newBufferIdx:本次回调新增的buffer起始索引;asyncEnd:fn() 如果onProcess是异步的(返回值为true时),处理完成时需要调用此回调,如果不是异步的请忽略此参数,此方法回调时必须是真异步(不能真异步时需用setTimeout包裹)。onProcess返回值:如果返回true代表开启异步模式,在某些大量运算的场合异步是必须的,必须在异步处理完成时调用asyncEnd(不能真异步时需用setTimeout包裹),在onProcess执行后新增的buffer会全部替换成空数组,因此本回调开头应立即将newBufferIdx到本次回调结尾位置的buffer全部保存到另外一个数组内,处理完成后写回buffers中本次回调的结尾位置。
614 |
615 | //*******高级设置******
616 | //,sourceStream:MediaStream Object
617 | //可选直接提供一个媒体流,从这个流中录制、实时处理音频数据(当前Recorder实例独享此流);不提供时为普通的麦克风录音,由getUserMedia提供音频流(所有Recorder实例共享同一个流)
618 | //比如:audio、video标签dom节点的captureStream方法(实验特性,不同浏览器支持程度不高)返回的流;WebRTC中的remote流;自己创建的流等
619 | //注意:流内必须至少存在一条音轨(Audio Track),比如audio标签必须等待到可以开始播放后才会有音轨,否则open会失败
620 |
621 | //,audioTrackSet:{ deviceId:"",groupId:"", autoGainControl:true, echoCancellation:true, noiseSuppression:true }
622 | //普通麦克风录音时getUserMedia方法的audio配置参数,比如指定设备id,回声消除、降噪开关;注意:提供的任何配置值都不一定会生效
623 | //由于麦克风是全局共享的,所以新配置后需要close掉以前的再重新open
624 | //更多参考: https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints
625 |
626 | //,disableEnvInFix:false 内部参数,禁用设备卡顿时音频输入丢失补偿功能
627 |
628 | //,takeoffEncodeChunk:NOOP //fn(chunkBytes) chunkBytes=[Uint8,...]:实时编码环境下接管编码器输出,当编码器实时编码出一块有效的二进制音频数据时实时回调此方法;参数为二进制的Uint8Array,就是编码出来的音频数据片段,所有的chunkBytes拼接在一起即为完整音频。本实现的想法最初由QQ2543775048提出
629 | //当提供此回调方法时,将接管编码器的数据输出,编码器内部将放弃存储生成的音频数据;环境要求比较苛刻:如果当前环境不支持实时编码处理,将在open时直接走fail逻辑
630 | //因此提供此回调后调用stop方法将无法获得有效的音频数据,因为编码器内没有音频数据,因此stop时返回的blob将是一个字节长度为0的blob
631 | //目前只有mp3格式实现了实时编码,在支持实时处理的环境中将会实时的将编码出来的mp3片段通过此方法回调,所有的chunkBytes拼接到一起即为完整的mp3,此种拼接的结果比mock方法实时生成的音质更加,因为天然避免了首尾的静默
632 | //目前除mp3外其他格式不可以提供此回调,提供了将在open时直接走fail逻辑
633 | };
634 |
635 | for(var k in set){
636 | o[k]=set[k];
637 | };
638 | this.set=o;
639 |
640 | this._S=9;//stop同步锁,stop可以阻止open过程中还未运行的start
641 | this.Sync={O:9,C:9};//和Recorder.Sync一致,只不过这个是非全局的,仅用来简化代码逻辑,无实际作用
642 | };
643 | //同步锁,控制对Stream的竞争;用于close时中断异步的open;一个对象open如果变化了都要阻止close,Stream的控制权交个新的对象
644 | Recorder.Sync={/*open*/O:9,/*close*/C:9};
645 |
646 | Recorder.prototype=initFn.prototype={
647 | CLog:CLog
648 |
649 | //流相关的数据存储在哪个对象里面;如果提供了sourceStream,数据直接存储在当前对象中,否则存储在全局
650 | ,_streamStore:function(){
651 | if(this.set.sourceStream){
652 | return this;
653 | }else{
654 | return Recorder;
655 | }
656 | }
657 |
658 | //打开录音资源True(),False(msg,isUserNotAllow),需要调用close。注意:此方法是异步的;一般使用时打开,用完立即关闭;可重复调用,可用来测试是否能录音
659 | ,open:function(True,False){
660 | var This=this,streamStore=This._streamStore();
661 | True=True||NOOP;
662 | var failCall=function(errMsg,isUserNotAllow){
663 | isUserNotAllow=!!isUserNotAllow;
664 | This.CLog("录音open失败:"+errMsg+",isUserNotAllow:"+isUserNotAllow,1);
665 | False&&False(errMsg,isUserNotAllow);
666 | };
667 |
668 | var ok=function(){
669 | This.CLog("open ok id:"+This.id);
670 | True();
671 |
672 | This._SO=0;//解除stop对open中的start调用的阻止
673 | };
674 |
675 |
676 | //同步锁
677 | var Lock=streamStore.Sync;
678 | var lockOpen=++Lock.O,lockClose=Lock.C;
679 | This._O=This._O_=lockOpen;//记住当前的open,如果变化了要阻止close,这里假定了新对象已取代当前对象并且不再使用
680 | This._SO=This._S;//记住open过程中的stop,中途任何stop调用后都不能继续open中的start
681 | var lockFail=function(){
682 | //允许多次open,但不允许任何一次close,或者自身已经调用了关闭
683 | if(lockClose!=Lock.C || !This._O){
684 | var err="open被取消";
685 | if(lockOpen==Lock.O){
686 | //无新的open,已经调用了close进行取消,此处应让上次的close明确生效
687 | This.close();
688 | }else{
689 | err="open被中断";
690 | };
691 | failCall(err);
692 | return true;
693 | };
694 | };
695 |
696 | //环境配置检查
697 | var checkMsg=This.envCheck({envName:"H5",canProcess:true});
698 | if(checkMsg){
699 | failCall("不能录音:"+checkMsg);
700 | return;
701 | };
702 |
703 |
704 | //***********已直接提供了音频流************
705 | if(This.set.sourceStream){
706 | if(!Recorder.GetContext()){
707 | failCall("不支持此浏览器从流中获取录音");
708 | return;
709 | };
710 |
711 | Disconnect(streamStore);//可能已open过,直接先尝试断开
712 | This.Stream=This.set.sourceStream;
713 | This.Stream._call={};
714 |
715 | try{
716 | Connect(streamStore);
717 | }catch(e){
718 | failCall("从流中打开录音失败:"+e.message);
719 | return;
720 | }
721 | ok();
722 | return;
723 | };
724 |
725 |
726 | //***********打开麦克风得到全局的音频流************
727 | var codeFail=function(code,msg){
728 | try{//跨域的优先检测一下
729 | window.top.a;
730 | }catch(e){
731 | failCall('无权录音(跨域,请尝试给iframe添加麦克风访问策略,如allow="camera;microphone")');
732 | return;
733 | };
734 |
735 | if(/Permission|Allow/i.test(code)){
736 | failCall("用户拒绝了录音权限",true);
737 | }else if(window.isSecureContext===false){
738 | failCall("浏览器禁止不安全页面录音,可开启https解决");
739 | }else if(/Found/i.test(code)){//可能是非安全环境导致的没有设备
740 | failCall(msg+",无可用麦克风");
741 | }else{
742 | failCall(msg);
743 | };
744 | };
745 |
746 |
747 | //如果已打开并且有效就不要再打开了
748 | if(Recorder.IsOpen()){
749 | ok();
750 | return;
751 | };
752 | if(!Recorder.Support()){
753 | codeFail("","此浏览器不支持录音");
754 | return;
755 | };
756 |
757 | //请求权限,如果从未授权,一般浏览器会弹出权限请求弹框
758 | var f1=function(stream){
759 | //https://github.com/xiangyuecn/Recorder/issues/14 获取到的track.readyState!="live",刚刚回调时可能是正常的,但过一下可能就被关掉了,原因不明。延迟一下保证真异步。对正常浏览器不影响
760 | setTimeout(function(){
761 | stream._call={};
762 | var oldStream=Recorder.Stream;
763 | if(oldStream){
764 | Disconnect(); //直接断开已存在的,旧的Connect未完成会自动终止
765 | stream._call=oldStream._call;
766 | };
767 | Recorder.Stream=stream;
768 | if(lockFail())return;
769 |
770 | if(Recorder.IsOpen()){
771 | if(oldStream)This.CLog("发现同时多次调用open",1);
772 |
773 | Connect(streamStore,1);
774 | ok();
775 | }else{
776 | failCall("录音功能无效:无音频流");
777 | };
778 | },100);
779 | };
780 | var f2=function(e){
781 | var code=e.name||e.message||e.code+":"+e;
782 | This.CLog("请求录音权限错误",1,e);
783 |
784 | codeFail(code,"无法录音:"+code);
785 | };
786 |
787 | var trackSet={
788 | noiseSuppression:false //默认禁用降噪,原声录制,免得移动端表现怪异(包括系统播放声音变小)
789 | ,echoCancellation:false //回声消除
790 | };
791 | var trackSet2=This.set.audioTrackSet;
792 | for(var k in trackSet2)trackSet[k]=trackSet2[k];
793 | trackSet.sampleRate=Recorder.Ctx.sampleRate;//必须指明采样率,不然手机上MediaRecorder采样率16k
794 |
795 | try{
796 | var pro=Recorder.Scope[getUserMediaTxt]({audio:trackSet},f1,f2);
797 | }catch(e){//不能设置trackSet就算了
798 | This.CLog(getUserMediaTxt,3,e);
799 | pro=Recorder.Scope[getUserMediaTxt]({audio:true},f1,f2);
800 | };
801 | if(pro&&pro.then){
802 | pro.then(f1)[CatchTxt](f2); //fix 关键字,保证catch压缩时保持字符串形式
803 | };
804 | }
805 | //关闭释放录音资源
806 | ,close:function(call){
807 | call=call||NOOP;
808 |
809 | var This=this,streamStore=This._streamStore();
810 | This._stop();
811 |
812 | var Lock=streamStore.Sync;
813 | This._O=0;
814 | if(This._O_!=Lock.O){
815 | //唯一资源Stream的控制权已交给新对象,这里不能关闭。此处在每次都弹权限的浏览器内可能存在泄漏,新对象被拒绝权限可能不会调用close,忽略这种不处理
816 | This.CLog("close被忽略(因为同时open了多个rec,只有最后一个会真正close)",3);
817 | call();
818 | return;
819 | };
820 | Lock.C++;//获得控制权
821 |
822 | Disconnect(streamStore);
823 |
824 | This.CLog("close");
825 | call();
826 | }
827 |
828 |
829 |
830 |
831 |
832 | /*模拟一段录音数据,后面可以调用stop进行编码,需提供pcm数据[1,2,3...],pcm的采样率*/
833 | ,mock:function(pcmData,pcmSampleRate){
834 | var This=this;
835 | This._stop();//清理掉已有的资源
836 |
837 | This.isMock=1;
838 | This.mockEnvInfo=null;
839 | This.buffers=[pcmData];
840 | This.recSize=pcmData.length;
841 | This[srcSampleRateTxt]=pcmSampleRate;
842 | return This;
843 | }
844 | ,envCheck:function(envInfo){//平台环境下的可用性检查,任何时候都可以调用检查,返回errMsg:""正常,"失败原因"
845 | //envInfo={envName:"H5",canProcess:true}
846 | var errMsg,This=this,set=This.set;
847 |
848 | //检测CPU的数字字节序,TypedArray字节序是个迷,直接拒绝罕见的大端模式,因为找不到这种CPU进行测试
849 | var tag="CPU_BE";
850 | if(!errMsg && !Recorder[tag] && window.Int8Array && !new Int8Array(new Int32Array([1]).buffer)[0]){
851 | Traffic(tag); //如果开启了流量统计,这里将发送一个图片请求
852 | errMsg="不支持"+tag+"架构";
853 | };
854 |
855 | //编码器检查环境下配置是否可用
856 | if(!errMsg){
857 | var type=set.type;
858 | if(This[type+"_envCheck"]){//编码器已实现环境检查
859 | errMsg=This[type+"_envCheck"](envInfo,set);
860 | }else{//未实现检查的手动检查配置是否有效
861 | if(set.takeoffEncodeChunk){
862 | errMsg=type+"类型"+(This[type]?"":"(未加载编码器)")+"不支持设置takeoffEncodeChunk";
863 | };
864 | };
865 | };
866 |
867 | return errMsg||"";
868 | }
869 | ,envStart:function(mockEnvInfo,sampleRate){//平台环境相关的start调用
870 | var This=this,set=This.set;
871 | This.isMock=mockEnvInfo?1:0;//非H5环境需要启用mock,并提供envCheck需要的环境信息
872 | This.mockEnvInfo=mockEnvInfo;
873 | This.buffers=[];//数据缓冲
874 | This.recSize=0;//数据大小
875 |
876 | This.envInLast=0;//envIn接收到最后录音内容的时间
877 | This.envInFirst=0;//envIn接收到的首个录音内容的录制时间
878 | This.envInFix=0;//补偿的总时间
879 | This.envInFixTs=[];//补偿计数列表
880 |
881 | //engineCtx需要提前确定最终的采样率
882 | var setSr=set[sampleRateTxt];
883 | if(setSr>sampleRate){
884 | set[sampleRateTxt]=sampleRate;
885 | }else{ setSr=0 }
886 | This[srcSampleRateTxt]=sampleRate;
887 | This.CLog(srcSampleRateTxt+": "+sampleRate+" set."+sampleRateTxt+": "+set[sampleRateTxt]+(setSr?" 忽略"+setSr:""), setSr?3:0);
888 |
889 | This.engineCtx=0;
890 | //此类型有边录边转码(Worker)支持
891 | if(This[set.type+"_start"]){
892 | var engineCtx=This.engineCtx=This[set.type+"_start"](set);
893 | if(engineCtx){
894 | engineCtx.pcmDatas=[];
895 | engineCtx.pcmSize=0;
896 | };
897 | };
898 | }
899 | ,envResume:function(){//和平台环境无关的恢复录音
900 | //重新开始计数
901 | this.envInFixTs=[];
902 | }
903 | ,envIn:function(pcm,sum){//和平台环境无关的pcm[Int16]输入
904 | var This=this,set=This.set,engineCtx=This.engineCtx;
905 | var bufferSampleRate=This[srcSampleRateTxt];
906 | var size=pcm.length;
907 | var powerLevel=Recorder.PowerLevel(sum,size);
908 |
909 | var buffers=This.buffers;
910 | var bufferFirstIdx=buffers.length;//之前的buffer都是经过onProcess处理好的,不允许再修改
911 | buffers.push(pcm);
912 |
913 | //有engineCtx时会被覆盖,这里保存一份
914 | var buffersThis=buffers;
915 | var bufferFirstIdxThis=bufferFirstIdx;
916 |
917 | //卡顿丢失补偿:因为设备很卡的时候导致H5接收到的数据量不够造成播放时候变速,结果比实际的时长要短,此处保证了不会变短,但不能修复丢失的音频数据造成音质变差。当前算法采用输入时间侦测下一帧是否需要添加补偿帧,需要(6次输入||超过1秒)以上才会开始侦测,如果滑动窗口内丢失超过1/3就会进行补偿
918 | var now=Date.now();
919 | var pcmTime=Math.round(size/bufferSampleRate*1000);
920 | This.envInLast=now;
921 | if(This.buffers.length==1){//记下首个录音数据的录制时间
922 | This.envInFirst=now-pcmTime;
923 | };
924 | var envInFixTs=This.envInFixTs;
925 | envInFixTs.splice(0,0,{t:now,d:pcmTime});
926 | //保留3秒的计数滑动窗口,另外超过3秒的停顿不补偿
927 | var tsInStart=now,tsPcm=0;
928 | for(var i=0;i3000){
931 | envInFixTs.length=i;
932 | break;
933 | };
934 | tsInStart=o.t;
935 | tsPcm+=o.d;
936 | };
937 | //达到需要的数据量,开始侦测是否需要补偿
938 | var tsInPrev=envInFixTs[1];
939 | var tsIn=now-tsInStart;
940 | var lost=tsIn-tsPcm;
941 | if( lost>tsIn/3 && (tsInPrev&&tsIn>1000 || envInFixTs.length>=6) ){
942 | //丢失过多,开始执行补偿
943 | var addTime=now-tsInPrev.t-pcmTime;//距离上次输入丢失这么多ms
944 | if(addTime>pcmTime/5){//丢失超过本帧的1/5
945 | var fixOpen=!set.disableEnvInFix;
946 | This.CLog("["+now+"]"+(fixOpen?"":"未")+"补偿"+addTime+"ms",3);
947 | This.envInFix+=addTime;
948 |
949 | //用静默进行补偿
950 | if(fixOpen){
951 | var addPcm=new Int16Array(addTime*bufferSampleRate/1000);
952 | size+=addPcm.length;
953 | buffers.push(addPcm);
954 | };
955 | };
956 | };
957 |
958 |
959 | var sizeOld=This.recSize,addSize=size;
960 | var bufferSize=sizeOld+addSize;
961 | This.recSize=bufferSize;//此值在onProcess后需要修正,可能新数据被修改
962 |
963 |
964 | //此类型有边录边转码(Worker)支持,开启实时转码
965 | if(engineCtx){
966 | //转换成set的采样率
967 | var chunkInfo=Recorder.SampleData(buffers,bufferSampleRate,set[sampleRateTxt],engineCtx.chunkInfo);
968 | engineCtx.chunkInfo=chunkInfo;
969 |
970 | sizeOld=engineCtx.pcmSize;
971 | addSize=chunkInfo.data.length;
972 | bufferSize=sizeOld+addSize;
973 | engineCtx.pcmSize=bufferSize;//此值在onProcess后需要修正,可能新数据被修改
974 |
975 | buffers=engineCtx.pcmDatas;
976 | bufferFirstIdx=buffers.length;
977 | buffers.push(chunkInfo.data);
978 | bufferSampleRate=chunkInfo[sampleRateTxt];
979 | };
980 |
981 | var duration=Math.round(bufferSize/bufferSampleRate*1000);
982 | var bufferNextIdx=buffers.length;
983 | var bufferNextIdxThis=buffersThis.length;
984 |
985 | //允许异步处理buffer数据
986 | var asyncEnd=function(){
987 | //重新计算size,异步的早已减去添加的,同步的需去掉本次添加的然后重新计算
988 | var num=asyncBegin?0:-addSize;
989 | var hasClear=buffers[0]==null;
990 | for(var i=bufferFirstIdx;i10 && This.envInFirst-now>1000){ //1秒后开始onProcess性能监测
1038 | This.CLog(procTxt+"低性能,耗时"+slowT+"ms",3);
1039 | };
1040 |
1041 | if(asyncBegin===true){
1042 | //开启了异步模式,onProcess已接管buffers新数据,立即清空,避免出现未处理的数据
1043 | var hasClear=0;
1044 | for(var i=bufferFirstIdx;i"+res.length+" 花:"+(Date.now()-t1)+"ms");
1250 |
1251 | setTimeout(function(){
1252 | t1=Date.now();
1253 | This[set.type](res,function(blob){
1254 | ok(blob,duration);
1255 | },function(msg){
1256 | err(msg);
1257 | });
1258 | });
1259 | }
1260 |
1261 | };
1262 |
1263 | if(window[RecTxt]){
1264 | CLog("重复引入"+RecTxt,3);
1265 | window[RecTxt].Destroy();
1266 | };
1267 | window[RecTxt]=Recorder;
1268 |
1269 |
1270 |
1271 |
1272 | //=======从WebM字节流中提取pcm数据,提取成功返回Float32Array,失败返回null||-1=====
1273 | var WebM_Extract=function(inBytes, scope){
1274 | if(!scope.pos){
1275 | scope.pos=[0]; scope.tracks={}; scope.bytes=[];
1276 | };
1277 | var tracks=scope.tracks, position=[scope.pos[0]];
1278 | var endPos=function(){ scope.pos[0]=position[0] };
1279 |
1280 | var sBL=scope.bytes.length;
1281 | var bytes=new Uint8Array(sBL+inBytes.length);
1282 | bytes.set(scope.bytes); bytes.set(inBytes,sBL);
1283 | scope.bytes=bytes;
1284 |
1285 | //先读取文件头和Track信息
1286 | if(!scope._ht){
1287 | readMatroskaVInt(bytes, position);//EBML Header
1288 | readMatroskaBlock(bytes, position);//跳过EBML Header内容
1289 | if(!BytesEq(readMatroskaVInt(bytes, position), [0x18,0x53,0x80,0x67])){
1290 | return;//未识别到Segment
1291 | }
1292 | readMatroskaVInt(bytes, position);//跳过Segment长度值
1293 | while(position[0]1){//多声道,提取一个声道
1403 | var arr2=[];
1404 | for(var i=0;i=arr.length)return;
1432 | var b0=arr[i],b2=("0000000"+b0.toString(2)).substr(-8);
1433 | var m=/^(0*1)(\d*)$/.exec(b2);
1434 | if(!m)return;
1435 | var len=m[1].length, val=[];
1436 | if(i+len>arr.length)return;
1437 | for(var i2=0;i2arr.length)return;
1450 | for(var i2=0;i2>8)+128;
75 | data.setInt8(offset,val,true);
76 | };
77 | }else{
78 | for (var i=0;i