├── .github
    └── workflows
    │   └── docker-publish.yml
├── Dockerfile
├── README.md
├── assets
    ├── audio-2.png
    ├── bohemian.png
    ├── database.png
    ├── flow.gif
    ├── mic.png
    ├── music.gif
    ├── translate-2.png
    └── user.png
├── docs
    └── flow.gif
├── requirements.txt
├── scripts
    └── flow.py
└── src
    ├── client.py
    ├── config.py
    ├── docker
        ├── a.py
        ├── b.py
        └── whisper.py
    ├── local_deploy.py
    ├── local_deploy_openai.py
    ├── server.py
    └── utils.py


/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish Docker image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   build-and-push:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v2
13 | 
14 |     - name: Log in to GitHub Container Registry
15 |       uses: docker/login-action@v1
16 |       with:
17 |         registry: ghcr.io
18 |         username: ${{ github.actor }}
19 |         password: ${{ secrets.CR_PAT }}
20 | 
21 |     - name: Build and push Docker image
22 |       uses: docker/build-push-action@v2
23 |       with:
24 |         context: .
25 |         file: ./Dockerfile
26 |         push: true
27 |         tags: ghcr.io/${{ github.repository_owner }}/whisper:latest
28 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim
 2 | WORKDIR /app/
 3 | COPY requirements.txt /app/
 4 | 
 5 | RUN apt update && apt install -y libpq-dev gcc portaudio19-dev
 6 | RUN pip3 install -r requirements.txt
 7 | RUN pip3 install uvicorn fastapi pydantic python-multipart loguru==0.7.0
 8 | 
 9 | COPY ./src/docker/whisper.py /app/
10 | 
11 | CMD ["uvicorn", "whisper:app", "--host", "0.0.0.0", "--port", "8000"]
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 使用 Faster-whisper 模拟实时语音转写
 2 | 
 3 | <figure style="text-align: center; radius:10pt">
 4 |     <img src="assets/flow.gif" width=689pt radius=10pt>
 5 | </figure>
 6 | 
 7 | 
 8 | 
 9 | # 使用方法
10 | ## 1. 拆分服务端与客户端
11 | 适合 GPU 在云端的场景。
12 | ### 服务端
13 | 负责接收客户端发送的音频数据，进行语音识别，然后把识别结果返回给客户端。
14 | ```bash
15 | git clone https://github.com/ultrasev/stream-whisper
16 | apt -y install libcublas11
17 | cd stream-whisper
18 | pip3 install -r requirements.txt
19 | ```
20 | 
21 | 注：
22 | - `libcublas11` 是 NVIDIA CUDA Toolkit 的依赖，如果需要使用 CUDA Toolkit，需要安装。
23 | - 经 [@muzian666](https://github.com/muzian666) 提示，aioredis 包目前仍然不支持 Python3.11，Python 版本建议 3.8 ~ 3.10
24 | 
25 | 把 `.env` 文件中的 `REDIS_SERVER` 改成自己的 Redis 地址，然后运行 `python3 -m src.server`，服务端就启动了。
26 | 第一次执行时，会从 huggingface 上下载语音识别模型，需要等待一段时间。Huggingface 已经被防火墙特别对待了，下载速度很慢，建议使用代理。
27 | 
28 | 
29 | ### 客户端
30 | 负责录音，然后把音频数据发送给服务端，接收服务端返回的识别结果。
31 | 
32 | ```bash
33 | git clone https://github.com/ultrasev/stream-whisper
34 | apt -y install portaudio19-dev
35 | cd stream-whisper
36 | pip3 install -r requirements.txt
37 | ```
38 | 
39 | 注：
40 | - `portaudio19-dev` 是 pyaudio 的依赖，如果系统已安装，可以忽略。
41 | 
42 | 同样需要把 `.env` 文件中的 `REDIS_SERVER` 改成自己的 Redis 地址，在本地机器上运行 `python3 -m src.client`，客户端就启动了。运行前先测试一下麦克风是否正常工作，确认能够正常录音。
43 | 
44 | ## 2. 本地直接运行
45 | 如果本地有 GPU，可以直接运行 `src/local_deploy.py`，这样就可以在本地直接运行服务端和客户端了。
46 | ```bash
47 | git clone https://github.com/ultrasev/stream-whisper
48 | apt -y install portaudio19-dev  libcublas11
49 | python3 src/local_deploy.py
50 | ```
51 | 
52 | 
53 | # Docker 一键部署自己的 whisper 转写服务
54 | ```bash
55 | docker run -d --name whisper \
56 |     -e MODEL
57 |     -p 8000:8000 ghcr.io/ultrasev/whisper
58 | ```
59 | 接口兼容 OpenAI 的 [API 规范](https://platform.openai.com/docs/guides/speech-to-text)，可以直接使用 OpenAI 的 SDK 进行调用。
60 | 
61 | ```python
62 | from openai import OpenAI
63 | client = OpenAI(base_url="http://localhost:8000")
64 | 
65 | audio_file= open("/path/to/file/audio.mp3", "rb")
66 | transcription = client.audio.transcriptions.create(
67 |   model="whisper-1",
68 |   file=audio_file
69 | )
70 | print(transcription.text)
71 | ```
72 | 


--------------------------------------------------------------------------------
/assets/audio-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/assets/audio-2.png


--------------------------------------------------------------------------------
/assets/bohemian.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/assets/bohemian.png


--------------------------------------------------------------------------------
/assets/database.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/assets/database.png


--------------------------------------------------------------------------------
/assets/flow.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/assets/flow.gif


--------------------------------------------------------------------------------
/assets/mic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/assets/mic.png


--------------------------------------------------------------------------------
/assets/music.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/assets/music.gif


--------------------------------------------------------------------------------
/assets/translate-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/assets/translate-2.png


--------------------------------------------------------------------------------
/assets/user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/assets/user.png


--------------------------------------------------------------------------------
/docs/flow.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ultrasev/stream-whisper/f201b6b52a86a5ef8ddc79d2b021da751d41b31f/docs/flow.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aioredis==2.0.1
2 | PyAudio==0.2.14
3 | webrtcvad==2.0.10
4 | python-dotenv==1.0.0
5 | faster-whisper==0.10.0


--------------------------------------------------------------------------------
/scripts/flow.py:
--------------------------------------------------------------------------------
 1 | from manim import *
 2 | 
 3 | 
 4 | class FlowChart(Scene):
 5 |     def construct(self):
 6 |         # set background to gray
 7 |         user = ImageMobject("assets/bohemian.png").scale(0.7).to_edge(LEFT)
 8 |         recorder = ImageMobject(
 9 |             "assets/mic.png").scale(0.4).to_edge(UP)
10 | 
11 |         db = ImageMobject(
12 |             "assets/database.png").scale(0.5).to_edge(RIGHT)
13 |         translate = ImageMobject(
14 |             "assets/translate-2.png").scale(0.3).to_edge(DOWN)
15 | 
16 |         line1 = ArcBetweenPoints(
17 |             user.get_right(), recorder.get_left(), angle=-PI/4,
18 |             arc_center=(recorder.get_left() + user.get_right())/2)
19 |         line1 = Line(user.get_right(), recorder.get_left())
20 |         line2 = Line(recorder.get_right(), db.get_left())
21 |         line3 = Line(db.get_left(), translate.get_right())
22 |         self.play(FadeIn(user),
23 |                   FadeIn(recorder),
24 |                   FadeIn(db),
25 |                   FadeIn(translate),
26 |                   Create(line1),
27 |                   Create(line2),
28 |                   Create(line3),
29 |                   run_time=2)
30 | 
31 |         updater = VMobject()
32 |         self.add(updater)
33 |         audio = ImageMobject(
34 |             "assets/audio-2.png").scale(0.1)
35 |         text1 = Text("1. record audio with PyAudio", font="Georgia",
36 |                      font_size=31, weight=BOLD,
37 |                      color=BLUE)
38 |         text1.move_to(user.get_right() + RIGHT*4)
39 |         # text1.rotate(line1.get_angle()*0.8).move_to(line1.get_center() + UP*1)
40 |         updater.add_updater(lambda x: x.become(
41 |             Line(user.get_right(), audio.get_center(),
42 |                  stroke_width=5).set_color(BLUE)))
43 |         self.play(MoveAlongPath(audio, line1),
44 |                   Write(text1),
45 |                   rate_func=linear,
46 |                   run_time=3)
47 |         line1.set_color(BLUE)
48 |         updater.clear_updaters()
49 |         self.play(
50 |             audio.animate.move_to(recorder.get_right()),
51 |             run_time=1
52 |         )
53 | 
54 |         text2 = Text("2. sync audio to database", font="Georgia",
55 |                      font_size=31, weight=BOLD,
56 |                      color=YELLOW)
57 |         text2.rotate(line2.get_angle()).move_to(line2.get_center() + UP*1)
58 |         updater.add_updater(lambda x: x.become(
59 |             Line(recorder.get_right(), audio.get_center(),
60 |                  stroke_width=5).set_color(YELLOW)))
61 |         self.play(MoveAlongPath(audio, line2),
62 |                   Write(text2),
63 |                   rate_func=linear,
64 |                   run_time=3)
65 |         line2.set_color(YELLOW)
66 |         self.wait(1)
67 | 
68 |         text3 = Text("3. transcribe with whisper", font="Georgia",
69 |                      font_size=31, weight=BOLD,
70 |                      color=GREEN)
71 |         text3.move_to(translate.get_right() + RIGHT*3.3)
72 |         updater.add_updater(lambda x: x.become(
73 |             Line(db.get_left(), audio.get_center(),
74 |                  stroke_width=5).set_color(GREEN)))
75 |         self.play(MoveAlongPath(audio, line3),
76 |                   Write(text3),
77 |                   rate_func=linear,
78 |                   run_time=3)
79 |         line3.set_color(GREEN)
80 | 
81 |         self.wait(10)
82 | 


--------------------------------------------------------------------------------
/src/client.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import collections
  3 | import wave
  4 | from collections import deque
  5 | 
  6 | import aioredis
  7 | import pyaudio
  8 | import webrtcvad
  9 | import logging
 10 | from .utils import asyncformer
 11 | from .config import REDIS_SERVER
 12 | 
 13 | # Audio recording parameters
 14 | FORMAT = pyaudio.paInt16
 15 | CHANNELS = 1
 16 | RATE = 16000
 17 | CHUNK = 256
 18 | FRAME_DURATION = 30  # 毫秒
 19 | FRAME_SIZE = int(RATE * FRAME_DURATION / 1000)
 20 | 
 21 | g_frames = deque(maxlen=100)
 22 | audio = pyaudio.PyAudio()
 23 | logging.basicConfig(level=logging.INFO)
 24 | 
 25 | # for audio recording
 26 | stream = audio.open(format=FORMAT,
 27 |                     channels=CHANNELS,
 28 |                     rate=RATE,
 29 |                     input=True,
 30 |                     frames_per_buffer=CHUNK)
 31 | 
 32 | 
 33 | async def sync_audio():
 34 |     # Sync audio to redis server list STS:AUDIO
 35 |     async with aioredis.from_url(REDIS_SERVER) as redis:
 36 |         while True:
 37 |             if g_frames:
 38 |                 content = g_frames.pop()
 39 |                 await redis.rpush('STS:AUDIOS', content)
 40 |                 logging.info('Sync audio to redis server')
 41 | 
 42 | 
 43 | def export_wav(data, filename):
 44 |     wf = wave.open(filename, 'wb')
 45 |     wf.setnchannels(CHANNELS)
 46 |     wf.setsampwidth(audio.get_sample_size(FORMAT))
 47 |     wf.setframerate(RATE)
 48 |     wf.writeframes(b''.join(data))
 49 |     wf.close()
 50 | 
 51 | 
 52 | def record_until_silence():
 53 |     frames = collections.deque(maxlen=30)  # 保存最近 30 个帧
 54 |     tmp = collections.deque(maxlen=1000)
 55 |     vad = webrtcvad.Vad()
 56 |     vad.set_mode(1)  # 敏感度，0 到 3，0 最不敏感，3 最敏感
 57 |     triggered = False
 58 |     frames.clear()
 59 |     ratio = 0.5
 60 |     while True:
 61 |         frame = stream.read(FRAME_SIZE)
 62 |         is_speech = vad.is_speech(frame, RATE)
 63 |         if not triggered:
 64 |             frames.append((frame, is_speech))
 65 |             tmp.append(frame)
 66 |             num_voiced = len([f for f, speech in frames if speech])
 67 |             if num_voiced > ratio * frames.maxlen:
 68 |                 logging.info("start recording...")
 69 |                 triggered = True
 70 |                 frames.clear()
 71 |         else:
 72 |             frames.append((frame, is_speech))
 73 |             tmp.append(frame)
 74 |             num_unvoiced = len([f for f, speech in frames if not speech])
 75 |             if num_unvoiced > ratio * frames.maxlen:
 76 |                 logging.info("stop recording...")
 77 |                 export_wav(tmp, 'output.wav')
 78 |                 with open('output.wav', 'rb') as f:
 79 |                     g_frames.appendleft(f.read())
 80 |                 break
 81 | 
 82 | 
 83 | async def record_audio():
 84 |     while True:
 85 |         await asyncformer(record_until_silence)
 86 | 
 87 | 
 88 | async def main():
 89 |     try:
 90 |         task2 = asyncio.create_task(record_audio())
 91 |         task3 = asyncio.create_task(sync_audio())
 92 |         await asyncio.gather(task2, task3)
 93 |     except KeyboardInterrupt:
 94 |         stream.stop_stream()
 95 |         stream.close()
 96 |         audio.terminate()
 97 | 
 98 | 
 99 | def api():
100 |     return asyncio.run(main())
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     import fire
105 |     fire.Fire(api)
106 | 


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | 
 5 | load_dotenv()
 6 | REDIS_SERVER = os.getenv('REDIS_SERVER')
 7 | 
 8 | if REDIS_SERVER is None:
 9 |     raise EnvironmentError(
10 |         "The REDIS_SERVER environment variable is not set. "
11 |         "Please set it in your .env file or as an environment variable.")
12 | 


--------------------------------------------------------------------------------
/src/docker/a.py:
--------------------------------------------------------------------------------
 1 | from celery import Celery
 2 | 
 3 | # Configure Celery to use RabbitMQ as the broker
 4 | 
 5 | amqpurl = 'amqps://svumkmdp:B29vtha3miLQU56iiH37m_FLi6WvDNnl@cougar.rmq.cloudamqp.com/svumkmdp'
 6 | amqpurl = 'redis://:0KuY45rROfxG6qND2CkqhQIEPTKVgkqA@redis-12289.c51.ap-southeast-2-1.ec2.cloud.redislabs.com:12289'
 7 | app = Celery('tasks', broker=amqpurl,          backend=amqpurl)
 8 | 
 9 | # Define a task
10 | 
11 | 
12 | @app.task
13 | def add(x, y):
14 |     return x + y
15 | 


--------------------------------------------------------------------------------
/src/docker/b.py:
--------------------------------------------------------------------------------
1 | from consume import c
2 | 
3 | result = add.delay(4, 4)
4 | print('Waiting for result...')
5 | print('Result:', result.get())
6 | 


--------------------------------------------------------------------------------
/src/docker/whisper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import asyncio
  3 | import os
  4 | import typing
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | from io import BytesIO
  7 | 
  8 | import av
  9 | from fastapi import FastAPI, File, HTTPException, UploadFile
 10 | from faster_whisper import WhisperModel
 11 | from loguru import logger
 12 | from starlette.middleware.base import BaseHTTPMiddleware
 13 | from starlette.requests import Request
 14 | from starlette.responses import JSONResponse
 15 | 
 16 | # Accept the following environment variables from Docker
 17 | MODEL_SIZE = os.getenv('MODEL', 'base')
 18 | PROMPT = os.getenv('PROMPT', '基于FastWhisper的低延迟语音转写服务')
 19 | 
 20 | 
 21 | class ValidateFileTypeMiddleware(BaseHTTPMiddleware):
 22 |     async def dispatch(self, request: Request, call_next):
 23 |         if request.method.lower() == "post":
 24 |             try:
 25 |                 logger.info(f"Request: {request.url}")
 26 |                 response = await call_next(request)
 27 |                 return response
 28 |             except av.error.InvalidDataError:
 29 |                 return JSONResponse(status_code=400,
 30 |                                     content={"message": "Invalid file type"})
 31 |             except Exception as e:
 32 |                 return JSONResponse(status_code=500,
 33 |                                     content={"message": str(e)})
 34 | 
 35 | 
 36 | app = FastAPI()
 37 | app.add_middleware(ValidateFileTypeMiddleware)
 38 | 
 39 | 
 40 | async def asyncformer(sync_func: typing.Callable, *args, **kwargs):
 41 |     loop = asyncio.get_event_loop()
 42 |     with ThreadPoolExecutor() as pool:
 43 |         return await loop.run_in_executor(pool, sync_func, *args, **kwargs)
 44 | 
 45 | 
 46 | class Transcriber:
 47 |     _instance = None
 48 | 
 49 |     def __new__(cls, *args, **kwargs):
 50 |         if cls._instance is None:
 51 |             cls._instance = super(Transcriber, cls).__new__(cls)
 52 |             # Put any initialization here.
 53 |         return cls._instance
 54 | 
 55 |     def __init__(
 56 |             self,
 57 |             model_size: str,
 58 |             device: str = "auto",
 59 |             compute_type: str = "default",
 60 |             prompt: str = PROMPT) -> None:
 61 |         """ FasterWhisper 语音转写
 62 | 
 63 |         Args:
 64 |             model_size (str): 模型大小，可选项为 "tiny", "base", "small", "medium", "large" 。
 65 |                 更多信息参考：https://github.com/openai/whisper
 66 |             device (str, optional): 模型运行设备。
 67 |             compute_type (str, optional): 计算类型。默认为"default"。
 68 |             prompt (str, optional): 初始提示。如果需要转写简体中文，可以使用简体中文提示。
 69 |         """
 70 |         super().__init__()
 71 |         self.model_size = model_size
 72 |         self.device = device
 73 |         self.compute_type = compute_type
 74 |         self.prompt = prompt
 75 | 
 76 |     def __enter__(self) -> 'Transcriber':
 77 |         self._model = WhisperModel(self.model_size,
 78 |                                    device=self.device,
 79 |                                    compute_type=self.compute_type)
 80 |         return self
 81 | 
 82 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
 83 |         pass
 84 | 
 85 |     async def __call__(self, audio: bytes) -> typing.AsyncGenerator[str, None]:
 86 |         def _process():
 87 |             return self._model.transcribe(BytesIO(audio),
 88 |                                           initial_prompt=self.prompt,
 89 |                                           vad_filter=True)
 90 | 
 91 |         segments, info = await asyncformer(_process)
 92 |         for segment in segments:
 93 |             t = segment.text
 94 |             if self.prompt in t.strip():
 95 |                 continue
 96 |             if t.strip().replace('.', ''):
 97 |                 logger.info(t)
 98 |                 yield t
 99 | 
100 | 
101 | @app.post("/v1/audio/transcriptions")
102 | async def _transcribe(file: UploadFile = File(...)):
103 |     with Transcriber(MODEL_SIZE) as stt:
104 |         audio = await file.read()
105 |         text = ','.join([seg async for seg in stt(audio)])
106 |         return {"text": text}
107 | 


--------------------------------------------------------------------------------
/src/local_deploy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | 在本地进行录音 + 转写的单脚本代码。不依赖于云服务（e.g., redis, socket），适合于离线使用。
  4 | 
  5 | 依赖安装:
  6 |     pip3 install pyaudio webrtcvad faster-whisper
  7 | 
  8 | 运行方式:
  9 |     python3 local_deploy.py
 10 | """
 11 | 
 12 | import collections
 13 | import io
 14 | import logging
 15 | import queue
 16 | import threading
 17 | import typing
 18 | import wave
 19 | from io import BytesIO
 20 | 
 21 | import pyaudio
 22 | import webrtcvad
 23 | from faster_whisper import WhisperModel
 24 | 
 25 | logging.basicConfig(level=logging.INFO,
 26 |                     format='%(name)s - %(levelname)s - %(message)s')
 27 | 
 28 | 
 29 | class Queues:
 30 |     audio = queue.Queue()
 31 |     text = queue.Queue()
 32 | 
 33 | 
 34 | class Transcriber(threading.Thread):
 35 |     def __init__(
 36 |             self,
 37 |             model_size: str,
 38 |             device: str = "auto",
 39 |             compute_type: str = "default",
 40 |             prompt: str = '实时/低延迟语音转写服务，林黛玉、倒拔、杨柳树、鲁迅、周树人、关键词、转写正确') -> None:
 41 |         """ FasterWhisper 语音转写
 42 | 
 43 |         Args:
 44 |             model_size (str): 模型大小，可选项为 "tiny", "base", "small", "medium", "large" 。
 45 |                 更多信息参考：https://github.com/openai/whisper
 46 |             device (str, optional): 模型运行设备。
 47 |             compute_type (str, optional): 计算类型。默认为"default"。
 48 |             prompt (str, optional): 初始提示。如果需要转写简体中文，可以使用简体中文提示。
 49 |         """
 50 |         super().__init__()
 51 |         self.model_size = model_size
 52 |         self.device = device
 53 |         self.compute_type = compute_type
 54 |         self.prompt = prompt
 55 | 
 56 |     def __enter__(self) -> 'Transcriber':
 57 |         self._model = WhisperModel(self.model_size,
 58 |                                    device=self.device,
 59 |                                    compute_type=self.compute_type)
 60 |         return self
 61 | 
 62 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
 63 |         pass
 64 | 
 65 |     def __call__(self, audio: bytes) -> typing.Generator[str, None, None]:
 66 |         segments, info = self._model.transcribe(BytesIO(audio),
 67 |                                                 initial_prompt=self.prompt,
 68 |                                                 vad_filter=True)
 69 |         # if info.language != "zh":
 70 |         #     return {"error": "transcribe Chinese only"}
 71 |         for segment in segments:
 72 |             t = segment.text
 73 |             if self.prompt in t.strip():
 74 |                 continue
 75 |             if t.strip().replace('.', ''):
 76 |                 yield t
 77 | 
 78 |     def run(self):
 79 |         while True:
 80 |             audio = Queues.audio.get()
 81 |             text = ''
 82 |             for seg in self(audio):
 83 |                 logging.info(seg)
 84 |                 text += seg
 85 |             Queues.text.put(text)
 86 | 
 87 | 
 88 | class AudioRecorder(threading.Thread):
 89 |     """ Audio recorder.
 90 |     Args:
 91 |         channels (int, 可选): 通道数，默认为1（单声道）。
 92 |         rate (int, 可选): 采样率，默认为16000 Hz。
 93 |         chunk (int, 可选): 缓冲区中的帧数，默认为256。
 94 |         frame_duration (int, 可选): 每帧的持续时间（单位：毫秒），默认为30。
 95 |     """
 96 | 
 97 |     def __init__(self,
 98 |                  channels: int = 1,
 99 |                  sample_rate: int = 16000,
100 |                  chunk: int = 256,
101 |                  frame_duration: int = 30) -> None:
102 |         super().__init__()
103 |         self.sample_rate = sample_rate
104 |         self.channels = channels
105 |         self.chunk = chunk
106 |         self.frame_size = (sample_rate * frame_duration // 1000)
107 |         self.__frames: typing.List[bytes] = []
108 | 
109 |     def __enter__(self) -> 'AudioRecorder':
110 |         self.vad = webrtcvad.Vad()
111 |         # 设置 VAD 的敏感度。参数是一个 0 到 3 之间的整数。0 表示对非语音最不敏感，3 最敏感。
112 |         self.vad.set_mode(1)
113 | 
114 |         self.audio = pyaudio.PyAudio()
115 |         self.sample_width = self.audio.get_sample_size(pyaudio.paInt16)
116 |         self.stream = self.audio.open(format=pyaudio.paInt16,
117 |                                       channels=self.channels,
118 |                                       rate=self.sample_rate,
119 |                                       input=True,
120 |                                       frames_per_buffer=self.chunk)
121 |         return self
122 | 
123 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
124 |         self.stream.stop_stream()
125 |         self.stream.close()
126 |         self.audio.terminate()
127 | 
128 |     def __bytes__(self) -> bytes:
129 |         buf = io.BytesIO()
130 |         with wave.open(buf, 'wb') as wf:
131 |             wf.setnchannels(self.channels)
132 |             wf.setsampwidth(self.sample_width)
133 |             wf.setframerate(self.sample_rate)
134 |             wf.writeframes(b''.join(self.__frames))
135 |             self.__frames.clear()
136 |         return buf.getvalue()
137 | 
138 |     def run(self):
139 |         """ Record audio until silence is detected.
140 |         """
141 |         MAXLEN = 30
142 |         watcher = collections.deque(maxlen=MAXLEN)
143 |         triggered, ratio = False, 0.5
144 |         while True:
145 |             frame = self.stream.read(self.frame_size)
146 |             is_speech = self.vad.is_speech(frame, self.sample_rate)
147 |             watcher.append(is_speech)
148 |             self.__frames.append(frame)
149 |             if not triggered:
150 |                 num_voiced = len([x for x in watcher if x])
151 |                 if num_voiced > ratio * watcher.maxlen:
152 |                     logging.info("start recording...")
153 |                     triggered = True
154 |                     watcher.clear()
155 |                     self.__frames = self.__frames[-MAXLEN:]
156 |             else:
157 |                 num_unvoiced = len([x for x in watcher if not x])
158 |                 if num_unvoiced > ratio * watcher.maxlen:
159 |                     logging.info("stop recording...")
160 |                     triggered = False
161 |                     Queues.audio.put(bytes(self))
162 |                     logging.info("audio task number: {}".format(
163 |                         Queues.audio.qsize()))
164 | 
165 | 
166 | class Chat(threading.Thread):
167 |     def __init__(self, prompt: str) -> None:
168 |         super().__init__()
169 |         self.prompt = prompt
170 | 
171 |     def run(self):
172 |         prompt = "Hey! I'm currently working on my English speaking skills and I was hoping you could help me out. If you notice any mistakes in my expressions or if something I say doesn't sound quite right, could you please correct me? And if everything's fine, just carry on with a normal conversation. I'd really appreciate it if you could reply in a conversational, spoken English style. This way, it feels more like a natural chat. Thanks a lot for your help!"
173 |         while True:
174 |             text = Queues.text.get()
175 |             if text:
176 |                 import os
177 |                 os.system('chat "{}"'.format(prompt + text))
178 |                 prompt = ""
179 | 
180 | 
181 | def main():
182 |     try:
183 |         with AudioRecorder(channels=1, sample_rate=16000) as recorder:
184 |             with Transcriber(model_size="base") as transcriber:
185 |                 recorder.start()
186 |                 transcriber.start()
187 |                 # chat = Chat("")
188 |                 # chat.start()
189 | 
190 |                 recorder.join()
191 |                 transcriber.join()
192 | 
193 |     except KeyboardInterrupt:
194 |         print("KeyboardInterrupt: terminating...")
195 |     except Exception as e:
196 |         logging.error(e, exc_info=True, stack_info=True)
197 | 
198 | 
199 | if __name__ == "__main__":
200 |     main()
201 | 


--------------------------------------------------------------------------------
/src/local_deploy_openai.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | 在本地进行录音 + 转写的单脚本代码。不依赖于云服务（e.g., redis, socket），适合于离线使用。
  4 | 
  5 | 依赖安装:
  6 |     pip3 install pyaudio webrtcvad faster-whisper
  7 | 
  8 | 运行方式:
  9 |     python3 local_deploy.py
 10 | """
 11 | from faster_whisper import WhisperModel
 12 | from io import BytesIO
 13 | import typing
 14 | import io
 15 | import collections
 16 | import wave
 17 | import time
 18 | 
 19 | import pyaudio
 20 | import webrtcvad
 21 | import logging
 22 | from funasr import AutoModel  #添加标点的模型
 23 | 
 24 | #解决bug问题
 25 | import os
 26 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 27 | 
 28 | import openai
 29 | 
 30 | openai.api_key="sk-********"  #填写你自己的key
 31 | 
 32 | logging.basicConfig(
 33 |     level=logging.INFO,
 34 |     format='%(name)s - %(levelname)s - %(message)s')
 35 | 
 36 | #实现标点符号的添加
 37 | model1 = AutoModel(model="E:\ct-punc")
 38 | class Transcriber(object):
 39 |     def __init__(self,
 40 |                  model_size: str = r"E:\whisper\faster-whisper-large-v3",
 41 |                  device: str = "auto",
 42 |                  compute_type: str = "default",
 43 |                  prompt: str = '实时/低延迟语音转写服务'
 44 |                  ) -> None:
 45 |         """ FasterWhisper 语音转写
 46 | 
 47 |         Args:
 48 |             model_size (str): 模型大小，可选项为 "tiny", "base", "small", "medium", "large" 。
 49 |                 更多信息参考：https://github.com/openai/whisper
 50 |             device (str, optional): 模型运行设备。
 51 |             compute_type (str, optional): 计算类型。默认为"default"。
 52 |             prompt (str, optional): 初始提示。如果需要转写简体中文，可以使用简体中文提示。
 53 |         """
 54 | 
 55 |         self.model_size = model_size
 56 |         self.device = device
 57 |         self.compute_type = compute_type
 58 |         self.prompt = prompt
 59 | 
 60 |     def __enter__(self) -> 'Transcriber':
 61 |         self._model = WhisperModel(self.model_size,
 62 |                                   device=self.device,
 63 |                                   compute_type=self.compute_type)
 64 |         return self
 65 | 
 66 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
 67 |         pass
 68 | 
 69 |     def __call__(self, audio: bytes) -> typing.Generator[str, None, None]:
 70 |         segments, info = self._model.transcribe(BytesIO(audio),
 71 |                                                initial_prompt=self.prompt)
 72 |         if info.language != "zh":
 73 |             return {"error": "transcribe Chinese only"}
 74 |         res_all = ""
 75 |         for segment in segments:
 76 |             t = segment.text
 77 | 
 78 |             res1 = model1.generate(input=t)
 79 |             res_all = res_all + res1[0]["text"]
 80 |         if res_all.strip().replace('.', ''):
 81 |             yield res_all
 82 | 
 83 | 
 84 | 
 85 | class AudioRecorder(object):
 86 |     """ Audio recorder.
 87 |     Args:
 88 |         channels (int, 可选): 通道数，默认为1（单声道）。
 89 |         rate (int, 可选): 采样率，默认为16000 Hz。
 90 |         chunk (int, 可选): 缓冲区中的帧数，默认为256。
 91 |         frame_duration (int, 可选): 每帧的持续时间（单位：毫秒），默认为30。
 92 |     """
 93 | 
 94 |     def __init__(self,
 95 |                  channels: int = 1,
 96 |                  sample_rate: int = 16000,
 97 |                  chunk: int = 256,
 98 |                  frame_duration: int = 30) -> None:
 99 |         self.sample_rate = sample_rate
100 |         self.channels = channels
101 |         self.chunk = chunk
102 |         self.frame_size = (sample_rate * frame_duration // 1000)
103 |         self.__frames: typing.List[bytes] = []
104 | 
105 |     def __enter__(self) -> 'AudioRecorder':
106 |         self.vad = webrtcvad.Vad()
107 |         # 设置 VAD 的敏感度。参数是一个 0 到 3 之间的整数。0 表示对非语音最不敏感，3 最敏感。
108 |         self.vad.set_mode(1)
109 | 
110 |         self.audio = pyaudio.PyAudio()
111 |         self.sample_width = self.audio.get_sample_size(pyaudio.paInt16)
112 |         self.stream = self.audio.open(format=pyaudio.paInt16,
113 |                                       channels=self.channels,
114 |                                       rate=self.sample_rate,
115 |                                       input=True,
116 |                                       frames_per_buffer=self.chunk)
117 |         return self
118 | 
119 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
120 |         self.stream.stop_stream()
121 |         self.stream.close()
122 |         self.audio.terminate()
123 | 
124 |     def __bytes__(self) -> bytes:
125 |         buf = io.BytesIO()
126 |         with wave.open(buf, 'wb') as wf:
127 |             wf.setnchannels(self.channels)
128 |             wf.setsampwidth(self.sample_width)
129 |             wf.setframerate(self.sample_rate)
130 |             wf.writeframes(b''.join(self.__frames))
131 |             self.__frames.clear()
132 |         return buf.getvalue()
133 | 
134 |     def __iter__(self):
135 |         """ Record audio until silence is detected.
136 |         """
137 |         MAXLEN = 30
138 |         watcher = collections.deque(maxlen=MAXLEN)
139 |         triggered, ratio = False, 0.5
140 |         while True:
141 |             frame = self.stream.read(self.frame_size)
142 |             is_speech = self.vad.is_speech(frame, self.sample_rate)
143 |             watcher.append(is_speech)
144 |             self.__frames.append(frame)
145 |             if not triggered:
146 |                 num_voiced = len([x for x in watcher if x])
147 |                 if num_voiced > ratio * watcher.maxlen:
148 |                     logging.info("start recording...")
149 |                     triggered = True
150 |                     watcher.clear()
151 |                     self.__frames = self.__frames[-MAXLEN:]
152 |             else:
153 |                 num_unvoiced = len([x for x in watcher if not x])
154 |                 if num_unvoiced > ratio * watcher.maxlen:
155 |                     logging.info("stop recording...")
156 |                     triggered = False
157 |                     yield bytes(self)
158 | 
159 | 
160 | def main():
161 |     try:
162 |         with AudioRecorder(channels=1, sample_rate=16000) as recorder:
163 |             # print("recorder")
164 |             with Transcriber(model_size=r"E:\whisper\faster-whisper-large-v3") as transcriber:  #选择本地的large-v3
165 |                 # print("transcriber")
166 |                 for audio in recorder:
167 |                     # print("audio")
168 |                     for seg in transcriber(audio):
169 |                         # print(seg)
170 |                         print("问：", seg)
171 |                         # time.sleep(0.5)
172 |                         messages = []
173 |                         system_message = "资深工作人员"
174 |                         system_message_dict = {
175 |                             "role": "system",
176 |                             "content": system_message
177 |                         }
178 |                         messages.append(system_message_dict)
179 |                         user_message_dict = {
180 |                             "role": "user",
181 |                             "content": seg
182 |                         }
183 |                         messages.append(user_message_dict)
184 |                         try:
185 |                             response = openai.ChatCompletion.create(
186 |                                 model="gpt-3.5-turbo",
187 |                                 messages=messages
188 |                             )
189 |                             # print(response)
190 |                             reply = response["choices"][0]["message"]["content"]
191 |                             print("++++++++++++++++++++++++++正在加速寻找答案！++++++++++++++++++++++++++")
192 |                             time.sleep(1)
193 |                             print("GPT答：", reply)
194 |                         except:
195 |                             time.sleep(1)
196 |                             print("**************************请不要密集提问！**************************")
197 |                         # time.sleep(0.5)
198 |                         logging.info(seg)
199 |                         print("--------------------------------请继续询问！--------------------------------")
200 | 
201 |     except KeyboardInterrupt:
202 |         print("KeyboardInterrupt: terminating...")
203 |     except Exception as e:
204 |         logging.error(e, exc_info=True, stack_info=True)
205 | 
206 | 
207 | if __name__ == "__main__":
208 |     main()
209 | 


--------------------------------------------------------------------------------
/src/server.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | import time
 4 | from collections import deque
 5 | 
 6 | import aioredis
 7 | from faster_whisper import WhisperModel
 8 | 
 9 | from .config import REDIS_SERVER
10 | from .utils import asyncformer
11 | 
12 | CONVERSATION = deque(maxlen=100)
13 | MODEL_SIZE = "large-v3"
14 | CN_PROMPT = '聊一下基于faster-whisper的实时/低延迟语音转写服务'
15 | logging.basicConfig(level=logging.INFO)
16 | model = WhisperModel(MODEL_SIZE, device="auto", compute_type="default")
17 | logging.info('Model loaded')
18 | 
19 | 
20 | async def transcribe():
21 |     # download audio from redis by popping from list: STS:AUDIO
22 |     def b_transcribe():
23 |         # transcribe audio to text
24 |         start_time = time.time()
25 |         segments, info = model.transcribe("chunk.mp3",
26 |                                           beam_size=5,
27 |                                           initial_prompt=CN_PROMPT)
28 |         end_time = time.time()
29 |         period = end_time - start_time
30 |         text = ''
31 |         for segment in segments:
32 |             t = segment.text
33 |             if t.strip().replace('.', ''):
34 |                 text += ', ' + t if text else t
35 |         return text, period
36 | 
37 |     async with aioredis.from_url(REDIS_SERVER) as redis:
38 |         '-' * 81
39 |         while True:
40 |             length = await redis.llen('STS:AUDIOS')
41 |             if length > 10:
42 |                 await redis.expire('STS:AUDIOS', 1)
43 |             content = await redis.blpop('STS:AUDIOS', timeout=0.1)
44 |             if content is None:
45 |                 continue
46 | 
47 |             with open('chunk.mp3', 'wb') as f:
48 |                 f.write(content[1])
49 | 
50 |             text, _period = await asyncformer(b_transcribe)
51 |             t = text.strip().replace('.', '')
52 |             logging.info(t)
53 |             CONVERSATION.append(text)
54 | 
55 | 
56 | async def main():
57 |     await asyncio.gather(transcribe())
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     asyncio.run(main())
62 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import asyncio
 3 | from concurrent.futures import ThreadPoolExecutor
 4 | from typing import Callable, Any
 5 | 
 6 | 
 7 | async def asyncformer(sync_func: Callable, *args, **kwargs) -> Any:
 8 |     loop = asyncio.get_event_loop()
 9 |     with ThreadPoolExecutor() as pool:
10 |         return await loop.run_in_executor(pool, sync_func, *args, **kwargs)
11 | 


--------------------------------------------------------------------------------