├── requirements.txt
├── README.md
├── deploy_llamaedge_macos.sh
├── deploy_llamaedge_linux_x86_cuda12.sh
└── app.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiofiles==23.2.1
 2 | annotated-types==0.7.0
 3 | anyio==4.6.2.post1
 4 | audioread==3.0.1
 5 | certifi==2024.8.30
 6 | cffi==1.17.1
 7 | charset-normalizer==3.4.0
 8 | click==8.1.7
 9 | decorator==5.1.1
10 | distro==1.9.0
11 | fastapi==0.115.4
12 | ffmpy==0.4.0
13 | filelock==3.16.1
14 | fsspec==2024.10.0
15 | gradio==5.5.0
16 | gradio_client==1.4.2
17 | gTTS==2.5.4
18 | h11==0.14.0
19 | httpcore==1.0.6
20 | httpx==0.27.2
21 | huggingface-hub==0.26.2
22 | idna==3.10
23 | Jinja2==3.1.4
24 | jiter==0.7.0
25 | joblib==1.4.2
26 | lazy_loader==0.4
27 | librosa==0.10.2.post1
28 | llvmlite==0.43.0
29 | markdown-it-py==3.0.0
30 | MarkupSafe==2.1.5
31 | mdurl==0.1.2
32 | msgpack==1.1.0
33 | numba==0.60.0
34 | numpy==2.0.2
35 | openai==1.54.3
36 | orjson==3.10.11
37 | packaging==24.2
38 | pandas==2.2.3
39 | pillow==11.0.0
40 | platformdirs==4.3.6
41 | pooch==1.8.2
42 | pycparser==2.22
43 | pydantic==2.9.2
44 | pydantic_core==2.23.4
45 | pydub==0.25.1
46 | Pygments==2.18.0
47 | python-dateutil==2.9.0.post0
48 | python-multipart==0.0.12
49 | pytz==2024.2
50 | PyYAML==6.0.2
51 | requests==2.32.3
52 | resampy==0.4.3
53 | rich==13.9.4
54 | ruff==0.7.3
55 | safehttpx==0.1.1
56 | scikit-learn==1.5.2
57 | scipy==1.14.1
58 | semantic-version==2.10.0
59 | shellingham==1.5.4
60 | six==1.16.0
61 | sniffio==1.3.1
62 | soundfile==0.12.1
63 | soxr==0.5.0.post1
64 | SpeechRecognition==3.11.0
65 | starlette==0.41.2
66 | threadpoolctl==3.5.0
67 | tomlkit==0.12.0
68 | tqdm==4.67.0
69 | typer==0.13.0
70 | typing_extensions==4.12.2
71 | tzdata==2024.2
72 | urllib3==2.2.3
73 | uvicorn==0.32.0
74 | websockets==12.0
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # :speaking_head: TalkTalk (唠嗑)
  2 | 
  3 | > [!NOTE]
  4 | > This project is for experimental purposes. If you like it, please give it a star :wink:
  5 | 
  6 | ## Setup
  7 | 
  8 | ```bash
  9 | git clone https://github.com/LlamaEdge/talktalk.git
 10 | cd talktalk
 11 | ```
 12 | 
 13 | - Deploy LlamaEdge API Servers on MacOS (Apple Silicon)
 14 | 
 15 |   ```bash
 16 |   ./deploy_llamaedge_macos.sh
 17 | 
 18 |   # or, specify the ports
 19 |   ./deploy_llamaedge_macos.sh --proxy-port 10086 --llama-port 12345 --whisper-port 12306
 20 |   ```
 21 |   > Use the `deploy_llamaedge_linux_x86_cuda12.sh` script for Linux machines with Nvidia GPUs and CUDA 12 drivers. 
 22 | 
 23 |   The default ports for `llama-proxy-server`, `llama-api-server` and `whisper-api-server` are `10086`, `12345` and `12306`, respectively. You can change them by using the `--proxy-port`, `--llama-port` and `--whisper-port` options.
 24 | 
 25 |   <details>
 26 |   <summary>For those who want to deploy servers step by step manually:</summary>
 27 | 
 28 |   - Install WasmEdge Runtime
 29 | 
 30 |     ```bash
 31 |     # Install WasmEdge Runtime
 32 |     curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v 0.14.1
 33 |     ```
 34 | 
 35 |   - LlamaEdge API Server
 36 | 
 37 |     ```bash
 38 |     # Download LlamaEdge API Server
 39 |     curl -LO https://github.com/LlamaEdge/LlamaEdge/releases/download/0.14.15/llama-api-server.wasm
 40 | 
 41 |     # Download chat model
 42 |     curl -LO https://huggingface.co/second-state/Qwen2.5-3B-Instruct-GGUF/resolve/main/Qwen2.5-3B-Instruct-Q5_K_M.gguf
 43 | 
 44 |     # Start LlamaEdge API Server
 45 |     wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2.5-3B-Instruct-Q5_K_M.gguf \
 46 |       llama-api-server.wasm \
 47 |       --model-name Qwen2.5-3B-Instruct \
 48 |       --prompt-template chatml \
 49 |       --ctx-size 32000 \
 50 |       --port 12345
 51 |     ```
 52 | 
 53 |   - LlamaEdge-Whisper API Server
 54 | 
 55 |     ```bash
 56 |     # Download whisper model
 57 |     curl -LO https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin
 58 | 
 59 |     # Download wasmedge-whisper plugin
 60 |     mkdir -p wasmedge-whisper/plugin
 61 |     curl -LO https://github.com/WasmEdge/WasmEdge/releases/download/0.14.1/WasmEdge-plugin-wasi_nn-whisper-0.14.1-darwin_arm64.tar.gz
 62 |     tar -xzf WasmEdge-plugin-wasi_nn-whisper-0.14.1-darwin_arm64.tar.gz -C wasmedge-whisper/plugin
 63 |     rm WasmEdge-plugin-wasi_nn-whisper-0.14.1-darwin_arm64.tar.gz
 64 | 
 65 |     # Start LlamaEdge-Whisper API Server
 66 |     WASMEDGE_PLUGIN_PATH=$(pwd)/wasmedge-whisper/plugin wasmedge --dir .:. whisper-api-server.wasm -m ggml-medium.bin --port 12306
 67 |     ```
 68 | 
 69 |   - Proxy Server
 70 | 
 71 |     ```bash
 72 |     curl -LO https://github.com/LlamaEdge/llama-proxy-server/releases/download/0.1.0/llama-proxy-server.wasm
 73 |     wasmedge llama-proxy-server.wasm --port 10086
 74 | 
 75 |     # register chat server
 76 |     curl -X POST http://localhost:10086/admin/register/chat -d "http://localhost:12345"
 77 | 
 78 |     # register whisper server
 79 |     curl -X POST http://localhost:10086/admin/register/whisper -d "http://localhost:12306"
 80 |     ```
 81 | 
 82 |   </details>
 83 | 
 84 | - Install dependencies and start TalkTalk App
 85 | 
 86 |   ```bash
 87 |   # Optional: create a new virtual environment with conda or other tools
 88 |   conda create -n talktalk python=3.11
 89 |   conda activate talktalk
 90 | 
 91 |   # Install dependencies
 92 |   pip install -r requirements.txt
 93 | 
 94 |   # Start TalkTalk App
 95 |   python app.py
 96 |   ```
 97 | 
 98 |   If the app is running, you can visit `http://127.0.0.1:7860` or the `gradio.live` public URL to use the app.
 99 | 
100 | ## Talk with TalkTalk
101 | 
102 | * A person speakimg Chinese: https://youtu.be/NFpLShcT7NM
103 | * A person speaking English: https://twitter.com/juntao/status/1857133856144621840
104 | 
105 | ## Future Plan
106 | 
107 | *TalkTalk* is using `gtts` for text-to-speech conversion. In the next step, it will be replaced by [LlamaEdge-TTS API Server](https://github.com/LlamaEdge/tts-api-server).
108 | 


--------------------------------------------------------------------------------
/deploy_llamaedge_macos.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Default ports
  4 | proxy_port=10086
  5 | llama_port=12345
  6 | whisper_port=12306
  7 | 
  8 | # ANSI color codes
  9 | GREEN='\033[0;32m'
 10 | RED='\033[0;31m'
 11 | YELLOW='\033[1;33m'
 12 | NC='\033[0m' # No Color
 13 | 
 14 | # Helper functions
 15 | info() {
 16 |     printf "${GREEN}$1${NC}\n\n"
 17 | }
 18 | 
 19 | error() {
 20 |     printf "${RED}$1${NC}\n\n"
 21 | }
 22 | 
 23 | warning() {
 24 |     printf "${YELLOW}$1${NC}\n\n"
 25 | }
 26 | 
 27 | # Parse command line arguments
 28 | while [[ $# -gt 0 ]]; do
 29 |     case $1 in
 30 |         --proxy-port)
 31 |             proxy_port="$2"
 32 |             shift 2
 33 |             ;;
 34 |         --llama-port)
 35 |             llama_port="$2"
 36 |             shift 2
 37 |             ;;
 38 |         --whisper-port)
 39 |             whisper_port="$2"
 40 |             shift 2
 41 |             ;;
 42 |         -h|--help)
 43 |             echo "Usage: $0 [options]"
 44 |             echo "Options:"
 45 |             echo "  --proxy-port PORT    Set proxy server port (default: 10086)"
 46 |             echo "  --llama-port PORT    Set LlamaEdge server port (default: 12345)"
 47 |             echo "  --whisper-port PORT  Set LlamaEdge-Whisper server port (default: 12306)"
 48 |             echo "  -h, --help           Show this help message"
 49 |             exit 0
 50 |             ;;
 51 |         *)
 52 |             error "Unknown option: $1"
 53 |             exit 1
 54 |             ;;
 55 |     esac
 56 | done
 57 | 
 58 | # Check if ports are valid numbers
 59 | if ! [[ "$proxy_port" =~ ^[0-9]+$ ]] || ! [[ "$llama_port" =~ ^[0-9]+$ ]] || ! [[ "$whisper_port" =~ ^[0-9]+$ ]]; then
 60 |     error "Ports must be valid numbers"
 61 |     exit 1
 62 | fi
 63 | 
 64 | info "[+] Checking ports ..."
 65 | if lsof -Pi :$proxy_port -sTCP:LISTEN -t >/dev/null; then
 66 |     error "    * Port $proxy_port is already in use. Please choose another port."
 67 |     exit 1
 68 | fi
 69 | if lsof -Pi :$llama_port -sTCP:LISTEN -t >/dev/null; then
 70 |     error "    * Port $llama_port is already in use. Please choose another port."
 71 |     exit 1
 72 | fi
 73 | if lsof -Pi :$whisper_port -sTCP:LISTEN -t >/dev/null; then
 74 |     error "    * Port $whisper_port is already in use. Please choose another port."
 75 |     exit 1
 76 | fi
 77 | info "    * All ports are available."
 78 | 
 79 | info "[+] Installing WasmEdge Runtime..."
 80 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v 0.14.1
 81 | printf "\n\n"
 82 | 
 83 | info "[+] Create api-server directory in the current directory"
 84 | if [ -d "api-server" ]; then
 85 |     warning "    * api-server directory already exists. Remove it? (y/n)"
 86 |     read -r answer
 87 |     if [ "$answer" = "y" ]; then
 88 |         rm -rf api-server
 89 |     else
 90 |         exit 1
 91 |     fi
 92 | fi
 93 | mkdir -p api-server
 94 | 
 95 | info "[+] Downloading LlamaEdge API Server and model..."
 96 | curl -LO# https://github.com/LlamaEdge/LlamaEdge/releases/download/0.14.15/llama-api-server.wasm
 97 | if [ ! -f Llama-3.2-3B-Instruct-Q5_K_M.gguf ]; then
 98 |     curl -LO https://huggingface.co/second-state/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q5_K_M.gguf
 99 | fi
100 | printf "\n\n"
101 | 
102 | info "[+] Downloading LlamaEdge-Whisper API Server, Whisper model and plugin..."
103 | curl -LO# https://github.com/LlamaEdge/whisper-api-server/releases/download/0.3.2/whisper-api-server.wasm
104 | if [ ! -f ggml-large-v2-q5_0.bin ]; then
105 |     curl -LO# https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2-q5_0.bin
106 | fi
107 | if [ -d "wasmedge-whisper/plugin" ]; then
108 |     rm -rf wasmedge-whisper/plugin
109 | fi
110 | mkdir -p wasmedge-whisper/plugin
111 | curl -LO# https://github.com/WasmEdge/WasmEdge/releases/download/0.14.1/WasmEdge-plugin-wasi_nn-whisper-0.14.1-darwin_arm64.tar.gz
112 | tar -xzf WasmEdge-plugin-wasi_nn-whisper-0.14.1-darwin_arm64.tar.gz -C wasmedge-whisper/plugin
113 | rm WasmEdge-plugin-wasi_nn-whisper-0.14.1-darwin_arm64.tar.gz
114 | printf "\n\n"
115 | 
116 | info "[+] Downloading proxy server..."
117 | curl -LO# https://github.com/LlamaEdge/llama-proxy-server/releases/download/0.1.0/llama-proxy-server.wasm
118 | printf "\n\n"
119 | 
120 | info "[+] Starting servers in background..."
121 | # Start LlamaEdge API Server
122 | wasmedge --dir .:. --nn-preload default:GGML:AUTO:Llama-3.2-3B-Instruct-Q5_K_M.gguf \
123 |   llama-api-server.wasm \
124 |   --model-name llama \
125 |   --prompt-template llama-3-chat \
126 |   --ctx-size 32000 \
127 |   --port $llama_port &
128 | 
129 | # Start Whisper API Server
130 | WASMEDGE_PLUGIN_PATH=$(pwd)/wasmedge-whisper/plugin wasmedge --dir .:. whisper-api-server.wasm -m ggml-large-v2-q5_0.bin --task transcribe --port $whisper_port &
131 | 
132 | # Start Proxy Server
133 | wasmedge llama-proxy-server.wasm --port $proxy_port &
134 | 
135 | # Wait for servers to start
136 | sleep 5
137 | info "    * Servers started."
138 | 
139 | info "[+] Registering servers with proxy..."
140 | curl -X POST http://localhost:$proxy_port/admin/register/chat -d "http://localhost:$llama_port"
141 | curl -X POST http://localhost:$proxy_port/admin/register/whisper -d "http://localhost:$whisper_port"
142 | printf "\n\n"
143 | 
144 | info "[+] Done!"
145 | 
146 | info ">>> To stop the servers, run 'pkill -f wasmedge'."
147 | 
148 | exit 0
149 | 


--------------------------------------------------------------------------------
/deploy_llamaedge_linux_x86_cuda12.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Default ports
  4 | proxy_port=10086
  5 | llama_port=12345
  6 | whisper_port=12306
  7 | 
  8 | # ANSI color codes
  9 | GREEN='\033[0;32m'
 10 | RED='\033[0;31m'
 11 | YELLOW='\033[1;33m'
 12 | NC='\033[0m' # No Color
 13 | 
 14 | # Helper functions
 15 | info() {
 16 |     printf "${GREEN}$1${NC}\n\n"
 17 | }
 18 | 
 19 | error() {
 20 |     printf "${RED}$1${NC}\n\n"
 21 | }
 22 | 
 23 | warning() {
 24 |     printf "${YELLOW}$1${NC}\n\n"
 25 | }
 26 | 
 27 | # Parse command line arguments
 28 | while [[ $# -gt 0 ]]; do
 29 |     case $1 in
 30 |         --proxy-port)
 31 |             proxy_port="$2"
 32 |             shift 2
 33 |             ;;
 34 |         --llama-port)
 35 |             llama_port="$2"
 36 |             shift 2
 37 |             ;;
 38 |         --whisper-port)
 39 |             whisper_port="$2"
 40 |             shift 2
 41 |             ;;
 42 |         -h|--help)
 43 |             echo "Usage: $0 [options]"
 44 |             echo "Options:"
 45 |             echo "  --proxy-port PORT    Set proxy server port (default: 10086)"
 46 |             echo "  --llama-port PORT    Set LlamaEdge server port (default: 12345)"
 47 |             echo "  --whisper-port PORT  Set LlamaEdge-Whisper server port (default: 12306)"
 48 |             echo "  -h, --help           Show this help message"
 49 |             exit 0
 50 |             ;;
 51 |         *)
 52 |             error "Unknown option: $1"
 53 |             exit 1
 54 |             ;;
 55 |     esac
 56 | done
 57 | 
 58 | # Check if ports are valid numbers
 59 | if ! [[ "$proxy_port" =~ ^[0-9]+$ ]] || ! [[ "$llama_port" =~ ^[0-9]+$ ]] || ! [[ "$whisper_port" =~ ^[0-9]+$ ]]; then
 60 |     error "Ports must be valid numbers"
 61 |     exit 1
 62 | fi
 63 | 
 64 | info "[+] Checking ports ..."
 65 | if lsof -Pi :$proxy_port -sTCP:LISTEN -t >/dev/null; then
 66 |     error "    * Port $proxy_port is already in use. Please choose another port."
 67 |     exit 1
 68 | fi
 69 | if lsof -Pi :$llama_port -sTCP:LISTEN -t >/dev/null; then
 70 |     error "    * Port $llama_port is already in use. Please choose another port."
 71 |     exit 1
 72 | fi
 73 | if lsof -Pi :$whisper_port -sTCP:LISTEN -t >/dev/null; then
 74 |     error "    * Port $whisper_port is already in use. Please choose another port."
 75 |     exit 1
 76 | fi
 77 | info "    * All ports are available."
 78 | 
 79 | info "[+] Installing WasmEdge Runtime..."
 80 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v 0.14.1
 81 | printf "\n\n"
 82 | 
 83 | info "[+] Create api-server directory in the current directory"
 84 | if [ -d "api-server" ]; then
 85 |     warning "    * api-server directory already exists. Remove it? (y/n)"
 86 |     read -r answer
 87 |     if [ "$answer" = "y" ]; then
 88 |         rm -rf api-server
 89 |     else
 90 |         exit 1
 91 |     fi
 92 | fi
 93 | mkdir -p api-server
 94 | 
 95 | info "[+] Downloading LlamaEdge API Server and model..."
 96 | curl -LO# https://github.com/LlamaEdge/LlamaEdge/releases/download/0.14.15/llama-api-server.wasm
 97 | if [ ! -f Llama-3.2-3B-Instruct-Q5_K_M.gguf ]; then
 98 |     curl -LO https://huggingface.co/second-state/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q5_K_M.gguf
 99 | fi
100 | printf "\n\n"
101 | 
102 | info "[+] Downloading LlamaEdge-Whisper API Server, Whisper model and plugin..."
103 | curl -LO# https://github.com/LlamaEdge/whisper-api-server/releases/download/0.3.2/whisper-api-server.wasm
104 | if [ ! -f ggml-large-v2-q5_0.bin ]; then
105 |     curl -LO# https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2-q5_0.bin
106 | fi
107 | if [ -d "wasmedge-whisper/plugin" ]; then
108 |     rm -rf wasmedge-whisper/plugin
109 | fi
110 | mkdir -p wasmedge-whisper/plugin
111 | curl -LO# https://github.com/WasmEdge/WasmEdge/releases/download/0.14.1/WasmEdge-plugin-wasi_nn-whisper-cuda-12.0-0.14.1-ubuntu20.04_x86_64.tar.gz
112 | tar -xzf WasmEdge-plugin-wasi_nn-whisper-cuda-12.0-0.14.1-ubuntu20.04_x86_64.tar.gz -C wasmedge-whisper/plugin
113 | rm WasmEdge-plugin-wasi_nn-whisper-cuda-12.0-0.14.1-ubuntu20.04_x86_64.tar.gz
114 | printf "\n\n"
115 | 
116 | info "[+] Downloading proxy server..."
117 | curl -LO# https://github.com/LlamaEdge/llama-proxy-server/releases/download/0.1.0/llama-proxy-server.wasm
118 | printf "\n\n"
119 | 
120 | info "[+] Starting servers in background..."
121 | # Start LlamaEdge API Server
122 | wasmedge --dir .:. --nn-preload default:GGML:AUTO:Llama-3.2-3B-Instruct-Q5_K_M.gguf \
123 |   llama-api-server.wasm \
124 |   --model-name llama \
125 |   --prompt-template llama-3-chat \
126 |   --ctx-size 32000 \
127 |   --port $llama_port &
128 | 
129 | # Start Whisper API Server
130 | WASMEDGE_PLUGIN_PATH=$(pwd)/wasmedge-whisper/plugin wasmedge --dir .:. whisper-api-server.wasm -m ggml-large-v2-q5_0.bin --task transcribe --port $whisper_port &
131 | 
132 | # Start Proxy Server
133 | wasmedge llama-proxy-server.wasm --port $proxy_port &
134 | 
135 | # Wait for servers to start
136 | sleep 5
137 | info "    * Servers started."
138 | 
139 | info "[+] Registering servers with proxy..."
140 | curl -X POST http://localhost:$proxy_port/admin/register/chat -d "http://localhost:$llama_port"
141 | curl -X POST http://localhost:$proxy_port/admin/register/whisper -d "http://localhost:$whisper_port"
142 | printf "\n\n"
143 | 
144 | info "[+] Done!"
145 | 
146 | info ">>> To stop the servers, run 'pkill -f wasmedge'."
147 | 
148 | exit 0
149 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import tempfile
  5 | from pathlib import Path
  6 | 
  7 | import gradio as gr
  8 | import librosa
  9 | import openai
 10 | import requests
 11 | import soundfile as sf
 12 | from gtts import gTTS
 13 | from openai import OpenAI
 14 | 
 15 | # The workflow is now:
 16 | # 1. User uploads an audio file
 17 | # 2. The audio is transcribed to text using Whisper
 18 | # 3. The transcribed text is sent to ChatGPT
 19 | # 4. ChatGPT's response is converted to speech
 20 | # 5. The UI shows:
 21 | # - The AI's audio response
 22 | # - The original transcribed text
 23 | # - The AI's text response
 24 | # Note: This uses gTTS (Google Text-to-Speech) for the text-to-speech conversion. For # better quality, you might want to consider using OpenAI's TTS API or other commercial # TTS services.
 25 | 
 26 | 
 27 | # Initialize OpenAI client
 28 | client = OpenAI(api_key="GAIA")
 29 | 
 30 | 
 31 | def process_audio(audio_file, api_url, input_language):
 32 |     print(f"Processing audio file: {audio_file}")
 33 |     print(f"Using API URL: {api_url}")
 34 |     print(f"Input language: {input_language}")
 35 | 
 36 |     # Check and resample if needed
 37 |     TARGET_SAMPLE_RATE = 16000
 38 | 
 39 |     # Load audio with librosa (automatically handles different formats)
 40 |     data, current_sample_rate = librosa.load(
 41 |         audio_file, sr=None
 42 |     )  # sr=None preserves original sample rate
 43 |     print(f"Original sample rate: {current_sample_rate} Hz")
 44 | 
 45 |     if current_sample_rate != TARGET_SAMPLE_RATE:
 46 |         print(f"Resampling from {current_sample_rate} Hz to {TARGET_SAMPLE_RATE} Hz")
 47 |         # High-quality resampling using librosa
 48 |         data = librosa.resample(
 49 |             y=data,
 50 |             orig_sr=current_sample_rate,
 51 |             target_sr=TARGET_SAMPLE_RATE,
 52 |             res_type="kaiser_best",  # Highest quality resampling
 53 |         )
 54 | 
 55 |         # Normalize audio to prevent clipping
 56 |         data = librosa.util.normalize(data)
 57 | 
 58 |         # Save as 32-bit float WAV for better quality
 59 |         temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
 60 |         sf.write(
 61 |             temp_wav.name,
 62 |             data,
 63 |             TARGET_SAMPLE_RATE,
 64 |         )
 65 |         audio_file = temp_wav.name
 66 | 
 67 |     # Step 1: Transcribe audio to text using whisper-api-server transcriptions api
 68 |     print(f"Transcribing audio to text")
 69 |     whisper_url = f"{api_url.rstrip('/')}/v1/audio/transcriptions"
 70 | 
 71 |     # 构造请求的数据
 72 |     files = {"file": open(audio_file, "rb")}
 73 |     data = {"language": input_language, "max_len": 1000, "split_on_word": "true"}
 74 | 
 75 |     # 发送 POST 请求
 76 |     response = requests.post(whisper_url, files=files, data=data).json()
 77 | 
 78 |     # 使用正则表达式提取时间戳后的内容
 79 |     user_message = re.sub(r"\[.*?\]\s*", "", response["text"])
 80 | 
 81 |     print(f"Transcribed text: {user_message}")
 82 | 
 83 |     if "temp_wav" in locals():
 84 |         os.unlink(temp_wav.name)
 85 | 
 86 |     # Step 2: Generate response using llama-api-server chat completions api
 87 |     print(f"Generating chat completions")
 88 |     chat_url = f"{api_url.rstrip('/')}/v1/chat/completions"
 89 |     headers = {"Content-Type": "application/json"}
 90 | 
 91 |     # 构造请求的 JSON 数据
 92 |     data = {
 93 |         "messages": [
 94 |             {
 95 |                 "role": "system",
 96 |                 "content": "You are a helpful AI assistant. You should answer questions as precisely and concisely as possible. The answer should be suitable for speech playback, not for reading.",
 97 |             },
 98 |             {
 99 |                 "role": "user",
100 |                 "content": user_message,
101 |             },
102 |         ],
103 |         "model": "llama",
104 |         "stream": False,
105 |     }
106 | 
107 |     # 发送 POST 请求
108 |     chat_completion_response = requests.post(
109 |         chat_url, headers=headers, json=data
110 |     ).json()
111 |     assistant_message = chat_completion_response["choices"][0]["message"]["content"]
112 | 
113 |     # 打印响应内容
114 |     print(f"AI Response: {assistant_message}")
115 | 
116 |     # Step 3: Convert response text to speech using OpenAI TTS
117 |     # speech_response = client.audio.speech.create(
118 |     #     model="tts-1", voice="alloy", input=chat_completion_response.text
119 |     # )
120 | 
121 |     # Save the audio response
122 |     # output_file = "response_audio.wav"
123 |     # with open(output_file, "wb") as f:
124 |     #     f.write(assistant_message)
125 | 
126 |     # Step 3: Convert response text to speech using gTTS
127 |     print(f"Converting response text to speech using gTTS in {input_language} language")
128 |     tts = gTTS(text=assistant_message, lang=input_language)
129 | 
130 |     # Save the audio response to a temporary file
131 |     print(f"Saving audio response")
132 |     output_file = "response_audio.wav"
133 |     tts.save(output_file)
134 | 
135 |     return (
136 |         output_file,
137 |         user_message,
138 |         assistant_message,
139 |     )
140 | 
141 | 
142 | # Define Gradio interface
143 | with gr.Blocks() as iface:
144 |     gr.Markdown("# AI Conversation Demo")
145 |     gr.Markdown(
146 |         "Upload an audio file or record using your microphone to get an AI response in both audio and text format."
147 |     )
148 | 
149 |     with gr.Row():
150 |         api_url = gr.Textbox(
151 |             label="LlamaEdge API Server URL",
152 |             placeholder="http://localhost:10086",
153 |             value="http://localhost:10086",
154 |             info="Enter the URL of your LlamaEdge API server",
155 |         )
156 |         input_language = gr.Dropdown(
157 |             choices=["en", "zh", "ja"],
158 |             value="en",
159 |             label="Input Audio Language",
160 |             info="Select the language of your input audio",
161 |         )
162 | 
163 |     with gr.Row():
164 |         with gr.Column():
165 |             audio_input = gr.Audio(
166 |                 sources=["microphone", "upload"],
167 |                 type="filepath",
168 |                 label="Me",
169 |             )
170 |         with gr.Column():
171 |             audio_output = gr.Audio(type="filepath", label="TalkTalk AI")
172 | 
173 |     with gr.Row():
174 |         submit_btn = gr.Button("Submit")
175 | 
176 |     with gr.Row():
177 |         user_text = gr.Textbox(label="Me")
178 |         ai_text = gr.Textbox(label="TalkTalk AI")
179 | 
180 |     submit_btn.click(
181 |         fn=process_audio,
182 |         inputs=[audio_input, api_url, input_language],
183 |         outputs=[audio_output, user_text, ai_text],
184 |     )
185 | 
186 | # Launch Gradio app
187 | if __name__ == "__main__":
188 |     iface.launch(share=True)
189 | 


--------------------------------------------------------------------------------