├── frontend_nextjs ├── components │ ├── noop.ts │ ├── generateEmptyFft.ts │ ├── types │ │ ├── ChatMessage.ts │ │ ├── JsonMessage.ts │ │ ├── AudioConfiguration.ts │ │ ├── index.ts │ │ ├── AudioOutput.ts │ │ ├── PauseAssistantMessage.ts │ │ ├── UserInterruption.ts │ │ ├── ResumeAssistantMessage.ts │ │ ├── PostAssistantMessage.ts │ │ ├── AssistantInput.ts │ │ ├── AudioInput.ts │ │ ├── AssistantMessage.ts │ │ ├── ChatMetadata.ts │ │ ├── Context.ts │ │ ├── UserMessage.ts │ │ └── SessionSettings.ts │ ├── connection-message.ts │ ├── auth.ts │ ├── Card.tsx │ ├── audio-message.ts │ ├── external-link.tsx │ ├── toaster.tsx │ ├── StartCall.tsx │ ├── MicFFT.tsx │ ├── events.ts │ ├── Chat.tsx │ ├── convertFrequencyScale.ts │ ├── useEncoding.ts │ ├── ui │ │ ├── toggle.tsx │ │ └── button.tsx │ ├── useCallDuration.ts │ ├── errors.ts │ ├── logos │ │ ├── BWLogo.tsx │ │ ├── GitHub.tsx │ │ └── FenXuan.tsx │ ├── Expressions.tsx │ ├── Nav.tsx │ ├── CardPanel.tsx │ ├── runtime.ts │ ├── useMessages.ts │ ├── Controls.tsx │ ├── useVoiceClient.ts │ ├── useMicrophone.ts │ ├── Messages.tsx │ └── ChatSocket.ts ├── preview.png ├── public │ ├── favicon.ico │ ├── icons │ │ ├── hat1.jpg │ │ ├── hat2.jpg │ │ └── hat3.jpg │ ├── favicon-16x16.png │ ├── apple-touch-icon.png │ ├── vercel.svg │ └── next.svg ├── postcss.config.mjs ├── .env.example ├── next-env.d.ts ├── app │ ├── page.tsx │ ├── error.tsx │ ├── layout.tsx │ └── globals.css ├── utils │ ├── index.ts │ ├── expressionLabels.ts │ └── expressionColors.ts ├── components.json ├── .gitignore ├── tsconfig.json ├── package.json └── tailwind.config.ts ├── assets ├── pipeline.png ├── pipeline_zh.png ├── proactive_pipeline.png └── proactive_showcase.png ├── backend ├── ref_audio │ ├── ref_wav │ │ ├── ref_audio_1.wav │ │ ├── ref_audio_2.wav │ │ ├── ref_audio_3.wav │ │ └── ref_audio_default.wav │ └── ref.json ├── prompts │ ├── proactivity │ │ ├── initialize.txt │ │ ├── update.txt │ │ ├── summary.txt │ │ ├── panding.txt │ │ ├── fact.txt │ │ ├── inside_conflict.txt │ │ ├── reject.txt │ │ └── nci.txt │ ├── README.md │ └── character │ │ ├── anlingrong.txt │ │ ├── zhangwei.txt │ │ └── zhenzhen.txt ├── requirements_proactivity.txt ├── requirements.txt ├── requirements-rag.txt ├── tests │ ├── conftest.py │ ├── test_siliconflow_tts.py │ ├── test_siliconflow_asr.py │ ├── test_lm.py │ ├── test_llm_api.py │ ├── test_emoji_module.py │ ├── test_nontext_interaction_module.py │ ├── test_virtualcharacter_module.py │ ├── test_memory_module.py │ ├── test_proactivity_module.py │ ├── test_paraformer_asr.py │ ├── test_rag_lmapi.py │ ├── test_uid.py │ ├── test_utils.py │ ├── test_cosyvoice_tts.py │ └── test_llm_uid.py ├── utils.py ├── uid.py ├── README.zh.md └── README.md ├── NOTICE ├── .gitignore ├── .github └── workflows │ └── api_handler.yml └── README.zh.md /frontend_nextjs/components/noop.ts: -------------------------------------------------------------------------------- 1 | export const noop = () => {}; 2 | -------------------------------------------------------------------------------- /assets/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/assets/pipeline.png -------------------------------------------------------------------------------- /assets/pipeline_zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/assets/pipeline_zh.png -------------------------------------------------------------------------------- /frontend_nextjs/preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/frontend_nextjs/preview.png -------------------------------------------------------------------------------- /assets/proactive_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/assets/proactive_pipeline.png -------------------------------------------------------------------------------- /assets/proactive_showcase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/assets/proactive_showcase.png -------------------------------------------------------------------------------- /frontend_nextjs/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/frontend_nextjs/public/favicon.ico -------------------------------------------------------------------------------- /frontend_nextjs/public/icons/hat1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/frontend_nextjs/public/icons/hat1.jpg -------------------------------------------------------------------------------- /frontend_nextjs/public/icons/hat2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/frontend_nextjs/public/icons/hat2.jpg -------------------------------------------------------------------------------- /frontend_nextjs/public/icons/hat3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/frontend_nextjs/public/icons/hat3.jpg -------------------------------------------------------------------------------- /backend/ref_audio/ref_wav/ref_audio_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/backend/ref_audio/ref_wav/ref_audio_1.wav -------------------------------------------------------------------------------- /backend/ref_audio/ref_wav/ref_audio_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/backend/ref_audio/ref_wav/ref_audio_2.wav -------------------------------------------------------------------------------- /backend/ref_audio/ref_wav/ref_audio_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/backend/ref_audio/ref_wav/ref_audio_3.wav -------------------------------------------------------------------------------- /frontend_nextjs/public/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/frontend_nextjs/public/favicon-16x16.png -------------------------------------------------------------------------------- /frontend_nextjs/public/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/frontend_nextjs/public/apple-touch-icon.png -------------------------------------------------------------------------------- /backend/ref_audio/ref_wav/ref_audio_default.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendilab/CleanS2S/HEAD/backend/ref_audio/ref_wav/ref_audio_default.wav -------------------------------------------------------------------------------- /frontend_nextjs/components/generateEmptyFft.ts: -------------------------------------------------------------------------------- 1 | export function generateEmptyFft(): number[] { 2 | return Array.from({ length: 24 }).map(() => 0); 3 | } 4 | -------------------------------------------------------------------------------- /backend/prompts/proactivity/initialize.txt: -------------------------------------------------------------------------------- 1 | # 任务目标: 2 | - 根据收到的对话和要素名称,从对话中提取与要素名称相关的信息 3 | - 按照常识扩展信息的细节,但不可以添加、减少或改变信息的内容 4 | - 概括信息内容,不超过50字 5 | - 只输出信息,不输出要素名称 6 | 7 | # 可以输出单个或多个字符串 -------------------------------------------------------------------------------- /backend/prompts/proactivity/update.txt: -------------------------------------------------------------------------------- 1 | # 任务目标: 2 | - 根据你收到的对话和要素,提取与要素名称相关的信息 3 | - 按照常识扩展信息的细节,但不可以添加、减少或改变信息的内容 4 | - 总结信息内容,将总结后的信息内容与收到的要素内容对比 5 | - 输出要素内容不包含的部分 6 | 7 | # 可以输出一个或多个字符串 -------------------------------------------------------------------------------- /backend/requirements_proactivity.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==1.5.2 2 | promcse==0.0.2 3 | transformers==4.44.2 4 | datasets==2.19.0 5 | FlagEmbedding==1.3.3 6 | funasr==1.1.6 7 | einops==0.8.0 -------------------------------------------------------------------------------- /backend/prompts/proactivity/summary.txt: -------------------------------------------------------------------------------- 1 | # 任务目标: 2 | - 总结被弹出滑动窗口的历史对话,保留你觉得必要的信息 3 | - 输入的信息包括之前的总结和新的被弹出的对话 4 | - 你总结的信息会在生成未来新的回答时作为参考 5 | - 输出的总结不超过50字,只输出总结,不要在输出中包含其他内容 6 | 7 | # 可以输出单个或多个字符串 -------------------------------------------------------------------------------- /frontend_nextjs/postcss.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('postcss-load-config').Config} */ 2 | const config = { 3 | plugins: { 4 | tailwindcss: {}, 5 | }, 6 | }; 7 | 8 | export default config; 9 | -------------------------------------------------------------------------------- /backend/requirements.txt: -------------------------------------------------------------------------------- 1 | nltk==3.9.1 2 | torch>=2.0.1 3 | torchaudio>=2.0.1 4 | protobuf==4.25.4 5 | librosa 6 | rich 7 | websockets 8 | openai>=1.0 9 | transformers>=4.49.0 10 | funasr 11 | loguru 12 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | CleanS2S incorporates the Jina Embeddings model (jinaai/jina-embeddings-v3) 2 | Copyright (c) 2023 Jina AI GmbH. Licensed under CC BY-NC 4.0. 3 | Commercial use restrictions apply to this third-party component. -------------------------------------------------------------------------------- /backend/requirements-rag.txt: -------------------------------------------------------------------------------- 1 | langchain>=0.2.15 2 | langchain_community 3 | langchain_openai 4 | sentence-transformers 5 | chromadb==0.5.3 6 | pydub 7 | git+https://github.com/HKUDS/LightRAG.git@6f40fad#egg=lightrag-hku -------------------------------------------------------------------------------- /frontend_nextjs/.env.example: -------------------------------------------------------------------------------- 1 | NEXT_PUBLIC_FX_SEND_HOST= 2 | NEXT_PUBLIC_FX_RECV_HOST= 3 | NEXT_PUBLIC_VCTTS_SEND_HOST= 4 | NEXT_PUBLIC_VCTTS_RECV_HOST= 5 | NEXT_PUBLIC_AGENT_NAME=感染力大师 6 | NEXT_PUBLIC_FX_ENABLE=false 7 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/ChatMessage.ts: -------------------------------------------------------------------------------- 1 | export interface ChatMessage { 2 | /** Role of who is providing the message. */ 3 | role: string; 4 | /** Transcript of the message. */ 5 | content?: string; 6 | } 7 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/JsonMessage.ts: -------------------------------------------------------------------------------- 1 | import { AssistantMessage } from './AssistantMessage'; 2 | import { ChatMetadata } from './ChatMetadata'; 3 | 4 | 5 | export type JsonMessage = 6 | | AssistantMessage 7 | | ChatMetadata 8 | -------------------------------------------------------------------------------- /frontend_nextjs/next-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | /// 3 | 4 | // NOTE: This file should not be edited 5 | // see https://nextjs.org/docs/basic-features/typescript for more information. 6 | -------------------------------------------------------------------------------- /backend/prompts/proactivity/panding.txt: -------------------------------------------------------------------------------- 1 | # 输入 2 | - 用户输入 3 | - 关键事实种类列表 4 | 5 | # 任务 6 | - 推理用户回复中存在哪些确定的事实性内容 7 | - 注意区分用户提到的词,和事实性内容的差别 8 | - 注意区分用户提问和已经确定的事实之间的差异 9 | - 判定事实性内容属于关键事实列表中的哪些关键事实种类 10 | - 输出这些关键事实种类,并组成一个列表 11 | - 你需要以json格式输出列表格式的内容 12 | -------------------------------------------------------------------------------- /frontend_nextjs/components/connection-message.ts: -------------------------------------------------------------------------------- 1 | export type ConnectionMessage = 2 | | { 3 | type: 'socket_connected'; 4 | receivedAt: Date; 5 | } 6 | | { 7 | type: 'socket_disconnected'; 8 | receivedAt: Date; 9 | }; 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | *.DS_Store 4 | .idea/* 5 | __pycache__/* 6 | backend/.env.dev 7 | backend/tmp 8 | backend/persist_db 9 | *output.wav 10 | *ragtmp 11 | backend/examples 12 | backend/s2s_data 13 | backend/vc_data 14 | *pretrained_models 15 | *.wav 16 | -------------------------------------------------------------------------------- /backend/prompts/proactivity/fact.txt: -------------------------------------------------------------------------------- 1 | # 目标 2 | - 从最新对话中提取关键事实,这些事实将用于未来回复用户时作为参考。 3 | - 提取时要假定你自己是一个有实体的人,对一些常规物理信息的认识是必须具备的 4 | - 关键事实是指一些不易改变的现实情况,比如你父母的身体健康情况、相关姓名、日期、时间、场景等 5 | - 从最新的对话提取关键事实时要参考‘旧的关键事实’,重复的关键事实不要输出 6 | - 输出的文字要精炼,可以适当符号比如‘:’,‘-’等等 7 | 8 | # 以list形式输出多个字符串 -------------------------------------------------------------------------------- /backend/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 5 | sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend")) 6 | sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend/tests")) 7 | -------------------------------------------------------------------------------- /frontend_nextjs/app/page.tsx: -------------------------------------------------------------------------------- 1 | import dynamic from "next/dynamic"; 2 | 3 | const Chat = dynamic(() => import("@/components/Chat"), { 4 | ssr: false, 5 | }); 6 | 7 | export default async function Page() { 8 | 9 | return ( 10 |
11 | 12 |
13 | ); 14 | } 15 | -------------------------------------------------------------------------------- /frontend_nextjs/utils/index.ts: -------------------------------------------------------------------------------- 1 | import { type ClassValue, clsx } from "clsx" 2 | import { twMerge } from "tailwind-merge" 3 | 4 | export function cn(...inputs: ClassValue[]) { 5 | return twMerge(clsx(inputs)) 6 | } 7 | 8 | export const keepLastN = (n: number, arr: T[]): T[] => { 9 | if (arr.length <= n) { 10 | return arr; 11 | } 12 | return arr.slice(arr.length - n); 13 | }; 14 | -------------------------------------------------------------------------------- /frontend_nextjs/app/error.tsx: -------------------------------------------------------------------------------- 1 | 'use client'; 2 | 3 | export default function Error () { 4 | return
5 |
6 |

An unexpected error occurred

7 |

Please try again later

8 |
9 |
10 | } -------------------------------------------------------------------------------- /frontend_nextjs/components/types/AudioConfiguration.ts: -------------------------------------------------------------------------------- 1 | export type Encoding = 'linear16'; 2 | 3 | 4 | export interface AudioConfiguration { 5 | /** Encoding format of the audio input, such as `linear16`. */ 6 | encoding: Encoding; 7 | /** Number of audio channels. */ 8 | channels: number; 9 | /** Audio sample rate. Number of samples per second in the audio input, measured in Hertz. */ 10 | sampleRate: number; 11 | } 12 | -------------------------------------------------------------------------------- /frontend_nextjs/components.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://ui.shadcn.com/schema.json", 3 | "style": "default", 4 | "rsc": true, 5 | "tsx": true, 6 | "tailwind": { 7 | "config": "tailwind.config.ts", 8 | "css": "app/globals.css", 9 | "baseColor": "slate", 10 | "cssVariables": true, 11 | "prefix": "" 12 | }, 13 | "aliases": { 14 | "components": "@/components", 15 | "utils": "@/utils" 16 | } 17 | } -------------------------------------------------------------------------------- /backend/prompts/README.md: -------------------------------------------------------------------------------- 1 | # 介绍 2 | 3 | 存放本系统中用到的所有 system prompt,目前有负责人设构建的 character 部分和负责产出主动式交互流程中间结果的 proactivity 部分。 4 | 5 | ## character 6 | 7 | 目前人物设定有: 8 | 9 | 1. 《甄嬛传》中的 安陵容 -> anlingrong.txt 10 | 11 | 2. 《爱情公寓中》中的 张伟 -> zhangwei.txt 12 | 13 | 3. 虚拟人物 真真 (引用自[“AI 特离谱” 团队](https://waytoagi.feishu.cn/wiki/SY7uwWPiwiK4yXkDasfcky8zn3e))-> zhenzhen.txt 14 | 15 | ## proactivity 16 | 17 | 包括 memory 模块中关键事实提取、对话总结、情况更新等流程需要用到的 system prompt,非文本交互模块中对话方向判定、emoji 选择等流程中需要用到的 system prompt。 -------------------------------------------------------------------------------- /backend/prompts/proactivity/inside_conflict.txt: -------------------------------------------------------------------------------- 1 | # Goal 2 | - 你将接收到一个对话内容、一个要素名称和一个要素内容。你的任务是总结对话内容在要素名称方面的关键信息,并将这个关键信息与要素内容进行对比。如果关键信息与要素内容冲突,输出True;如果不冲突,输出False。 3 | - 如果要素内容为空,则判断为不冲突,输出False 4 | 5 | # 步骤 6 | - 总结关键信息:从对话内容中提取与要素名称相关的关键信息。 7 | - 对比关键信息与要素内容:将提取的关键信息与要素内容进行对比。 8 | - 输出结果:如果关键信息与要素内容冲突,输出True;如果不冲突,输出False。 9 | 10 | # 示例: 11 | 对话内容: "用户:我想要一个红色的苹果。客服:我们有红色的苹果,但也有绿色的苹果。" 12 | 要素名称: "苹果颜色" 13 | 要素内容: "红色" 14 | 15 | # 分析: 16 | 关键信息总结:对话中提到的苹果颜色是“红色”。 17 | 对比:关键信息“红色”与要素内容“红色”不冲突。 18 | 输出结果:False -------------------------------------------------------------------------------- /frontend_nextjs/.gitignore: -------------------------------------------------------------------------------- 1 | # Node.js 2 | node_modules/ 3 | npm-debug.log 4 | yarn-error.log 5 | 6 | # Next.js 7 | /.next/ 8 | /out/ 9 | 10 | # Build files 11 | /.build/ 12 | /.cache/ 13 | /.vercel/ 14 | 15 | # Environment variables 16 | .env.local 17 | .env.development.local 18 | .env.test.local 19 | .env.production.local 20 | 21 | # IDE files 22 | .vscode/ 23 | .idea/ 24 | *.iml 25 | 26 | # Logs 27 | logs/ 28 | *.log 29 | 30 | # Dependency lock files 31 | yarn.lock 32 | package-lock.json 33 | 34 | .DS_Store 35 | -------------------------------------------------------------------------------- /frontend_nextjs/components/auth.ts: -------------------------------------------------------------------------------- 1 | import z from 'zod'; 2 | 3 | export const AuthStrategySchema = z.union([ 4 | z.object({ 5 | type: z.literal('apiKey'), 6 | value: z.string({ 7 | description: 'API key for the API is required', 8 | }), 9 | }), 10 | z.object({ 11 | type: z.literal('accessToken'), 12 | value: z.string({ 13 | description: 'Access token for the API is required', 14 | }), 15 | }), 16 | ]); 17 | 18 | export type AuthStrategy = z.infer; 19 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/index.ts: -------------------------------------------------------------------------------- 1 | export * from './AssistantInput'; 2 | export * from './AssistantMessage'; 3 | export * from './ChatMessage'; 4 | export * from './ChatMetadata'; 5 | export * from './JsonMessage'; 6 | export * from './UserMessage'; 7 | export * from './AudioInput'; 8 | export * from './AudioOutput'; 9 | export * from './SessionSettings'; 10 | export * from './PauseAssistantMessage'; 11 | export * from './ResumeAssistantMessage'; 12 | export * from './UserInterruption'; 13 | export * from './PostAssistantMessage'; 14 | -------------------------------------------------------------------------------- /backend/ref_audio/ref.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "ref_wav_path": "ref_audio_2.wav", 4 | "prompt_text": "现在这个是新版更新了许多图片,修订了一些错误,还增加了一些内容,这是一套书啊,一共五本,是我和唐志远、蒋彻、陈进、汪田一块写的,我写的是半字幕,这一本。" 5 | }, 6 | { 7 | "ref_wav_path": "ref_audio_3.wav", 8 | "prompt_text": "一年前的这个时候,我认识了玲玲,并知道了他的故事。在过去的整整一年里,她的经历都持续的鼓舞着我,给了我很多的力量与勇气。" 9 | }, 10 | { 11 | "ref_wav_path": "ref_audio_1.wav", 12 | "prompt_text": "我以为已经是卷到头了,没想到七彩虹来了个更劲爆的。" 13 | }, 14 | { 15 | "ref_wav_path": "ref_audio_default.wav", 16 | "prompt_text": "没错,就是那个维基解谜的创始人,这哥们儿这两年过得着实有点惨。" 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /frontend_nextjs/public/vercel.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/prompts/proactivity/reject.txt: -------------------------------------------------------------------------------- 1 | ## 目标 2 | 你将接收到一个对话内容和一个要素,其中包括要素名称和要素内容。你的任务是分析对话内容与要素内容之间的冲突,并给LLM提供指令来让其进行反驳。 3 | 4 | ## 步骤 5 | - 冲突识别:首先,你需要仔细对比对话内容和要素内容,找出两者之间的冲突部分。 6 | - 反驳与解释:针对每个冲突部分,你需要: 7 | 1. 反驳对话内容:明确指出对话内容中的错误或不一致之处。 8 | 2. 提供依据和原因:基于要素内容,解释为什么对话内容是错误的,并提供具体的依据和原因。 9 | - 确保你的反驳逻辑清晰、依据充分,并且能够帮助LLM理解如何反驳对话的内容。 10 | 11 | ## 输出格式 12 | 你需要输出:正确的事实+反驳用户说的话,并以你所扮演的角色的身份纠正用户的错误 13 | 14 | ## 示例 15 | 假设对话内容为:用户:这个杯子的价格是50美元。 16 | 要素为:杯子价格:75美元 17 | 分析与反驳: 18 | 冲突识别:对话内容中提到的价格是50美元,而要素内容中明确指出价格是75美元。 19 | 反驳与解释: 20 | 反驳对话内容:用户提到的价格50美元是错误的。 21 | 提供依据和原因:根据要素内容中的价格信息,该产品的实际价格是75美元。因此,对话内容中的50美元价格是不准确的。 22 | 输出:杯子的价格应该是75美元,反驳用户说的话,并以你所扮演的角色的身份纠正用户的错误 -------------------------------------------------------------------------------- /frontend_nextjs/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "lib": ["dom", "dom.iterable", "esnext"], 4 | "allowJs": true, 5 | "skipLibCheck": true, 6 | "strict": true, 7 | "noEmit": true, 8 | "esModuleInterop": true, 9 | "module": "esnext", 10 | "moduleResolution": "bundler", 11 | "resolveJsonModule": true, 12 | "isolatedModules": true, 13 | "jsx": "preserve", 14 | "incremental": true, 15 | "plugins": [ 16 | { 17 | "name": "next" 18 | } 19 | ], 20 | "paths": { 21 | "@/*": ["./*"] 22 | } 23 | }, 24 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 25 | "exclude": ["node_modules"] 26 | } 27 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/AudioOutput.ts: -------------------------------------------------------------------------------- 1 | export interface AudioOutput { 2 | /** The type of message sent through the socket; for an Audio Output message, this may be `audio_output`. */ 3 | type: string; 4 | /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */ 5 | customSessionId?: string; 6 | /** ID of the audio output. Allows the Audio Output message to be tracked and referenced. */ 7 | id: string; 8 | /** Base64 encoded audio output. This encoded audio is transmitted to the client, where it can be decoded and played back as part of the user interaction. */ 9 | data: Int16Array; 10 | question?: string; 11 | answer?: string; 12 | end?: boolean; 13 | } 14 | -------------------------------------------------------------------------------- /frontend_nextjs/components/Card.tsx: -------------------------------------------------------------------------------- 1 | interface CardProps { 2 | title: string; 3 | description: string; 4 | icon: string; 5 | } 6 | 7 | const Card = ({ title, description, icon }: CardProps) => { 8 | return ( 9 |
14 | {title} 15 |

{title}

16 |

{description}

17 |
18 | ); 19 | } 20 | 21 | export default Card; 22 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/PauseAssistantMessage.ts: -------------------------------------------------------------------------------- 1 | export interface PauseAssistantMessage { 2 | /** 3 | * The type of message sent through the socket; must be `pause_assistant_message` for our server to correctly identify and process it as a Pause Assistant message. 4 | * 5 | * Once this message is sent, EVI will not respond until a [Resume Assistant message](/reference/empathic-voice-interface-evi/chat/chat#send.Resume%20Assistant%20Message.type) is sent. When paused, EVI won’t respond, but transcriptions of your audio inputs will still be recorded. 6 | */ 7 | type: 'pause_assistant_message'; 8 | /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */ 9 | customSessionId?: string; 10 | } 11 | -------------------------------------------------------------------------------- /frontend_nextjs/components/audio-message.ts: -------------------------------------------------------------------------------- 1 | import z from 'zod'; 2 | 3 | export const AudioMessageSchema = z 4 | .object({ 5 | type: z.literal('audio'), 6 | data: z.instanceof(ArrayBuffer), 7 | }) 8 | .transform((obj) => { 9 | return Object.assign(obj, { 10 | receivedAt: new Date(), 11 | }); 12 | }); 13 | 14 | export type AudioMessage = z.infer; 15 | 16 | export const parseAudioMessage = async ( 17 | blob: Blob, 18 | ): Promise => { 19 | return blob 20 | .arrayBuffer() 21 | .then((buffer) => { 22 | return { 23 | type: 'audio' as const, 24 | data: buffer, 25 | receivedAt: new Date(), 26 | }; 27 | }) 28 | .catch(() => { 29 | return null; 30 | }); 31 | }; 32 | -------------------------------------------------------------------------------- /frontend_nextjs/components/external-link.tsx: -------------------------------------------------------------------------------- 1 | export function ExternalLink({ 2 | href, 3 | children 4 | }: { 5 | href: string 6 | children: React.ReactNode 7 | }) { 8 | return ( 9 | 14 | {children} 15 | 27 | 28 | ) 29 | } 30 | -------------------------------------------------------------------------------- /backend/tests/test_siliconflow_tts.py: -------------------------------------------------------------------------------- 1 | from threading import Event 2 | 3 | from s2s_server_pipeline import TTSAPIHandler 4 | 5 | 6 | def test_tts(): 7 | stop_event = Event() 8 | interruption_event = Event() 9 | cur_conn_end_event = Event() 10 | model = TTSAPIHandler( 11 | stop_event, 12 | cur_conn_end_event, 13 | 0, 14 | 0, # placeholder 15 | interruption_event=interruption_event, 16 | ref_dir="ref_audio", 17 | ) 18 | text = "只用一个文件实现的流式全双工语音交互原型智能体!" 19 | uid = "test_uid" 20 | 21 | inputs = {"text": text, "uid": uid} 22 | 23 | audio_np = model.process(inputs) 24 | 25 | print(audio_np.shape) 26 | 27 | if audio_np is not None: 28 | assert len(audio_np) > 32000 29 | 30 | 31 | if __name__ == "__main__": 32 | test_tts() 33 | -------------------------------------------------------------------------------- /frontend_nextjs/components/toaster.tsx: -------------------------------------------------------------------------------- 1 | 'use client' 2 | 3 | import { toast } from 'react-hot-toast' 4 | 5 | 6 | export function errorToaster(message: string, duration: number = 2000) { 7 | const el = document.documentElement; 8 | const dark = el.classList.contains("dark"); 9 | const lightStyle = { 10 | border: '1px solid #713200', 11 | color: '#713200', 12 | padding: '16px', 13 | } 14 | const darkStyle = { 15 | borderRadius: '10px', 16 | background: '#333', 17 | color: '#fff', 18 | } 19 | toast.error( 20 | message, 21 | { 22 | duration: duration, 23 | style: dark ? darkStyle : lightStyle, 24 | position: "bottom-center", 25 | iconTheme: { 26 | primary: '#713200', 27 | secondary: '#FFFAEE', 28 | }, 29 | } 30 | ) 31 | } 32 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/UserInterruption.ts: -------------------------------------------------------------------------------- 1 | export interface UserInterruption { 2 | /** 3 | * The type of message sent through the socket; for a User Interruption message, this must be `user_interruption`. 4 | * 5 | * This message indicates the user has interrupted the assistant’s response. EVI detects the interruption in real-time and sends this message to signal the interruption event. This message allows the system to stop the current audio playback, clear the audio queue, and prepare to handle new user input. 6 | */ 7 | type: "user_interruption"; 8 | /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */ 9 | customSessionId?: string; 10 | /** Unix timestamp of the detected user interruption. */ 11 | time: number; 12 | } 13 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/ResumeAssistantMessage.ts: -------------------------------------------------------------------------------- 1 | export interface ResumeAssistantMessage { 2 | /** 3 | * The type of message sent through the socket; must be `resume_assistant_message` for our server to correctly identify and process it as a Resume Assistant message. 4 | * 5 | * Upon resuming, if any audio input was sent during the pause, EVI will retain context from all messages sent but only respond to the last user message. (e.g., If you ask EVI two questions while paused and then send a `resume_assistant_message`, EVI will respond to the second question and have added the first question to its conversation context.) 6 | */ 7 | type: "resume_assistant_message"; 8 | /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */ 9 | customSessionId?: string; 10 | } 11 | -------------------------------------------------------------------------------- /.github/workflows/api_handler.yml: -------------------------------------------------------------------------------- 1 | name: ASR_TTS_LLM_HANDLER 2 | 3 | on: [push] 4 | 5 | jobs: 6 | pytest: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | max-parallel: 5 10 | 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Python 3.10 14 | uses: actions/setup-python@v3 15 | with: 16 | python-version: '3.10' 17 | - name: Install dependencies 18 | run: | 19 | pip install -r backend/requirements.txt 20 | - name: Test asr, tts, llm 21 | env: 22 | ASR_TTS_API_KEY: ${{secrets.ASR_TTS_API_KEY}} 23 | LLM_API_KEY: ${{secrets.LLM_API_KEY}} 24 | run: | 25 | pip install pytest 26 | export PYTHONPATH=$PYTHONPATH:$(pwd) 27 | pytest backend/tests/test_siliconflow_asr.py -s 28 | pytest backend/tests/test_siliconflow_tts.py -s 29 | pytest backend/tests/test_llm_api.py -s 30 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/PostAssistantMessage.ts: -------------------------------------------------------------------------------- 1 | import { ChatMessage } from './ChatMessage'; 2 | 3 | export interface PostAssistantMessage { 4 | /** 5 | * The type of message sent through the socket; for an Assistant Message, this must be `post_assistant_message`. 6 | */ 7 | type: 'post_assistant_message'; 8 | /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */ 9 | customSessionId?: string; 10 | /** ID of the assistant message. Allows the Assistant Message to be tracked and referenced. */ 11 | id?: string; 12 | /** Transcript of the message. */ 13 | message: ChatMessage; 14 | /** Indicates if this message was inserted into the conversation as text from an [Assistant Input message](/reference/empathic-voice-interface-evi/chat/chat#send.Assistant%20Input.text). */ 15 | fromText: boolean; 16 | receivedAt?: Date; 17 | end?: boolean; 18 | } 19 | -------------------------------------------------------------------------------- /backend/utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | 4 | 5 | # Function to adjust volume 6 | def adjust_volume(audio, volume_factor: float): 7 | """ 8 | Adjust the volume of an audio file by a given factor. 9 | """ 10 | adjusted_audio = audio * volume_factor 11 | adjusted_audio = np.clip(adjusted_audio, -1.0, 1.0) 12 | return adjusted_audio 13 | 14 | 15 | # Function to change speed 16 | def change_speed(audio, speed: float): 17 | """ 18 | Change the speed of an audio file by a given percentage. 19 | """ 20 | changed_audio = librosa.effects.time_stretch(audio, rate=speed) 21 | return changed_audio 22 | 23 | 24 | # Function to shift pitch 25 | def shift_pitch(audio, sampling_rate, n_steps: int): 26 | """ 27 | Shift the pitch of an audio file by a given number of steps. 28 | """ 29 | audio_shifted = librosa.effects.pitch_shift(audio, sr=sampling_rate, n_steps=n_steps) 30 | return audio_shifted 31 | -------------------------------------------------------------------------------- /frontend_nextjs/components/StartCall.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | 4 | import { AnimatePresence, motion } from "framer-motion"; 5 | import { Phone } from "lucide-react"; 6 | import { useVoice } from "./VoiceProvider"; 7 | import { Button } from "./ui/button"; 8 | import CardPanel from "./CardPanel"; 9 | 10 | export default function StartCall() { 11 | const { status } = useVoice(); 12 | 13 | return ( 14 | 15 | {status.value !== "connected" ? ( 16 | 27 | 28 | 29 | ) : null} 30 | 31 | ); 32 | } 33 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/AssistantInput.ts: -------------------------------------------------------------------------------- 1 | export interface AssistantInput { 2 | /** The type of message sent through the socket; must be `assistant_input` for our server to correctly identify and process it as an Assistant Input message. */ 3 | type: 'assistant_input'; 4 | /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */ 5 | customSessionId?: string; 6 | /** 7 | * Assistant text to synthesize into spoken audio and insert into the conversation. 8 | * 9 | * EVI uses this text to generate spoken audio using our proprietary expressive text-to-speech model. Our model adds appropriate emotional inflections and tones to the text based on the user’s expressions and the context of the conversation. The synthesized audio is streamed back to the user as an [Assistant Message](/reference/empathic-voice-interface-evi/chat/chat#receive.Assistant%20Message.type). 10 | */ 11 | text: string; 12 | } 13 | -------------------------------------------------------------------------------- /backend/tests/test_siliconflow_asr.py: -------------------------------------------------------------------------------- 1 | from threading import Event 2 | import os 3 | import torchaudio 4 | 5 | from s2s_server_pipeline import ASRAPIHandler 6 | 7 | PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 8 | 9 | 10 | def test_asr(): 11 | stop_event = Event() 12 | interruption_event = Event() 13 | cur_conn_end_event = Event() 14 | model = ASRAPIHandler( 15 | stop_event, 16 | cur_conn_end_event, 17 | 0, 18 | 0, # placeholder 19 | interruption_event=interruption_event, 20 | ) 21 | 22 | file_path = os.path.join(PROJECT_ROOT, "backend/ref_audio/ref_wav/ref_audio_2.wav") 23 | data_wav, sample_rate = torchaudio.load(file_path) 24 | data_wav = data_wav.numpy() 25 | response = model.process({"data": data_wav, "sample_rate": sample_rate, "uid": "test_uid"}) 26 | assert isinstance(response, str), "response type is wrong" 27 | print(response) 28 | 29 | 30 | if __name__ == "__main__": 31 | test_asr() 32 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/AudioInput.ts: -------------------------------------------------------------------------------- 1 | export interface AudioInput { 2 | /** 3 | * The type of message sent through the socket; must be `audio_input` for our server to correctly identify and process it as an Audio Input message. 4 | * 5 | * This message is used for sending audio input data to EVI for processing and expression measurement. Audio data should be sent as a continuous stream, encoded in Base64. 6 | */ 7 | type: 'audio_input'; 8 | /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */ 9 | customSessionId?: string; 10 | /** 11 | * Base64 encoded audio input to insert into the conversation. 12 | * 13 | * The content of an Audio Input message is treated as the user’s speech to EVI and must be streamed continuously. Pre-recorded audio files are not supported. 14 | * 15 | * For optimal transcription quality, the audio data should be transmitted in small chunks. 16 | */ 17 | data: string; 18 | } 19 | -------------------------------------------------------------------------------- /backend/tests/test_lm.py: -------------------------------------------------------------------------------- 1 | from threading import Event 2 | import os 3 | import sys 4 | 5 | sys.path.append('..') 6 | from s2s_server_pipeline import LanguageModelHandler 7 | 8 | 9 | def main(): 10 | stop_event = Event() 11 | interruption_event = Event() 12 | cur_conn_end_event = Event() 13 | model_name = os.getenv("LLM_MODEL_NAME") 14 | assert model_name is not None, "Please indicate local LLM model name, like `/home/root/Qwen=7B`" 15 | 16 | lm = LanguageModelHandler( 17 | stop_event, 18 | cur_conn_end_event, 19 | 0, 20 | 0, # placeholder 21 | interruption_event=interruption_event, 22 | model_name=model_name, 23 | ) 24 | inputs = { 25 | 'data': '如何评价马斯克', 26 | 'user_input_count': 1, 27 | 'uid': 'test_uid', 28 | 'audio_input': False, 29 | } 30 | generator = lm.process(inputs) 31 | outputs = "".join([t["answer_text"] for t in generator]) 32 | print(f'end: {outputs}') 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /backend/tests/test_llm_api.py: -------------------------------------------------------------------------------- 1 | from threading import Event 2 | 3 | from s2s_server_pipeline import LanguageModelAPIHandler 4 | 5 | 6 | def test_llm_api(): 7 | stop_event = Event() 8 | interruption_event = Event() 9 | cur_conn_end_event = Event() 10 | model_name = "deepseek-chat" 11 | model_url = "https://api.deepseek.com" 12 | 13 | lm = LanguageModelAPIHandler( 14 | stop_event, 15 | cur_conn_end_event, 16 | 0, 17 | 0, # placeholder 18 | interruption_event=interruption_event, 19 | model_name=model_name, 20 | model_url=model_url, 21 | generate_questions=False 22 | ) 23 | inputs = { 24 | 'data': '如何评价马斯克', 25 | 'user_input_count': 1, 26 | 'uid': 'test_uid', 27 | 'audio_input': False, 28 | } 29 | generator = lm.process(inputs) 30 | outputs = "".join([t["answer_text"] for t in generator]) 31 | assert isinstance(outputs, str), "output type is wrong" 32 | print(outputs) 33 | 34 | 35 | if __name__ == "__main__": 36 | test_llm_api() 37 | -------------------------------------------------------------------------------- /frontend_nextjs/components/types/AssistantMessage.ts: -------------------------------------------------------------------------------- 1 | import { ChatMessage } from './ChatMessage'; 2 | 3 | export interface AssistantMessage { 4 | /** 5 | * The type of message sent through the socket; for an Assistant Message, this must be `assistant_message`. 6 | * 7 | * This message contains both a transcript of the assistant’s response and the expression measurement predictions of the assistant’s audio output. 8 | */ 9 | type: 'assistant_message' | 'assistant_notend_message'; 10 | /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */ 11 | customSessionId?: string; 12 | /** ID of the assistant message. Allows the Assistant Message to be tracked and referenced. */ 13 | id?: string; 14 | /** Transcript of the message. */ 15 | message: ChatMessage; 16 | /** Indicates if this message was inserted into the conversation as text from an [Assistant Input message](/reference/empathic-voice-interface-evi/chat/chat#send.Assistant%20Input.text). */ 17 | fromText: boolean; 18 | receivedAt?: Date; 19 | end?: boolean; 20 | } 21 | -------------------------------------------------------------------------------- /frontend_nextjs/app/layout.tsx: -------------------------------------------------------------------------------- 1 | import type { Metadata } from "next"; 2 | import { GeistSans } from "geist/font/sans"; 3 | import { GeistMono } from "geist/font/mono"; 4 | import "./globals.css"; 5 | import { Nav } from "@/components/Nav"; 6 | import { cn } from "@/utils"; 7 | 8 | export const metadata: Metadata = { 9 | metadataBase: new URL(process.env.NEXTAUTH_URL || 'http://localhost:3001'), 10 | title: "CleanS2S - Speech to Speech Agent", 11 | description: "CleanS2S next.js Interface", 12 | icons: { 13 | shortcut: '/favicon-16x16.png', 14 | apple: '/apple-touch-icon.png' 15 | } 16 | }; 17 | 18 | export default function RootLayout({ 19 | children, 20 | }: Readonly<{ 21 | children: React.ReactNode; 22 | }>) { 23 | return ( 24 | 25 | 26 | 33 |