├── web-app ├── public │ ├── robots.txt │ ├── favicon.ico │ ├── female.wav │ ├── logo192.png │ ├── logo512.png │ ├── manifest.json │ └── index.html ├── src │ ├── setupTests.js │ ├── App.test.js │ ├── index.css │ ├── reportWebVitals.js │ ├── index.js │ ├── App.css │ ├── logo.svg │ └── App.js ├── .gitignore ├── package.json └── README.md ├── nginx.conf ├── docker-compose.yml └── readme.md /web-app/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /web-app/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeberJulian/AI-voice-chat/HEAD/web-app/public/favicon.ico -------------------------------------------------------------------------------- /web-app/public/female.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeberJulian/AI-voice-chat/HEAD/web-app/public/female.wav -------------------------------------------------------------------------------- /web-app/public/logo192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeberJulian/AI-voice-chat/HEAD/web-app/public/logo192.png -------------------------------------------------------------------------------- /web-app/public/logo512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeberJulian/AI-voice-chat/HEAD/web-app/public/logo512.png -------------------------------------------------------------------------------- /web-app/src/setupTests.js: -------------------------------------------------------------------------------- 1 | // jest-dom adds custom jest matchers for asserting on DOM nodes. 2 | // allows you to do things like: 3 | // expect(element).toHaveTextContent(/react/i) 4 | // learn more: https://github.com/testing-library/jest-dom 5 | import '@testing-library/jest-dom'; 6 | -------------------------------------------------------------------------------- /web-app/src/App.test.js: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react'; 2 | import App from './App'; 3 | 4 | test('renders learn react link', () => { 5 | render(); 6 | const linkElement = screen.getByText(/learn react/i); 7 | expect(linkElement).toBeInTheDocument(); 8 | }); 9 | -------------------------------------------------------------------------------- /web-app/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', 5 | sans-serif; 6 | -webkit-font-smoothing: antialiased; 7 | -moz-osx-font-smoothing: grayscale; 8 | } 9 | 10 | code { 11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New', 12 | monospace; 13 | } 14 | -------------------------------------------------------------------------------- /web-app/src/reportWebVitals.js: -------------------------------------------------------------------------------- 1 | const reportWebVitals = onPerfEntry => { 2 | if (onPerfEntry && onPerfEntry instanceof Function) { 3 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => { 4 | getCLS(onPerfEntry); 5 | getFID(onPerfEntry); 6 | getFCP(onPerfEntry); 7 | getLCP(onPerfEntry); 8 | getTTFB(onPerfEntry); 9 | }); 10 | } 11 | }; 12 | 13 | export default reportWebVitals; 14 | -------------------------------------------------------------------------------- /web-app/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | package-lock.json 8 | 9 | # testing 10 | /coverage 11 | 12 | # production 13 | /build 14 | # misc 15 | .DS_Store 16 | .env.local 17 | .env.development.local 18 | .env.test.local 19 | .env.production.local 20 | 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | -------------------------------------------------------------------------------- /web-app/src/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom/client'; 3 | import './index.css'; 4 | import App from './App'; 5 | import reportWebVitals from './reportWebVitals'; 6 | 7 | const root = ReactDOM.createRoot(document.getElementById('root')); 8 | root.render( 9 | 10 | 11 | 12 | ); 13 | 14 | // If you want to start measuring performance in your app, pass a function 15 | // to log results (for example: reportWebVitals(console.log)) 16 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals 17 | reportWebVitals(); 18 | -------------------------------------------------------------------------------- /web-app/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /nginx.conf: -------------------------------------------------------------------------------- 1 | events {} 2 | 3 | http { 4 | include mime.types; 5 | default_type application/octet-stream; 6 | 7 | server { 8 | listen 80; 9 | 10 | location / { 11 | root /usr/share/nginx/html; 12 | try_files $uri /index.html; 13 | } 14 | 15 | location /clone_speaker { 16 | proxy_pass http://tts/clone_speaker; 17 | client_max_body_size 20M; 18 | } 19 | 20 | location /tts_stream { 21 | proxy_pass http://tts/tts_stream; 22 | proxy_buffering off; 23 | } 24 | 25 | location /generate_stream { 26 | proxy_pass http://llm:8080/generate_stream; 27 | proxy_buffering off; 28 | } 29 | 30 | location /asr { 31 | proxy_pass http://asr:9000/asr; 32 | client_max_body_size 20M; 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /web-app/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "react-ui", 3 | "version": "0.1.0", 4 | "private": true, 5 | "proxy": "http://localhost:5000", 6 | "dependencies": { 7 | "@testing-library/jest-dom": "^5.17.0", 8 | "@testing-library/react": "^13.4.0", 9 | "@testing-library/user-event": "^13.5.0", 10 | "react": "^18.2.0", 11 | "react-dom": "^18.2.0", 12 | "react-scripts": "5.0.1", 13 | "web-vitals": "^2.1.4" 14 | }, 15 | "scripts": { 16 | "start": "react-scripts start", 17 | "build": "react-scripts build", 18 | "test": "react-scripts test", 19 | "eject": "react-scripts eject" 20 | }, 21 | "eslintConfig": { 22 | "extends": [ 23 | "react-app", 24 | "react-app/jest" 25 | ] 26 | }, 27 | "browserslist": { 28 | "production": [ 29 | ">0.2%", 30 | "not dead", 31 | "not op_mini all" 32 | ], 33 | "development": [ 34 | "last 1 chrome version", 35 | "last 1 firefox version", 36 | "last 1 safari version" 37 | ] 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /web-app/src/App.css: -------------------------------------------------------------------------------- 1 | .App { 2 | text-align: center; 3 | background-color: #03363e; 4 | } 5 | 6 | .App-logo { 7 | height: 40vmin; 8 | pointer-events: none; 9 | } 10 | 11 | @media (prefers-reduced-motion: no-preference) { 12 | .App-logo { 13 | animation: App-logo-spin infinite 20s linear; 14 | } 15 | } 16 | 17 | .App-header { 18 | background-color: #282c34; 19 | min-height: 100vh; 20 | display: flex; 21 | flex-direction: column; 22 | align-items: center; 23 | justify-content: center; 24 | font-size: calc(10px + 2vmin); 25 | color: white; 26 | } 27 | 28 | .App-link { 29 | color: #61dafb; 30 | } 31 | 32 | @keyframes App-logo-spin { 33 | from { 34 | transform: rotate(0deg); 35 | } 36 | to { 37 | transform: rotate(360deg); 38 | } 39 | } 40 | 41 | .chat-window { 42 | border: 1px solid #ccc; 43 | padding: 10px; 44 | height: 300px; 45 | overflow-y: scroll; 46 | } 47 | 48 | .message { 49 | margin: 5px 0; 50 | } 51 | 52 | .message.user { 53 | text-align: left; 54 | } 55 | 56 | .message.bot { 57 | text-align: right; 58 | } 59 | 60 | .circle { 61 | border-radius: 50%; 62 | transition: width 0.1s ease, height 0.1s ease; 63 | } 64 | 65 | .settings-tab { 66 | position: absolute; 67 | left: 0; 68 | top: 0; 69 | padding: 20px; 70 | /* Other styling... */ 71 | } 72 | 73 | .waveform-container { 74 | display: flex; 75 | justify-content: center; 76 | align-items: center; 77 | height: 100vh; 78 | /* Other styling... */ 79 | } -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | nginx: 4 | image: nginx:alpine 5 | ports: 6 | - "5000:80" 7 | volumes: 8 | - ./web-app/build:/usr/share/nginx/html 9 | - ./nginx.conf:/etc/nginx/nginx.conf:ro 10 | depends_on: 11 | - llm 12 | - tts 13 | - asr 14 | llm: 15 | image: ghcr.io/huggingface/text-generation-inference:1.1.0 16 | ports: 17 | - "8080:8080" 18 | environment: 19 | - MODEL_ID=TheBloke/openchat_3.5-AWQ 20 | - PORT=8080 21 | - QUANTIZE=awq 22 | - MAX_INPUT_LEN=3696 23 | - MAX_TOTAL_TOKENS=4096 24 | - MAX_BATCH_PREFILL_TOKENS=4096 25 | - CUDA_MEMORY_FRACTION=0.6 26 | deploy: 27 | resources: 28 | reservations: 29 | devices: 30 | - driver: nvidia 31 | count: 1 32 | capabilities: [gpu] 33 | tts: 34 | image: ghcr.io/coqui-ai/xtts-streaming-server:main-cuda121-818a108b41be2dd43dada04bd319fdfcdabc5c6a 35 | ports: 36 | - "8000:80" 37 | # Uncomment the following lines to use your own models 38 | # volumes: 39 | # - /media/julian/Workdisk/models/ai_voice_chat:/app/tts_models 40 | environment: 41 | - COQUI_TOS_AGREED=1 42 | deploy: 43 | resources: 44 | reservations: 45 | devices: 46 | - driver: nvidia 47 | count: 1 48 | capabilities: [gpu] 49 | asr: 50 | image: onerahmet/openai-whisper-asr-webservice:v1.2.4-gpu 51 | ports: 52 | - "9000:9000" 53 | environment: 54 | - ASR_ENGINE=faster_whisper 55 | - ASR_MODEL=large-v3 56 | deploy: 57 | resources: 58 | reservations: 59 | devices: 60 | - driver: nvidia 61 | count: 1 62 | capabilities: [gpu] 63 | 64 | -------------------------------------------------------------------------------- /web-app/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 12 | 13 | 17 | 18 | 27 | React App 28 | 29 | 30 | 31 |
32 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # AI Voice Chat 2 | 3 | ## Overview 4 | 5 | This is a simple react app that allows you to chat with an AI assistant using your voice. 6 | 7 | It uses `Whisper large v3` for transcription, `openchat 3.5 AWQ` for the AI assistant, and `XTTS v2` for text-to-speech. 8 | 9 | Its main feature is speech-to-speech latency, it more than halves the latency of the ChatGPT with voice demo video. 10 | This repo runs on a single RTX 3090 GPU. 11 | 12 | No concurrency is supported and the project is not optimized production ready. It's also probably riddled with bugs so if you experience some, please open an issue or send a PR. 13 | 14 | The XTTS v2 model is from the [coqui-TTS](https://github.com/coqui-ai/TTS). 15 | If you have any questions about the model or the project, you can join our [discord server](https://discord.gg/vHgDbMzgfv) 16 | 17 | ## Demo 18 | 19 | https://github.com/WeberJulian/AI-voice-chat/assets/17219561/2be20ec1-fa5e-4c26-83ec-c074357f3905 20 | 21 | ## Installation 22 | 23 | ### Requisites 24 | 1. Have a Nvidia GPU with more than 16GB of VRAM and latest drivers 25 | 2. Have `docker-compose` installed 26 | 27 | ### Steps 28 | 1. Clone the repo 29 | 30 | ```bash 31 | git clone https://github.com/WeberJulian/AI-voice-chat.git 32 | ``` 33 | 34 | 2. Build the react app 35 | 36 | ```bash 37 | cd AI-voice-chat 38 | cd web-app 39 | npm install && npm run build 40 | ``` 41 | 42 | 3. Start everything 🚀 43 | 44 | ```bash 45 | cd .. 46 | docker-compose up 47 | ``` 48 | 49 | ## Usage 50 | 51 | 1. Open the app in your browser at `http://localhost:5000` 52 | 53 | 2. Allow microphone access 54 | 55 | 3. Push to talk either with the `Shift` ⇧ key or the circle 56 | 57 | 4. Enjoy! 58 | 59 | To reset the conversation, refresh the page. 60 | 61 | ## Custom models 62 | 63 | If you fine-tune XTTS and want to use your own model, you can add that line to the `docker-compose.yml` file, in the tts service: 64 | 65 | ```yml 66 | services: 67 | ... 68 | tts: 69 | ... 70 | volumes: 71 | - /path/to/your/model:/app/tts_models 72 | ``` 73 | 74 | In the /path/to/your/model folder, you must have the following files: 75 | - `config.json` 76 | - `model.pth` 77 | - `vocab.json` 78 | 79 | -------------------------------------------------------------------------------- /web-app/src/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web-app/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Create React App 2 | 3 | This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app). 4 | 5 | ## Available Scripts 6 | 7 | In the project directory, you can run: 8 | 9 | ### `npm start` 10 | 11 | Runs the app in the development mode.\ 12 | Open [http://localhost:3000](http://localhost:3000) to view it in your browser. 13 | 14 | The page will reload when you make changes.\ 15 | You may also see any lint errors in the console. 16 | 17 | ### `npm test` 18 | 19 | Launches the test runner in the interactive watch mode.\ 20 | See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information. 21 | 22 | ### `npm run build` 23 | 24 | Builds the app for production to the `build` folder.\ 25 | It correctly bundles React in production mode and optimizes the build for the best performance. 26 | 27 | The build is minified and the filenames include the hashes.\ 28 | Your app is ready to be deployed! 29 | 30 | See the section about [deployment](https://facebook.github.io/create-react-app/docs/deployment) for more information. 31 | 32 | ### `npm run eject` 33 | 34 | **Note: this is a one-way operation. Once you `eject`, you can't go back!** 35 | 36 | If you aren't satisfied with the build tool and configuration choices, you can `eject` at any time. This command will remove the single build dependency from your project. 37 | 38 | Instead, it will copy all the configuration files and the transitive dependencies (webpack, Babel, ESLint, etc) right into your project so you have full control over them. All of the commands except `eject` will still work, but they will point to the copied scripts so you can tweak them. At this point you're on your own. 39 | 40 | You don't have to ever use `eject`. The curated feature set is suitable for small and middle deployments, and you shouldn't feel obligated to use this feature. However we understand that this tool wouldn't be useful if you couldn't customize it when you are ready for it. 41 | 42 | ## Learn More 43 | 44 | You can learn more in the [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started). 45 | 46 | To learn React, check out the [React documentation](https://reactjs.org/). 47 | 48 | ### Code Splitting 49 | 50 | This section has moved here: [https://facebook.github.io/create-react-app/docs/code-splitting](https://facebook.github.io/create-react-app/docs/code-splitting) 51 | 52 | ### Analyzing the Bundle Size 53 | 54 | This section has moved here: [https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size](https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size) 55 | 56 | ### Making a Progressive Web App 57 | 58 | This section has moved here: [https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app](https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app) 59 | 60 | ### Advanced Configuration 61 | 62 | This section has moved here: [https://facebook.github.io/create-react-app/docs/advanced-configuration](https://facebook.github.io/create-react-app/docs/advanced-configuration) 63 | 64 | ### Deployment 65 | 66 | This section has moved here: [https://facebook.github.io/create-react-app/docs/deployment](https://facebook.github.io/create-react-app/docs/deployment) 67 | 68 | ### `npm run build` fails to minify 69 | 70 | This section has moved here: [https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify](https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify) 71 | -------------------------------------------------------------------------------- /web-app/src/App.js: -------------------------------------------------------------------------------- 1 | import React, { useState, useEffect, useRef} from 'react'; 2 | import './App.css'; 3 | 4 | function App() { 5 | const [file, setFile] = useState(null); 6 | const [waveformColor, setWaveformColor] = useState('#819a9d'); 7 | const isRecordingRef = useRef(false); 8 | const mediaRecorderRef = useRef(null); 9 | const speakerRef = useRef(null); 10 | const animationFrameRef = useRef(null); 11 | const conversationRef = useRef([ 12 | {sender: 'user', message: "You are a large language model known as OpenChat, the open-source counterpart to ChatGPT, equally powerful as its closed-source sibling. You communicate using an advanced deep learning based speech synthesis system made by coqui, so feel free to include interjections (such as 'hmm', 'oh', 'right', 'wow'...), but avoid using emojis, symboles, code snippets, or anything else that does not translate well to spoken language. Fox exemple, instead of using % say percent, = say equal and for * say times etc... Also please avoid using lists with numbers as items like so 1. 2. Use regular sentences instead."}, 13 | {sender: 'bot', message: "No problem. Anything else?"}, 14 | {sender: 'user', message: "Yeah, please always respond in a sentence or two from now on."}, 15 | {sender: 'bot', message: "Sure, I'll be concise."}, 16 | // {sender: 'bot', message: "I am an advanced emulation of your favourite machine learning youtuber. I'm based on a deep learning system made by coqui. I'm made to explain machine learning to you, I know every paper there is. I say 'hold on to your papers' and 'mindblowing' a lot."}, 17 | // {sender: 'user', message: "Ok, please always respond in a sentence or two from now on."}, 18 | // {sender: 'bot', message: "No problem, I'll be concise."}, 19 | ]); 20 | let audioChunks = []; 21 | let isTTSPending = false; 22 | 23 | const defaultCircleDiameter = 200; 24 | const [circleDiameter, setCircleDiameter] = useState(defaultCircleDiameter); 25 | 26 | const handleMouseDown = () => { 27 | if (!isRecordingRef.current) { 28 | isRecordingRef.current = true; 29 | startRecording(); 30 | } 31 | }; 32 | 33 | const handleMouseUp = () => { 34 | if (isRecordingRef.current) { 35 | isRecordingRef.current = false; 36 | stopRecording(); 37 | } 38 | }; 39 | 40 | // Use these for touch devices 41 | const handleTouchStart = (e) => { 42 | e.preventDefault(); // Prevents additional mouse events 43 | handleMouseDown(); 44 | }; 45 | 46 | const handleTouchEnd = (e) => { 47 | e.preventDefault(); // Prevents additional mouse events 48 | handleMouseUp(); 49 | }; 50 | 51 | const conv2prompt = (conv) => { 52 | let prompt = ""; 53 | for (let i = 0; i < conv.length; i++) { 54 | if (conv[i].sender === "user") { 55 | prompt += "GPT4 Correct User: " + conv[i].message + "<|end_of_turn|>GPT4 Correct Assistant:"; 56 | } else { 57 | prompt += conv[i].message + "<|end_of_turn|>"; 58 | } 59 | } 60 | return prompt; 61 | } 62 | 63 | useEffect(() => { 64 | const fetchDefaultSpeakerEmbedding = async () => { 65 | navigator.getUserMedia({audio:true,video:false}, function(stream) { 66 | stream.getTracks().forEach(x=>x.stop()); 67 | }, err=>console.log(err)); 68 | try { 69 | const response = await fetch('/female.wav'); 70 | const blob = await response.blob(); 71 | const formData = new FormData(); 72 | formData.append('wav_file', blob, 'ref.wav'); 73 | 74 | const speakerResponse = await fetch('/clone_speaker', { 75 | method: 'POST', 76 | body: formData, 77 | }); 78 | const speakerData = await speakerResponse.json(); 79 | speakerRef.current = speakerData; 80 | } catch (error) { 81 | console.error('Error fetching default speaker embedding:', error); 82 | } 83 | }; 84 | 85 | fetchDefaultSpeakerEmbedding(); 86 | }, []); 87 | 88 | useEffect(() => { 89 | // Setup event listeners for push-to-talk 90 | const handleKeyDown = (event) => { 91 | if (event.key === 'Shift' && !isRecordingRef.current) { 92 | isRecordingRef.current = true; 93 | startRecording(); 94 | } 95 | }; 96 | 97 | const handleKeyUp = (event) => { 98 | if (event.key === 'Shift' && isRecordingRef.current) { 99 | isRecordingRef.current = false; 100 | stopRecording(); 101 | } 102 | }; 103 | 104 | window.addEventListener('keydown', handleKeyDown); 105 | window.addEventListener('keyup', handleKeyUp); 106 | 107 | return () => { 108 | window.removeEventListener('keydown', handleKeyDown); 109 | window.removeEventListener('keyup', handleKeyUp); 110 | }; 111 | }, []); 112 | 113 | const startRecording = () => { 114 | setWaveformColor('#ed901b'); 115 | navigator.mediaDevices.getUserMedia({ audio: true }) 116 | .then(stream => { 117 | const audioContext = new (window.AudioContext || window.webkitAudioContext)(); 118 | const mediaStreamSource = audioContext.createMediaStreamSource(stream); 119 | const analyser = audioContext.createAnalyser(); 120 | mediaStreamSource.connect(analyser); 121 | 122 | let amplitudeSum = 0; // Accumulator for amplitude values 123 | let sampleCount = 0; // Counter for number of samples processed 124 | 125 | // Setup to periodically analyze the audio stream 126 | const processAudio = () => { 127 | const dataArray = new Uint8Array(analyser.frequencyBinCount); 128 | analyser.getByteTimeDomainData(dataArray); 129 | 130 | // Calculate amplitude values and accumulate 131 | dataArray.forEach(value => { 132 | amplitudeSum += Math.abs(value - 128); // Subtracting 128 because the range is 0-255 133 | sampleCount++; 134 | }); 135 | 136 | // Every 1000 samples, calculate and log the average, then reset 137 | if (sampleCount >= 100) { 138 | if (isRecordingRef.current) { 139 | const averageAmplitude = amplitudeSum / sampleCount; 140 | setCircleDiameter(defaultCircleDiameter + averageAmplitude * defaultCircleDiameter * 0.15); 141 | amplitudeSum = 0; 142 | sampleCount = 0; 143 | } 144 | } 145 | 146 | animationFrameRef.current = requestAnimationFrame(processAudio); 147 | }; 148 | animationFrameRef.current = requestAnimationFrame(processAudio); 149 | 150 | processAudio(); 151 | mediaRecorderRef.current = new MediaRecorder(stream); 152 | mediaRecorderRef.current.start(); 153 | console.log('Starting to record:', mediaRecorderRef.current); 154 | 155 | mediaRecorderRef.current.ondataavailable = (event) => { 156 | audioChunks.push(event.data); 157 | console.log('Audio chunk recorded:', event.data); 158 | }; 159 | 160 | mediaRecorderRef.current.onstop = () => { 161 | const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); 162 | sendAudioToASR(audioBlob); 163 | audioChunks = []; 164 | audioContext.close(); 165 | }; 166 | }) 167 | .catch(err => console.error('Error accessing microphone:', err)); 168 | }; 169 | 170 | const stopRecording = () => { 171 | console.log('Stopping recording', mediaRecorderRef.current); 172 | mediaRecorderRef.current.stop(); 173 | setWaveformColor('#819a9d'); 174 | if (animationFrameRef.current) { 175 | cancelAnimationFrame(animationFrameRef.current); // Cancel the animation frame request 176 | } 177 | }; 178 | 179 | const sendAudioToASR = (audioBlob) => { 180 | const formData = new FormData(); 181 | console.log('Sending audio to ASR:', audioBlob); 182 | formData.append('audio_file', audioBlob); 183 | 184 | fetch('/asr?encode=true&task=transcribe&vad_filter=true&word_timestamps=false&output=json', { 185 | method: 'POST', 186 | body: formData 187 | }) 188 | .then(response => response.json()) 189 | .then(transcribedText => { 190 | console.log('Transcribed text:', transcribedText["text"]); 191 | sendMessage(transcribedText["text"], transcribedText["language"]); 192 | }) 193 | .catch(error => console.error('Error sending audio to ASR:', error)); 194 | }; 195 | 196 | const handleFileChange = (event) => { 197 | setFile(event.target.files[0]); 198 | }; 199 | 200 | const handleUpload = () => { 201 | const formData = new FormData(); 202 | formData.append('wav_file', file); 203 | 204 | fetch('/clone_speaker', { 205 | method: 'POST', 206 | body: formData, 207 | }) 208 | .then(response => response.json()) 209 | .then(data => { 210 | speakerRef.current = data; 211 | }) 212 | .catch(error => { 213 | console.error('Error:', error); 214 | }); 215 | }; 216 | 217 | const handleTTS = async (text, lang) => { 218 | setWaveformColor('#679989'); 219 | isTTSPending = true; 220 | 221 | function linearInterpolate(sample1, sample2, fraction) { 222 | return sample1 * (1 - fraction) + sample2 * fraction; 223 | } 224 | 225 | await fetch('/tts_stream', { 226 | method: 'POST', 227 | headers: { 228 | 'Content-Type': 'application/json', 229 | }, 230 | body: JSON.stringify({ 231 | text: text, 232 | language: lang, 233 | gpt_cond_latent: speakerRef.current.gpt_cond_latent, 234 | speaker_embedding: speakerRef.current.speaker_embedding, 235 | add_wav_header: false, 236 | }) 237 | }) 238 | .then(response => { 239 | if (!response.ok) { 240 | throw new Error('Network response was not ok'); 241 | } 242 | const audioContext = new (window.AudioContext || window.webkitAudioContext)(); 243 | const playbackSpeed = 24000 / audioContext.sampleRate; 244 | const scriptNode = audioContext.createScriptProcessor(4096, 1, 1); 245 | scriptNode.connect(audioContext.destination); 246 | 247 | const reader = response.body.getReader(); 248 | let audioQueue = []; 249 | let isStreamingFinished = false; 250 | let nextSample = 0; 251 | let amplitudeSum = 0; // Accumulator for amplitude values 252 | let sampleCount = 0; // Counter for number of samples processed 253 | 254 | scriptNode.onaudioprocess = (audioProcessingEvent) => { 255 | const outputBuffer = audioProcessingEvent.outputBuffer.getChannelData(0); 256 | for (let i = 0; i < outputBuffer.length; i++) { 257 | if (nextSample < audioQueue.length) { 258 | const sampleIndex = Math.floor(nextSample); 259 | const nextIndex = sampleIndex + 1; 260 | const sampleFraction = nextSample - sampleIndex; 261 | const interpolatedSample = linearInterpolate( 262 | audioQueue[sampleIndex], 263 | audioQueue[nextIndex], 264 | sampleFraction 265 | ); 266 | outputBuffer[i] = interpolatedSample / 32768; 267 | nextSample += playbackSpeed; 268 | 269 | // Calculate amplitude and update accumulator 270 | amplitudeSum += Math.abs(outputBuffer[i]); 271 | sampleCount++; 272 | 273 | // Every 100 samples, calculate and log the average, then reset 274 | if (sampleCount === 1000) { 275 | const averageAmplitude = amplitudeSum / sampleCount; 276 | amplitudeSum = 0; 277 | sampleCount = 0; 278 | setCircleDiameter(defaultCircleDiameter + averageAmplitude * defaultCircleDiameter * 5); 279 | } 280 | } else { 281 | outputBuffer[i] = 0; // Fill with silence if no data available 282 | if (isStreamingFinished) { 283 | scriptNode.disconnect(); 284 | audioContext.close(); 285 | isTTSPending = false; 286 | break; 287 | } 288 | } 289 | } 290 | }; 291 | 292 | function processAudioChunk({ done, value }) { 293 | if (done) { 294 | isStreamingFinished = true; 295 | return; 296 | } 297 | 298 | // Convert the incoming data to Int16Array and add it to the queue 299 | const rawData = new Int16Array(value.buffer, value.byteOffset, value.byteLength / 2); 300 | audioQueue = audioQueue.concat(Array.from(rawData)); 301 | 302 | reader.read().then(processAudioChunk); 303 | } 304 | 305 | reader.read().then(processAudioChunk); 306 | }) 307 | .catch(error => { 308 | console.error('Error calling TTS service:', error); 309 | }); 310 | }; 311 | 312 | const generateBotResponse = async (text, lang) => { 313 | let generated_text = ""; 314 | let current_sentence = ""; 315 | const response = await fetch('/generate_stream', { 316 | method: 'POST', 317 | headers: { 318 | 'Content-Type': 'application/json', 319 | }, 320 | body: JSON.stringify({ 321 | inputs: text, 322 | parameters: { 323 | max_new_tokens: 250, 324 | } 325 | }) 326 | }); 327 | 328 | if (!response.ok || !response.body) { 329 | throw response.statusText; 330 | } 331 | 332 | const reader = response.body.getReader(); 333 | const decoder = new TextDecoder(); 334 | let partialData = ''; 335 | 336 | while (true) { 337 | const { value, done } = await reader.read(); 338 | if (done) { 339 | break; 340 | } 341 | 342 | partialData += decoder.decode(value, { stream: true }); 343 | 344 | // Process each line separately 345 | let lines = partialData.split('\n'); 346 | for (let i = 0; i < lines.length - 1; i++) { 347 | const line = lines[i]; 348 | if (line.startsWith('data:')) { 349 | const jsonString = line.substring(5); // Remove 'data:' prefix 350 | 351 | try { 352 | const jsonObject = JSON.parse(jsonString); 353 | if (jsonObject && jsonObject.token && jsonObject.token.text) { 354 | console.log('Received:', jsonObject.token.text); 355 | generated_text += jsonObject.token.text; 356 | if (jsonObject.token.text === '<|end_of_turn|>') { 357 | reader.cancel(); 358 | } else { 359 | current_sentence += jsonObject.token.text; 360 | } 361 | if (jsonObject.token.text === '.' || jsonObject.token.text === '?' || jsonObject.token.text === '!') { 362 | await handleTTS(current_sentence, lang); 363 | while (isTTSPending) { 364 | await new Promise(resolve => setTimeout(resolve, 100)); 365 | } 366 | current_sentence = ""; 367 | } 368 | 369 | } 370 | } catch (error) { 371 | console.error('Error parsing JSON:', error); 372 | } 373 | } 374 | } 375 | 376 | partialData = lines[lines.length - 1]; 377 | } 378 | return generated_text; 379 | }; 380 | 381 | const sendMessage = async (message, lang) => { 382 | if (!message) return; 383 | conversationRef.current.push({ sender: 'user', message }); 384 | const prompt = conv2prompt(conversationRef.current); 385 | let generated_text = await generateBotResponse(prompt, lang); 386 | conversationRef.current.push({ sender: 'bot', message: generated_text }); 387 | setWaveformColor('#819a9d'); 388 | }; 389 | 390 | return ( 391 |
392 |
393 |
394 | 395 | 396 |
397 |
398 |
405 |
406 |
407 |
408 | ); 409 | } 410 | 411 | export default App; --------------------------------------------------------------------------------