├── web-app
├── public
│ ├── robots.txt
│ ├── favicon.ico
│ ├── female.wav
│ ├── logo192.png
│ ├── logo512.png
│ ├── manifest.json
│ └── index.html
├── src
│ ├── setupTests.js
│ ├── App.test.js
│ ├── index.css
│ ├── reportWebVitals.js
│ ├── index.js
│ ├── App.css
│ ├── logo.svg
│ └── App.js
├── .gitignore
├── package.json
└── README.md
├── nginx.conf
├── docker-compose.yml
└── readme.md
/web-app/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 |
--------------------------------------------------------------------------------
/web-app/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeberJulian/AI-voice-chat/HEAD/web-app/public/favicon.ico
--------------------------------------------------------------------------------
/web-app/public/female.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeberJulian/AI-voice-chat/HEAD/web-app/public/female.wav
--------------------------------------------------------------------------------
/web-app/public/logo192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeberJulian/AI-voice-chat/HEAD/web-app/public/logo192.png
--------------------------------------------------------------------------------
/web-app/public/logo512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeberJulian/AI-voice-chat/HEAD/web-app/public/logo512.png
--------------------------------------------------------------------------------
/web-app/src/setupTests.js:
--------------------------------------------------------------------------------
1 | // jest-dom adds custom jest matchers for asserting on DOM nodes.
2 | // allows you to do things like:
3 | // expect(element).toHaveTextContent(/react/i)
4 | // learn more: https://github.com/testing-library/jest-dom
5 | import '@testing-library/jest-dom';
6 |
--------------------------------------------------------------------------------
/web-app/src/App.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from '@testing-library/react';
2 | import App from './App';
3 |
4 | test('renders learn react link', () => {
5 | render();
6 | const linkElement = screen.getByText(/learn react/i);
7 | expect(linkElement).toBeInTheDocument();
8 | });
9 |
--------------------------------------------------------------------------------
/web-app/src/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
5 | sans-serif;
6 | -webkit-font-smoothing: antialiased;
7 | -moz-osx-font-smoothing: grayscale;
8 | }
9 |
10 | code {
11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 | monospace;
13 | }
14 |
--------------------------------------------------------------------------------
/web-app/src/reportWebVitals.js:
--------------------------------------------------------------------------------
1 | const reportWebVitals = onPerfEntry => {
2 | if (onPerfEntry && onPerfEntry instanceof Function) {
3 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
4 | getCLS(onPerfEntry);
5 | getFID(onPerfEntry);
6 | getFCP(onPerfEntry);
7 | getLCP(onPerfEntry);
8 | getTTFB(onPerfEntry);
9 | });
10 | }
11 | };
12 |
13 | export default reportWebVitals;
14 |
--------------------------------------------------------------------------------
/web-app/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 | /.pnp
6 | .pnp.js
7 | package-lock.json
8 |
9 | # testing
10 | /coverage
11 |
12 | # production
13 | /build
14 | # misc
15 | .DS_Store
16 | .env.local
17 | .env.development.local
18 | .env.test.local
19 | .env.production.local
20 |
21 | npm-debug.log*
22 | yarn-debug.log*
23 | yarn-error.log*
24 |
--------------------------------------------------------------------------------
/web-app/src/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ReactDOM from 'react-dom/client';
3 | import './index.css';
4 | import App from './App';
5 | import reportWebVitals from './reportWebVitals';
6 |
7 | const root = ReactDOM.createRoot(document.getElementById('root'));
8 | root.render(
9 |
10 |
11 |
12 | );
13 |
14 | // If you want to start measuring performance in your app, pass a function
15 | // to log results (for example: reportWebVitals(console.log))
16 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
17 | reportWebVitals();
18 |
--------------------------------------------------------------------------------
/web-app/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "short_name": "React App",
3 | "name": "Create React App Sample",
4 | "icons": [
5 | {
6 | "src": "favicon.ico",
7 | "sizes": "64x64 32x32 24x24 16x16",
8 | "type": "image/x-icon"
9 | },
10 | {
11 | "src": "logo192.png",
12 | "type": "image/png",
13 | "sizes": "192x192"
14 | },
15 | {
16 | "src": "logo512.png",
17 | "type": "image/png",
18 | "sizes": "512x512"
19 | }
20 | ],
21 | "start_url": ".",
22 | "display": "standalone",
23 | "theme_color": "#000000",
24 | "background_color": "#ffffff"
25 | }
26 |
--------------------------------------------------------------------------------
/nginx.conf:
--------------------------------------------------------------------------------
1 | events {}
2 |
3 | http {
4 | include mime.types;
5 | default_type application/octet-stream;
6 |
7 | server {
8 | listen 80;
9 |
10 | location / {
11 | root /usr/share/nginx/html;
12 | try_files $uri /index.html;
13 | }
14 |
15 | location /clone_speaker {
16 | proxy_pass http://tts/clone_speaker;
17 | client_max_body_size 20M;
18 | }
19 |
20 | location /tts_stream {
21 | proxy_pass http://tts/tts_stream;
22 | proxy_buffering off;
23 | }
24 |
25 | location /generate_stream {
26 | proxy_pass http://llm:8080/generate_stream;
27 | proxy_buffering off;
28 | }
29 |
30 | location /asr {
31 | proxy_pass http://asr:9000/asr;
32 | client_max_body_size 20M;
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/web-app/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "react-ui",
3 | "version": "0.1.0",
4 | "private": true,
5 | "proxy": "http://localhost:5000",
6 | "dependencies": {
7 | "@testing-library/jest-dom": "^5.17.0",
8 | "@testing-library/react": "^13.4.0",
9 | "@testing-library/user-event": "^13.5.0",
10 | "react": "^18.2.0",
11 | "react-dom": "^18.2.0",
12 | "react-scripts": "5.0.1",
13 | "web-vitals": "^2.1.4"
14 | },
15 | "scripts": {
16 | "start": "react-scripts start",
17 | "build": "react-scripts build",
18 | "test": "react-scripts test",
19 | "eject": "react-scripts eject"
20 | },
21 | "eslintConfig": {
22 | "extends": [
23 | "react-app",
24 | "react-app/jest"
25 | ]
26 | },
27 | "browserslist": {
28 | "production": [
29 | ">0.2%",
30 | "not dead",
31 | "not op_mini all"
32 | ],
33 | "development": [
34 | "last 1 chrome version",
35 | "last 1 firefox version",
36 | "last 1 safari version"
37 | ]
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/web-app/src/App.css:
--------------------------------------------------------------------------------
1 | .App {
2 | text-align: center;
3 | background-color: #03363e;
4 | }
5 |
6 | .App-logo {
7 | height: 40vmin;
8 | pointer-events: none;
9 | }
10 |
11 | @media (prefers-reduced-motion: no-preference) {
12 | .App-logo {
13 | animation: App-logo-spin infinite 20s linear;
14 | }
15 | }
16 |
17 | .App-header {
18 | background-color: #282c34;
19 | min-height: 100vh;
20 | display: flex;
21 | flex-direction: column;
22 | align-items: center;
23 | justify-content: center;
24 | font-size: calc(10px + 2vmin);
25 | color: white;
26 | }
27 |
28 | .App-link {
29 | color: #61dafb;
30 | }
31 |
32 | @keyframes App-logo-spin {
33 | from {
34 | transform: rotate(0deg);
35 | }
36 | to {
37 | transform: rotate(360deg);
38 | }
39 | }
40 |
41 | .chat-window {
42 | border: 1px solid #ccc;
43 | padding: 10px;
44 | height: 300px;
45 | overflow-y: scroll;
46 | }
47 |
48 | .message {
49 | margin: 5px 0;
50 | }
51 |
52 | .message.user {
53 | text-align: left;
54 | }
55 |
56 | .message.bot {
57 | text-align: right;
58 | }
59 |
60 | .circle {
61 | border-radius: 50%;
62 | transition: width 0.1s ease, height 0.1s ease;
63 | }
64 |
65 | .settings-tab {
66 | position: absolute;
67 | left: 0;
68 | top: 0;
69 | padding: 20px;
70 | /* Other styling... */
71 | }
72 |
73 | .waveform-container {
74 | display: flex;
75 | justify-content: center;
76 | align-items: center;
77 | height: 100vh;
78 | /* Other styling... */
79 | }
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | nginx:
4 | image: nginx:alpine
5 | ports:
6 | - "5000:80"
7 | volumes:
8 | - ./web-app/build:/usr/share/nginx/html
9 | - ./nginx.conf:/etc/nginx/nginx.conf:ro
10 | depends_on:
11 | - llm
12 | - tts
13 | - asr
14 | llm:
15 | image: ghcr.io/huggingface/text-generation-inference:1.1.0
16 | ports:
17 | - "8080:8080"
18 | environment:
19 | - MODEL_ID=TheBloke/openchat_3.5-AWQ
20 | - PORT=8080
21 | - QUANTIZE=awq
22 | - MAX_INPUT_LEN=3696
23 | - MAX_TOTAL_TOKENS=4096
24 | - MAX_BATCH_PREFILL_TOKENS=4096
25 | - CUDA_MEMORY_FRACTION=0.6
26 | deploy:
27 | resources:
28 | reservations:
29 | devices:
30 | - driver: nvidia
31 | count: 1
32 | capabilities: [gpu]
33 | tts:
34 | image: ghcr.io/coqui-ai/xtts-streaming-server:main-cuda121-818a108b41be2dd43dada04bd319fdfcdabc5c6a
35 | ports:
36 | - "8000:80"
37 | # Uncomment the following lines to use your own models
38 | # volumes:
39 | # - /media/julian/Workdisk/models/ai_voice_chat:/app/tts_models
40 | environment:
41 | - COQUI_TOS_AGREED=1
42 | deploy:
43 | resources:
44 | reservations:
45 | devices:
46 | - driver: nvidia
47 | count: 1
48 | capabilities: [gpu]
49 | asr:
50 | image: onerahmet/openai-whisper-asr-webservice:v1.2.4-gpu
51 | ports:
52 | - "9000:9000"
53 | environment:
54 | - ASR_ENGINE=faster_whisper
55 | - ASR_MODEL=large-v3
56 | deploy:
57 | resources:
58 | reservations:
59 | devices:
60 | - driver: nvidia
61 | count: 1
62 | capabilities: [gpu]
63 |
64 |
--------------------------------------------------------------------------------
/web-app/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
12 |
13 |
17 |
18 |
27 | React App
28 |
29 |
30 |
31 |
32 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # AI Voice Chat
2 |
3 | ## Overview
4 |
5 | This is a simple react app that allows you to chat with an AI assistant using your voice.
6 |
7 | It uses `Whisper large v3` for transcription, `openchat 3.5 AWQ` for the AI assistant, and `XTTS v2` for text-to-speech.
8 |
9 | Its main feature is speech-to-speech latency, it more than halves the latency of the ChatGPT with voice demo video.
10 | This repo runs on a single RTX 3090 GPU.
11 |
12 | No concurrency is supported and the project is not optimized production ready. It's also probably riddled with bugs so if you experience some, please open an issue or send a PR.
13 |
14 | The XTTS v2 model is from the [coqui-TTS](https://github.com/coqui-ai/TTS).
15 | If you have any questions about the model or the project, you can join our [discord server](https://discord.gg/vHgDbMzgfv)
16 |
17 | ## Demo
18 |
19 | https://github.com/WeberJulian/AI-voice-chat/assets/17219561/2be20ec1-fa5e-4c26-83ec-c074357f3905
20 |
21 | ## Installation
22 |
23 | ### Requisites
24 | 1. Have a Nvidia GPU with more than 16GB of VRAM and latest drivers
25 | 2. Have `docker-compose` installed
26 |
27 | ### Steps
28 | 1. Clone the repo
29 |
30 | ```bash
31 | git clone https://github.com/WeberJulian/AI-voice-chat.git
32 | ```
33 |
34 | 2. Build the react app
35 |
36 | ```bash
37 | cd AI-voice-chat
38 | cd web-app
39 | npm install && npm run build
40 | ```
41 |
42 | 3. Start everything 🚀
43 |
44 | ```bash
45 | cd ..
46 | docker-compose up
47 | ```
48 |
49 | ## Usage
50 |
51 | 1. Open the app in your browser at `http://localhost:5000`
52 |
53 | 2. Allow microphone access
54 |
55 | 3. Push to talk either with the `Shift` ⇧ key or the circle
56 |
57 | 4. Enjoy!
58 |
59 | To reset the conversation, refresh the page.
60 |
61 | ## Custom models
62 |
63 | If you fine-tune XTTS and want to use your own model, you can add that line to the `docker-compose.yml` file, in the tts service:
64 |
65 | ```yml
66 | services:
67 | ...
68 | tts:
69 | ...
70 | volumes:
71 | - /path/to/your/model:/app/tts_models
72 | ```
73 |
74 | In the /path/to/your/model folder, you must have the following files:
75 | - `config.json`
76 | - `model.pth`
77 | - `vocab.json`
78 |
79 |
--------------------------------------------------------------------------------
/web-app/src/logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/web-app/README.md:
--------------------------------------------------------------------------------
1 | # Getting Started with Create React App
2 |
3 | This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app).
4 |
5 | ## Available Scripts
6 |
7 | In the project directory, you can run:
8 |
9 | ### `npm start`
10 |
11 | Runs the app in the development mode.\
12 | Open [http://localhost:3000](http://localhost:3000) to view it in your browser.
13 |
14 | The page will reload when you make changes.\
15 | You may also see any lint errors in the console.
16 |
17 | ### `npm test`
18 |
19 | Launches the test runner in the interactive watch mode.\
20 | See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information.
21 |
22 | ### `npm run build`
23 |
24 | Builds the app for production to the `build` folder.\
25 | It correctly bundles React in production mode and optimizes the build for the best performance.
26 |
27 | The build is minified and the filenames include the hashes.\
28 | Your app is ready to be deployed!
29 |
30 | See the section about [deployment](https://facebook.github.io/create-react-app/docs/deployment) for more information.
31 |
32 | ### `npm run eject`
33 |
34 | **Note: this is a one-way operation. Once you `eject`, you can't go back!**
35 |
36 | If you aren't satisfied with the build tool and configuration choices, you can `eject` at any time. This command will remove the single build dependency from your project.
37 |
38 | Instead, it will copy all the configuration files and the transitive dependencies (webpack, Babel, ESLint, etc) right into your project so you have full control over them. All of the commands except `eject` will still work, but they will point to the copied scripts so you can tweak them. At this point you're on your own.
39 |
40 | You don't have to ever use `eject`. The curated feature set is suitable for small and middle deployments, and you shouldn't feel obligated to use this feature. However we understand that this tool wouldn't be useful if you couldn't customize it when you are ready for it.
41 |
42 | ## Learn More
43 |
44 | You can learn more in the [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started).
45 |
46 | To learn React, check out the [React documentation](https://reactjs.org/).
47 |
48 | ### Code Splitting
49 |
50 | This section has moved here: [https://facebook.github.io/create-react-app/docs/code-splitting](https://facebook.github.io/create-react-app/docs/code-splitting)
51 |
52 | ### Analyzing the Bundle Size
53 |
54 | This section has moved here: [https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size](https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size)
55 |
56 | ### Making a Progressive Web App
57 |
58 | This section has moved here: [https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app](https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app)
59 |
60 | ### Advanced Configuration
61 |
62 | This section has moved here: [https://facebook.github.io/create-react-app/docs/advanced-configuration](https://facebook.github.io/create-react-app/docs/advanced-configuration)
63 |
64 | ### Deployment
65 |
66 | This section has moved here: [https://facebook.github.io/create-react-app/docs/deployment](https://facebook.github.io/create-react-app/docs/deployment)
67 |
68 | ### `npm run build` fails to minify
69 |
70 | This section has moved here: [https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify](https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify)
71 |
--------------------------------------------------------------------------------
/web-app/src/App.js:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect, useRef} from 'react';
2 | import './App.css';
3 |
4 | function App() {
5 | const [file, setFile] = useState(null);
6 | const [waveformColor, setWaveformColor] = useState('#819a9d');
7 | const isRecordingRef = useRef(false);
8 | const mediaRecorderRef = useRef(null);
9 | const speakerRef = useRef(null);
10 | const animationFrameRef = useRef(null);
11 | const conversationRef = useRef([
12 | {sender: 'user', message: "You are a large language model known as OpenChat, the open-source counterpart to ChatGPT, equally powerful as its closed-source sibling. You communicate using an advanced deep learning based speech synthesis system made by coqui, so feel free to include interjections (such as 'hmm', 'oh', 'right', 'wow'...), but avoid using emojis, symboles, code snippets, or anything else that does not translate well to spoken language. Fox exemple, instead of using % say percent, = say equal and for * say times etc... Also please avoid using lists with numbers as items like so 1. 2. Use regular sentences instead."},
13 | {sender: 'bot', message: "No problem. Anything else?"},
14 | {sender: 'user', message: "Yeah, please always respond in a sentence or two from now on."},
15 | {sender: 'bot', message: "Sure, I'll be concise."},
16 | // {sender: 'bot', message: "I am an advanced emulation of your favourite machine learning youtuber. I'm based on a deep learning system made by coqui. I'm made to explain machine learning to you, I know every paper there is. I say 'hold on to your papers' and 'mindblowing' a lot."},
17 | // {sender: 'user', message: "Ok, please always respond in a sentence or two from now on."},
18 | // {sender: 'bot', message: "No problem, I'll be concise."},
19 | ]);
20 | let audioChunks = [];
21 | let isTTSPending = false;
22 |
23 | const defaultCircleDiameter = 200;
24 | const [circleDiameter, setCircleDiameter] = useState(defaultCircleDiameter);
25 |
26 | const handleMouseDown = () => {
27 | if (!isRecordingRef.current) {
28 | isRecordingRef.current = true;
29 | startRecording();
30 | }
31 | };
32 |
33 | const handleMouseUp = () => {
34 | if (isRecordingRef.current) {
35 | isRecordingRef.current = false;
36 | stopRecording();
37 | }
38 | };
39 |
40 | // Use these for touch devices
41 | const handleTouchStart = (e) => {
42 | e.preventDefault(); // Prevents additional mouse events
43 | handleMouseDown();
44 | };
45 |
46 | const handleTouchEnd = (e) => {
47 | e.preventDefault(); // Prevents additional mouse events
48 | handleMouseUp();
49 | };
50 |
51 | const conv2prompt = (conv) => {
52 | let prompt = "";
53 | for (let i = 0; i < conv.length; i++) {
54 | if (conv[i].sender === "user") {
55 | prompt += "GPT4 Correct User: " + conv[i].message + "<|end_of_turn|>GPT4 Correct Assistant:";
56 | } else {
57 | prompt += conv[i].message + "<|end_of_turn|>";
58 | }
59 | }
60 | return prompt;
61 | }
62 |
63 | useEffect(() => {
64 | const fetchDefaultSpeakerEmbedding = async () => {
65 | navigator.getUserMedia({audio:true,video:false}, function(stream) {
66 | stream.getTracks().forEach(x=>x.stop());
67 | }, err=>console.log(err));
68 | try {
69 | const response = await fetch('/female.wav');
70 | const blob = await response.blob();
71 | const formData = new FormData();
72 | formData.append('wav_file', blob, 'ref.wav');
73 |
74 | const speakerResponse = await fetch('/clone_speaker', {
75 | method: 'POST',
76 | body: formData,
77 | });
78 | const speakerData = await speakerResponse.json();
79 | speakerRef.current = speakerData;
80 | } catch (error) {
81 | console.error('Error fetching default speaker embedding:', error);
82 | }
83 | };
84 |
85 | fetchDefaultSpeakerEmbedding();
86 | }, []);
87 |
88 | useEffect(() => {
89 | // Setup event listeners for push-to-talk
90 | const handleKeyDown = (event) => {
91 | if (event.key === 'Shift' && !isRecordingRef.current) {
92 | isRecordingRef.current = true;
93 | startRecording();
94 | }
95 | };
96 |
97 | const handleKeyUp = (event) => {
98 | if (event.key === 'Shift' && isRecordingRef.current) {
99 | isRecordingRef.current = false;
100 | stopRecording();
101 | }
102 | };
103 |
104 | window.addEventListener('keydown', handleKeyDown);
105 | window.addEventListener('keyup', handleKeyUp);
106 |
107 | return () => {
108 | window.removeEventListener('keydown', handleKeyDown);
109 | window.removeEventListener('keyup', handleKeyUp);
110 | };
111 | }, []);
112 |
113 | const startRecording = () => {
114 | setWaveformColor('#ed901b');
115 | navigator.mediaDevices.getUserMedia({ audio: true })
116 | .then(stream => {
117 | const audioContext = new (window.AudioContext || window.webkitAudioContext)();
118 | const mediaStreamSource = audioContext.createMediaStreamSource(stream);
119 | const analyser = audioContext.createAnalyser();
120 | mediaStreamSource.connect(analyser);
121 |
122 | let amplitudeSum = 0; // Accumulator for amplitude values
123 | let sampleCount = 0; // Counter for number of samples processed
124 |
125 | // Setup to periodically analyze the audio stream
126 | const processAudio = () => {
127 | const dataArray = new Uint8Array(analyser.frequencyBinCount);
128 | analyser.getByteTimeDomainData(dataArray);
129 |
130 | // Calculate amplitude values and accumulate
131 | dataArray.forEach(value => {
132 | amplitudeSum += Math.abs(value - 128); // Subtracting 128 because the range is 0-255
133 | sampleCount++;
134 | });
135 |
136 | // Every 1000 samples, calculate and log the average, then reset
137 | if (sampleCount >= 100) {
138 | if (isRecordingRef.current) {
139 | const averageAmplitude = amplitudeSum / sampleCount;
140 | setCircleDiameter(defaultCircleDiameter + averageAmplitude * defaultCircleDiameter * 0.15);
141 | amplitudeSum = 0;
142 | sampleCount = 0;
143 | }
144 | }
145 |
146 | animationFrameRef.current = requestAnimationFrame(processAudio);
147 | };
148 | animationFrameRef.current = requestAnimationFrame(processAudio);
149 |
150 | processAudio();
151 | mediaRecorderRef.current = new MediaRecorder(stream);
152 | mediaRecorderRef.current.start();
153 | console.log('Starting to record:', mediaRecorderRef.current);
154 |
155 | mediaRecorderRef.current.ondataavailable = (event) => {
156 | audioChunks.push(event.data);
157 | console.log('Audio chunk recorded:', event.data);
158 | };
159 |
160 | mediaRecorderRef.current.onstop = () => {
161 | const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
162 | sendAudioToASR(audioBlob);
163 | audioChunks = [];
164 | audioContext.close();
165 | };
166 | })
167 | .catch(err => console.error('Error accessing microphone:', err));
168 | };
169 |
170 | const stopRecording = () => {
171 | console.log('Stopping recording', mediaRecorderRef.current);
172 | mediaRecorderRef.current.stop();
173 | setWaveformColor('#819a9d');
174 | if (animationFrameRef.current) {
175 | cancelAnimationFrame(animationFrameRef.current); // Cancel the animation frame request
176 | }
177 | };
178 |
179 | const sendAudioToASR = (audioBlob) => {
180 | const formData = new FormData();
181 | console.log('Sending audio to ASR:', audioBlob);
182 | formData.append('audio_file', audioBlob);
183 |
184 | fetch('/asr?encode=true&task=transcribe&vad_filter=true&word_timestamps=false&output=json', {
185 | method: 'POST',
186 | body: formData
187 | })
188 | .then(response => response.json())
189 | .then(transcribedText => {
190 | console.log('Transcribed text:', transcribedText["text"]);
191 | sendMessage(transcribedText["text"], transcribedText["language"]);
192 | })
193 | .catch(error => console.error('Error sending audio to ASR:', error));
194 | };
195 |
196 | const handleFileChange = (event) => {
197 | setFile(event.target.files[0]);
198 | };
199 |
200 | const handleUpload = () => {
201 | const formData = new FormData();
202 | formData.append('wav_file', file);
203 |
204 | fetch('/clone_speaker', {
205 | method: 'POST',
206 | body: formData,
207 | })
208 | .then(response => response.json())
209 | .then(data => {
210 | speakerRef.current = data;
211 | })
212 | .catch(error => {
213 | console.error('Error:', error);
214 | });
215 | };
216 |
217 | const handleTTS = async (text, lang) => {
218 | setWaveformColor('#679989');
219 | isTTSPending = true;
220 |
221 | function linearInterpolate(sample1, sample2, fraction) {
222 | return sample1 * (1 - fraction) + sample2 * fraction;
223 | }
224 |
225 | await fetch('/tts_stream', {
226 | method: 'POST',
227 | headers: {
228 | 'Content-Type': 'application/json',
229 | },
230 | body: JSON.stringify({
231 | text: text,
232 | language: lang,
233 | gpt_cond_latent: speakerRef.current.gpt_cond_latent,
234 | speaker_embedding: speakerRef.current.speaker_embedding,
235 | add_wav_header: false,
236 | })
237 | })
238 | .then(response => {
239 | if (!response.ok) {
240 | throw new Error('Network response was not ok');
241 | }
242 | const audioContext = new (window.AudioContext || window.webkitAudioContext)();
243 | const playbackSpeed = 24000 / audioContext.sampleRate;
244 | const scriptNode = audioContext.createScriptProcessor(4096, 1, 1);
245 | scriptNode.connect(audioContext.destination);
246 |
247 | const reader = response.body.getReader();
248 | let audioQueue = [];
249 | let isStreamingFinished = false;
250 | let nextSample = 0;
251 | let amplitudeSum = 0; // Accumulator for amplitude values
252 | let sampleCount = 0; // Counter for number of samples processed
253 |
254 | scriptNode.onaudioprocess = (audioProcessingEvent) => {
255 | const outputBuffer = audioProcessingEvent.outputBuffer.getChannelData(0);
256 | for (let i = 0; i < outputBuffer.length; i++) {
257 | if (nextSample < audioQueue.length) {
258 | const sampleIndex = Math.floor(nextSample);
259 | const nextIndex = sampleIndex + 1;
260 | const sampleFraction = nextSample - sampleIndex;
261 | const interpolatedSample = linearInterpolate(
262 | audioQueue[sampleIndex],
263 | audioQueue[nextIndex],
264 | sampleFraction
265 | );
266 | outputBuffer[i] = interpolatedSample / 32768;
267 | nextSample += playbackSpeed;
268 |
269 | // Calculate amplitude and update accumulator
270 | amplitudeSum += Math.abs(outputBuffer[i]);
271 | sampleCount++;
272 |
273 | // Every 100 samples, calculate and log the average, then reset
274 | if (sampleCount === 1000) {
275 | const averageAmplitude = amplitudeSum / sampleCount;
276 | amplitudeSum = 0;
277 | sampleCount = 0;
278 | setCircleDiameter(defaultCircleDiameter + averageAmplitude * defaultCircleDiameter * 5);
279 | }
280 | } else {
281 | outputBuffer[i] = 0; // Fill with silence if no data available
282 | if (isStreamingFinished) {
283 | scriptNode.disconnect();
284 | audioContext.close();
285 | isTTSPending = false;
286 | break;
287 | }
288 | }
289 | }
290 | };
291 |
292 | function processAudioChunk({ done, value }) {
293 | if (done) {
294 | isStreamingFinished = true;
295 | return;
296 | }
297 |
298 | // Convert the incoming data to Int16Array and add it to the queue
299 | const rawData = new Int16Array(value.buffer, value.byteOffset, value.byteLength / 2);
300 | audioQueue = audioQueue.concat(Array.from(rawData));
301 |
302 | reader.read().then(processAudioChunk);
303 | }
304 |
305 | reader.read().then(processAudioChunk);
306 | })
307 | .catch(error => {
308 | console.error('Error calling TTS service:', error);
309 | });
310 | };
311 |
312 | const generateBotResponse = async (text, lang) => {
313 | let generated_text = "";
314 | let current_sentence = "";
315 | const response = await fetch('/generate_stream', {
316 | method: 'POST',
317 | headers: {
318 | 'Content-Type': 'application/json',
319 | },
320 | body: JSON.stringify({
321 | inputs: text,
322 | parameters: {
323 | max_new_tokens: 250,
324 | }
325 | })
326 | });
327 |
328 | if (!response.ok || !response.body) {
329 | throw response.statusText;
330 | }
331 |
332 | const reader = response.body.getReader();
333 | const decoder = new TextDecoder();
334 | let partialData = '';
335 |
336 | while (true) {
337 | const { value, done } = await reader.read();
338 | if (done) {
339 | break;
340 | }
341 |
342 | partialData += decoder.decode(value, { stream: true });
343 |
344 | // Process each line separately
345 | let lines = partialData.split('\n');
346 | for (let i = 0; i < lines.length - 1; i++) {
347 | const line = lines[i];
348 | if (line.startsWith('data:')) {
349 | const jsonString = line.substring(5); // Remove 'data:' prefix
350 |
351 | try {
352 | const jsonObject = JSON.parse(jsonString);
353 | if (jsonObject && jsonObject.token && jsonObject.token.text) {
354 | console.log('Received:', jsonObject.token.text);
355 | generated_text += jsonObject.token.text;
356 | if (jsonObject.token.text === '<|end_of_turn|>') {
357 | reader.cancel();
358 | } else {
359 | current_sentence += jsonObject.token.text;
360 | }
361 | if (jsonObject.token.text === '.' || jsonObject.token.text === '?' || jsonObject.token.text === '!') {
362 | await handleTTS(current_sentence, lang);
363 | while (isTTSPending) {
364 | await new Promise(resolve => setTimeout(resolve, 100));
365 | }
366 | current_sentence = "";
367 | }
368 |
369 | }
370 | } catch (error) {
371 | console.error('Error parsing JSON:', error);
372 | }
373 | }
374 | }
375 |
376 | partialData = lines[lines.length - 1];
377 | }
378 | return generated_text;
379 | };
380 |
381 | const sendMessage = async (message, lang) => {
382 | if (!message) return;
383 | conversationRef.current.push({ sender: 'user', message });
384 | const prompt = conv2prompt(conversationRef.current);
385 | let generated_text = await generateBotResponse(prompt, lang);
386 | conversationRef.current.push({ sender: 'bot', message: generated_text });
387 | setWaveformColor('#819a9d');
388 | };
389 |
390 | return (
391 |
392 |
393 |
394 |
395 |
396 |
397 |
406 |
407 |
408 | );
409 | }
410 |
411 | export default App;
--------------------------------------------------------------------------------