├── .dockerignore ├── .github ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── SWARM.md ├── bake_deploy_prod.sh ├── bake_deploy_staging.sh ├── docker-compose.yml ├── dockerless ├── .gitignore ├── start_backend.sh ├── start_frontend.sh ├── start_llm.sh ├── start_stt.sh └── start_tts.sh ├── docs └── browser_backend_communication.md ├── frontend ├── .gitignore ├── Dockerfile ├── README.md ├── eslint.config.mjs ├── hot-reloading.Dockerfile ├── next.config.ts ├── package.json ├── pnpm-lock.yaml ├── postcss.config.mjs ├── public │ ├── audio-output-processor.js │ ├── decoderWorker.min.js │ ├── decoderWorker.min.wasm │ └── encoderWorker.min.js ├── src │ ├── app │ │ ├── ConsentModal.tsx │ │ ├── CouldNotConnect.tsx │ │ ├── ErrorMessages.tsx │ │ ├── Modal.tsx │ │ ├── PositionedAudioVisualizer.tsx │ │ ├── SingleRoleSubtitles.tsx │ │ ├── SlantedButton.tsx │ │ ├── SquareButton.tsx │ │ ├── Subtitles.tsx │ │ ├── TrimmedAudioPreview.tsx │ │ ├── Unmute.tsx │ │ ├── UnmuteConfigurator.tsx │ │ ├── UnmuteHeader.tsx │ │ ├── VoiceAttribution.tsx │ │ ├── VoiceRecorder.tsx │ │ ├── VoiceUpload.tsx │ │ ├── audioUtil.ts │ │ ├── chatHistory.ts │ │ ├── cssUtil.ts │ │ ├── favicon.ico │ │ ├── faviconKyutai.ico │ │ ├── faviconKyutai.png │ │ ├── globals.css │ │ ├── layout.tsx │ │ ├── opus-recorder.d.ts │ │ ├── page.tsx │ │ ├── useAudioProcessor.ts │ │ ├── useAudioVisualizerCircle.ts │ │ ├── useBackendServerUrl.ts │ │ ├── useGoogleAnalytics.ts │ │ ├── useKeyboardShortcuts.ts │ │ ├── useLocalStorage.ts │ │ ├── useMicrophoneAccess.ts │ │ ├── useRecordingCanvas.ts │ │ ├── useWakeLock.ts │ │ └── voice-donation │ │ │ ├── DonationConsent.tsx │ │ │ ├── IntroText.mdx │ │ │ ├── page.tsx │ │ │ ├── privacy-policy │ │ │ └── page.tsx │ │ │ └── terms-of-use │ │ │ └── page.tsx │ ├── assets │ │ ├── fonts │ │ │ ├── Satoshi-Variable.eot │ │ │ ├── Satoshi-Variable.ttf │ │ │ ├── Satoshi-Variable.woff │ │ │ ├── Satoshi-Variable.woff2 │ │ │ ├── Satoshi-VariableItalic.eot │ │ │ ├── Satoshi-VariableItalic.ttf │ │ │ ├── Satoshi-VariableItalic.woff │ │ │ └── Satoshi-VariableItalic.woff2 │ │ └── kyutai-logo-cropped.svg │ └── mdx-components.tsx └── tsconfig.json ├── notebooks ├── .gitignore └── create-voice-donation-sentences.ipynb ├── pyproject.toml ├── services ├── debugger │ └── Dockerfile ├── grafana │ ├── Dockerfile │ ├── dashboards │ │ └── unmute-monitoring-1751624072717.json │ ├── grafana.ini │ └── provisioning │ │ ├── dashboards │ │ └── dashboards.yaml │ │ └── datasources │ │ └── datasources.yaml ├── moshi-server │ ├── configs │ │ ├── stt-prod.toml │ │ ├── stt.toml │ │ ├── tts-prod.toml │ │ ├── tts.toml │ │ └── voice-cloning.toml │ ├── private.Dockerfile │ ├── public.Dockerfile │ ├── start_moshi_server_private.sh │ └── start_moshi_server_public.sh └── prometheus │ ├── Dockerfile │ └── prometheus.yml ├── setup_gpu_swarm_node.py ├── swarm-deploy.yml ├── tests ├── test_exponential_moving_average.py └── test_llm_utils.py ├── unmute ├── audio_input_override.py ├── audio_stream_saver.py ├── cache.py ├── exceptions.py ├── kyutai_constants.py ├── llm │ ├── chatbot.py │ ├── llm_utils.py │ ├── newsapi.py │ ├── quiz_show_questions.py │ └── system_prompt.py ├── loadtest │ ├── dummy_tts_server.py │ ├── generate_dataset_for_vllm.py │ ├── loadtest_client.py │ ├── loadtest_result.py │ └── voices │ │ ├── Bear-or-shark-trim.mp3 │ │ ├── dog-or-cat-3-nowait.mp3 │ │ ├── seine.mp3 │ │ └── vaclav_english_news_trim.mp3 ├── main_gradio.py ├── main_websocket.py ├── metrics.py ├── openai_realtime_api_events.py ├── process_recording.py ├── quest_manager.py ├── recorder.py ├── scripts │ ├── check_hugging_face_token_not_write.py │ ├── copy_voice_to_prod.py │ ├── example_websocket_client.py │ ├── mistral_streaming.py │ ├── output_from_file.py │ ├── output_sine.py │ ├── output_sine_async.py │ ├── output_tts.py │ ├── pitch_detection_handler.py │ ├── stt_from_file_example.py │ ├── stt_microphone_example.py │ ├── tts_example.py │ ├── update_voice_list.py │ └── vllm_wrapper_example.py ├── service_discovery.py ├── stt │ ├── dummy_speech_to_text.py │ ├── exponential_moving_average.py │ └── speech_to_text.py ├── timer.py ├── tts │ ├── copy_approved_voice_donations.py │ ├── create_voice_donation_table.py │ ├── freesound_download.py │ ├── realtime_queue.py │ ├── text_to_speech.py │ ├── trim_voice_donation_clip.py │ ├── voice_cloning.py │ ├── voice_donation.py │ ├── voice_donation_sentences.txt │ └── voices.py ├── unmute_handler.py ├── webrtc_utils.py └── websocket_utils.py ├── uv.lock └── voices.yaml /.dockerignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | 4 | debug/ 5 | recordings/ 6 | .venv/ 7 | 8 | Dockerfile 9 | 10 | frontend/node_modules 11 | frontend/.next 12 | volumes/ 13 | notebooks/ 14 | voices/ 15 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Checklist 2 | 3 | - [ ] Read CONTRIBUTING.md, and accept the CLA by including the provided snippet. We will not accept PR without this. 4 | 5 | ## PR Description 6 | 7 | 8 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - prod 7 | pull_request: 8 | 9 | jobs: 10 | # Enable again when we don't have private dependencies 11 | #build-docker-images: 12 | # runs-on: ubuntu-latest 13 | # steps: 14 | # - name: Checkout code 15 | # uses: actions/checkout@v3 16 | # 17 | # - name: Set up a builder (we don't want to load the images) 18 | # run: docker buildx create --name mybuilder --use 19 | # 20 | # - name: Build all docker images 21 | # run: docker buildx bake --progress=plain -f swarm-deploy.yml workers frontend tts 22 | # env: 23 | # DOMAIN: dummy 24 | 25 | pre-commit: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - name: Checkout code 29 | uses: actions/checkout@v3 30 | 31 | - name: Install uv 32 | uses: astral-sh/setup-uv@v5 33 | with: 34 | version: "0.7.12" 35 | 36 | - name: Install Node.js 37 | uses: actions/setup-node@v4 38 | with: 39 | node-version: 20 40 | 41 | - name: Install pnpm 42 | run: npm install -g pnpm 43 | 44 | - name: Install dependencies 45 | run: cd frontend && pnpm install 46 | 47 | - name: Run pre-commit 48 | run: | 49 | uv run pre-commit run --all-files 50 | # Some redundancy here because some hooks will run in any stage, 51 | # but I don't think there is a cleaner way to make sure they all run 52 | uv run pre-commit run --all-files --hook-stage pre-push 53 | 54 | backend-unit-tests: 55 | runs-on: ubuntu-latest 56 | steps: 57 | - name: Checkout code 58 | uses: actions/checkout@v3 59 | 60 | - name: Install uv 61 | uses: astral-sh/setup-uv@v5 62 | with: 63 | version: "0.7.12" 64 | 65 | - name: Run backend unit tests 66 | run: uv run pytest -v 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | .DS_Store 4 | 5 | debug/ 6 | recordings/ 7 | .venv/ 8 | # only ignore voices/ in the root directory 9 | /voices/ 10 | 11 | # env files (can opt-in for committing if needed) 12 | .env* 13 | 14 | # vercel 15 | .vercel 16 | 17 | # typescript 18 | *.tsbuildinfo 19 | next-env.d.ts 20 | 21 | # Traefik/HTTPS 22 | certs/ 23 | 24 | volumes/ 25 | CLAUDE.md 26 | .claude/settings.local.json 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/kynan/nbstripout 3 | rev: 0.8.1 4 | hooks: 5 | - id: nbstripout 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v5.0.0 # Use the ref you want to point at 8 | hooks: 9 | - id: check-added-large-files 10 | args: ["--maxkb=2048"] 11 | - repo: https://github.com/astral-sh/ruff-pre-commit 12 | # Ruff version. 13 | rev: v0.11.7 14 | hooks: 15 | # Run the linter. 16 | - id: ruff 17 | types_or: [python, pyi] # Don't run on `jupyter` files 18 | args: [--fix] 19 | # Run the formatter. 20 | - id: ruff-format 21 | types_or: [python, pyi] # Don't run on `jupyter` files 22 | - repo: https://github.com/pre-commit/pre-commit-hooks 23 | rev: v3.2.0 24 | hooks: 25 | - id: trailing-whitespace 26 | - repo: local 27 | hooks: 28 | - id: pnpm-run-lint 29 | name: pnpm run lint 30 | language: system 31 | entry: bash -c 'cd frontend && pnpm run lint --max-warnings 0' 32 | files: ^frontend/src/.*$ 33 | pass_filenames: false 34 | stages: [pre-commit] 35 | - id: pnpm-run-build 36 | name: pnpm run build 37 | language: system 38 | entry: bash -c 'cd frontend && pnpm run build' 39 | files: ^frontend/src/.*$ 40 | pass_filenames: false 41 | stages: [pre-push] 42 | - id: pyright 43 | name: Pyright type-checking 44 | language: system 45 | entry: bash -c 'uv run pyright' 46 | files: ^unmute/.*$ 47 | pass_filenames: false 48 | stages: [pre-push] 49 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Unmute 2 | 3 | ## Pull Requests 4 | 5 | 1. Fork the repo and create your branch from `main`. 6 | 2. If you have changed APIs, update the documentation accordingly. 7 | 3. Ensure pre-commit hooks pass properly, in particular the linting and typing. 8 | 4. Accept the Contributor License Agreement (see after). 9 | 10 | ## Contributor License Agreement ("CLA") 11 | 12 | In order to accept your pull request, we need you to submit a Contributor License Agreement. 13 | 14 | If you agree with the full CLA provided in the next paragraph, copy the following statement in your PR, changing your Github Handle: 15 | 16 | > I, {your GitHub handle}, confirm that I have read and understood the terms of the CLA of Kyutai-labs, as outlined in the repository's CONTRIBUTING.md, and I agree to be bound by these terms. 17 | The full CLA is provided as follows: 18 | 19 | > I, {your GitHub handle}, hereby grant to Kyutai-labs a perpetual, worldwide, non-exclusive, royalty-free, 20 | > irrevocable license to use, modify, distribute, and sublicense my Contributions. 21 | > I understand and accept that Contributions are limited to modifications, improvements, or changes 22 | > to the project’s source code submitted via pull requests. I accept that Kyutai-labs has full discretion to 23 | > review, accept, reject, or request changes to any Contributions I submit, and that submitting 24 | > a pull request does not guarantee its inclusion in the project. 25 | > By submitting a Contribution, I grant Kyutai-labs a perpetual, worldwide license to use, modify, 26 | > reproduce, distribute, and create derivative works based on my Contributions. 27 | > I also agree to assign all patent rights for any inventions or improvements that arise from my Contributions, 28 | > giving the Kyutai-labs full rights to file for and enforce patents. 29 | > I understand that the Kyutai-labs may commercialize, relicense, or exploit the project and my Contributions without further notice or obligation to me. 30 | > I confirm that my Contributions are original and that I have the legal right to grant this license. 31 | > If my Contributions include third-party materials, I will ensure that I have the necessary permissions 32 | > and will disclose this information. I accept that once my Contributions are integrated, they may be altered or removed at the Kyutai-labs’s discretion. 33 | > I acknowledge that I am making these Contributions voluntarily and will not receive any compensation. 34 | > Furthermore, I understand that all Contributions, including mine, are provided on an "as-is" basis, with no warranties. 35 | > By submitting a pull request, I agree to be bound by these terms. 36 | 37 | ## Issues 38 | 39 | Please submit issues on our GitHub repository. 40 | 41 | ## License 42 | 43 | By contributing to Unmute, you agree that your contributions will be licensed under the MIT license. 44 | See the `LICENSE` file in the root directory of this source tree. -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/astral-sh/uv:0.6.17-debian AS build 2 | WORKDIR /app 3 | 4 | ENV UV_COMPILE_BYTECODE=1 UV_LOCKED=1 5 | 6 | RUN --mount=type=bind,source=uv.lock,target=uv.lock \ 7 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ 8 | uv run --no-dev echo hello 9 | 10 | COPY . . 11 | ENV HOSTNAME="0.0.0.0" 12 | 13 | HEALTHCHECK --start-period=15s \ 14 | CMD curl --fail http://localhost:80/metrics || exit 1 15 | 16 | FROM build AS prod 17 | # Running through uvicorn directly to be able to deactive the Websocket per message deflate which is slowing 18 | # down the replies by a few ms. 19 | CMD ["uv", "run", "--no-dev", "uvicorn", "unmute.main_websocket:app", "--host", "0.0.0.0", "--port", "80", "--ws-per-message-deflate=false"] 20 | 21 | 22 | FROM build AS hot-reloading 23 | CMD ["uv", "run", "--no-dev", "uvicorn", "unmute.main_websocket:app", "--reload", "--host", "0.0.0.0", "--port", "80", "--ws-per-message-deflate=false"] 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 kyutai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bake_deploy_prod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # Exit on error 3 | 4 | uv run unmute/scripts/check_hugging_face_token_not_write.py $HUGGING_FACE_HUB_TOKEN 5 | 6 | expected_branch="prod" 7 | 8 | current_branch=$(git rev-parse --abbrev-ref HEAD) 9 | if [[ "$current_branch" != "$expected_branch" ]]; then 10 | echo "❌ You are on branch '$current_branch'. Please switch to '$expected_branch' before deploying." 11 | exit 1 12 | fi 13 | 14 | if [[ -n $(git status --porcelain) ]]; then 15 | echo "❌ You have uncommitted changes. Please commit or stash them before deploying." 16 | exit 1 17 | fi 18 | 19 | set -x # Print commands 20 | 21 | export DOMAIN=unmute.sh 22 | # Note that using non-Mistral models also requires changing the vLLM args in ./swarm-deploy.yml 23 | export KYUTAI_LLM_MODEL=mistralai/Mistral-Small-3.2-24B-Instruct-2506 24 | export DOCKER_HOST=ssh://root@${DOMAIN} 25 | 26 | echo "If you get an connection error, do: ssh root@${DOMAIN}" 27 | 28 | docker buildx bake -f ./swarm-deploy.yml --allow=ssh --push 29 | docker stack deploy --with-registry-auth --prune --compose-file ./swarm-deploy.yml llm-wrapper 30 | -------------------------------------------------------------------------------- /bake_deploy_staging.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | uv run unmute/scripts/check_hugging_face_token_not_write.py $HUGGING_FACE_HUB_TOKEN 5 | 6 | export DOMAIN=unmute-staging.kyutai.io 7 | export KYUTAI_LLM_MODEL=google/gemma-3-4b-it 8 | export DOCKER_HOST=ssh://root@${DOMAIN} 9 | 10 | echo "If you get an connection error, do: ssh root@${DOMAIN}" 11 | 12 | docker buildx bake -f ./swarm-deploy.yml --allow=ssh --push 13 | docker stack deploy --with-registry-auth --prune --compose-file ./swarm-deploy.yml llm-wrapper 14 | docker service scale -d llm-wrapper_tts=1 llm-wrapper_llm=1 15 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # See NOTE comments for places to modify. 2 | services: 3 | traefik: 4 | image: traefik:v3.3.1 5 | command: 6 | # Swarm provider configuration 7 | - "--providers.docker=true" 8 | - "--providers.docker.exposedbydefault=false" 9 | 10 | # This is set up for HTTP. If you want HTTPS support for production, use Docker Swarm 11 | # (check out swarm-deploy.yml) or ask ChatGPT to modify this file for you. 12 | - "--entrypoints.web.address=:80" 13 | ports: 14 | - "80:80" 15 | volumes: 16 | - "/var/run/docker.sock:/var/run/docker.sock:ro" 17 | 18 | frontend: 19 | image: unmute-frontend:latest 20 | build: 21 | context: frontend/ 22 | dockerfile: hot-reloading.Dockerfile 23 | volumes: 24 | - ./frontend/src:/app/src 25 | labels: 26 | - "traefik.enable=true" 27 | - "traefik.http.routers.frontend.rule=PathPrefix(`/`)" 28 | - "traefik.http.routers.frontend.entrypoints=web" 29 | - "traefik.http.services.frontend.loadbalancer.server.port=3000" 30 | - "traefik.http.routers.frontend.priority=10" # lowest priority 31 | 32 | backend: 33 | image: unmute-backend:latest 34 | build: 35 | context: ./ 36 | target: hot-reloading 37 | volumes: 38 | - ./unmute:/app/unmute 39 | environment: 40 | - KYUTAI_STT_URL=ws://stt:8080 41 | - KYUTAI_TTS_URL=ws://tts:8080 42 | - KYUTAI_LLM_URL=http://llm:8000 43 | - NEWSAPI_API_KEY=$NEWSAPI_API_KEY 44 | labels: 45 | - "traefik.enable=true" 46 | - "traefik.http.routers.backend.rule=PathPrefix(`/api`)" 47 | - "traefik.http.routers.backend.middlewares=strip-api" 48 | - "traefik.http.middlewares.strip-api.replacepathregex.regex=^/api/(.*)" 49 | - "traefik.http.middlewares.strip-api.replacepathregex.replacement=/$$1" 50 | - "traefik.http.routers.backend.entrypoints=web" 51 | - "traefik.http.services.backend.loadbalancer.server.port=80" 52 | - "traefik.http.routers.backend.priority=100" # higher priority than frontend 53 | - "prometheus-port=80" 54 | 55 | tts: 56 | image: moshi-server:latest 57 | command: ["worker", "--config", "configs/tts.toml"] 58 | build: 59 | context: services/moshi-server 60 | dockerfile: public.Dockerfile 61 | environment: 62 | - HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN 63 | volumes: 64 | - ./volumes/hf-cache:/root/.cache/huggingface 65 | - ./volumes/cargo-registry-tts:/root/.cargo/registry 66 | - ./volumes/tts-target:/app/target 67 | - ./volumes/uv-cache:/root/.cache/uv 68 | - /tmp/models/:/models 69 | - ./volumes/tts-logs:/logs 70 | deploy: 71 | resources: 72 | reservations: 73 | devices: 74 | - driver: nvidia 75 | count: all 76 | capabilities: [gpu] 77 | 78 | stt: 79 | image: moshi-server:latest 80 | command: ["worker", "--config", "configs/stt.toml"] 81 | build: 82 | context: services/moshi-server 83 | dockerfile: public.Dockerfile 84 | environment: 85 | - HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN 86 | volumes: 87 | - ./volumes/hf-cache:/root/.cache/huggingface 88 | - ./volumes/cargo-registry-stt:/root/.cargo/registry 89 | - ./volumes/stt-target:/app/target 90 | - ./volumes/uv-cache:/root/.cache/uv 91 | - /tmp/models/:/models 92 | - ./volumes/stt-logs:/logs 93 | deploy: 94 | resources: 95 | reservations: 96 | devices: 97 | - driver: nvidia 98 | count: all 99 | capabilities: [gpu] 100 | 101 | llm: 102 | image: vllm/vllm-openai:v0.9.1 103 | command: 104 | [ 105 | # NOTE: Change the LLM here if you want. 106 | # (caution: gemma-3-1b-it also exists but it's slow on vLLM: https://github.com/vllm-project/vllm/issues/19575) 107 | "--model=meta-llama/Llama-3.2-1B-Instruct", 108 | # NOTE: You can adapt this based on your GPU memory. 109 | # A higher value takes more memory but supports longer conversations. 110 | "--max-model-len=1536", 111 | "--dtype=bfloat16", 112 | # NOTE: Change this based on your GPU memory. 113 | # A higher value can make inference faster. 114 | "--gpu-memory-utilization=0.4", 115 | ] 116 | volumes: 117 | - ./volumes/hf-cache:/root/.cache/huggingface 118 | - ./volumes/vllm-cache:/root/.cache/vllm 119 | environment: 120 | - HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN 121 | deploy: 122 | resources: 123 | reservations: 124 | devices: 125 | - driver: nvidia 126 | count: all 127 | capabilities: [gpu] 128 | 129 | networks: 130 | default: -------------------------------------------------------------------------------- /dockerless/.gitignore: -------------------------------------------------------------------------------- 1 | # This is part of a hack to get dependencies needed for the TTS Rust server, because it integrates a Python component. 2 | pyproject.toml 3 | uv.lock -------------------------------------------------------------------------------- /dockerless/start_backend.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | cd "$(dirname "$0")/.." 4 | 5 | uv run uvicorn unmute.main_websocket:app --reload --host 0.0.0.0 --port 8000 --ws-per-message-deflate=false 6 | -------------------------------------------------------------------------------- /dockerless/start_frontend.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | cd "$(dirname "$0")/.." 4 | 5 | cd frontend 6 | pnpm install 7 | pnpm env use --global lts 8 | pnpm dev 9 | -------------------------------------------------------------------------------- /dockerless/start_llm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | cd "$(dirname "$0")/.." 4 | 5 | uv tool run vllm@v0.9.1 serve \ 6 | --model=google/gemma-3-1b-it \ 7 | --max-model-len=8192 \ 8 | --dtype=bfloat16 \ 9 | --gpu-memory-utilization=0.3 \ 10 | --port=8091 11 | -------------------------------------------------------------------------------- /dockerless/start_stt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | cd "$(dirname "$0")/.." 4 | 5 | # A fix for building Sentencepiece on GCC 15, see: https://github.com/google/sentencepiece/issues/1108 6 | export CXXFLAGS="-include cstdint" 7 | 8 | cargo install --features cuda moshi-server@0.6.4 9 | moshi-server worker --config services/moshi-server/configs/stt.toml --port 8090 10 | -------------------------------------------------------------------------------- /dockerless/start_tts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | cd "$(dirname "$0")/" 4 | 5 | # This is part of a hack to get dependencies needed for the TTS Rust server, because it integrates a Python component 6 | [ -f pyproject.toml ] || wget https://raw.githubusercontent.com/kyutai-labs/moshi/9837ca328d58deef5d7a4fe95a0fb49c902ec0ae/rust/moshi-server/pyproject.toml 7 | [ -f uv.lock ] || wget https://raw.githubusercontent.com/kyutai-labs/moshi/9837ca328d58deef5d7a4fe95a0fb49c902ec0ae/rust/moshi-server/uv.lock 8 | 9 | uv venv 10 | source .venv/bin/activate 11 | 12 | cd .. 13 | 14 | # This env var must be set to get the correct environment for the Rust build. 15 | # Must be set before running `cargo install`! 16 | # If you don't have it, you'll see an error like `no module named 'huggingface_hub'` 17 | # or similar, which means you don't have the necessary Python packages installed. 18 | export LD_LIBRARY_PATH=$(python -c 'import sysconfig; print(sysconfig.get_config_var("LIBDIR"))') 19 | 20 | # A fix for building Sentencepiece on GCC 15, see: https://github.com/google/sentencepiece/issues/1108 21 | export CXXFLAGS="-include cstdint" 22 | 23 | # If you already have moshi-server installed and things are not working because of the LD_LIBRARY_PATH issue, 24 | # you might have to force a rebuild with --force. 25 | cargo install --features cuda moshi-server@0.6.4 26 | 27 | # If you're getting `moshi-server: error: unrecognized arguments: worker`, it means you're 28 | # using the binary from the `moshi` Python package rather than from the Rust package. 29 | # Use `pip install moshi --upgrade` to update the Python package to >=0.2.8. 30 | uv run --locked --project ./dockerless moshi-server worker --config services/moshi-server/configs/tts.toml --port 8089 31 | -------------------------------------------------------------------------------- /docs/browser_backend_communication.md: -------------------------------------------------------------------------------- 1 | # Browser-backend communication protocol 2 | 3 | This document explains how the browser frontend and backend service communicate through WebSocket connections in the Unmute system. 4 | 5 | ## Overview 6 | 7 | Unmute uses a WebSocket-based protocol inspired by the [OpenAI Realtime API](https://platform.openai.com/docs/api-reference/realtime) for real-time voice conversations. The protocol handles: 8 | 9 | - Real-time audio streaming (bidirectional) 10 | - Voice conversation transcription 11 | - Session configuration 12 | - Error handling and debugging 13 | 14 | ## WebSocket connection 15 | 16 | ### Endpoint 17 | - **URL**: `/v1/realtime` 18 | - **Protocol**: `realtime` (specified in WebSocket subprotocol) 19 | - **Port**: 8000 (development), routed through Traefik in Docker Swarm and Compose. Traefik uses http (port 80) and https (port 443). 20 | 21 | ### Connection setup 22 | 23 | The WebSocket connection is established using the `realtime` subprotocol. See implementation details in: 24 | - **Frontend**: [`frontend/src/app/Unmute.tsx`](../frontend/src/app/Unmute.tsx) 25 | - **Backend**: [`unmute/main_websocket.py`](../unmute/main_websocket.py) 26 | 27 | ## Message protocol 28 | 29 | All messages are JSON-encoded with a common structure defined in [`unmute/openai_realtime_api_events.py`](../unmute/openai_realtime_api_events.py). 30 | 31 | ### Base message structure 32 | 33 | All messages inherit from [`BaseEvent`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L32-L50) which provides a common type and event_id structure. 34 | 35 | ## Client → server messages 36 | 37 | ### 1. Audio input streaming 38 | 39 | **Message Type**: `input_audio_buffer.append` 40 | 41 | **Purpose**: Stream real-time audio data from microphone to backend 42 | 43 | **Model**: [`InputAudioBufferAppend`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L80-L81) 44 | 45 | **Audio Format**: 46 | - **Codec**: Opus 47 | - **Sample Rate**: 24kHz 48 | - **Channels**: Mono 49 | - **Encoding**: Base64-encoded bytes 50 | 51 | ### 2. Session configuration 52 | 53 | **Message Type**: `session.update` 54 | 55 | **Purpose**: Configure voice character and conversation instructions. The backend will not start sending messages until it gets a session.update message that sets its instructions. 56 | 57 | **Models**: 58 | - [`SessionUpdate`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L72-L73) 59 | - [`SessionConfig`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L66-L69) 60 | 61 | ## Server → client messages 62 | 63 | ### 1. Audio response streaming 64 | 65 | **Message Type**: `response.audio.delta` 66 | 67 | **Purpose**: Stream generated speech audio to frontend 68 | 69 | **Model**: [`ResponseAudioDelta`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L133-L134) 70 | 71 | ### 2. Speech transcription 72 | 73 | **Message Type**: `conversation.item.input_audio_transcription.delta` 74 | 75 | **Purpose**: Real-time transcription of user speech 76 | 77 | **Model**: [`ConversationItemInputAudioTranscriptionDelta`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L147-L151) 78 | 79 | ### 3. Text response streaming 80 | 81 | **Message Type**: `response.text.delta` 82 | 83 | **Purpose**: Stream generated text responses (for display/debugging) 84 | 85 | **Model**: [`ResponseTextDelta`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L125-L126) 86 | 87 | ### 4. Speech detection events 88 | 89 | **Message Types**: 90 | - `input_audio_buffer.speech_started` 91 | - `input_audio_buffer.speech_stopped` 92 | 93 | **Purpose**: Indicate when user starts/stops speaking (for UI feedback). In Unmute we actually just ignore these events at the moment, even though we report them. 94 | 95 | **Models**: 96 | - [`InputAudioBufferSpeechStarted`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L95-L105) 97 | - [`InputAudioBufferSpeechStopped`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L108-L111) 98 | 99 | ### 5. Response status updates 100 | 101 | **Message Type**: `response.created` 102 | 103 | **Purpose**: Indicate when assistant starts generating a response 104 | 105 | **Models**: 106 | - [`ResponseCreated`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L121-L122) 107 | - [`Response`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L114-L118) 108 | 109 | ### 6. Error handling 110 | 111 | **Message Type**: `error` 112 | 113 | **Purpose**: Communicate errors and warnings 114 | 115 | **Models**: 116 | - [`Error`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L62-L63) 117 | - [`ErrorDetails`](https://github.com/kyutai-labs/unmute/blob/main/unmute/openai_realtime_api_events.py#L53-L59) 118 | 119 | ## Connection lifecycle 120 | 121 | 1. **Health Check**: Frontend checks `/v1/health` endpoint 122 | 2. **WebSocket Connection**: Establish connection with `realtime` protocol 123 | 3. **Session Setup**: Send `session.update` with voice and instructions 124 | 4. **Audio Streaming**: Bidirectional real-time audio communication 125 | 5. **Graceful Shutdown**: Handle disconnection and cleanup 126 | 127 | -------------------------------------------------------------------------------- /frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.* 7 | .yarn/* 8 | !.yarn/patches 9 | !.yarn/plugins 10 | !.yarn/releases 11 | !.yarn/versions 12 | 13 | # testing 14 | /coverage 15 | 16 | # next.js 17 | /.next/ 18 | /out/ 19 | 20 | # production 21 | /build 22 | 23 | # misc 24 | .DS_Store 25 | *.pem 26 | 27 | # debug 28 | npm-debug.log* 29 | yarn-debug.log* 30 | yarn-error.log* 31 | .pnpm-debug.log* 32 | -------------------------------------------------------------------------------- /frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker.io/docker/dockerfile:1 2 | # Taken from: https://github.com/vercel/next.js/tree/aebf26f7923c7c4da7734798048bf48d2e57b521/examples/with-docker 3 | 4 | FROM node:18-alpine AS base 5 | 6 | # Install dependencies only when needed 7 | FROM base AS deps 8 | # Check https://github.com/nodejs/docker-node/tree/b4117f9333da4138b03a546ec926ef50a31506c3#nodealpine to understand why libc6-compat might be needed. 9 | RUN apk add --no-cache libc6-compat 10 | WORKDIR /app 11 | 12 | # Install dependencies based on the preferred package manager 13 | COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* .npmrc* ./ 14 | RUN corepack enable pnpm && pnpm i --frozen-lockfile 15 | 16 | 17 | # Rebuild the source code only when needed 18 | FROM base AS builder 19 | WORKDIR /app 20 | COPY --from=deps /app/node_modules ./node_modules 21 | COPY . . 22 | 23 | # Next.js collects completely anonymous telemetry data about general usage. 24 | # Learn more here: https://nextjs.org/telemetry 25 | # Uncomment the following line in case you want to disable telemetry during the build. 26 | ENV NEXT_TELEMETRY_DISABLED=1 27 | 28 | ENV NEXT_PUBLIC_IN_DOCKER=true 29 | 30 | RUN corepack enable pnpm && pnpm run build 31 | 32 | # Production image, copy all the files and run next 33 | FROM base AS runner 34 | WORKDIR /app 35 | 36 | ENV NODE_ENV=production 37 | # Uncomment the following line in case you want to disable telemetry during runtime. 38 | ENV NEXT_TELEMETRY_DISABLED=1 39 | 40 | RUN apk add --no-cache curl 41 | 42 | RUN addgroup --system --gid 1001 nodejs 43 | RUN adduser --system --uid 1001 nextjs 44 | 45 | COPY --from=builder /app/public ./public 46 | 47 | # Automatically leverage output traces to reduce image size 48 | # https://nextjs.org/docs/advanced-features/output-file-tracing 49 | COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./ 50 | COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static 51 | 52 | USER nextjs 53 | 54 | EXPOSE 3000 55 | 56 | ENV PORT=3000 57 | 58 | HEALTHCHECK --start-period=15s \ 59 | CMD curl --fail http://localhost:3000/ || exit 1 60 | 61 | # server.js is created by next build from the standalone output 62 | # https://nextjs.org/docs/pages/api-reference/config/next-config-js/output 63 | ENV HOSTNAME="0.0.0.0" 64 | CMD ["node", "server.js"] -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # Unmute frontend 2 | 3 | This is the frontend for Unmute, written in Next.js. 4 | 5 | Use `pnpm` to install: 6 | 7 | ```bash 8 | pnpm install 9 | # if you don't have Node: 10 | pnpm env use --global lts 11 | ``` 12 | 13 | Then run: 14 | 15 | ```bash 16 | pnpm run dev 17 | ``` 18 | -------------------------------------------------------------------------------- /frontend/eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import { dirname } from "path"; 2 | import { fileURLToPath } from "url"; 3 | import { FlatCompat } from "@eslint/eslintrc"; 4 | 5 | const __filename = fileURLToPath(import.meta.url); 6 | const __dirname = dirname(__filename); 7 | 8 | const compat = new FlatCompat({ 9 | baseDirectory: __dirname, 10 | }); 11 | 12 | const eslintConfig = [ 13 | ...compat.extends("next/core-web-vitals", "next/typescript"), 14 | { 15 | rules: { 16 | '@next/next/no-img-element': 'off', 17 | }, 18 | }, 19 | ]; 20 | 21 | export default eslintConfig; 22 | -------------------------------------------------------------------------------- /frontend/hot-reloading.Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker.io/docker/dockerfile:1 2 | 3 | FROM node:18-alpine AS dev 4 | 5 | # Install required dependencies 6 | RUN apk add --no-cache libc6-compat curl 7 | 8 | # Set working directory 9 | WORKDIR /app 10 | 11 | # Install dependencies using the package manager (detected automatically via lockfile) 12 | COPY package.json tsconfig.json yarn.lock* package-lock.json* pnpm-lock.yaml* .npmrc* postcss.config.mjs ./ 13 | COPY public/ ./public/ 14 | RUN corepack enable pnpm && pnpm i --frozen-lockfile 15 | 16 | # Expose the port the dev server runs on 17 | EXPOSE 3000 18 | 19 | # Set environment variables 20 | ENV NODE_ENV=development 21 | ENV NEXT_TELEMETRY_DISABLED=1 22 | ENV HOSTNAME=0.0.0.0 23 | ENV PORT=3000 24 | ENV NEXT_PUBLIC_IN_DOCKER=true 25 | 26 | HEALTHCHECK --start-period=15s \ 27 | CMD curl --fail http://localhost:3000/ || exit 1 28 | 29 | # The source code will be mounted as a volume, so no need to copy it here 30 | # Default command to run the development server with hot reloading 31 | CMD ["pnpm", "dev"] 32 | -------------------------------------------------------------------------------- /frontend/next.config.ts: -------------------------------------------------------------------------------- 1 | import createMDX from "@next/mdx"; 2 | import type { NextConfig } from "next"; 3 | 4 | const nextConfig: NextConfig = { 5 | output: "standalone", // For Docker 6 | // Configure `pageExtensions` to include markdown and MDX files 7 | pageExtensions: ["js", "jsx", "md", "mdx", "ts", "tsx"], 8 | }; 9 | 10 | const withMDX = createMDX({ 11 | // markdown plugins go here 12 | }); 13 | 14 | // Merge MDX config with Next.js config 15 | export default withMDX(nextConfig); 16 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tts-demo", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "@mdx-js/loader": "^3.1.0", 13 | "@mdx-js/react": "^3.1.0", 14 | "@next/mdx": "^15.3.4", 15 | "@next/third-parties": "^15.3.2", 16 | "@types/http-proxy": "^1.17.16", 17 | "@types/mdx": "^2.0.13", 18 | "bcryptjs": "^3.0.2", 19 | "clsx": "^2.1.1", 20 | "http-proxy": "^1.18.1", 21 | "lucide-react": "^0.503.0", 22 | "next": "15.2.2", 23 | "opus-recorder": "^8.0.5", 24 | "pretty-print-json": "^3.0.4", 25 | "react": "^19.0.0", 26 | "react-dom": "^19.0.0", 27 | "react-use-websocket": "^4.13.0" 28 | }, 29 | "devDependencies": { 30 | "@eslint/eslintrc": "^3", 31 | "@next/eslint-plugin-next": "^15.3.2", 32 | "@tailwindcss/postcss": "^4", 33 | "@types/bcrypt": "^5.0.2", 34 | "@types/node": "^20", 35 | "@types/react": "^19", 36 | "@types/react-dom": "^19", 37 | "eslint": "^9.24.0", 38 | "eslint-config-next": "15.2.2", 39 | "eslint-plugin-react-hooks": "^5.2.0", 40 | "tailwindcss": "^4", 41 | "typescript": "^5" 42 | }, 43 | "packageManager": "pnpm@10.7.1+sha512.2d92c86b7928dc8284f53494fb4201f983da65f0fb4f0d40baafa5cf628fa31dae3e5968f12466f17df7e97310e30f343a648baea1b9b350685dafafffdf5808" 44 | } 45 | -------------------------------------------------------------------------------- /frontend/postcss.config.mjs: -------------------------------------------------------------------------------- 1 | const config = { 2 | plugins: ["@tailwindcss/postcss"], 3 | }; 4 | 5 | export default config; 6 | -------------------------------------------------------------------------------- /frontend/public/decoderWorker.min.wasm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyutai-labs/unmute/d85fbf05a90d2f019a10bf521c1c029d3208b280/frontend/public/decoderWorker.min.wasm -------------------------------------------------------------------------------- /frontend/src/app/CouldNotConnect.tsx: -------------------------------------------------------------------------------- 1 | import clsx from "clsx"; 2 | import UnmuteHeader from "./UnmuteHeader"; 3 | 4 | export type HealthStatus = { 5 | connected: "no" | "yes_request_ok" | "yes_request_fail"; 6 | ok: boolean; 7 | tts_up?: boolean; 8 | stt_up?: boolean; 9 | llm_up?: boolean; 10 | voice_cloning_up?: boolean; 11 | }; 12 | 13 | const renderServiceStatus = ( 14 | name: string, 15 | status: string | boolean | undefined, 16 | necessary: boolean = true 17 | ) => { 18 | if (status === undefined) { 19 | status = "Unknown"; 20 | } else if (status === true) { 21 | status = "Up"; 22 | } else if (status === false) { 23 | status = "Down"; 24 | } 25 | 26 | return ( 27 |

28 | {name}:{" "} 29 | 38 | {status} 39 | 40 |

41 | ); 42 | }; 43 | 44 | const humanReadableStatus = { 45 | no: "Down", 46 | yes_request_ok: "Up", 47 | yes_request_fail: "Up, but with errors", 48 | }; 49 | 50 | const CouldNotConnect = ({ healthStatus }: { healthStatus: HealthStatus }) => { 51 | if (healthStatus.ok) { 52 | return null; 53 | } 54 | 55 | return ( 56 |
57 | 58 |
59 |

{"Couldn't connect :("}

60 |

Service status:

61 | {renderServiceStatus( 62 | "Backend", 63 | humanReadableStatus[healthStatus.connected] 64 | )} 65 | {renderServiceStatus("STT", healthStatus.stt_up)} 66 | {renderServiceStatus("LLM", healthStatus.llm_up)} 67 | {renderServiceStatus("TTS", healthStatus.tts_up)} 68 | {renderServiceStatus( 69 | "Voice cloning", 70 | healthStatus.voice_cloning_up, 71 | false 72 | )} 73 |
74 |
75 | ); 76 | }; 77 | 78 | export default CouldNotConnect; 79 | -------------------------------------------------------------------------------- /frontend/src/app/ErrorMessages.tsx: -------------------------------------------------------------------------------- 1 | import React, { useEffect } from "react"; 2 | import { X } from "lucide-react"; 3 | 4 | export interface ErrorItem { 5 | id: string; 6 | message: string; 7 | timestamp: number; 8 | } 9 | 10 | export const makeErrorItem = (message: string): ErrorItem => { 11 | const timestamp = Date.now(); 12 | return { 13 | id: `${timestamp}-${Math.random()}`, 14 | message, 15 | timestamp, 16 | }; 17 | }; 18 | 19 | const ERROR_TIMEOUT_SEC = 10; 20 | 21 | export default function ErrorMessages({ 22 | errors, 23 | setErrors, 24 | }: { 25 | errors: ErrorItem[]; 26 | setErrors: React.Dispatch>; 27 | }) { 28 | // Auto-dismiss errors after 10 seconds 29 | useEffect(() => { 30 | const interval = setInterval(() => { 31 | setErrors((prev) => { 32 | const now = Date.now(); 33 | const filtered = prev.filter( 34 | (error) => now - error.timestamp < ERROR_TIMEOUT_SEC * 1000 35 | ); 36 | return filtered; 37 | }); 38 | }, 1000); 39 | 40 | return () => clearInterval(interval); 41 | }, [setErrors]); 42 | 43 | const handleDismiss = (index: number, errorId: string) => { 44 | setErrors((prev) => prev.filter((error) => error.id !== errorId)); 45 | }; 46 | 47 | if (errors.length === 0) { 48 | return null; 49 | } 50 | 51 | return ( 52 |
53 | {errors.map((error, index) => ( 54 |
59 |
60 |
61 |

62 | {error.message} 63 |

64 |
65 | 72 |
73 |
74 | ))} 75 |
76 | ); 77 | } 78 | -------------------------------------------------------------------------------- /frontend/src/app/PositionedAudioVisualizer.tsx: -------------------------------------------------------------------------------- 1 | import clsx from "clsx"; 2 | import { ChatMessage } from "./chatHistory"; 3 | import { useAudioVisualizerCircle } from "./useAudioVisualizerCircle"; 4 | import { useEffect, useRef } from "react"; 5 | 6 | const PositionedAudioVisualizer = ({ 7 | chatHistory, 8 | role, 9 | analyserNode, 10 | isConnected, 11 | onCircleClick, 12 | }: { 13 | chatHistory: ChatMessage[]; 14 | role: "user" | "assistant"; 15 | analyserNode: AnalyserNode | null; 16 | isConnected: boolean; 17 | onCircleClick?: () => void; 18 | }) => { 19 | const canvasRef = useRef(null); 20 | const isAssistant = role === "assistant"; 21 | 22 | useAudioVisualizerCircle(canvasRef, { 23 | chatHistory, 24 | role, 25 | analyserNode, 26 | isConnected, 27 | showPlayButton: !!onCircleClick, 28 | clearCanvas: true, 29 | }); 30 | 31 | // Resize the canvas to fit its parent element 32 | useEffect(() => { 33 | const canvas = canvasRef.current; 34 | if (!canvas) return; 35 | 36 | const parent = canvas.parentElement; 37 | if (!parent) return; 38 | 39 | const size = Math.min(parent.clientWidth, parent.clientHeight); 40 | 41 | // If we don't do this `if` check, the recording ends up with flickering 42 | if (canvas.width !== size || canvas.height !== size) { 43 | canvas.width = size; 44 | canvas.height = size; 45 | } 46 | }); 47 | 48 | return ( 49 |
57 |
62 | 69 |
70 |
71 | ); 72 | }; 73 | 74 | export default PositionedAudioVisualizer; 75 | -------------------------------------------------------------------------------- /frontend/src/app/SingleRoleSubtitles.tsx: -------------------------------------------------------------------------------- 1 | import clsx from "clsx"; 2 | import React, { useCallback, useEffect, useRef, useState } from "react"; 3 | 4 | const SingleRoleSubtitles = ({ 5 | text, 6 | role, 7 | nLines = 3, 8 | }: { 9 | text: string; 10 | role: "user" | "assistant"; 11 | nLines?: number; 12 | }) => { 13 | const containerRef = useRef(null); 14 | const [displayText, setDisplayText] = useState([]); 15 | const [previousText, setPreviousText] = useState(""); 16 | 17 | const updateDisplayText = useCallback(() => { 18 | if (!containerRef.current) return; 19 | 20 | const container = containerRef.current; 21 | const containerWidth = container.clientWidth; 22 | 23 | // Create a temporary span to measure text width 24 | const tempSpan = document.createElement("span"); 25 | tempSpan.style.visibility = "hidden"; 26 | tempSpan.style.position = "absolute"; 27 | tempSpan.style.whiteSpace = "nowrap"; 28 | tempSpan.style.font = window.getComputedStyle(container).font; 29 | document.body.appendChild(tempSpan); 30 | 31 | const words = text.split(" "); 32 | const lines: string[] = []; 33 | let currentLine = ""; 34 | 35 | // Build lines word by word 36 | for (const word of words) { 37 | const testLine = currentLine ? `${currentLine} ${word}` : word; 38 | tempSpan.textContent = testLine; 39 | 40 | if (tempSpan.offsetWidth <= containerWidth) { 41 | currentLine = testLine; 42 | } else { 43 | if (currentLine) { 44 | lines.push(currentLine); 45 | currentLine = word; 46 | } else { 47 | // Word is too long for one line 48 | lines.push(word); 49 | currentLine = ""; 50 | } 51 | } 52 | } 53 | 54 | // Add the last line if it's not empty 55 | if (currentLine) { 56 | lines.push(currentLine); 57 | } 58 | 59 | // Remove the temporary span 60 | document.body.removeChild(tempSpan); 61 | 62 | const lastLines = lines.slice(-nLines); 63 | setDisplayText(lastLines); 64 | }, [nLines, text]); 65 | 66 | useEffect(() => { 67 | // If the new text is not a prefix of the old text, reset 68 | if (!text.startsWith(previousText)) { 69 | setDisplayText([]); 70 | } 71 | 72 | setPreviousText(text); 73 | 74 | updateDisplayText(); 75 | }, [previousText, text, updateDisplayText]); 76 | 77 | // Re-calculate when the window resizes 78 | useEffect(() => { 79 | const handleResize = () => { 80 | updateDisplayText(); 81 | }; 82 | 83 | window.addEventListener("resize", handleResize); 84 | return () => { 85 | window.removeEventListener("resize", handleResize); 86 | }; 87 | }, [text, updateDisplayText]); 88 | 89 | return ( 90 | // Apply padding from the outside because otherwise we have to take it into 91 | // account when deciding how to break lines 92 |
98 |
99 | {displayText.map((line, index) => ( 100 |
101 | {line} 102 |
103 | ))} 104 |
105 |
106 | ); 107 | }; 108 | 109 | export default SingleRoleSubtitles; 110 | -------------------------------------------------------------------------------- /frontend/src/app/SlantedButton.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import clsx from "clsx"; 3 | 4 | const SlantedButton = ({ 5 | onClick = () => {}, 6 | children, 7 | kind = "primary", 8 | style, 9 | extraClasses, 10 | }: { 11 | onClick?: () => void; 12 | children: React.ReactNode; 13 | kind?: "primary" | "secondary" | "disabled"; 14 | style?: React.CSSProperties; 15 | extraClasses?: string; 16 | }) => { 17 | const kindToClass = { 18 | primary: "cursor-pointer after:bg-green text-black after:border-green", 19 | secondary: 20 | "cursor-pointer after:bg-darkgray text-white after:border-white after:border-dashed", 21 | disabled: 22 | "cursor-not-allowed after:bg-darkgray text-lightgray after:border-lightgray after:border-dashed", 23 | }; 24 | 25 | return ( 26 | 47 | ); 48 | }; 49 | 50 | export default SlantedButton; 51 | -------------------------------------------------------------------------------- /frontend/src/app/SquareButton.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import clsx from "clsx"; 3 | 4 | const SquareButton = ({ 5 | onClick = () => {}, 6 | children, 7 | kind = "primary", 8 | extraClasses, 9 | }: { 10 | onClick?: () => void; 11 | children: React.ReactNode; 12 | kind?: "primary" | "primaryOff" | "secondary"; 13 | extraClasses?: string; 14 | }) => { 15 | const kindToClass = { 16 | primary: "text-green border-green", 17 | primaryOff: "text-white border-white", 18 | secondary: "text-white border-transparent", 19 | }; 20 | 21 | return ( 22 | 38 | ); 39 | }; 40 | 41 | export default SquareButton; 42 | -------------------------------------------------------------------------------- /frontend/src/app/Subtitles.tsx: -------------------------------------------------------------------------------- 1 | import { ChatMessage } from "./chatHistory"; 2 | import SingleRoleSubtitles from "./SingleRoleSubtitles"; 3 | 4 | const Subtitles = ({ chatHistory }: { chatHistory: ChatMessage[] }) => { 5 | const lastAssistantMessage = chatHistory.findLast( 6 | (message) => message.role === "assistant" && message.content !== "" 7 | ); 8 | const lastUserMessage = chatHistory.findLast( 9 | (message) => message.role === "user" && message.content !== "" 10 | ); 11 | 12 | return ( 13 |
14 | 18 | 19 |
20 | ); 21 | }; 22 | 23 | export default Subtitles; 24 | -------------------------------------------------------------------------------- /frontend/src/app/TrimmedAudioPreview.tsx: -------------------------------------------------------------------------------- 1 | import { memo, useRef, useState } from "react"; 2 | import { MIC_RECORDING_FILENAME } from "./VoiceRecorder"; 3 | 4 | const TrimmedAudioPreviewUnmemoized = ({ file }: { file: File }) => { 5 | const audioRef = useRef(null); 6 | const [duration, setDuration] = useState(null); 7 | const maxDurationSec = 10; 8 | 9 | const handleTimeUpdate = () => { 10 | if (audioRef.current && audioRef.current.currentTime >= maxDurationSec) { 11 | // If playing, restart the playhead to 0 so that you can just press the play 12 | // button to play again. If paused, max duration to indicate trimming 13 | audioRef.current.currentTime = audioRef.current.paused 14 | ? maxDurationSec 15 | : 0; 16 | 17 | audioRef.current.pause(); 18 | } 19 | }; 20 | 21 | const handleDurationChange = () => { 22 | setDuration(audioRef.current?.duration || null); 23 | }; 24 | 25 | return ( 26 |
27 | {file.name !== MIC_RECORDING_FILENAME && ( 28 |
29 | Selected file: {file.name} 30 |
31 | )} 32 | {duration && duration > maxDurationSec + 1 && ( 33 |
34 | Note that only the first {maxDurationSec} seconds{" "} 35 | will be used. 36 |
37 | )} 38 |
47 | ); 48 | }; 49 | 50 | // We memoize because otherwise the