├── .gitignore ├── .parcelrc ├── .prettierrc ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── examples └── directToLLMTransports │ ├── README.md │ ├── README.md.bak │ ├── env.example │ ├── index.html │ ├── package-lock.json │ ├── package.json │ ├── src │ ├── app.ts │ ├── styles.css │ └── vite-env.d.ts │ └── tsconfig.json ├── lib ├── media-mgmt │ ├── dailyMediaManager.ts │ └── mediaManager.ts ├── wavtools │ ├── index.js │ └── lib │ │ ├── analysis │ │ ├── audio_analysis.js │ │ └── constants.js │ │ ├── mediastream_recorder.js │ │ ├── wav_packer.js │ │ ├── wav_recorder.js │ │ ├── wav_stream_player.js │ │ └── worklets │ │ ├── audio_processor.js │ │ └── stream_processor.js └── websocket-utils │ └── reconnectingWebSocket.ts ├── package-lock.json ├── package.json └── transports ├── daily ├── CHANGELOG.md ├── LICENSE ├── README.md ├── package.json ├── src │ ├── index.ts │ └── transport.ts └── tsconfig.json ├── gemini-live-websocket-transport ├── LICENSE ├── README.md ├── package.json ├── src │ ├── directToLLMBaseWebSocketTransport.ts │ ├── geminiLiveWebSocketTransport.ts │ └── index.ts └── tsconfig.json ├── openai-realtime-webrtc-transport ├── LICENSE ├── README.md ├── package.json ├── src │ ├── OpenAIRealTimeWebRTCTransport.ts │ └── index.ts └── tsconfig.json ├── small-webrtc-transport ├── CHANGELOG.md ├── LICENSE ├── README.md ├── package.json ├── src │ ├── index.ts │ └── smallWebRTCTransport.ts └── tsconfig.json └── websocket-transport ├── LICENSE ├── README.md ├── package.json ├── proto ├── frames.proto └── generate_typescript.sh ├── src ├── generated │ └── proto │ │ └── frames.ts ├── index.ts └── webSocketTransport.ts └── tsconfig.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | pnpm-debug.log* 8 | lerna-debug.log* 9 | 10 | node_modules 11 | dist 12 | dist-ssr 13 | *.local 14 | .parcel-cache 15 | 16 | .env 17 | 18 | # Editor directories and files 19 | .vscode/* 20 | !.vscode/extensions.json 21 | .idea 22 | .DS_Store 23 | *.suo 24 | *.ntvs* 25 | *.njsproj 26 | *.sln 27 | *.sw? 28 | -------------------------------------------------------------------------------- /.parcelrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@parcel/config-default", 3 | "transformers": { 4 | "*.{ts,tsx}": [ 5 | "@parcel/transformer-typescript-tsc" 6 | ] 7 | } 8 | } -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "tabWidth": 2, 4 | "useTabs": false, 5 | "singleQuote": false 6 | } -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing to Pipecat Client Web Trnsports 2 | 3 | We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved: 4 | 5 | 1. **Fork this repository**: Start by forking the Pipecat Client Web Transports repository to your GitHub account. 6 | 7 | 2. **Clone the repository**: Clone your forked repository to your local machine. 8 | ```bash 9 | git clone https://github.com/your-username/pipecat-client-web-transports 10 | ``` 11 | 3. **Create a branch**: For your contribution, create a new branch. 12 | ```bash 13 | git checkout -b your-branch-name 14 | ``` 15 | 4. **Make your changes**: Edit or add files as necessary. 16 | 5. **Test your changes**: Ensure that your changes look correct and follow the style set in the codebase. 17 | 6. **Commit your changes**: Once you're satisfied with your changes, commit them with a meaningful message. 18 | 19 | ```bash 20 | git commit -m "Description of your changes" 21 | ``` 22 | 23 | 7. **Push your changes**: Push your branch to your forked repository. 24 | 25 | ```bash 26 | git push origin your-branch-name 27 | ``` 28 | 29 | 9. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo. 30 | > Important: Describe the changes you've made clearly! 31 | 32 | Our maintainers will review your PR, and once everything is good, your contributions will be merged! 33 | 34 | # Contributor Covenant Code of Conduct 35 | 36 | ## Our Pledge 37 | 38 | We as members, contributors, and leaders pledge to make participation in our 39 | community a harassment-free experience for everyone, regardless of age, body 40 | size, visible or invisible disability, ethnicity, sex characteristics, gender 41 | identity and expression, level of experience, education, socio-economic status, 42 | nationality, personal appearance, race, caste, color, religion, or sexual 43 | identity and orientation. 44 | 45 | We pledge to act and interact in ways that contribute to an open, welcoming, 46 | diverse, inclusive, and healthy community. 47 | 48 | ## Our Standards 49 | 50 | Examples of behavior that contributes to a positive environment for our 51 | community include: 52 | 53 | - Demonstrating empathy and kindness toward other people 54 | - Being respectful of differing opinions, viewpoints, and experiences 55 | - Giving and gracefully accepting constructive feedback 56 | - Accepting responsibility and apologizing to those affected by our mistakes, 57 | and learning from the experience 58 | - Focusing on what is best not just for us as individuals, but for the overall 59 | community 60 | 61 | Examples of unacceptable behavior include: 62 | 63 | - The use of sexualized language or imagery, and sexual attention or advances of 64 | any kind 65 | - Trolling, insulting or derogatory comments, and personal or political attacks 66 | - Public or private harassment 67 | - Publishing others' private information, such as a physical or email address, 68 | without their explicit permission 69 | - Other conduct which could reasonably be considered inappropriate in a 70 | professional setting 71 | 72 | ## Enforcement Responsibilities 73 | 74 | Community leaders are responsible for clarifying and enforcing our standards of 75 | acceptable behavior and will take appropriate and fair corrective action in 76 | response to any behavior that they deem inappropriate, threatening, offensive, 77 | or harmful. 78 | 79 | Community leaders have the right and responsibility to remove, edit, or reject 80 | comments, commits, code, wiki edits, issues, and other contributions that are 81 | not aligned to this Code of Conduct, and will communicate reasons for moderation 82 | decisions when appropriate. 83 | 84 | ## Scope 85 | 86 | This Code of Conduct applies within all community spaces, and also applies when 87 | an individual is officially representing the community in public spaces. 88 | Examples of representing our community include using an official email address, 89 | posting via an official social media account, or acting as an appointed 90 | representative at an online or offline event. 91 | 92 | ## Enforcement 93 | 94 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 95 | reported to the community leaders responsible for enforcement at pipecat-ai@daily.co. 96 | All complaints will be reviewed and investigated promptly and fairly. 97 | 98 | All community leaders are obligated to respect the privacy and security of the 99 | reporter of any incident. 100 | 101 | ## Enforcement Guidelines 102 | 103 | Community leaders will follow these Community Impact Guidelines in determining 104 | the consequences for any action they deem in violation of this Code of Conduct: 105 | 106 | ### 1. Correction 107 | 108 | **Community Impact**: Use of inappropriate language or other behavior deemed 109 | unprofessional or unwelcome in the community. 110 | 111 | **Consequence**: A private, written warning from community leaders, providing 112 | clarity around the nature of the violation and an explanation of why the 113 | behavior was inappropriate. A public apology may be requested. 114 | 115 | ### 2. Warning 116 | 117 | **Community Impact**: A violation through a single incident or series of 118 | actions. 119 | 120 | **Consequence**: A warning with consequences for continued behavior. No 121 | interaction with the people involved, including unsolicited interaction with 122 | those enforcing the Code of Conduct, for a specified period of time. This 123 | includes avoiding interactions in community spaces as well as external channels 124 | like social media. Violating these terms may lead to a temporary or permanent 125 | ban. 126 | 127 | ### 3. Temporary Ban 128 | 129 | **Community Impact**: A serious violation of community standards, including 130 | sustained inappropriate behavior. 131 | 132 | **Consequence**: A temporary ban from any sort of interaction or public 133 | communication with the community for a specified period of time. No public or 134 | private interaction with the people involved, including unsolicited interaction 135 | with those enforcing the Code of Conduct, is allowed during this period. 136 | Violating these terms may lead to a permanent ban. 137 | 138 | ### 4. Permanent Ban 139 | 140 | **Community Impact**: Demonstrating a pattern of violation of community 141 | standards, including sustained inappropriate behavior, harassment of an 142 | individual, or aggression toward or disparagement of classes of individuals. 143 | 144 | **Consequence**: A permanent ban from any sort of public interaction within the 145 | community. 146 | 147 | ## Attribution 148 | 149 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 150 | version 2.1, available at 151 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 152 | 153 | Community Impact Guidelines were inspired by 154 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 155 | 156 | For answers to common questions about this code of conduct, see the FAQ at 157 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 158 | [https://www.contributor-covenant.org/translations][translations]. 159 | 160 | [homepage]: https://www.contributor-covenant.org 161 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 162 | [Mozilla CoC]: https://github.com/mozilla/diversity 163 | [FAQ]: https://www.contributor-covenant.org/faq 164 | [translations]: https://www.contributor-covenant.org/translations 165 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2024, Daily 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pipecat Client Web Transports 2 | 3 | [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai/client/js/transports/transport) 4 | [![Discord](https://img.shields.io/discord/1239284677165056021)](https://discord.gg/pipecat) 5 | 6 | A mono-repo to house the various supported Transport options to be used with the pipecat-client-web library. Currently, there are four transports: `small-webrtc-transport`, `daily-transport`, `gemini-live-websocket-transport`, and `openai-realtime-webrtc-transport`. 7 | 8 | ## Documentation 9 | 10 | Pipecat Transports are intended to be used in conjunction with a Pipecat web client. Please refer to the full Pipecat client documentation [here](https://docs.pipecat.ai/client/introduction) and an overview of the [Transport API here](https://docs.pipecat.ai/client/js/transports/transport) 11 | 12 | ## Current Transports 13 | 14 | ### [SmallWebRTCTransport](/transports/small-webrtc-transport/README.md) 15 | 16 | [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai/client/js/transports/small-webrtc) 17 | [![README](https://img.shields.io/badge/README-goldenrod)](/transports/small-webrtc-transport/README.md) 18 | [![Demo](https://img.shields.io/badge/Demo-forestgreen)](https://github.com/pipecat-ai/pipecat/tree/main/examples/p2p-webrtc) 19 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/small-webrtc-transport) 20 | 21 | This Transport creates a peer-to-peer WebRTC connection between the client and the bot process. This Transport is the client-side counterpart to the Pipecat [SmallWebRTCTransport component](https://docs.pipecat.ai/server/services/transport/small-webrtc). 22 | 23 | This is the simplest low-latency audio/video transport for Pipecat. This transport is recommended for local development and demos. Things to be aware of: 24 | - This transport is a direct connection between the client and the bot process. If you need multiple clients to connect to the same bot, you will need to use a different transport. 25 | - For production usage at scale, a distributed WebRTC network that can do edge/mesh routing, has session-level observability and metrics, and can offload recording and other auxiliary services is often useful. 26 | 27 | Typical media flow using a SmallWebRTCTransport: 28 | ``` 29 | ┌──────────────────────────────────────────────────┐ 30 | │ │ 31 | ┌─────────────────────────┐ │ Server ┌─────────┐ │ 32 | │ │ │ │Pipecat │ │ 33 | │ Client │ RTVI Messages │ │Pipeline │ │ 34 | │ │ & │ │ │ 35 | │ ┌────────────────────┐ │ WebRTC Media │ ┌────────────────────┐ media │ ┌─────┐ │ │ 36 | │ │SmallWebRTCTransport│◄─┼────────────────┼─►│SmallWebRTCTransport┼────────────┼─► STT │ │ │ 37 | │ └────────────────────┘ │ │ └───────▲────────────┘ in │ └──┬──┘ │ │ 38 | │ │ │ │ │ │ │ │ 39 | └─────────────────────────┘ │ │ │ ┌──▼──┐ │ │ 40 | │ │ │ │ LLM │ │ │ 41 | │ │ │ └──┬──┘ │ │ 42 | │ │ │ │ │ │ 43 | │ │ │ ┌──▼──┐ │ │ 44 | │ │ media │ │ TTS │ │ │ 45 | │ └─────────────────────────┼─┴─────┘ │ │ 46 | │ out └─────────┘ │ 47 | │ │ 48 | └──────────────────────────────────────────────────┘ 49 | ``` 50 | 51 | ### [DailyTransport](/transports/daily/README.md) 52 | 53 | [![Docs](https://img.shields.io/badge/Documention-blue)](https://docs.pipecat.ai/client/js/transports/daily) 54 | [![README](https://img.shields.io/badge/README-goldenrod)](/transports/daily/README.md) 55 | [![Demo](https://img.shields.io/badge/Demo-forestgreen)](https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot) 56 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/daily-transport) 57 | 58 | This Transport uses the [Daily](https://daily.co) audio and video calling service to connect to a bot and stream media over a WebRTC connection. This Transport is the client-side counterpart to the Pipecat [DailyTransport component](https://docs.pipecat.ai/server/services/transport/daily). 59 | 60 | Typical media flow using a DailyTransport: 61 | ``` 62 | 63 | ┌────────────────────────────────────────────┐ 64 | │ │ 65 | ┌───────────────────┐ │ Server ┌─────────┐ │ 66 | │ │ │ │Pipecat │ │ 67 | │ Client │ RTVI Messages │ │Pipeline │ │ 68 | │ │ & │ │ │ │ 69 | │ ┌──────────────┐ │ WebRTC Media │ ┌──────────────┐ media │ ┌─────┐ │ │ 70 | │ │DailyTransport│◄─┼────────────────┼─►│DailyTransport┼────────────┼─► STT │ │ │ 71 | │ └──────────────┘ │ │ └───────▲──────┘ in │ └──┬──┘ │ │ 72 | │ │ │ │ │ │ │ │ 73 | └───────────────────┘ │ │ │ ┌──▼──┐ │ │ 74 | │ │ │ │ LLM │ │ │ 75 | │ │ │ └──┬──┘ │ │ 76 | │ │ │ │ │ │ 77 | │ │ │ ┌──▼──┐ │ │ 78 | │ │ media │ │ TTS │ │ │ 79 | │ └───────────────────┼─┴─────┘ │ │ 80 | │ out └─────────┘ │ 81 | │ │ 82 | └────────────────────────────────────────────┘ 83 | 84 | ``` 85 | 86 | ### [GeminiLiveWebSocketTransport](transports/gemini-live-websocket-transport/README.md) 87 | [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai/client/js/transports/gemini) 88 | [![README](https://img.shields.io/badge/README-goldenrod)](transports/gemini-live-websocket-transport/README.md) 89 | [![Demo](https://img.shields.io/badge/Demo-forestgreen)](examples/directToLLMTransports/README.md) 90 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/gemini-live-websocket-transport) 91 | 92 | This Transport extends the [RealTimeWebSocketTransport](transports/realtime-websocket-transport/README) and connects directly to Gemini over a WebSocket connection using the Multimodal Live API. This type of transport is great for testing different services out without the need to build a server component. Just be aware that it is insecure since you will need to have access to your Gemini API Key client-side so not probably something you want to use in your production app. 93 | 94 | Media flow using a GeminiLiveWebSocketTransport: 95 | ``` 96 | Client Server 97 | ┌────────────────────────────────────┐ 98 | │ │ 99 | │ RTVIClient │ ┌──────────────┐ 100 | │ │ Media over │ │ 101 | │ ┌──────────────────────────────┐ │ WebSocket │ Gemini │ 102 | │ │ GeminiLiveWebSocketTransport │◄─┼────────────────┼─► Server │ 103 | │ └──────────────────────────────┘ │ │ │ 104 | │ │ └──────────────┘ 105 | └────────────────────────────────────┘ 106 | ``` 107 | 108 | ### [OpenAIRealTimeWebRTCTransport](transports/gemini-live-websocket-transport/README.md) 109 | [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai/client/js/transports/openai-webrtc) 110 | [![README](https://img.shields.io/badge/README-goldenrod)](transports/openai-realtime-webrtc-transport/README.md) 111 | [![Demo](https://img.shields.io/badge/Demo-forestgreen)](examples/directToLLMTransports/README.md) 112 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/openai-realtime-webrtc-transport) 113 | 114 | This Transport connects directly to OpenAI over a WebRTC connection using the RealTime API. This type of transport is great for testing different services out without the need to build a server component. Just be aware that it is insecure since you will need to have access to your OpenAI API Key client-side so not probably something you want to use in your production app. It does not implement the Ephemeral Token process. 115 | 116 | Media flow using a OpenAIRealTimeWebRTCTransport: 117 | ``` 118 | Client Server 119 | ┌─────────────────────────────────────┐ 120 | │ │ 121 | │ RTVIClient │ ┌──────────────┐ 122 | │ │ Media over │ │ 123 | │ ┌───────────────────────────────┐ │ WebRTC │ OpenAI │ 124 | │ │ OpenAIRealTimeWebRTCTransport │◄─┼────────────────┼─► Server │ 125 | │ └───────────────────────────────┘ │ │ │ 126 | │ │ └──────────────┘ 127 | └─────────────────────────────────────┘ 128 | ``` 129 | 130 | ## Local Development 131 | 132 | ### Build the transport libraries 133 | 134 | ```bash 135 | $ npm i 136 | $ npm run build 137 | ``` 138 | 139 | ## License 140 | BSD-2 Clause 141 | 142 | ## Contributing 143 | We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help: 144 | 145 | - **Found a bug?** Open an [issue](https://github.com/pipecat-ai/pipecat-client-web-transports/issues) 146 | - **Have a feature idea?** Start a [discussion](https://discord.gg/pipecat) 147 | - **Want to contribute code?** Check our [CONTRIBUTING.md](CONTRIBUTING.md) guide 148 | - **Documentation improvements?** [Docs](https://github.com/pipecat-ai/docs) PRs are always welcome 149 | 150 | Before submitting a pull request, please check existing issues and PRs to avoid duplicates. 151 | 152 | We aim to review all contributions promptly and provide constructive feedback to help get your changes merged. -------------------------------------------------------------------------------- /examples/directToLLMTransports/README.md: -------------------------------------------------------------------------------- 1 | # Pipecat (RTVI) Client Demo for Direct Communication with LLMs 2 | 3 | ## Overview 4 | This application demonstrates a real-time voice interaction system using the RTVI Client library with both the Gemini Multimodal Live and OpenAI RealTime WebRTC integrations. It enables two-way communication between users and the LLM, featuring voice input/output, text messaging, and various audio controls. 5 | 6 | ## Features 7 | - Real-time voice interaction with a Gemini Multimodal Live bot 8 | - Real-time voice interaction with an OpenAI RealTime bot 9 | - Microphone input control and device selection 10 | - Text-based message prompting 11 | - Audio visualization through dynamic speech bubbles 12 | - Comprehensive event handling system 13 | - Connection state management 14 | 15 | ## Prerequisites 16 | - Gemini API key (set as environment variable `VITE_DANGEROUS_GEMINI_API_KEY`) 17 | - OpenAI API key (set as environment variable `VITE_DANGEROUS_OPENAI_API_KEY`) 18 | - Optional [OpenWeather API](https://openweathermap.org/api) key for fetching weather. If none is provided, the app will generate something random. 19 | - Modern web browser with WebSocket support 20 | - Access to microphone 21 | 22 | ## Dependencies 23 | ``` 24 | # from base folder 25 | $ npm i 26 | $ npm run build 27 | ``` 28 | 29 | 30 | ## Setup and Installation 31 | ``` 32 | npm i 33 | npm run dev 34 | 35 | cp env.example .env 36 | # update .env with API keys 37 | ``` 38 | 39 | ### To run the example with Gemini MultiModal Live: 40 | 41 | Open [http://localhost:5173/](http://localhost:5173/) 42 | 43 | ### To run the example with OpenAI RealTime: 44 | 45 | Open [http://localhost:5173?service=openai](http://localhost:5173?service=openai) 46 | 47 | ## Documentation Reference 48 | [RTVI Client Documentation](https://docs.pipecat.ai/client/introduction) 49 | [Gemini Multimodal Live Documentation](https://ai.google.dev/api/multimodal-live) 50 | [OpenAI RealTime WebRTC Documentation](https://platform.openai.com/docs/guides/realtime-webrtc) 51 | 52 | ## Usage 53 | 54 | ### Initialization 55 | The application automatically initializes when the DOM content is loaded. It sets up: 56 | - Audio device selection 57 | - Microphone controls 58 | - Bot connection management 59 | - Event handlers 60 | 61 | ### Controls 62 | - **Toggle Bot**: Connect/disconnect the AI assistant 63 | - **Mute/Unmute**: Control microphone input 64 | - **Microphone Selection**: Choose input device 65 | - **Text Input**: Send text messages to the bot 66 | 67 | ### Event Handling 68 | The application handles various events including: 69 | - Transport state changes 70 | - Bot connection status 71 | - Audio track management 72 | - Speech detection 73 | - Error handling 74 | - Audio level visualization 75 | 76 | ## Key Components 77 | 78 | ### RTVIClient Configuration 79 | ```typescript 80 | let RTVIConfig: RTVIClientOptions = { 81 | transport, 82 | params: { 83 | baseUrl: "api", 84 | requestData: { }, 85 | }, 86 | enableMic: true, 87 | enableCam: false, 88 | timeout: 30 * 1000, 89 | }; 90 | ``` 91 | 92 | ### Gemini Multimodal Live Service Configuration 93 | ```typescript 94 | const llm_service_options: GeminiLLMServiceOptions = { 95 | api_key: process.env.VITE_DANGEROUS_GEMINI_API_KEY, 96 | model: "models/gemini-2.0-flash-exp", 97 | // ... additional configuration 98 | }; 99 | ``` 100 | 101 | For all service options and their defaults, see [GeminiLLMServiceOptions](../../transports/gemini-live-websocket-transport/src/geminiLiveWebSocketTransport.ts#21) 102 | 103 | ### OpenAI Realtime API Service Configuration 104 | ```typescript 105 | const llm_service_options: OpenAIServiceOptions = { 106 | api_key: import.meta.env.VITE_DANGEROUS_OPENAI_API_KEY, 107 | // ... additional configuration 108 | }; 109 | ``` 110 | 111 | For all service options and their defaults, see [OpenAIServiceOptions](../../transports/openai-realtime-webrtc-transport/src/OpenAIRealTimeWebRTCTransport.ts#28) 112 | 113 | ## Notes 114 | - Gemini integration currently does not support transcripts 115 | 116 | ## License 117 | BSD-2 Clause 118 | -------------------------------------------------------------------------------- /examples/directToLLMTransports/README.md.bak: -------------------------------------------------------------------------------- 1 | 2 | # Pipecat JavaScript Client SDK Example using a Gemini MultiModal Live Transport 3 | 4 | ## Setup 5 | 6 | Build transport dependencies 7 | 8 | ``` 9 | # from base folder 10 | $ yarn 11 | $ yarn workspace @pipecat-ai/realtime-websocket-transport build 12 | $ yarn workspace @pipecat-ai/gemini-live-websocket-transport 13 | ``` 14 | 15 | ## Install and run locally 16 | 17 | ``` 18 | npm i 19 | npm run dev 20 | 21 | cp env.example .env 22 | # update .env with API keys 23 | ``` 24 | 25 | Open [http://localhost:5173/](http://localhost:5173/) 26 | 27 | ## Demo code 28 | 29 | This is bare-bones LLM voice chat app that sets up an [RTVI Client](https://github.com/pipecat-ai/pipecat-client-web)(Pipecat's client-side JS component) with a [GeminiLiveWebsocketTransport](../../transports/gemini-live-websocket-transport/src/geminiLiveWebSocketTransport.ts) to communicate directly with Google's Multimodal Live API over a websocket connection. 30 | 31 | The application code is all in two files: 32 | 33 | - [index.html](./index.html) 34 | - [src/app.ts](./src/app.ts) 35 | -------------------------------------------------------------------------------- /examples/directToLLMTransports/env.example: -------------------------------------------------------------------------------- 1 | VITE_DANGEROUS_GEMINI_API_KEY= 2 | VITE_DANGEROUS_OPENAI_API_KEY= 3 | VITE_DANGEROUS_OPENWEATHER_API_KEY= -------------------------------------------------------------------------------- /examples/directToLLMTransports/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | My Web App 5 | 6 | 7 | 8 | 9 | 10 |
11 |
Transport state: disconnected
12 |
13 | 14 | 15 |
16 | 17 | 18 |
19 |
20 |
21 | 22 | 28 | 29 |
30 |
31 |
32 |
33 |
34 |
35 | 36 |
37 | 38 | 39 | -------------------------------------------------------------------------------- /examples/directToLLMTransports/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "1116", 3 | "version": "1.0.0", 4 | "main": "server/server.ts", 5 | "scripts": { 6 | "dev": "vite", 7 | "build": "vite build", 8 | "preview": "vite preview" 9 | }, 10 | "keywords": [], 11 | "author": "", 12 | "license": "BSD-2-Clause", 13 | "description": "", 14 | "dependencies": { 15 | "@pipecat-ai/client-js": "^0.3.5", 16 | "@pipecat-ai/gemini-live-websocket-transport": "file:../../transports/gemini-live-websocket-transport", 17 | "@pipecat-ai/openai-realtime-webrtc-transport": "file:../../transports/openai-realtime-webrtc-transport", 18 | "dotenv": "^16.4.5", 19 | "express": "^4.21.1", 20 | "morgan": "^1.10.0" 21 | }, 22 | "devDependencies": { 23 | "@types/express": "^5.0.0", 24 | "@types/morgan": "^1.9.9", 25 | "@types/node": "^22.9.0", 26 | "concurrently": "^9.1.0", 27 | "eslint": "^9.15.0", 28 | "nodemon": "^3.1.7", 29 | "ts-node": "^10.9.2", 30 | "ts-node-dev": "^2.0.0", 31 | "typescript": "^5.6.3", 32 | "vite": "^5.4.11" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /examples/directToLLMTransports/src/app.ts: -------------------------------------------------------------------------------- 1 | // Import Types for Gemini Transport 2 | import { 3 | GeminiLiveWebsocketTransport, 4 | GeminiLLMServiceOptions, 5 | } from "@pipecat-ai/gemini-live-websocket-transport"; 6 | 7 | import { 8 | OpenAIRealTimeWebRTCTransport, 9 | OpenAIServiceOptions, 10 | } from "@pipecat-ai/openai-realtime-webrtc-transport"; 11 | 12 | // Import core Pipecat RTVI Client and types 13 | import { 14 | LLMHelper, 15 | FunctionCallParams, 16 | Transport, 17 | RTVIClient, 18 | RTVIEvent, 19 | RTVIMessage, 20 | Participant, 21 | TranscriptData, 22 | BotTTSTextData, 23 | RTVIClientOptions, 24 | } from "@pipecat-ai/client-js"; 25 | 26 | // Global variables for DOM elements and client state 27 | let statusDiv: HTMLElement; 28 | let audioDiv: HTMLDivElement; 29 | let toggleBotButton: HTMLButtonElement; 30 | let submitBtn: HTMLButtonElement; 31 | let rtviClient: RTVIClient; 32 | let llmHelper: LLMHelper; 33 | let botRunning = false; 34 | 35 | // Initialize the application when DOM is fully loaded 36 | document.addEventListener("DOMContentLoaded", () => { 37 | statusDiv = document.getElementById("status")!; 38 | toggleBotButton = document.getElementById("toggleBot")! as HTMLButtonElement; 39 | toggleBotButton.addEventListener("click", () => toggleBot()); 40 | 41 | // Handle microphone device selection 42 | document.getElementById("mic-picker")!.onchange = (e) => { 43 | const target = e.target as HTMLSelectElement; 44 | console.log("user changed device", target, target.value); 45 | rtviClient.updateMic(target.value); 46 | }; 47 | 48 | // Set up mute button functionality 49 | const muteBtn = document.getElementById("toggleMute")!; 50 | muteBtn.addEventListener("click", () => { 51 | muteBtn.textContent = rtviClient.isMicEnabled ? "Unmute Mic" : "Mute Mic"; 52 | rtviClient.enableMic(!rtviClient.isMicEnabled); 53 | }); 54 | 55 | // Set up text submission button 56 | submitBtn = document.getElementById("submit-text")! as HTMLButtonElement; 57 | submitBtn.addEventListener("click", () => { 58 | sendUserMessage(); 59 | }); 60 | submitBtn.disabled = true; 61 | 62 | // Initialize the bot 63 | initBot(); 64 | }); 65 | 66 | // Connect / Disconnect from bot 67 | async function toggleBot() { 68 | toggleBotButton.disabled = true; 69 | if (botRunning) { 70 | console.log("disconnecting bot"); 71 | await disconnectBot(); 72 | } else { 73 | console.log("connecting bot"); 74 | await connectBot(); 75 | } 76 | toggleBotButton.textContent = botRunning ? "Disconnect" : "Connect"; 77 | } 78 | 79 | // Initialize the bot with configuration 80 | async function initBot() { 81 | const urlParams = new URLSearchParams(window.location.search); 82 | const service = urlParams.get("service") || "gemini"; 83 | const { transport, service_options } = 84 | service === "gemini" ? initGeminiTransport() : initOpenAITransport(); 85 | 86 | // Configure RTVI client options 87 | let RTVIConfig: RTVIClientOptions = { 88 | transport, 89 | params: { 90 | baseUrl: "api", 91 | requestData: { service_options }, 92 | }, 93 | enableMic: true, 94 | enableCam: false, 95 | timeout: 30 * 1000, 96 | }; 97 | RTVIConfig.customConnectHandler = () => Promise.resolve(); 98 | 99 | // Create new RTVI client instance 100 | rtviClient = new RTVIClient(RTVIConfig); 101 | llmHelper = new LLMHelper({}); 102 | llmHelper.handleFunctionCall(async (fn: FunctionCallParams) => { 103 | return await handleFunctionCall(fn.functionName, fn.arguments); 104 | }); 105 | rtviClient.registerHelper(service, llmHelper); 106 | 107 | // Make RTVI client and transport available globally for debugging 108 | (window as any).client = rtviClient; 109 | 110 | // Set up RTVI event handlers and initialize devices 111 | setupEventHandlers(rtviClient); 112 | await setupDevices(); 113 | } 114 | 115 | // Initialize the Gemini LLM and its service options 116 | function initGeminiTransport() { 117 | // Configure Gemini LLM service options 118 | const llm_service_options: GeminiLLMServiceOptions = { 119 | api_key: import.meta.env.VITE_DANGEROUS_GEMINI_API_KEY, 120 | model: "models/gemini-2.0-flash-exp", 121 | initial_messages: [ 122 | // Set up initial system and user messages. 123 | // Without the user message, the bot will not respond immediately 124 | // and wait for the user to speak first. 125 | { 126 | role: "model", 127 | content: "You are a pencil salesman...", 128 | }, 129 | { role: "user", content: "Hello!" }, 130 | ], 131 | settings: { 132 | speech_config: { 133 | voice_config: { 134 | prebuilt_voice_config: { 135 | // Options are: "Puck" | "Charon" | "Kore" | "Fenrir" | "Aoede" 136 | voice_name: "Charon", 137 | }, 138 | }, 139 | }, 140 | }, 141 | }; 142 | 143 | // Initialize transport 144 | let transport: Transport = new GeminiLiveWebsocketTransport( 145 | llm_service_options 146 | ); 147 | 148 | return { transport, service_options: llm_service_options }; 149 | } 150 | 151 | function initOpenAITransport() { 152 | // Configure OpenAI LLM service options 153 | const llm_service_options: OpenAIServiceOptions = { 154 | api_key: import.meta.env.VITE_DANGEROUS_OPENAI_API_KEY, 155 | settings: { 156 | instructions: "You are a pirate. You are looking for buried treasure.", 157 | voice: "echo", 158 | input_audio_noise_reduction: { type: "near_field" }, 159 | turn_detection: { type: "semantic_vad" }, 160 | tools: [ 161 | { 162 | type: "function", 163 | name: "changeBackgroundColor", 164 | description: "Change the background color of the page", 165 | parameters: { 166 | type: "object", 167 | properties: { 168 | color: { 169 | type: "string", 170 | description: "A hex value of the color", 171 | }, 172 | }, 173 | }, 174 | }, 175 | { 176 | type: "function", 177 | name: "getWeather", 178 | description: "Gets the current weather for a given location", 179 | parameters: { 180 | type: "object", 181 | properties: { 182 | location: { 183 | type: "string", 184 | description: "A city or location", 185 | }, 186 | }, 187 | }, 188 | }, 189 | ], 190 | }, 191 | initial_messages: [{ role: "user", content: "Hello" }], 192 | }; 193 | 194 | // Initialize transport 195 | let transport: Transport = new OpenAIRealTimeWebRTCTransport( 196 | llm_service_options 197 | ); 198 | 199 | return { transport, service_options: llm_service_options }; 200 | } 201 | 202 | // Initialize and update available audio devices 203 | async function setupDevices() { 204 | await rtviClient.initDevices(); 205 | const mics = await rtviClient.getAllMics(); 206 | updateMicList(mics); 207 | } 208 | 209 | // Updates the microphone selection dropdown 210 | function updateMicList(mics: MediaDeviceInfo[]) { 211 | const micPicker = document.getElementById("mic-picker")!; 212 | micPicker.replaceChildren(); 213 | const curMic = rtviClient.selectedMic?.deviceId; 214 | mics.forEach((mic) => { 215 | let el = document.createElement("option"); 216 | el.textContent = mic.label; 217 | el.value = mic.deviceId; 218 | micPicker.appendChild(el); 219 | if (mic.deviceId === curMic) { 220 | el.selected = true; 221 | } 222 | }); 223 | } 224 | 225 | // Connect client to Gemini Multimodal Live bot 226 | async function connectBot() { 227 | statusDiv.textContent = "Joining..."; 228 | try { 229 | await rtviClient.connect(); 230 | console.log("READY! Let's GO!"); 231 | } catch (e) { 232 | console.error("Error connecting", e); 233 | toggleBotButton.disabled = false; 234 | return; 235 | } 236 | toggleBotButton.disabled = false; 237 | submitBtn.disabled = false; 238 | botRunning = true; 239 | } 240 | 241 | // Disconnect client from Gemini Multimodal Live bot 242 | async function disconnectBot() { 243 | try { 244 | await rtviClient.disconnect(); 245 | } catch (e) { 246 | console.error("Error disconnecting", e); 247 | } 248 | toggleBotButton.disabled = false; 249 | submitBtn.disabled = true; 250 | botRunning = false; 251 | } 252 | 253 | // Set up event handlers for RTVI client 254 | // https://docs.pipecat.ai/client/js/api-reference/callbacks#2-event-listeners 255 | export async function setupEventHandlers(rtviClient: RTVIClient) { 256 | audioDiv = document.getElementById("audio") as HTMLDivElement; 257 | 258 | rtviClient.on(RTVIEvent.TransportStateChanged, (state: string) => { 259 | console.log(`-- transport state change: ${state} --`); 260 | statusDiv.textContent = `Transport state: ${state}`; 261 | if (state === "disconnected") { 262 | botRunning = false; 263 | toggleBotButton.textContent = "Connect"; 264 | } 265 | }); 266 | 267 | rtviClient.on(RTVIEvent.Connected, () => { 268 | console.log("-- user connected --"); 269 | }); 270 | 271 | rtviClient.on(RTVIEvent.Disconnected, () => { 272 | console.log("-- user disconnected --"); 273 | }); 274 | 275 | rtviClient.on(RTVIEvent.BotConnected, () => { 276 | console.log("-- bot connected --"); 277 | }); 278 | 279 | rtviClient.on(RTVIEvent.BotDisconnected, () => { 280 | console.log("--bot disconnected --"); 281 | }); 282 | 283 | rtviClient.on(RTVIEvent.BotReady, () => { 284 | console.log("-- bot ready to chat! --"); 285 | }); 286 | 287 | // For realtime v2v transports, this event will only fire for the 288 | // local participant. 289 | rtviClient.on( 290 | RTVIEvent.TrackStarted, 291 | (track: MediaStreamTrack, participant?: Participant) => { 292 | console.log(" --> track started", participant, track); 293 | if (participant?.local) { 294 | return; 295 | } 296 | let audio = document.createElement("audio"); 297 | audio.srcObject = new MediaStream([track]); 298 | audio.autoplay = true; 299 | audioDiv.appendChild(audio); 300 | } 301 | ); 302 | 303 | // For realtime v2v transports, this event will only fire for the 304 | // local participant. 305 | rtviClient.on( 306 | RTVIEvent.TrackStopped, 307 | (track: MediaStreamTrack, participant?: Participant) => { 308 | console.log(" --> track stopped", participant, track); 309 | } 310 | ); 311 | 312 | rtviClient.on(RTVIEvent.UserStartedSpeaking, () => { 313 | console.log("-- user started speaking -- "); 314 | }); 315 | 316 | rtviClient.on(RTVIEvent.UserStoppedSpeaking, () => { 317 | console.log("-- user stopped speaking -- "); 318 | }); 319 | 320 | rtviClient.on(RTVIEvent.BotStartedSpeaking, () => { 321 | console.log("-- bot started speaking -- "); 322 | }); 323 | 324 | rtviClient.on(RTVIEvent.BotStoppedSpeaking, () => { 325 | console.log("-- bot stopped speaking -- "); 326 | }); 327 | 328 | // multimodal live does not currently provide transcripts so this will not fire 329 | rtviClient.on(RTVIEvent.UserTranscript, (transcript: TranscriptData) => { 330 | console.log("[EVENT] UserTranscript", transcript); 331 | }); 332 | 333 | // multimodal live does not currently provide transcripts so this will not fire 334 | rtviClient.on(RTVIEvent.BotTtsText, (data: BotTTSTextData) => { 335 | console.log("[EVENT] BotTtsText", data); 336 | }); 337 | 338 | // multimodal live does not currently provide transcripts so this will not fire 339 | rtviClient.on(RTVIEvent.BotTranscript, (data: BotTTSTextData) => { 340 | console.log("[EVENT] BotTranscript", data); 341 | }); 342 | 343 | rtviClient.on(RTVIEvent.Error, (message: RTVIMessage) => { 344 | console.log("[EVENT] RTVI Error!", message); 345 | }); 346 | 347 | rtviClient.on(RTVIEvent.MessageError, (message: RTVIMessage) => { 348 | console.log("[EVENT] RTVI ErrorMessage error!", message); 349 | }); 350 | 351 | // multimodal live does not currently provide metrics so this will not fire 352 | rtviClient.on(RTVIEvent.Metrics, (data) => { 353 | // let's only print out ttfb for now 354 | if (!data.ttfb) { 355 | return; 356 | } 357 | data.ttfb.map((metric) => { 358 | console.log(`[METRICS] ${metric.processor} ttfb: ${metric.value}`); 359 | }); 360 | }); 361 | 362 | rtviClient.on(RTVIEvent.MicUpdated, (mic: MediaDeviceInfo) => { 363 | const micPicker = document.getElementById("mic-picker")!; 364 | for (let i = 0; i < micPicker.children.length; i++) { 365 | let el = micPicker.children[i] as HTMLOptionElement; 366 | el.selected = el.value === mic.deviceId; 367 | } 368 | }); 369 | 370 | rtviClient.on(RTVIEvent.AvailableMicsUpdated, (mics: MediaDeviceInfo[]) => { 371 | updateMicList(mics); 372 | }); 373 | 374 | rtviClient.on(RTVIEvent.LocalAudioLevel, (level: number) => { 375 | updateSpeakerBubble(level, "user"); 376 | }); 377 | rtviClient.on(RTVIEvent.RemoteAudioLevel, (level: number) => { 378 | updateSpeakerBubble(level, "bot"); 379 | }); 380 | } 381 | 382 | // Send user message to bot. 383 | function sendUserMessage() { 384 | const textInput = document.getElementById("text-input")! as HTMLInputElement; 385 | llmHelper.appendToMessages({ role: "user", content: textInput.value }, true); 386 | textInput.value = ""; 387 | } 388 | 389 | // Update the speaker bubble size based on the audio level 390 | function updateSpeakerBubble(level: number, whom: string) { 391 | const volume = level * 100; 392 | const userBubble = document.getElementById( 393 | whom === "user" ? "user-bubble" : "bot-bubble" 394 | )!; 395 | // Scale the bubble size based on the volume value 396 | const scale = 1 + volume / 50; // Adjust the divisor to control the scaling effect 397 | userBubble.style.transform = `scale(${scale})`; 398 | } 399 | 400 | function _generateRandomWeather() { 401 | const temperature = Math.random() * 200 - 80; 402 | const humidity = Math.random() * 100; 403 | const conditions = ["sunny", "cloudy", "rainy", "snowy"]; 404 | const condition = conditions[Math.floor(Math.random() * conditions.length)]; 405 | const windSpeed = Math.random() * 50; 406 | const windGusts = windSpeed + Math.random() * 20; 407 | return { 408 | temperature, 409 | humidity, 410 | condition, 411 | windSpeed, 412 | windGusts, 413 | }; 414 | } 415 | 416 | async function handleFunctionCall(functionName: string, args: unknown) { 417 | console.log("[EVENT] LLMFunctionCall", functionName); 418 | const toolFunctions: { [key: string]: any } = { 419 | changeBackgroundColor: ({ color }: { [key: string]: string }) => { 420 | console.log("changing background color to", color); 421 | document.body.style.backgroundColor = color; 422 | return { success: true, color }; 423 | }, 424 | getWeather: async ({ location }: { [key: string]: string }) => { 425 | console.log("getting weather for", location); 426 | const key = import.meta.env.VITE_DANGEROUS_OPENWEATHER_API_KEY; 427 | if (!key) { 428 | const ret = { success: true, weather: _generateRandomWeather() }; 429 | console.log("returning weather", ret); 430 | return ret; 431 | } 432 | const locationReq = await fetch( 433 | `http://api.openweathermap.org/geo/1.0/direct?q=${location}&limit=1&appid=${key}` 434 | ); 435 | const locJson = await locationReq.json(); 436 | const loc = { lat: locJson[0].lat, lon: locJson[0].lon }; 437 | const exclude = ["minutely", "hourly", "daily"].join(","); 438 | const weatherRec = await fetch( 439 | `https://api.openweathermap.org/data/3.0/onecall?lat=${loc.lat}&lon=${loc.lon}&exclude=${exclude}&appid=${key}` 440 | ); 441 | const weather = await weatherRec.json(); 442 | return { success: true, weather: weather.current }; 443 | }, 444 | }; 445 | const toolFunction = toolFunctions[functionName]; 446 | if (toolFunction) { 447 | let result = await toolFunction(args); 448 | console.debug("returning result", result); 449 | return result; 450 | } 451 | } 452 | -------------------------------------------------------------------------------- /examples/directToLLMTransports/src/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | max-width: 800px; 3 | margin: 20px 20px; 4 | font-family: system-ui, -apple-system, sans-serif; 5 | } 6 | 7 | #join-div a { 8 | color: #2563eb; 9 | cursor: pointer; 10 | } 11 | 12 | #connected-div { 13 | width: 90vw; 14 | margin: 20px 2vw; 15 | display: flex; 16 | flex-direction: column; 17 | justify-content: space-between; 18 | } 19 | 20 | #controls { 21 | display: flex; 22 | justify-content: left; 23 | margin: 10px 0; 24 | } 25 | 26 | #controls div { 27 | padding: 5px; 28 | margin: 0 5px; 29 | } 30 | 31 | button { 32 | padding: 5px; 33 | margin: 0 5px; 34 | width: 8em; 35 | border-radius: 10px; 36 | background-color: aliceblue; 37 | } 38 | 39 | button:active { 40 | transform: translateY(1px); /* Move the button down slightly */ 41 | box-shadow: 2px 2px #666; /* Add a shadow to create a pressed effect */ 42 | } 43 | 44 | #text-div { 45 | display: flex; 46 | flex-direction: column; 47 | margin: 10px 0; 48 | } 49 | 50 | #text-div label { 51 | margin: 5px 0; 52 | } 53 | 54 | #text-div textarea { 55 | margin: 5px 0; 56 | padding: 5px; 57 | width: 50%; 58 | border-radius: 10px; 59 | } 60 | 61 | #bubbles { 62 | margin: 20px 0px; 63 | border-radius: 16px; 64 | display: flex; 65 | flex-direction: row; 66 | } 67 | 68 | .bubble { 69 | width: 50px; 70 | height: 50px; 71 | border-radius: 50%; 72 | transition: transform 0.1s ease; 73 | margin: 15px; 74 | } 75 | 76 | #user-bubble { 77 | background-color: #4caf50; 78 | } 79 | 80 | #bot-bubble { 81 | background-color: #2196f3; 82 | } 83 | 84 | #content h2 { 85 | padding-left: 20px; 86 | } 87 | 88 | #chat-text { 89 | display: flex; 90 | flex-direction: column; 91 | align-items: left; 92 | overflow-y: auto; 93 | padding: 20px; 94 | flex: 1; 95 | } 96 | 97 | .user-message { 98 | display: flex; 99 | flex-direction: column; 100 | background-color: #f0f0f0; 101 | padding: 16px; 102 | margin: 12px 6px; 103 | border-radius: 8px; 104 | line-height: 1.5; 105 | } 106 | 107 | .user-message .interim { 108 | color: #707070; 109 | } 110 | 111 | .user-message::before { 112 | content: "User\A"; 113 | font-size: 14px; 114 | color: #666; 115 | font-weight: 500; 116 | display: block; 117 | padding-bottom: 0.4em; 118 | } 119 | 120 | .assistant-message { 121 | display: flex; 122 | flex-direction: column; 123 | background-color: #ffffff; 124 | border: 1px solid #e0e0e0; 125 | margin: 12px 6px; 126 | padding: 16px; 127 | border-radius: 8px; 128 | line-height: 1.5; 129 | } 130 | 131 | .assistant-message::before { 132 | content: "Assistant"; 133 | font-size: 14px; 134 | color: #666; 135 | font-weight: 500; 136 | display: block; 137 | padding-bottom: 0.4em; 138 | } 139 | -------------------------------------------------------------------------------- /examples/directToLLMTransports/src/vite-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | -------------------------------------------------------------------------------- /examples/directToLLMTransports/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "ESNext", 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "skipLibCheck": true, 7 | "jsx": "preserve", 8 | 9 | /* Bundler mode */ 10 | "moduleResolution": "bundler", 11 | "allowImportingTsExtensions": true, 12 | "allowJs": true, 13 | "noEmit": true, 14 | "resolveJsonModule": true, 15 | "isolatedModules": true, 16 | "moduleDetection": "force", 17 | 18 | /* Linting */ 19 | "strict": true, 20 | "noUnusedLocals": true, 21 | "noUnusedParameters": false, 22 | "noFallthroughCasesInSwitch": true 23 | }, 24 | "include": ["src", "lib"] 25 | } 26 | -------------------------------------------------------------------------------- /lib/media-mgmt/dailyMediaManager.ts: -------------------------------------------------------------------------------- 1 | import { MediaManager } from "./mediaManager"; 2 | import { MediaStreamRecorder, WavStreamPlayer } from "../wavtools"; 3 | 4 | import Daily, { 5 | DailyCall, 6 | DailyEventObjectAvailableDevicesUpdated, 7 | DailyEventObjectLocalAudioLevel, 8 | DailyEventObjectSelectedDevicesUpdated, 9 | DailyEventObjectTrack, 10 | DailyParticipant, 11 | DailyParticipantsObject, 12 | } from "@daily-co/daily-js"; 13 | import { Participant, Tracks } from "@pipecat-ai/client-js"; 14 | 15 | export class DailyMediaManager extends MediaManager { 16 | private _daily: DailyCall; 17 | private _mediaStreamRecorder: MediaStreamRecorder | undefined; 18 | private _wavStreamPlayer: WavStreamPlayer | undefined; 19 | 20 | private _initialized: boolean; 21 | private _connected: boolean; 22 | private _connectResolve: ((value: void | PromiseLike) => void) | null; 23 | 24 | private _currentAudioTrack: MediaStreamTrack | null; 25 | private _selectedCam: MediaDeviceInfo | Record = {}; 26 | private _selectedMic: MediaDeviceInfo | Record = {}; 27 | private _selectedSpeaker: MediaDeviceInfo | Record = {}; 28 | 29 | private _remoteAudioLevelInterval: NodeJS.Timeout | null = null; 30 | 31 | private onTrackStartedCallback?: (event: DailyEventObjectTrack) => void; 32 | private onTrackStoppedCallback?: (event: DailyEventObjectTrack) => void; 33 | 34 | private _recorderChunkSize: number | undefined = undefined; 35 | 36 | constructor( 37 | enablePlayer: boolean = true, 38 | enableRecording: boolean = true, 39 | onTrackStartedCallback?: (event: DailyEventObjectTrack) => void, 40 | onTrackStoppedCallback?: (event: DailyEventObjectTrack) => void, 41 | recorderChunkSize: number | undefined = undefined, 42 | recorderSampleRate: number = 24000, 43 | ) { 44 | super(); 45 | this._initialized = false; 46 | this._connected = false; 47 | this._currentAudioTrack = null; 48 | this._connectResolve = null; 49 | this.onTrackStartedCallback = onTrackStartedCallback; 50 | this.onTrackStoppedCallback = onTrackStoppedCallback; 51 | this._recorderChunkSize = recorderChunkSize; 52 | 53 | this._daily = Daily.getCallInstance() ?? Daily.createCallObject(); 54 | 55 | if (enableRecording) { 56 | this._mediaStreamRecorder = new MediaStreamRecorder({ 57 | sampleRate: recorderSampleRate, 58 | }); 59 | } 60 | if (enablePlayer) { 61 | this._wavStreamPlayer = new WavStreamPlayer({ sampleRate: 24000 }); 62 | } 63 | 64 | this._daily.on("track-started", this.handleTrackStarted.bind(this)); 65 | this._daily.on("track-stopped", this.handleTrackStopped.bind(this)); 66 | this._daily.on( 67 | "available-devices-updated", 68 | this._handleAvailableDevicesUpdated.bind(this), 69 | ); 70 | this._daily.on( 71 | "selected-devices-updated", 72 | this._handleSelectedDevicesUpdated.bind(this), 73 | ); 74 | this._daily.on("local-audio-level", this._handleLocalAudioLevel.bind(this)); 75 | } 76 | 77 | async initialize(): Promise { 78 | if (this._initialized) { 79 | console.warn("DailyMediaManager already initialized"); 80 | return; 81 | } 82 | const infos = await this._daily.startCamera({ 83 | startVideoOff: !this._camEnabled, 84 | startAudioOff: !this._micEnabled, 85 | }); 86 | const { devices } = await this._daily.enumerateDevices(); 87 | const cams = devices.filter((d) => d.kind === "videoinput"); 88 | const mics = devices.filter((d) => d.kind === "audioinput"); 89 | const speakers = devices.filter((d) => d.kind === "audiooutput"); 90 | this._callbacks.onAvailableCamsUpdated?.(cams); 91 | this._callbacks.onAvailableMicsUpdated?.(mics); 92 | this._callbacks.onAvailableSpeakersUpdated?.(speakers); 93 | this._selectedCam = infos.camera; 94 | this._callbacks.onCamUpdated?.(infos.camera as MediaDeviceInfo); 95 | this._selectedMic = infos.mic; 96 | this._callbacks.onMicUpdated?.(infos.mic as MediaDeviceInfo); 97 | this._selectedSpeaker = infos.speaker; 98 | this._callbacks.onSpeakerUpdated?.(infos.speaker as MediaDeviceInfo); 99 | 100 | // Instantiate audio observers 101 | if (!this._daily.isLocalAudioLevelObserverRunning()) 102 | await this._daily.startLocalAudioLevelObserver(100); 103 | 104 | if (this._wavStreamPlayer) { 105 | await this._wavStreamPlayer.connect(); 106 | if (!this._remoteAudioLevelInterval) { 107 | this._remoteAudioLevelInterval = setInterval(() => { 108 | const frequencies = this._wavStreamPlayer!.getFrequencies(); 109 | let aveVal = 0; 110 | if (frequencies.values?.length) { 111 | aveVal = 112 | frequencies.values.reduce((a, c) => a + c, 0) / 113 | frequencies.values.length; 114 | } 115 | this._handleRemoteAudioLevel(aveVal); 116 | }, 100); 117 | } 118 | } 119 | this._initialized = true; 120 | } 121 | 122 | async connect(): Promise { 123 | if (this._connected) { 124 | console.warn("DailyMediaManager already connected"); 125 | return; 126 | } 127 | this._connected = true; 128 | if (!this._initialized) { 129 | return new Promise((resolve) => { 130 | (async () => { 131 | this._connectResolve = resolve; 132 | await this.initialize(); 133 | })(); 134 | }); 135 | } 136 | if (this._micEnabled) { 137 | this._startRecording(); 138 | } 139 | } 140 | 141 | async disconnect(): Promise { 142 | if (this._remoteAudioLevelInterval) { 143 | clearInterval(this._remoteAudioLevelInterval); 144 | } 145 | this._remoteAudioLevelInterval = null; 146 | this._daily.leave(); 147 | this._currentAudioTrack = null; 148 | await this._mediaStreamRecorder?.end(); 149 | this._wavStreamPlayer?.interrupt(); 150 | this._initialized = false; 151 | this._connected = false; 152 | } 153 | 154 | async userStartedSpeaking(): Promise { 155 | return this._wavStreamPlayer?.interrupt(); 156 | } 157 | 158 | bufferBotAudio( 159 | data: ArrayBuffer | Int16Array, 160 | id?: string, 161 | ): Int16Array | undefined { 162 | return this._wavStreamPlayer?.add16BitPCM(data, id); 163 | } 164 | 165 | async getAllMics(): Promise { 166 | let devices = (await this._daily.enumerateDevices()).devices; 167 | return devices.filter((device) => device.kind === "audioinput"); 168 | } 169 | async getAllCams(): Promise { 170 | let devices = (await this._daily.enumerateDevices()).devices; 171 | return devices.filter((device) => device.kind === "videoinput"); 172 | } 173 | async getAllSpeakers(): Promise { 174 | let devices = (await this._daily.enumerateDevices()).devices; 175 | return devices.filter((device) => device.kind === "audiooutput"); 176 | } 177 | 178 | updateMic(micId: string) { 179 | this._daily 180 | .setInputDevicesAsync({ audioDeviceId: micId }) 181 | .then((deviceInfo) => { 182 | this._selectedMic = deviceInfo.mic; 183 | }); 184 | } 185 | updateCam(camId: string) { 186 | this._daily 187 | .setInputDevicesAsync({ videoDeviceId: camId }) 188 | .then((deviceInfo) => { 189 | this._selectedCam = deviceInfo.camera; 190 | }); 191 | } 192 | async updateSpeaker(speakerId: string): Promise { 193 | if (speakerId !== "default" && this._selectedSpeaker.deviceId === speakerId) 194 | return; 195 | let sID = speakerId; 196 | if (sID === "default") { 197 | const speakers = await this.getAllSpeakers(); 198 | const defaultSpeaker = speakers.find((s) => s.deviceId === "default"); 199 | if (!defaultSpeaker) { 200 | console.warn("No default speaker found"); 201 | return; 202 | } 203 | speakers.splice(speakers.indexOf(defaultSpeaker), 1); 204 | const defaultSpeakerCp = speakers.find((s) => 205 | defaultSpeaker.label.includes(s.label), 206 | ); 207 | sID = defaultSpeakerCp?.deviceId ?? speakerId; 208 | } 209 | this._wavStreamPlayer?.updateSpeaker(sID).then(() => { 210 | this._selectedSpeaker = { deviceId: speakerId } as MediaDeviceInfo; 211 | this._callbacks.onSpeakerUpdated?.(this._selectedSpeaker); 212 | }); 213 | } 214 | 215 | get selectedMic(): MediaDeviceInfo | Record { 216 | return this._selectedMic; 217 | } 218 | get selectedCam(): MediaDeviceInfo | Record { 219 | return this._selectedCam; 220 | } 221 | get selectedSpeaker(): MediaDeviceInfo | Record { 222 | return this._selectedSpeaker; 223 | } 224 | 225 | async enableMic(enable: boolean): Promise { 226 | this._micEnabled = enable; 227 | if (!this._daily.participants()?.local) return; 228 | this._daily.setLocalAudio(enable); 229 | if (this._mediaStreamRecorder) { 230 | if (enable) { 231 | if (this._mediaStreamRecorder.getStatus() === "paused") { 232 | this._startRecording(); 233 | } // else, we'll record on the track-started event 234 | } else { 235 | if (this._mediaStreamRecorder.getStatus() === "recording") { 236 | this._mediaStreamRecorder.pause(); 237 | } 238 | } 239 | } 240 | } 241 | enableCam(enable: boolean): void { 242 | this._camEnabled = enable; 243 | this._daily.setLocalVideo(enable); 244 | } 245 | 246 | get isCamEnabled(): boolean { 247 | return this._daily.localVideo(); 248 | } 249 | get isMicEnabled(): boolean { 250 | return this._daily.localAudio(); 251 | } 252 | 253 | tracks(): Tracks { 254 | const participants: DailyParticipantsObject = this._daily.participants(); 255 | return { 256 | local: { 257 | audio: participants?.local?.tracks?.audio?.persistentTrack, 258 | video: participants?.local?.tracks?.video?.persistentTrack, 259 | }, 260 | }; 261 | } 262 | 263 | private _startRecording(): void { 264 | if (!this._connected || !this._mediaStreamRecorder) return; 265 | try { 266 | this._mediaStreamRecorder.record((data) => { 267 | this._userAudioCallback(data.mono); 268 | }, this._recorderChunkSize); 269 | } catch (e) { 270 | const err = e as Error; 271 | if (!err.message.includes("Already recording")) { 272 | console.error("Error starting recording", e); 273 | } 274 | } 275 | } 276 | 277 | private _handleAvailableDevicesUpdated( 278 | event: DailyEventObjectAvailableDevicesUpdated, 279 | ) { 280 | this._callbacks.onAvailableCamsUpdated?.( 281 | event.availableDevices.filter((d) => d.kind === "videoinput"), 282 | ); 283 | this._callbacks.onAvailableMicsUpdated?.( 284 | event.availableDevices.filter((d) => d.kind === "audioinput"), 285 | ); 286 | this._callbacks.onAvailableSpeakersUpdated?.( 287 | event.availableDevices.filter((d) => d.kind === "audiooutput"), 288 | ); 289 | if (this._selectedSpeaker.deviceId === "default") { 290 | this.updateSpeaker("default"); 291 | } 292 | } 293 | 294 | private _handleSelectedDevicesUpdated( 295 | event: DailyEventObjectSelectedDevicesUpdated, 296 | ) { 297 | if (this._selectedCam?.deviceId !== event.devices.camera) { 298 | this._selectedCam = event.devices.camera; 299 | this._callbacks.onCamUpdated?.(event.devices.camera as MediaDeviceInfo); 300 | } 301 | if (this._selectedMic?.deviceId !== event.devices.mic) { 302 | this._selectedMic = event.devices.mic; 303 | this._callbacks.onMicUpdated?.(event.devices.mic as MediaDeviceInfo); 304 | } 305 | } 306 | 307 | private _handleLocalAudioLevel(ev: DailyEventObjectLocalAudioLevel) { 308 | this._callbacks.onLocalAudioLevel?.(ev.audioLevel); 309 | } 310 | 311 | private _handleRemoteAudioLevel(audioLevel: number) { 312 | this._callbacks.onRemoteAudioLevel?.(audioLevel, botParticipant()); 313 | } 314 | 315 | protected async handleTrackStarted(event: DailyEventObjectTrack) { 316 | if (!event.participant?.local) return; 317 | if (event.track.kind === "audio") { 318 | if (this._mediaStreamRecorder) { 319 | const status = this._mediaStreamRecorder.getStatus(); 320 | switch (status) { 321 | case "ended": 322 | await this._mediaStreamRecorder.begin(event.track); 323 | if (this._connected) { 324 | this._startRecording(); 325 | if (this._connectResolve) { 326 | this._connectResolve(); 327 | this._connectResolve = null; 328 | } 329 | } 330 | break; 331 | case "paused": 332 | this._startRecording(); 333 | break; 334 | case "recording": 335 | default: 336 | if (this._currentAudioTrack !== event.track) { 337 | await this._mediaStreamRecorder.end(); 338 | await this._mediaStreamRecorder.begin(event.track); 339 | this._startRecording(); 340 | } else { 341 | console.warn( 342 | "track-started event received for current track and already recording", 343 | ); 344 | } 345 | break; 346 | } 347 | } 348 | this._currentAudioTrack = event.track; 349 | } 350 | this._callbacks.onTrackStarted?.( 351 | event.track, 352 | event.participant 353 | ? dailyParticipantToParticipant(event.participant) 354 | : undefined, 355 | ); 356 | this.onTrackStartedCallback?.(event); 357 | } 358 | 359 | protected handleTrackStopped(event: DailyEventObjectTrack) { 360 | if (!event.participant?.local) return; 361 | if (event.track.kind === "audio") { 362 | if ( 363 | this._mediaStreamRecorder && 364 | this._mediaStreamRecorder.getStatus() === "recording" 365 | ) { 366 | this._mediaStreamRecorder.pause(); 367 | } 368 | } 369 | this._callbacks.onTrackStopped?.( 370 | event.track, 371 | event.participant 372 | ? dailyParticipantToParticipant(event.participant) 373 | : undefined, 374 | ); 375 | this.onTrackStoppedCallback?.(event); 376 | } 377 | } 378 | 379 | const dailyParticipantToParticipant = (p: DailyParticipant): Participant => ({ 380 | id: p.user_id, 381 | local: p.local, 382 | name: p.user_name, 383 | }); 384 | 385 | const botParticipant = () => ({ 386 | id: "bot", 387 | local: false, 388 | name: "Bot", 389 | }); 390 | -------------------------------------------------------------------------------- /lib/media-mgmt/mediaManager.ts: -------------------------------------------------------------------------------- 1 | import { WavRecorder, WavStreamPlayer } from "../wavtools"; 2 | 3 | import { 4 | RTVIClientOptions, 5 | RTVIEventCallbacks, 6 | Tracks, 7 | } from "@pipecat-ai/client-js"; 8 | 9 | export abstract class MediaManager { 10 | declare protected _userAudioCallback: (data: ArrayBuffer) => void; 11 | declare protected _options: RTVIClientOptions; 12 | protected _callbacks: RTVIEventCallbacks = {}; 13 | 14 | protected _micEnabled: boolean; 15 | protected _camEnabled: boolean; 16 | 17 | constructor() { 18 | this._micEnabled = true; 19 | this._camEnabled = false; 20 | } 21 | 22 | setUserAudioCallback(userAudioCallback: (data: ArrayBuffer) => void) { 23 | this._userAudioCallback = userAudioCallback; 24 | } 25 | setRTVIOptions(options: RTVIClientOptions, override: boolean = false) { 26 | if (this._options && !override) return; 27 | this._options = options; 28 | this._callbacks = options.callbacks ?? {}; 29 | this._micEnabled = options.enableMic ?? true; 30 | this._camEnabled = options.enableCam ?? false; 31 | } 32 | 33 | abstract initialize(): Promise; 34 | abstract connect(): Promise; 35 | abstract disconnect(): Promise; 36 | 37 | abstract userStartedSpeaking(): Promise; 38 | abstract bufferBotAudio( 39 | data: ArrayBuffer | Int16Array, 40 | id?: string, 41 | ): Int16Array | undefined; 42 | 43 | abstract getAllMics(): Promise; 44 | abstract getAllCams(): Promise; 45 | abstract getAllSpeakers(): Promise; 46 | 47 | abstract updateMic(micId: string): void; 48 | abstract updateCam(camId: string): void; 49 | abstract updateSpeaker(speakerId: string): void; 50 | 51 | abstract get selectedMic(): MediaDeviceInfo | Record; 52 | abstract get selectedCam(): MediaDeviceInfo | Record; 53 | abstract get selectedSpeaker(): MediaDeviceInfo | Record; 54 | 55 | abstract enableMic(enable: boolean): void; 56 | abstract enableCam(enable: boolean): void; 57 | 58 | abstract get isCamEnabled(): boolean; 59 | abstract get isMicEnabled(): boolean; 60 | 61 | abstract tracks(): Tracks; 62 | } 63 | 64 | export class WavMediaManager extends MediaManager { 65 | private _wavRecorder; 66 | private _wavStreamPlayer; 67 | 68 | private _initialized = false; 69 | private _recorderChunkSize: number | undefined = undefined; 70 | 71 | constructor( 72 | recorderChunkSize: number | undefined = undefined, 73 | recorderSampleRate: number | undefined = 24000, 74 | ) { 75 | super(); 76 | this._recorderChunkSize = recorderChunkSize; 77 | this._wavRecorder = new WavRecorder({ sampleRate: recorderSampleRate }); 78 | this._wavStreamPlayer = new WavStreamPlayer({ sampleRate: 24000 }); 79 | } 80 | 81 | async initialize(): Promise { 82 | await this._wavRecorder.begin(); 83 | this._wavRecorder.listenForDeviceChange(null); 84 | this._wavRecorder.listenForDeviceChange( 85 | this._handleAvailableDevicesUpdated.bind(this), 86 | ); 87 | await this._wavStreamPlayer.connect(); 88 | this._initialized = true; 89 | } 90 | 91 | async connect(): Promise { 92 | if (!this._initialized) { 93 | await this.initialize(); 94 | } 95 | const isAlreadyRecording = this._wavRecorder.getStatus() == "recording"; 96 | if (this._micEnabled && !isAlreadyRecording) { 97 | await this._startRecording(); 98 | } 99 | } 100 | 101 | async disconnect(): Promise { 102 | if (!this._initialized) { 103 | return; 104 | } 105 | await this._wavRecorder.end(); 106 | await this._wavStreamPlayer.interrupt(); 107 | this._initialized = false; 108 | } 109 | 110 | async userStartedSpeaking(): Promise { 111 | return this._wavStreamPlayer.interrupt(); 112 | } 113 | 114 | bufferBotAudio(data: ArrayBuffer | Int16Array, id?: string): Int16Array { 115 | return this._wavStreamPlayer.add16BitPCM(data, id); 116 | } 117 | 118 | getAllMics(): Promise { 119 | return this._wavRecorder.listDevices(); 120 | } 121 | getAllCams(): Promise { 122 | // TODO: Video not supported yet 123 | return Promise.resolve([]); 124 | } 125 | getAllSpeakers(): Promise { 126 | // TODO: Implement speaker support 127 | return Promise.resolve([]); 128 | } 129 | 130 | async updateMic(micId: string): Promise { 131 | const prevMic = this._wavRecorder.deviceSelection; 132 | await this._wavRecorder.end(); 133 | await this._wavRecorder.begin(micId); 134 | if (this._micEnabled) { 135 | await this._startRecording(); 136 | } 137 | const curMic = this._wavRecorder.deviceSelection; 138 | if (curMic && prevMic && prevMic.label !== curMic.label) { 139 | this._callbacks.onMicUpdated?.(curMic); 140 | } 141 | } 142 | 143 | updateCam(camId: string): void { 144 | // TODO: Video not supported yet 145 | } 146 | updateSpeaker(speakerId: string): void { 147 | // TODO: Implement speaker support 148 | } 149 | 150 | get selectedMic(): MediaDeviceInfo | Record { 151 | return this._wavRecorder.deviceSelection ?? {}; 152 | } 153 | get selectedCam(): MediaDeviceInfo | Record { 154 | // TODO: Video not supported yet 155 | return {}; 156 | } 157 | get selectedSpeaker(): MediaDeviceInfo | Record { 158 | // TODO: Implement speaker support 159 | return {}; 160 | } 161 | 162 | async enableMic(enable: boolean): Promise { 163 | this._micEnabled = enable; 164 | if (!this._wavRecorder.stream) return; 165 | this._wavRecorder.stream 166 | .getAudioTracks() 167 | .forEach((track: MediaStreamTrack) => { 168 | track.enabled = enable; 169 | if (!enable) { 170 | this._callbacks.onTrackStopped?.(track, localParticipant()); 171 | } 172 | }); 173 | if (enable) { 174 | await this._startRecording(); 175 | } else { 176 | await this._wavRecorder.pause(); 177 | } 178 | } 179 | enableCam(enable: boolean): void { 180 | // TODO: Video not supported yet 181 | } 182 | 183 | get isCamEnabled(): boolean { 184 | // TODO: Video not supported yet 185 | return false; 186 | } 187 | get isMicEnabled(): boolean { 188 | return this._micEnabled; 189 | } 190 | 191 | tracks(): Tracks { 192 | const tracks = this._wavRecorder.stream?.getTracks()[0]; 193 | return { local: tracks ? { audio: tracks } : {} }; 194 | } 195 | 196 | private async _startRecording() { 197 | await this._wavRecorder.record((data) => { 198 | this._userAudioCallback(data.mono); 199 | }, this._recorderChunkSize); 200 | const track = this._wavRecorder.stream?.getAudioTracks()[0]; 201 | if (track) { 202 | this._callbacks.onTrackStarted?.(track, localParticipant()); 203 | } 204 | } 205 | 206 | private _handleAvailableDevicesUpdated(devices: MediaDeviceInfo[]) { 207 | this._callbacks.onAvailableCamsUpdated?.( 208 | devices.filter((d) => d.kind === "videoinput"), 209 | ); 210 | this._callbacks.onAvailableMicsUpdated?.( 211 | devices.filter((d) => d.kind === "audioinput"), 212 | ); 213 | // if the current device went away or we're using the default and 214 | // the default changed, reset the mic. 215 | const defaultDevice = devices.find((d) => d.deviceId === "default"); 216 | const currentDevice = this._wavRecorder.deviceSelection; 217 | if ( 218 | currentDevice && 219 | (!devices.some((d) => d.deviceId === currentDevice.deviceId) || 220 | (currentDevice.deviceId === "default" && 221 | currentDevice.label !== defaultDevice?.label)) 222 | ) { 223 | this.updateMic(""); 224 | } 225 | } 226 | } 227 | 228 | const localParticipant = () => { 229 | return { 230 | id: "local", 231 | name: "", 232 | local: true, 233 | }; 234 | }; 235 | -------------------------------------------------------------------------------- /lib/wavtools/index.js: -------------------------------------------------------------------------------- 1 | import { WavPacker } from './lib/wav_packer.js'; 2 | import { AudioAnalysis } from './lib/analysis/audio_analysis.js'; 3 | import { WavStreamPlayer } from './lib/wav_stream_player.js'; 4 | import { WavRecorder } from './lib/wav_recorder.js'; 5 | import { MediaStreamRecorder } from './lib/mediastream_recorder.js'; 6 | 7 | export { 8 | AudioAnalysis, 9 | MediaStreamRecorder, 10 | WavPacker, 11 | WavStreamPlayer, 12 | WavRecorder, 13 | }; 14 | -------------------------------------------------------------------------------- /lib/wavtools/lib/analysis/audio_analysis.js: -------------------------------------------------------------------------------- 1 | import { 2 | noteFrequencies, 3 | noteFrequencyLabels, 4 | voiceFrequencies, 5 | voiceFrequencyLabels, 6 | } from './constants.js'; 7 | 8 | /** 9 | * Output of AudioAnalysis for the frequency domain of the audio 10 | * @typedef {Object} AudioAnalysisOutputType 11 | * @property {Float32Array} values Amplitude of this frequency between {0, 1} inclusive 12 | * @property {number[]} frequencies Raw frequency bucket values 13 | * @property {string[]} labels Labels for the frequency bucket values 14 | */ 15 | 16 | /** 17 | * Analyzes audio for visual output 18 | * @class 19 | */ 20 | export class AudioAnalysis { 21 | /** 22 | * Retrieves frequency domain data from an AnalyserNode adjusted to a decibel range 23 | * returns human-readable formatting and labels 24 | * @param {AnalyserNode} analyser 25 | * @param {number} sampleRate 26 | * @param {Float32Array} [fftResult] 27 | * @param {"frequency"|"music"|"voice"} [analysisType] 28 | * @param {number} [minDecibels] default -100 29 | * @param {number} [maxDecibels] default -30 30 | * @returns {AudioAnalysisOutputType} 31 | */ 32 | static getFrequencies( 33 | analyser, 34 | sampleRate, 35 | fftResult, 36 | analysisType = 'frequency', 37 | minDecibels = -100, 38 | maxDecibels = -30, 39 | ) { 40 | if (!fftResult) { 41 | fftResult = new Float32Array(analyser.frequencyBinCount); 42 | analyser.getFloatFrequencyData(fftResult); 43 | } 44 | const nyquistFrequency = sampleRate / 2; 45 | const frequencyStep = (1 / fftResult.length) * nyquistFrequency; 46 | let outputValues; 47 | let frequencies; 48 | let labels; 49 | if (analysisType === 'music' || analysisType === 'voice') { 50 | const useFrequencies = 51 | analysisType === 'voice' ? voiceFrequencies : noteFrequencies; 52 | const aggregateOutput = Array(useFrequencies.length).fill(minDecibels); 53 | for (let i = 0; i < fftResult.length; i++) { 54 | const frequency = i * frequencyStep; 55 | const amplitude = fftResult[i]; 56 | for (let n = useFrequencies.length - 1; n >= 0; n--) { 57 | if (frequency > useFrequencies[n]) { 58 | aggregateOutput[n] = Math.max(aggregateOutput[n], amplitude); 59 | break; 60 | } 61 | } 62 | } 63 | outputValues = aggregateOutput; 64 | frequencies = 65 | analysisType === 'voice' ? voiceFrequencies : noteFrequencies; 66 | labels = 67 | analysisType === 'voice' ? voiceFrequencyLabels : noteFrequencyLabels; 68 | } else { 69 | outputValues = Array.from(fftResult); 70 | frequencies = outputValues.map((_, i) => frequencyStep * i); 71 | labels = frequencies.map((f) => `${f.toFixed(2)} Hz`); 72 | } 73 | // We normalize to {0, 1} 74 | const normalizedOutput = outputValues.map((v) => { 75 | return Math.max( 76 | 0, 77 | Math.min((v - minDecibels) / (maxDecibels - minDecibels), 1), 78 | ); 79 | }); 80 | const values = new Float32Array(normalizedOutput); 81 | return { 82 | values, 83 | frequencies, 84 | labels, 85 | }; 86 | } 87 | 88 | /** 89 | * Creates a new AudioAnalysis instance for an HTMLAudioElement 90 | * @param {HTMLAudioElement} audioElement 91 | * @param {AudioBuffer|null} [audioBuffer] If provided, will cache all frequency domain data from the buffer 92 | * @returns {AudioAnalysis} 93 | */ 94 | constructor(audioElement, audioBuffer = null) { 95 | this.fftResults = []; 96 | if (audioBuffer) { 97 | /** 98 | * Modified from 99 | * https://stackoverflow.com/questions/75063715/using-the-web-audio-api-to-analyze-a-song-without-playing 100 | * 101 | * We do this to populate FFT values for the audio if provided an `audioBuffer` 102 | * The reason to do this is that Safari fails when using `createMediaElementSource` 103 | * This has a non-zero RAM cost so we only opt-in to run it on Safari, Chrome is better 104 | */ 105 | const { length, sampleRate } = audioBuffer; 106 | const offlineAudioContext = new OfflineAudioContext({ 107 | length, 108 | sampleRate, 109 | }); 110 | const source = offlineAudioContext.createBufferSource(); 111 | source.buffer = audioBuffer; 112 | const analyser = offlineAudioContext.createAnalyser(); 113 | analyser.fftSize = 8192; 114 | analyser.smoothingTimeConstant = 0.1; 115 | source.connect(analyser); 116 | // limit is :: 128 / sampleRate; 117 | // but we just want 60fps - cuts ~1s from 6MB to 1MB of RAM 118 | const renderQuantumInSeconds = 1 / 60; 119 | const durationInSeconds = length / sampleRate; 120 | const analyze = (index) => { 121 | const suspendTime = renderQuantumInSeconds * index; 122 | if (suspendTime < durationInSeconds) { 123 | offlineAudioContext.suspend(suspendTime).then(() => { 124 | const fftResult = new Float32Array(analyser.frequencyBinCount); 125 | analyser.getFloatFrequencyData(fftResult); 126 | this.fftResults.push(fftResult); 127 | analyze(index + 1); 128 | }); 129 | } 130 | if (index === 1) { 131 | offlineAudioContext.startRendering(); 132 | } else { 133 | offlineAudioContext.resume(); 134 | } 135 | }; 136 | source.start(0); 137 | analyze(1); 138 | this.audio = audioElement; 139 | this.context = offlineAudioContext; 140 | this.analyser = analyser; 141 | this.sampleRate = sampleRate; 142 | this.audioBuffer = audioBuffer; 143 | } else { 144 | const audioContext = new AudioContext(); 145 | const track = audioContext.createMediaElementSource(audioElement); 146 | const analyser = audioContext.createAnalyser(); 147 | analyser.fftSize = 8192; 148 | analyser.smoothingTimeConstant = 0.1; 149 | track.connect(analyser); 150 | analyser.connect(audioContext.destination); 151 | this.audio = audioElement; 152 | this.context = audioContext; 153 | this.analyser = analyser; 154 | this.sampleRate = this.context.sampleRate; 155 | this.audioBuffer = null; 156 | } 157 | } 158 | 159 | /** 160 | * Gets the current frequency domain data from the playing audio track 161 | * @param {"frequency"|"music"|"voice"} [analysisType] 162 | * @param {number} [minDecibels] default -100 163 | * @param {number} [maxDecibels] default -30 164 | * @returns {AudioAnalysisOutputType} 165 | */ 166 | getFrequencies( 167 | analysisType = 'frequency', 168 | minDecibels = -100, 169 | maxDecibels = -30, 170 | ) { 171 | let fftResult = null; 172 | if (this.audioBuffer && this.fftResults.length) { 173 | const pct = this.audio.currentTime / this.audio.duration; 174 | const index = Math.min( 175 | (pct * this.fftResults.length) | 0, 176 | this.fftResults.length - 1, 177 | ); 178 | fftResult = this.fftResults[index]; 179 | } 180 | return AudioAnalysis.getFrequencies( 181 | this.analyser, 182 | this.sampleRate, 183 | fftResult, 184 | analysisType, 185 | minDecibels, 186 | maxDecibels, 187 | ); 188 | } 189 | 190 | /** 191 | * Resume the internal AudioContext if it was suspended due to the lack of 192 | * user interaction when the AudioAnalysis was instantiated. 193 | * @returns {Promise} 194 | */ 195 | async resumeIfSuspended() { 196 | if (this.context.state === 'suspended') { 197 | await this.context.resume(); 198 | } 199 | return true; 200 | } 201 | } 202 | 203 | globalThis.AudioAnalysis = AudioAnalysis; 204 | -------------------------------------------------------------------------------- /lib/wavtools/lib/analysis/constants.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Constants for help with visualization 3 | * Helps map frequency ranges from Fast Fourier Transform 4 | * to human-interpretable ranges, notably music ranges and 5 | * human vocal ranges. 6 | */ 7 | 8 | // Eighth octave frequencies 9 | const octave8Frequencies = [ 10 | 4186.01, 4434.92, 4698.63, 4978.03, 5274.04, 5587.65, 5919.91, 6271.93, 11 | 6644.88, 7040.0, 7458.62, 7902.13, 12 | ]; 13 | 14 | // Labels for each of the above frequencies 15 | const octave8FrequencyLabels = [ 16 | 'C', 17 | 'C#', 18 | 'D', 19 | 'D#', 20 | 'E', 21 | 'F', 22 | 'F#', 23 | 'G', 24 | 'G#', 25 | 'A', 26 | 'A#', 27 | 'B', 28 | ]; 29 | 30 | /** 31 | * All note frequencies from 1st to 8th octave 32 | * in format "A#8" (A#, 8th octave) 33 | */ 34 | export const noteFrequencies = []; 35 | export const noteFrequencyLabels = []; 36 | for (let i = 1; i <= 8; i++) { 37 | for (let f = 0; f < octave8Frequencies.length; f++) { 38 | const freq = octave8Frequencies[f]; 39 | noteFrequencies.push(freq / Math.pow(2, 8 - i)); 40 | noteFrequencyLabels.push(octave8FrequencyLabels[f] + i); 41 | } 42 | } 43 | 44 | /** 45 | * Subset of the note frequencies between 32 and 2000 Hz 46 | * 6 octave range: C1 to B6 47 | */ 48 | const voiceFrequencyRange = [32.0, 2000.0]; 49 | export const voiceFrequencies = noteFrequencies.filter((_, i) => { 50 | return ( 51 | noteFrequencies[i] > voiceFrequencyRange[0] && 52 | noteFrequencies[i] < voiceFrequencyRange[1] 53 | ); 54 | }); 55 | export const voiceFrequencyLabels = noteFrequencyLabels.filter((_, i) => { 56 | return ( 57 | noteFrequencies[i] > voiceFrequencyRange[0] && 58 | noteFrequencies[i] < voiceFrequencyRange[1] 59 | ); 60 | }); 61 | -------------------------------------------------------------------------------- /lib/wavtools/lib/mediastream_recorder.js: -------------------------------------------------------------------------------- 1 | import { AudioProcessorSrc } from "./worklets/audio_processor.js"; 2 | import { AudioAnalysis } from "./analysis/audio_analysis.js"; 3 | import { WavPacker } from "./wav_packer.js"; 4 | 5 | /** 6 | * Decodes audio into a wav file 7 | * @typedef {Object} DecodedAudioType 8 | * @property {Blob} blob 9 | * @property {string} url 10 | * @property {Float32Array} values 11 | * @property {AudioBuffer} audioBuffer 12 | */ 13 | 14 | /** 15 | * Records live stream of user audio as PCM16 "audio/wav" data 16 | * @class 17 | */ 18 | export class MediaStreamRecorder { 19 | /** 20 | * Create a new MediaStreamRecorder instance 21 | * @param {{sampleRate?: number, outputToSpeakers?: boolean, debug?: boolean}} [options] 22 | * @returns {MediaStreamRecorder} 23 | */ 24 | constructor({ 25 | sampleRate = 44100, 26 | outputToSpeakers = false, 27 | debug = false, 28 | } = {}) { 29 | // Script source 30 | this.scriptSrc = AudioProcessorSrc; 31 | // Config 32 | this.sampleRate = sampleRate; 33 | this.outputToSpeakers = outputToSpeakers; 34 | this.debug = !!debug; 35 | // State variables 36 | this.stream = null; 37 | this.processor = null; 38 | this.source = null; 39 | this.node = null; 40 | this.recording = false; 41 | // Event handling with AudioWorklet 42 | this._lastEventId = 0; 43 | this.eventReceipts = {}; 44 | this.eventTimeout = 5000; 45 | // Process chunks of audio 46 | this._chunkProcessor = () => {}; 47 | this._chunkProcessorSize = void 0; 48 | this._chunkProcessorBuffer = { 49 | raw: new ArrayBuffer(0), 50 | mono: new ArrayBuffer(0), 51 | }; 52 | } 53 | 54 | /** 55 | * Logs data in debug mode 56 | * @param {...any} arguments 57 | * @returns {true} 58 | */ 59 | log() { 60 | if (this.debug) { 61 | this.log(...arguments); 62 | } 63 | return true; 64 | } 65 | 66 | /** 67 | * Retrieves the current sampleRate for the recorder 68 | * @returns {number} 69 | */ 70 | getSampleRate() { 71 | return this.sampleRate; 72 | } 73 | 74 | /** 75 | * Retrieves the current status of the recording 76 | * @returns {"ended"|"paused"|"recording"} 77 | */ 78 | getStatus() { 79 | if (!this.processor) { 80 | return "ended"; 81 | } else if (!this.recording) { 82 | return "paused"; 83 | } else { 84 | return "recording"; 85 | } 86 | } 87 | 88 | /** 89 | * Sends an event to the AudioWorklet 90 | * @private 91 | * @param {string} name 92 | * @param {{[key: string]: any}} data 93 | * @param {AudioWorkletNode} [_processor] 94 | * @returns {Promise<{[key: string]: any}>} 95 | */ 96 | async _event(name, data = {}, _processor = null) { 97 | _processor = _processor || this.processor; 98 | if (!_processor) { 99 | throw new Error("Can not send events without recording first"); 100 | } 101 | const message = { 102 | event: name, 103 | id: this._lastEventId++, 104 | data, 105 | }; 106 | _processor.port.postMessage(message); 107 | const t0 = new Date().valueOf(); 108 | while (!this.eventReceipts[message.id]) { 109 | if (new Date().valueOf() - t0 > this.eventTimeout) { 110 | throw new Error(`Timeout waiting for "${name}" event`); 111 | } 112 | await new Promise((res) => setTimeout(() => res(true), 1)); 113 | } 114 | const payload = this.eventReceipts[message.id]; 115 | delete this.eventReceipts[message.id]; 116 | return payload; 117 | } 118 | 119 | /** 120 | * Begins a recording session for the given audioTrack 121 | * Microphone recording indicator will appear on browser tab but status will be "paused" 122 | * @param {MediaStreamTrack} [audioTrack] if no device provided, default device will be used 123 | * @returns {Promise} 124 | */ 125 | async begin(audioTrack) { 126 | if (this.processor) { 127 | throw new Error( 128 | `Already connected: please call .end() to start a new session` 129 | ); 130 | } 131 | 132 | if (!audioTrack || audioTrack.kind !== "audio") { 133 | throw new Error("No audio track provided"); 134 | } 135 | 136 | this.stream = new MediaStream([audioTrack]); 137 | 138 | const context = new AudioContext({ sampleRate: this.sampleRate }); 139 | const source = context.createMediaStreamSource(this.stream); 140 | // Load and execute the module script. 141 | try { 142 | await context.audioWorklet.addModule(this.scriptSrc); 143 | } catch (e) { 144 | console.error(e); 145 | throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`); 146 | } 147 | const processor = new AudioWorkletNode(context, "audio_processor"); 148 | processor.port.onmessage = (e) => { 149 | const { event, id, data } = e.data; 150 | if (event === "receipt") { 151 | this.eventReceipts[id] = data; 152 | } else if (event === "chunk") { 153 | if (this._chunkProcessorSize) { 154 | const buffer = this._chunkProcessorBuffer; 155 | this._chunkProcessorBuffer = { 156 | raw: WavPacker.mergeBuffers(buffer.raw, data.raw), 157 | mono: WavPacker.mergeBuffers(buffer.mono, data.mono), 158 | }; 159 | if ( 160 | this._chunkProcessorBuffer.mono.byteLength >= 161 | this._chunkProcessorSize 162 | ) { 163 | this._chunkProcessor(this._chunkProcessorBuffer); 164 | this._chunkProcessorBuffer = { 165 | raw: new ArrayBuffer(0), 166 | mono: new ArrayBuffer(0), 167 | }; 168 | } 169 | } else { 170 | this._chunkProcessor(data); 171 | } 172 | } 173 | }; 174 | 175 | const node = source.connect(processor); 176 | const analyser = context.createAnalyser(); 177 | analyser.fftSize = 8192; 178 | analyser.smoothingTimeConstant = 0.1; 179 | node.connect(analyser); 180 | if (this.outputToSpeakers) { 181 | // eslint-disable-next-line no-console 182 | console.warn( 183 | "Warning: Output to speakers may affect sound quality,\n" + 184 | "especially due to system audio feedback preventative measures.\n" + 185 | "use only for debugging" 186 | ); 187 | analyser.connect(context.destination); 188 | } 189 | 190 | this.source = source; 191 | this.node = node; 192 | this.analyser = analyser; 193 | this.processor = processor; 194 | return true; 195 | } 196 | 197 | /** 198 | * Gets the current frequency domain data from the recording track 199 | * @param {"frequency"|"music"|"voice"} [analysisType] 200 | * @param {number} [minDecibels] default -100 201 | * @param {number} [maxDecibels] default -30 202 | * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType} 203 | */ 204 | getFrequencies( 205 | analysisType = "frequency", 206 | minDecibels = -100, 207 | maxDecibels = -30 208 | ) { 209 | if (!this.processor) { 210 | throw new Error("Session ended: please call .begin() first"); 211 | } 212 | return AudioAnalysis.getFrequencies( 213 | this.analyser, 214 | this.sampleRate, 215 | null, 216 | analysisType, 217 | minDecibels, 218 | maxDecibels 219 | ); 220 | } 221 | 222 | /** 223 | * Pauses the recording 224 | * Keeps microphone stream open but halts storage of audio 225 | * @returns {Promise} 226 | */ 227 | async pause() { 228 | if (!this.processor) { 229 | throw new Error("Session ended: please call .begin() first"); 230 | } else if (!this.recording) { 231 | throw new Error("Already paused: please call .record() first"); 232 | } 233 | if (this._chunkProcessorBuffer.raw.byteLength) { 234 | this._chunkProcessor(this._chunkProcessorBuffer); 235 | } 236 | this.log("Pausing ..."); 237 | await this._event("stop"); 238 | this.recording = false; 239 | return true; 240 | } 241 | 242 | /** 243 | * Start recording stream and storing to memory from the connected audio source 244 | * @param {(data: { mono: Int16Array; raw: Int16Array }) => any} [chunkProcessor] 245 | * @param {number} [chunkSize] chunkProcessor will not be triggered until this size threshold met in mono audio 246 | * @returns {Promise} 247 | */ 248 | async record(chunkProcessor = () => {}, chunkSize = 8192) { 249 | if (!this.processor) { 250 | throw new Error("Session ended: please call .begin() first"); 251 | } else if (this.recording) { 252 | throw new Error("Already recording: HELLO please call .pause() first"); 253 | } else if (typeof chunkProcessor !== "function") { 254 | throw new Error(`chunkProcessor must be a function`); 255 | } 256 | this._chunkProcessor = chunkProcessor; 257 | this._chunkProcessorSize = chunkSize; 258 | this._chunkProcessorBuffer = { 259 | raw: new ArrayBuffer(0), 260 | mono: new ArrayBuffer(0), 261 | }; 262 | this.log("Recording ..."); 263 | await this._event("start"); 264 | this.recording = true; 265 | return true; 266 | } 267 | 268 | /** 269 | * Clears the audio buffer, empties stored recording 270 | * @returns {Promise} 271 | */ 272 | async clear() { 273 | if (!this.processor) { 274 | throw new Error("Session ended: please call .begin() first"); 275 | } 276 | await this._event("clear"); 277 | return true; 278 | } 279 | 280 | /** 281 | * Reads the current audio stream data 282 | * @returns {Promise<{meanValues: Float32Array, channels: Array}>} 283 | */ 284 | async read() { 285 | if (!this.processor) { 286 | throw new Error("Session ended: please call .begin() first"); 287 | } 288 | this.log("Reading ..."); 289 | const result = await this._event("read"); 290 | return result; 291 | } 292 | 293 | /** 294 | * Saves the current audio stream to a file 295 | * @param {boolean} [force] Force saving while still recording 296 | * @returns {Promise} 297 | */ 298 | async save(force = false) { 299 | if (!this.processor) { 300 | throw new Error("Session ended: please call .begin() first"); 301 | } 302 | if (!force && this.recording) { 303 | throw new Error( 304 | "Currently recording: please call .pause() first, or call .save(true) to force" 305 | ); 306 | } 307 | this.log("Exporting ..."); 308 | const exportData = await this._event("export"); 309 | const packer = new WavPacker(); 310 | const result = packer.pack(this.sampleRate, exportData.audio); 311 | return result; 312 | } 313 | 314 | /** 315 | * Ends the current recording session and saves the result 316 | * @returns {Promise} 317 | */ 318 | async end() { 319 | if (!this.processor) { 320 | throw new Error("Session ended: please call .begin() first"); 321 | } 322 | 323 | const _processor = this.processor; 324 | 325 | this.log("Stopping ..."); 326 | await this._event("stop"); 327 | this.recording = false; 328 | 329 | this.log("Exporting ..."); 330 | const exportData = await this._event("export", {}, _processor); 331 | 332 | this.processor.disconnect(); 333 | this.source.disconnect(); 334 | this.node.disconnect(); 335 | this.analyser.disconnect(); 336 | this.stream = null; 337 | this.processor = null; 338 | this.source = null; 339 | this.node = null; 340 | 341 | const packer = new WavPacker(); 342 | const result = packer.pack(this.sampleRate, exportData.audio); 343 | return result; 344 | } 345 | 346 | /** 347 | * Performs a full cleanup of WavRecorder instance 348 | * Stops actively listening via microphone and removes existing listeners 349 | * @returns {Promise} 350 | */ 351 | async quit() { 352 | this.listenForDeviceChange(null); 353 | if (this.processor) { 354 | await this.end(); 355 | } 356 | return true; 357 | } 358 | } 359 | 360 | globalThis.WavRecorder = WavRecorder; 361 | -------------------------------------------------------------------------------- /lib/wavtools/lib/wav_packer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Raw wav audio file contents 3 | * @typedef {Object} WavPackerAudioType 4 | * @property {Blob} blob 5 | * @property {string} url 6 | * @property {number} channelCount 7 | * @property {number} sampleRate 8 | * @property {number} duration 9 | */ 10 | 11 | /** 12 | * Utility class for assembling PCM16 "audio/wav" data 13 | * @class 14 | */ 15 | export class WavPacker { 16 | /** 17 | * Converts Float32Array of amplitude data to ArrayBuffer in Int16Array format 18 | * @param {Float32Array} float32Array 19 | * @returns {ArrayBuffer} 20 | */ 21 | static floatTo16BitPCM(float32Array) { 22 | const buffer = new ArrayBuffer(float32Array.length * 2); 23 | const view = new DataView(buffer); 24 | let offset = 0; 25 | for (let i = 0; i < float32Array.length; i++, offset += 2) { 26 | let s = Math.max(-1, Math.min(1, float32Array[i])); 27 | view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true); 28 | } 29 | return buffer; 30 | } 31 | 32 | /** 33 | * Concatenates two ArrayBuffers 34 | * @param {ArrayBuffer} leftBuffer 35 | * @param {ArrayBuffer} rightBuffer 36 | * @returns {ArrayBuffer} 37 | */ 38 | static mergeBuffers(leftBuffer, rightBuffer) { 39 | const tmpArray = new Uint8Array( 40 | leftBuffer.byteLength + rightBuffer.byteLength 41 | ); 42 | tmpArray.set(new Uint8Array(leftBuffer), 0); 43 | tmpArray.set(new Uint8Array(rightBuffer), leftBuffer.byteLength); 44 | return tmpArray.buffer; 45 | } 46 | 47 | /** 48 | * Packs data into an Int16 format 49 | * @private 50 | * @param {number} size 0 = 1x Int16, 1 = 2x Int16 51 | * @param {number} arg value to pack 52 | * @returns 53 | */ 54 | _packData(size, arg) { 55 | return [ 56 | new Uint8Array([arg, arg >> 8]), 57 | new Uint8Array([arg, arg >> 8, arg >> 16, arg >> 24]), 58 | ][size]; 59 | } 60 | 61 | /** 62 | * Packs audio into "audio/wav" Blob 63 | * @param {number} sampleRate 64 | * @param {{bitsPerSample: number, channels: Array, data: Int16Array}} audio 65 | * @returns {WavPackerAudioType} 66 | */ 67 | pack(sampleRate, audio) { 68 | if (!audio?.bitsPerSample) { 69 | throw new Error(`Missing "bitsPerSample"`); 70 | } else if (!audio?.channels) { 71 | throw new Error(`Missing "channels"`); 72 | } else if (!audio?.data) { 73 | throw new Error(`Missing "data"`); 74 | } 75 | const { bitsPerSample, channels, data } = audio; 76 | const output = [ 77 | // Header 78 | 'RIFF', 79 | this._packData( 80 | 1, 81 | 4 + (8 + 24) /* chunk 1 length */ + (8 + 8) /* chunk 2 length */ 82 | ), // Length 83 | 'WAVE', 84 | // chunk 1 85 | 'fmt ', // Sub-chunk identifier 86 | this._packData(1, 16), // Chunk length 87 | this._packData(0, 1), // Audio format (1 is linear quantization) 88 | this._packData(0, channels.length), 89 | this._packData(1, sampleRate), 90 | this._packData(1, (sampleRate * channels.length * bitsPerSample) / 8), // Byte rate 91 | this._packData(0, (channels.length * bitsPerSample) / 8), 92 | this._packData(0, bitsPerSample), 93 | // chunk 2 94 | 'data', // Sub-chunk identifier 95 | this._packData( 96 | 1, 97 | (channels[0].length * channels.length * bitsPerSample) / 8 98 | ), // Chunk length 99 | data, 100 | ]; 101 | const blob = new Blob(output, { type: 'audio/mpeg' }); 102 | const url = URL.createObjectURL(blob); 103 | return { 104 | blob, 105 | url, 106 | channelCount: channels.length, 107 | sampleRate, 108 | duration: data.byteLength / (channels.length * sampleRate * 2), 109 | }; 110 | } 111 | } 112 | 113 | globalThis.WavPacker = WavPacker; 114 | -------------------------------------------------------------------------------- /lib/wavtools/lib/wav_recorder.js: -------------------------------------------------------------------------------- 1 | import { AudioProcessorSrc } from './worklets/audio_processor.js'; 2 | import { AudioAnalysis } from './analysis/audio_analysis.js'; 3 | import { WavPacker } from './wav_packer.js'; 4 | 5 | /** 6 | * Decodes audio into a wav file 7 | * @typedef {Object} DecodedAudioType 8 | * @property {Blob} blob 9 | * @property {string} url 10 | * @property {Float32Array} values 11 | * @property {AudioBuffer} audioBuffer 12 | */ 13 | 14 | /** 15 | * Records live stream of user audio as PCM16 "audio/wav" data 16 | * @class 17 | */ 18 | export class WavRecorder { 19 | /** 20 | * Create a new WavRecorder instance 21 | * @param {{sampleRate?: number, outputToSpeakers?: boolean, debug?: boolean}} [options] 22 | * @returns {WavRecorder} 23 | */ 24 | constructor({ 25 | sampleRate = 44100, 26 | outputToSpeakers = false, 27 | debug = false, 28 | } = {}) { 29 | // Script source 30 | this.scriptSrc = AudioProcessorSrc; 31 | // Config 32 | this.sampleRate = sampleRate; 33 | this.outputToSpeakers = outputToSpeakers; 34 | this.debug = !!debug; 35 | this._deviceChangeCallback = null; 36 | this._devices = []; 37 | this.deviceSelection = null; 38 | // State variables 39 | this.stream = null; 40 | this.processor = null; 41 | this.source = null; 42 | this.node = null; 43 | this.recording = false; 44 | // Event handling with AudioWorklet 45 | this._lastEventId = 0; 46 | this.eventReceipts = {}; 47 | this.eventTimeout = 5000; 48 | // Process chunks of audio 49 | this._chunkProcessor = () => {}; 50 | this._chunkProcessorSize = void 0; 51 | this._chunkProcessorBuffer = { 52 | raw: new ArrayBuffer(0), 53 | mono: new ArrayBuffer(0), 54 | }; 55 | } 56 | 57 | /** 58 | * Decodes audio data from multiple formats to a Blob, url, Float32Array and AudioBuffer 59 | * @param {Blob|Float32Array|Int16Array|ArrayBuffer|number[]} audioData 60 | * @param {number} sampleRate 61 | * @param {number} fromSampleRate 62 | * @returns {Promise} 63 | */ 64 | static async decode(audioData, sampleRate = 44100, fromSampleRate = -1) { 65 | const context = new AudioContext({ sampleRate }); 66 | let arrayBuffer; 67 | let blob; 68 | if (audioData instanceof Blob) { 69 | if (fromSampleRate !== -1) { 70 | throw new Error( 71 | `Can not specify "fromSampleRate" when reading from Blob` 72 | ); 73 | } 74 | blob = audioData; 75 | arrayBuffer = await blob.arrayBuffer(); 76 | } else if (audioData instanceof ArrayBuffer) { 77 | if (fromSampleRate !== -1) { 78 | throw new Error( 79 | `Can not specify "fromSampleRate" when reading from ArrayBuffer` 80 | ); 81 | } 82 | arrayBuffer = audioData; 83 | blob = new Blob([arrayBuffer], { type: 'audio/wav' }); 84 | } else { 85 | let float32Array; 86 | let data; 87 | if (audioData instanceof Int16Array) { 88 | data = audioData; 89 | float32Array = new Float32Array(audioData.length); 90 | for (let i = 0; i < audioData.length; i++) { 91 | float32Array[i] = audioData[i] / 0x8000; 92 | } 93 | } else if (audioData instanceof Float32Array) { 94 | float32Array = audioData; 95 | } else if (audioData instanceof Array) { 96 | float32Array = new Float32Array(audioData); 97 | } else { 98 | throw new Error( 99 | `"audioData" must be one of: Blob, Float32Arrray, Int16Array, ArrayBuffer, Array` 100 | ); 101 | } 102 | if (fromSampleRate === -1) { 103 | throw new Error( 104 | `Must specify "fromSampleRate" when reading from Float32Array, In16Array or Array` 105 | ); 106 | } else if (fromSampleRate < 3000) { 107 | throw new Error(`Minimum "fromSampleRate" is 3000 (3kHz)`); 108 | } 109 | if (!data) { 110 | data = WavPacker.floatTo16BitPCM(float32Array); 111 | } 112 | const audio = { 113 | bitsPerSample: 16, 114 | channels: [float32Array], 115 | data, 116 | }; 117 | const packer = new WavPacker(); 118 | const result = packer.pack(fromSampleRate, audio); 119 | blob = result.blob; 120 | arrayBuffer = await blob.arrayBuffer(); 121 | } 122 | const audioBuffer = await context.decodeAudioData(arrayBuffer); 123 | const values = audioBuffer.getChannelData(0); 124 | const url = URL.createObjectURL(blob); 125 | return { 126 | blob, 127 | url, 128 | values, 129 | audioBuffer, 130 | }; 131 | } 132 | 133 | /** 134 | * Logs data in debug mode 135 | * @param {...any} arguments 136 | * @returns {true} 137 | */ 138 | log() { 139 | if (this.debug) { 140 | this.log(...arguments); 141 | } 142 | return true; 143 | } 144 | 145 | /** 146 | * Retrieves the current sampleRate for the recorder 147 | * @returns {number} 148 | */ 149 | getSampleRate() { 150 | return this.sampleRate; 151 | } 152 | 153 | /** 154 | * Retrieves the current status of the recording 155 | * @returns {"ended"|"paused"|"recording"} 156 | */ 157 | getStatus() { 158 | if (!this.processor) { 159 | return 'ended'; 160 | } else if (!this.recording) { 161 | return 'paused'; 162 | } else { 163 | return 'recording'; 164 | } 165 | } 166 | 167 | /** 168 | * Sends an event to the AudioWorklet 169 | * @private 170 | * @param {string} name 171 | * @param {{[key: string]: any}} data 172 | * @param {AudioWorkletNode} [_processor] 173 | * @returns {Promise<{[key: string]: any}>} 174 | */ 175 | async _event(name, data = {}, _processor = null) { 176 | _processor = _processor || this.processor; 177 | if (!_processor) { 178 | throw new Error('Can not send events without recording first'); 179 | } 180 | const message = { 181 | event: name, 182 | id: this._lastEventId++, 183 | data, 184 | }; 185 | _processor.port.postMessage(message); 186 | const t0 = new Date().valueOf(); 187 | while (!this.eventReceipts[message.id]) { 188 | if (new Date().valueOf() - t0 > this.eventTimeout) { 189 | throw new Error(`Timeout waiting for "${name}" event`); 190 | } 191 | await new Promise((res) => setTimeout(() => res(true), 1)); 192 | } 193 | const payload = this.eventReceipts[message.id]; 194 | delete this.eventReceipts[message.id]; 195 | return payload; 196 | } 197 | 198 | /** 199 | * Sets device change callback, remove if callback provided is `null` 200 | * @param {(Array): void|null} callback 201 | * @returns {true} 202 | */ 203 | listenForDeviceChange(callback) { 204 | if (callback === null && this._deviceChangeCallback) { 205 | navigator.mediaDevices.removeEventListener( 206 | 'devicechange', 207 | this._deviceChangeCallback 208 | ); 209 | this._deviceChangeCallback = null; 210 | } else if (callback !== null) { 211 | // Basically a debounce; we only want this called once when devices change 212 | // And we only want the most recent callback() to be executed 213 | // if a few are operating at the same time 214 | let lastId = 0; 215 | let lastDevices = []; 216 | const serializeDevices = (devices) => 217 | devices 218 | .map((d) => d.deviceId) 219 | .sort() 220 | .join(','); 221 | const cb = async () => { 222 | let id = ++lastId; 223 | const devices = await this.listDevices(); 224 | if (id === lastId) { 225 | if (serializeDevices(lastDevices) !== serializeDevices(devices)) { 226 | lastDevices = devices; 227 | callback(devices.slice()); 228 | } 229 | } 230 | }; 231 | navigator.mediaDevices.addEventListener('devicechange', cb); 232 | cb(); 233 | this._deviceChangeCallback = cb; 234 | } 235 | return true; 236 | } 237 | 238 | /** 239 | * Manually request permission to use the microphone 240 | * @returns {Promise} 241 | */ 242 | async requestPermission() { 243 | const permissionStatus = await navigator.permissions.query({ 244 | name: 'microphone', 245 | }); 246 | if (permissionStatus.state === 'denied') { 247 | window.alert('You must grant microphone access to use this feature.'); 248 | } else if (permissionStatus.state === 'prompt') { 249 | try { 250 | const stream = await navigator.mediaDevices.getUserMedia({ 251 | audio: true, 252 | }); 253 | const tracks = stream.getTracks(); 254 | tracks.forEach((track) => track.stop()); 255 | } catch (e) { 256 | window.alert('You must grant microphone access to use this feature.'); 257 | } 258 | } 259 | return true; 260 | } 261 | 262 | /** 263 | * List all eligible devices for recording, will request permission to use microphone 264 | * @returns {Promise>} 265 | */ 266 | async listDevices() { 267 | if ( 268 | !navigator.mediaDevices || 269 | !('enumerateDevices' in navigator.mediaDevices) 270 | ) { 271 | throw new Error('Could not request user devices'); 272 | } 273 | await this.requestPermission(); 274 | const devices = await navigator.mediaDevices.enumerateDevices(); 275 | const audioDevices = devices.filter( 276 | (device) => device.kind === 'audioinput' 277 | ); 278 | return audioDevices; 279 | // const defaultDeviceIndex = audioDevices.findIndex( 280 | // (device) => device.deviceId === 'default' 281 | // ); 282 | // const deviceList = []; 283 | // if (defaultDeviceIndex !== -1) { 284 | // let defaultDevice = audioDevices.splice(defaultDeviceIndex, 1)[0]; 285 | // let existingIndex = audioDevices.findIndex( 286 | // (device) => device.groupId === defaultDevice.groupId 287 | // ); 288 | // if (existingIndex !== -1) { 289 | // defaultDevice = audioDevices.splice(existingIndex, 1)[0]; 290 | // } 291 | // defaultDevice.default = true; 292 | // deviceList.push(defaultDevice); 293 | // } 294 | // return deviceList.concat(audioDevices); 295 | } 296 | 297 | /** 298 | * Begins a recording session and requests microphone permissions if not already granted 299 | * Microphone recording indicator will appear on browser tab but status will be "paused" 300 | * @param {string} [deviceId] if no device provided, default device will be used 301 | * @returns {Promise} 302 | */ 303 | async begin(deviceId) { 304 | if (this.processor) { 305 | throw new Error( 306 | `Already connected: please call .end() to start a new session` 307 | ); 308 | } 309 | 310 | if ( 311 | !navigator.mediaDevices || 312 | !('getUserMedia' in navigator.mediaDevices) 313 | ) { 314 | throw new Error('Could not request user media'); 315 | } 316 | deviceId = deviceId ?? this.deviceSelection?.deviceId; 317 | try { 318 | const config = { audio: true }; 319 | if (deviceId) { 320 | config.audio = { deviceId: { exact: deviceId } }; 321 | } 322 | this.stream = await navigator.mediaDevices.getUserMedia(config); 323 | } catch (err) { 324 | throw new Error('Could not start media stream'); 325 | } 326 | 327 | this.listDevices().then((devices) => { 328 | deviceId = this.stream.getAudioTracks()[0].getSettings().deviceId; 329 | console.log( 330 | 'find current device', 331 | devices, 332 | deviceId, 333 | this.stream.getAudioTracks()[0].getSettings() 334 | ); 335 | this.deviceSelection = devices.find((d) => d.deviceId === deviceId); 336 | console.log('current device', this.deviceSelection); 337 | }); 338 | const context = new AudioContext({ sampleRate: this.sampleRate }); 339 | const source = context.createMediaStreamSource(this.stream); 340 | // Load and execute the module script. 341 | try { 342 | await context.audioWorklet.addModule(this.scriptSrc); 343 | } catch (e) { 344 | console.error(e); 345 | throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`); 346 | } 347 | const processor = new AudioWorkletNode(context, 'audio_processor'); 348 | processor.port.onmessage = (e) => { 349 | const { event, id, data } = e.data; 350 | if (event === 'receipt') { 351 | this.eventReceipts[id] = data; 352 | } else if (event === 'chunk') { 353 | if (this._chunkProcessorSize) { 354 | const buffer = this._chunkProcessorBuffer; 355 | this._chunkProcessorBuffer = { 356 | raw: WavPacker.mergeBuffers(buffer.raw, data.raw), 357 | mono: WavPacker.mergeBuffers(buffer.mono, data.mono), 358 | }; 359 | if ( 360 | this._chunkProcessorBuffer.mono.byteLength >= 361 | this._chunkProcessorSize 362 | ) { 363 | this._chunkProcessor(this._chunkProcessorBuffer); 364 | this._chunkProcessorBuffer = { 365 | raw: new ArrayBuffer(0), 366 | mono: new ArrayBuffer(0), 367 | }; 368 | } 369 | } else { 370 | this._chunkProcessor(data); 371 | } 372 | } 373 | }; 374 | 375 | const node = source.connect(processor); 376 | const analyser = context.createAnalyser(); 377 | analyser.fftSize = 8192; 378 | analyser.smoothingTimeConstant = 0.1; 379 | node.connect(analyser); 380 | if (this.outputToSpeakers) { 381 | // eslint-disable-next-line no-console 382 | console.warn( 383 | 'Warning: Output to speakers may affect sound quality,\n' + 384 | 'especially due to system audio feedback preventative measures.\n' + 385 | 'use only for debugging' 386 | ); 387 | analyser.connect(context.destination); 388 | } 389 | 390 | this.source = source; 391 | this.node = node; 392 | this.analyser = analyser; 393 | this.processor = processor; 394 | console.log('begin completed'); 395 | return true; 396 | } 397 | 398 | /** 399 | * Gets the current frequency domain data from the recording track 400 | * @param {"frequency"|"music"|"voice"} [analysisType] 401 | * @param {number} [minDecibels] default -100 402 | * @param {number} [maxDecibels] default -30 403 | * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType} 404 | */ 405 | getFrequencies( 406 | analysisType = 'frequency', 407 | minDecibels = -100, 408 | maxDecibels = -30 409 | ) { 410 | if (!this.processor) { 411 | throw new Error('Session ended: please call .begin() first'); 412 | } 413 | return AudioAnalysis.getFrequencies( 414 | this.analyser, 415 | this.sampleRate, 416 | null, 417 | analysisType, 418 | minDecibels, 419 | maxDecibels 420 | ); 421 | } 422 | 423 | /** 424 | * Pauses the recording 425 | * Keeps microphone stream open but halts storage of audio 426 | * @returns {Promise} 427 | */ 428 | async pause() { 429 | if (!this.processor) { 430 | throw new Error('Session ended: please call .begin() first'); 431 | } else if (!this.recording) { 432 | throw new Error('Already paused: please call .record() first'); 433 | } 434 | if (this._chunkProcessorBuffer.raw.byteLength) { 435 | this._chunkProcessor(this._chunkProcessorBuffer); 436 | } 437 | this.log('Pausing ...'); 438 | await this._event('stop'); 439 | this.recording = false; 440 | return true; 441 | } 442 | 443 | /** 444 | * Start recording stream and storing to memory from the connected audio source 445 | * @param {(data: { mono: Int16Array; raw: Int16Array }) => any} [chunkProcessor] 446 | * @param {number} [chunkSize] chunkProcessor will not be triggered until this size threshold met in mono audio 447 | * @returns {Promise} 448 | */ 449 | async record(chunkProcessor = () => {}, chunkSize = 8192) { 450 | if (!this.processor) { 451 | throw new Error('Session ended: please call .begin() first'); 452 | } else if (this.recording) { 453 | throw new Error('Already recording: please call .pause() first'); 454 | } else if (typeof chunkProcessor !== 'function') { 455 | throw new Error(`chunkProcessor must be a function`); 456 | } 457 | this._chunkProcessor = chunkProcessor; 458 | this._chunkProcessorSize = chunkSize; 459 | this._chunkProcessorBuffer = { 460 | raw: new ArrayBuffer(0), 461 | mono: new ArrayBuffer(0), 462 | }; 463 | this.log('Recording ...'); 464 | await this._event('start'); 465 | this.recording = true; 466 | return true; 467 | } 468 | 469 | /** 470 | * Clears the audio buffer, empties stored recording 471 | * @returns {Promise} 472 | */ 473 | async clear() { 474 | if (!this.processor) { 475 | throw new Error('Session ended: please call .begin() first'); 476 | } 477 | await this._event('clear'); 478 | return true; 479 | } 480 | 481 | /** 482 | * Reads the current audio stream data 483 | * @returns {Promise<{meanValues: Float32Array, channels: Array}>} 484 | */ 485 | async read() { 486 | if (!this.processor) { 487 | throw new Error('Session ended: please call .begin() first'); 488 | } 489 | this.log('Reading ...'); 490 | const result = await this._event('read'); 491 | return result; 492 | } 493 | 494 | /** 495 | * Saves the current audio stream to a file 496 | * @param {boolean} [force] Force saving while still recording 497 | * @returns {Promise} 498 | */ 499 | async save(force = false) { 500 | if (!this.processor) { 501 | throw new Error('Session ended: please call .begin() first'); 502 | } 503 | if (!force && this.recording) { 504 | throw new Error( 505 | 'Currently recording: please call .pause() first, or call .save(true) to force' 506 | ); 507 | } 508 | this.log('Exporting ...'); 509 | const exportData = await this._event('export'); 510 | const packer = new WavPacker(); 511 | const result = packer.pack(this.sampleRate, exportData.audio); 512 | return result; 513 | } 514 | 515 | /** 516 | * Ends the current recording session and saves the result 517 | * @returns {Promise} 518 | */ 519 | async end() { 520 | if (!this.processor) { 521 | throw new Error('Session ended: please call .begin() first'); 522 | } 523 | 524 | const _processor = this.processor; 525 | 526 | this.log('Stopping ...'); 527 | await this._event('stop'); 528 | this.recording = false; 529 | const tracks = this.stream.getTracks(); 530 | tracks.forEach((track) => track.stop()); 531 | 532 | this.log('Exporting ...'); 533 | const exportData = await this._event('export', {}, _processor); 534 | 535 | this.processor.disconnect(); 536 | this.source.disconnect(); 537 | this.node.disconnect(); 538 | this.analyser.disconnect(); 539 | this.stream = null; 540 | this.processor = null; 541 | this.source = null; 542 | this.node = null; 543 | 544 | const packer = new WavPacker(); 545 | const result = packer.pack(this.sampleRate, exportData.audio); 546 | return result; 547 | } 548 | 549 | /** 550 | * Performs a full cleanup of WavRecorder instance 551 | * Stops actively listening via microphone and removes existing listeners 552 | * @returns {Promise} 553 | */ 554 | async quit() { 555 | this.listenForDeviceChange(null); 556 | // we do not reset this on end so that selections persist across starts 557 | this.deviceSelection = null; 558 | if (this.processor) { 559 | await this.end(); 560 | } 561 | return true; 562 | } 563 | } 564 | 565 | globalThis.WavRecorder = WavRecorder; 566 | -------------------------------------------------------------------------------- /lib/wavtools/lib/wav_stream_player.js: -------------------------------------------------------------------------------- 1 | import { StreamProcessorSrc } from "./worklets/stream_processor.js"; 2 | import { AudioAnalysis } from "./analysis/audio_analysis.js"; 3 | 4 | /** 5 | * Plays audio streams received in raw PCM16 chunks from the browser 6 | * @class 7 | */ 8 | export class WavStreamPlayer { 9 | /** 10 | * Creates a new WavStreamPlayer instance 11 | * @param {{sampleRate?: number}} options 12 | * @returns {WavStreamPlayer} 13 | */ 14 | constructor({ sampleRate = 44100 } = {}) { 15 | this.scriptSrc = StreamProcessorSrc; 16 | this.sampleRate = sampleRate; 17 | this.context = null; 18 | this.stream = null; 19 | this.analyser = null; 20 | this.trackSampleOffsets = {}; 21 | this.interruptedTrackIds = {}; 22 | } 23 | 24 | /** 25 | * Connects the audio context and enables output to speakers 26 | * @returns {Promise} 27 | */ 28 | async connect() { 29 | this.context = new AudioContext({ sampleRate: this.sampleRate }); 30 | if (this._speakerID) { 31 | this.context.setSinkId(this._speakerID); 32 | } 33 | if (this.context.state === "suspended") { 34 | await this.context.resume(); 35 | } 36 | try { 37 | await this.context.audioWorklet.addModule(this.scriptSrc); 38 | } catch (e) { 39 | console.error(e); 40 | throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`); 41 | } 42 | const analyser = this.context.createAnalyser(); 43 | analyser.fftSize = 8192; 44 | analyser.smoothingTimeConstant = 0.1; 45 | this.analyser = analyser; 46 | return true; 47 | } 48 | 49 | /** 50 | * Gets the current frequency domain data from the playing track 51 | * @param {"frequency"|"music"|"voice"} [analysisType] 52 | * @param {number} [minDecibels] default -100 53 | * @param {number} [maxDecibels] default -30 54 | * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType} 55 | */ 56 | getFrequencies( 57 | analysisType = "frequency", 58 | minDecibels = -100, 59 | maxDecibels = -30 60 | ) { 61 | if (!this.analyser) { 62 | throw new Error("Not connected, please call .connect() first"); 63 | } 64 | return AudioAnalysis.getFrequencies( 65 | this.analyser, 66 | this.sampleRate, 67 | null, 68 | analysisType, 69 | minDecibels, 70 | maxDecibels 71 | ); 72 | } 73 | 74 | /** 75 | * @param {string} speaker deviceId 76 | */ 77 | async updateSpeaker(speaker) { 78 | const _prevSpeaker = this._speakerID; 79 | this._speakerID = speaker; 80 | if (this.context) { 81 | try { 82 | if (speaker === "default") { 83 | await this.context.setSinkId(); 84 | } else { 85 | await this.context.setSinkId(speaker); 86 | } 87 | } catch (e) { 88 | console.error(`Could not set sinkId to ${speaker}: ${e}`); 89 | this._speakerID = _prevSpeaker; 90 | } 91 | } 92 | } 93 | 94 | /** 95 | * Starts audio streaming 96 | * @private 97 | * @returns {Promise} 98 | */ 99 | _start() { 100 | const streamNode = new AudioWorkletNode(this.context, "stream_processor"); 101 | streamNode.connect(this.context.destination); 102 | streamNode.port.onmessage = (e) => { 103 | const { event } = e.data; 104 | if (event === "stop") { 105 | streamNode.disconnect(); 106 | this.stream = null; 107 | } else if (event === "offset") { 108 | const { requestId, trackId, offset } = e.data; 109 | const currentTime = offset / this.sampleRate; 110 | this.trackSampleOffsets[requestId] = { trackId, offset, currentTime }; 111 | } 112 | }; 113 | this.analyser.disconnect(); 114 | streamNode.connect(this.analyser); 115 | this.stream = streamNode; 116 | return true; 117 | } 118 | 119 | /** 120 | * Adds 16BitPCM data to the currently playing audio stream 121 | * You can add chunks beyond the current play point and they will be queued for play 122 | * @param {ArrayBuffer|Int16Array} arrayBuffer 123 | * @param {string} [trackId] 124 | * @returns {Int16Array} 125 | */ 126 | add16BitPCM(arrayBuffer, trackId = "default") { 127 | if (typeof trackId !== "string") { 128 | throw new Error(`trackId must be a string`); 129 | } else if (this.interruptedTrackIds[trackId]) { 130 | return; 131 | } 132 | if (!this.stream) { 133 | this._start(); 134 | } 135 | let buffer; 136 | if (arrayBuffer instanceof Int16Array) { 137 | buffer = arrayBuffer; 138 | } else if (arrayBuffer instanceof ArrayBuffer) { 139 | buffer = new Int16Array(arrayBuffer); 140 | } else { 141 | throw new Error(`argument must be Int16Array or ArrayBuffer`); 142 | } 143 | this.stream.port.postMessage({ event: "write", buffer, trackId }); 144 | return buffer; 145 | } 146 | 147 | /** 148 | * Gets the offset (sample count) of the currently playing stream 149 | * @param {boolean} [interrupt] 150 | * @returns {{trackId: string|null, offset: number, currentTime: number}} 151 | */ 152 | async getTrackSampleOffset(interrupt = false) { 153 | if (!this.stream) { 154 | return null; 155 | } 156 | const requestId = crypto.randomUUID(); 157 | this.stream.port.postMessage({ 158 | event: interrupt ? "interrupt" : "offset", 159 | requestId, 160 | }); 161 | let trackSampleOffset; 162 | while (!trackSampleOffset) { 163 | trackSampleOffset = this.trackSampleOffsets[requestId]; 164 | await new Promise((r) => setTimeout(() => r(), 1)); 165 | } 166 | const { trackId } = trackSampleOffset; 167 | if (interrupt && trackId) { 168 | this.interruptedTrackIds[trackId] = true; 169 | } 170 | return trackSampleOffset; 171 | } 172 | 173 | /** 174 | * Strips the current stream and returns the sample offset of the audio 175 | * @param {boolean} [interrupt] 176 | * @returns {{trackId: string|null, offset: number, currentTime: number}} 177 | */ 178 | async interrupt() { 179 | return this.getTrackSampleOffset(true); 180 | } 181 | } 182 | 183 | globalThis.WavStreamPlayer = WavStreamPlayer; 184 | -------------------------------------------------------------------------------- /lib/wavtools/lib/worklets/audio_processor.js: -------------------------------------------------------------------------------- 1 | const AudioProcessorWorklet = ` 2 | class AudioProcessor extends AudioWorkletProcessor { 3 | 4 | constructor() { 5 | super(); 6 | this.port.onmessage = this.receive.bind(this); 7 | this.initialize(); 8 | } 9 | 10 | initialize() { 11 | this.foundAudio = false; 12 | this.recording = false; 13 | this.chunks = []; 14 | } 15 | 16 | /** 17 | * Concatenates sampled chunks into channels 18 | * Format is chunk[Left[], Right[]] 19 | */ 20 | readChannelData(chunks, channel = -1, maxChannels = 9) { 21 | let channelLimit; 22 | if (channel !== -1) { 23 | if (chunks[0] && chunks[0].length - 1 < channel) { 24 | throw new Error( 25 | \`Channel \${channel} out of range: max \${chunks[0].length}\` 26 | ); 27 | } 28 | channelLimit = channel + 1; 29 | } else { 30 | channel = 0; 31 | channelLimit = Math.min(chunks[0] ? chunks[0].length : 1, maxChannels); 32 | } 33 | const channels = []; 34 | for (let n = channel; n < channelLimit; n++) { 35 | const length = chunks.reduce((sum, chunk) => { 36 | return sum + chunk[n].length; 37 | }, 0); 38 | const buffers = chunks.map((chunk) => chunk[n]); 39 | const result = new Float32Array(length); 40 | let offset = 0; 41 | for (let i = 0; i < buffers.length; i++) { 42 | result.set(buffers[i], offset); 43 | offset += buffers[i].length; 44 | } 45 | channels[n] = result; 46 | } 47 | return channels; 48 | } 49 | 50 | /** 51 | * Combines parallel audio data into correct format, 52 | * channels[Left[], Right[]] to float32Array[LRLRLRLR...] 53 | */ 54 | formatAudioData(channels) { 55 | if (channels.length === 1) { 56 | // Simple case is only one channel 57 | const float32Array = channels[0].slice(); 58 | const meanValues = channels[0].slice(); 59 | return { float32Array, meanValues }; 60 | } else { 61 | const float32Array = new Float32Array( 62 | channels[0].length * channels.length 63 | ); 64 | const meanValues = new Float32Array(channels[0].length); 65 | for (let i = 0; i < channels[0].length; i++) { 66 | const offset = i * channels.length; 67 | let meanValue = 0; 68 | for (let n = 0; n < channels.length; n++) { 69 | float32Array[offset + n] = channels[n][i]; 70 | meanValue += channels[n][i]; 71 | } 72 | meanValues[i] = meanValue / channels.length; 73 | } 74 | return { float32Array, meanValues }; 75 | } 76 | } 77 | 78 | /** 79 | * Converts 32-bit float data to 16-bit integers 80 | */ 81 | floatTo16BitPCM(float32Array) { 82 | const buffer = new ArrayBuffer(float32Array.length * 2); 83 | const view = new DataView(buffer); 84 | let offset = 0; 85 | for (let i = 0; i < float32Array.length; i++, offset += 2) { 86 | let s = Math.max(-1, Math.min(1, float32Array[i])); 87 | view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true); 88 | } 89 | return buffer; 90 | } 91 | 92 | /** 93 | * Retrieves the most recent amplitude values from the audio stream 94 | * @param {number} channel 95 | */ 96 | getValues(channel = -1) { 97 | const channels = this.readChannelData(this.chunks, channel); 98 | const { meanValues } = this.formatAudioData(channels); 99 | return { meanValues, channels }; 100 | } 101 | 102 | /** 103 | * Exports chunks as an audio/wav file 104 | */ 105 | export() { 106 | const channels = this.readChannelData(this.chunks); 107 | const { float32Array, meanValues } = this.formatAudioData(channels); 108 | const audioData = this.floatTo16BitPCM(float32Array); 109 | return { 110 | meanValues: meanValues, 111 | audio: { 112 | bitsPerSample: 16, 113 | channels: channels, 114 | data: audioData, 115 | }, 116 | }; 117 | } 118 | 119 | receive(e) { 120 | const { event, id } = e.data; 121 | let receiptData = {}; 122 | switch (event) { 123 | case 'start': 124 | this.recording = true; 125 | break; 126 | case 'stop': 127 | this.recording = false; 128 | break; 129 | case 'clear': 130 | this.initialize(); 131 | break; 132 | case 'export': 133 | receiptData = this.export(); 134 | break; 135 | case 'read': 136 | receiptData = this.getValues(); 137 | break; 138 | default: 139 | break; 140 | } 141 | // Always send back receipt 142 | this.port.postMessage({ event: 'receipt', id, data: receiptData }); 143 | } 144 | 145 | sendChunk(chunk) { 146 | const channels = this.readChannelData([chunk]); 147 | const { float32Array, meanValues } = this.formatAudioData(channels); 148 | const rawAudioData = this.floatTo16BitPCM(float32Array); 149 | const monoAudioData = this.floatTo16BitPCM(meanValues); 150 | this.port.postMessage({ 151 | event: 'chunk', 152 | data: { 153 | mono: monoAudioData, 154 | raw: rawAudioData, 155 | }, 156 | }); 157 | } 158 | 159 | process(inputList, outputList, parameters) { 160 | // Copy input to output (e.g. speakers) 161 | // Note that this creates choppy sounds with Mac products 162 | const sourceLimit = Math.min(inputList.length, outputList.length); 163 | for (let inputNum = 0; inputNum < sourceLimit; inputNum++) { 164 | const input = inputList[inputNum]; 165 | const output = outputList[inputNum]; 166 | const channelCount = Math.min(input.length, output.length); 167 | for (let channelNum = 0; channelNum < channelCount; channelNum++) { 168 | input[channelNum].forEach((sample, i) => { 169 | output[channelNum][i] = sample; 170 | }); 171 | } 172 | } 173 | const inputs = inputList[0]; 174 | // There's latency at the beginning of a stream before recording starts 175 | // Make sure we actually receive audio data before we start storing chunks 176 | let sliceIndex = 0; 177 | if (!this.foundAudio) { 178 | for (const channel of inputs) { 179 | sliceIndex = 0; // reset for each channel 180 | if (this.foundAudio) { 181 | break; 182 | } 183 | if (channel) { 184 | for (const value of channel) { 185 | if (value !== 0) { 186 | // find only one non-zero entry in any channel 187 | this.foundAudio = true; 188 | break; 189 | } else { 190 | sliceIndex++; 191 | } 192 | } 193 | } 194 | } 195 | } 196 | if (inputs && inputs[0] && this.foundAudio && this.recording) { 197 | // We need to copy the TypedArray, because the \`process\` 198 | // internals will reuse the same buffer to hold each input 199 | const chunk = inputs.map((input) => input.slice(sliceIndex)); 200 | this.chunks.push(chunk); 201 | this.sendChunk(chunk); 202 | } 203 | return true; 204 | } 205 | } 206 | 207 | registerProcessor('audio_processor', AudioProcessor); 208 | `; 209 | 210 | const script = new Blob([AudioProcessorWorklet], { 211 | type: 'application/javascript', 212 | }); 213 | const src = URL.createObjectURL(script); 214 | export const AudioProcessorSrc = src; 215 | -------------------------------------------------------------------------------- /lib/wavtools/lib/worklets/stream_processor.js: -------------------------------------------------------------------------------- 1 | export const StreamProcessorWorklet = ` 2 | class StreamProcessor extends AudioWorkletProcessor { 3 | constructor() { 4 | super(); 5 | this.hasStarted = false; 6 | this.hasInterrupted = false; 7 | this.outputBuffers = []; 8 | this.bufferLength = 128; 9 | this.write = { buffer: new Float32Array(this.bufferLength), trackId: null }; 10 | this.writeOffset = 0; 11 | this.trackSampleOffsets = {}; 12 | this.port.onmessage = (event) => { 13 | if (event.data) { 14 | const payload = event.data; 15 | if (payload.event === 'write') { 16 | const int16Array = payload.buffer; 17 | const float32Array = new Float32Array(int16Array.length); 18 | for (let i = 0; i < int16Array.length; i++) { 19 | float32Array[i] = int16Array[i] / 0x8000; // Convert Int16 to Float32 20 | } 21 | this.writeData(float32Array, payload.trackId); 22 | } else if ( 23 | payload.event === 'offset' || 24 | payload.event === 'interrupt' 25 | ) { 26 | const requestId = payload.requestId; 27 | const trackId = this.write.trackId; 28 | const offset = this.trackSampleOffsets[trackId] || 0; 29 | this.port.postMessage({ 30 | event: 'offset', 31 | requestId, 32 | trackId, 33 | offset, 34 | }); 35 | if (payload.event === 'interrupt') { 36 | this.hasInterrupted = true; 37 | } 38 | } else { 39 | throw new Error(\`Unhandled event "\${payload.event}"\`); 40 | } 41 | } 42 | }; 43 | } 44 | 45 | writeData(float32Array, trackId = null) { 46 | let { buffer } = this.write; 47 | let offset = this.writeOffset; 48 | for (let i = 0; i < float32Array.length; i++) { 49 | buffer[offset++] = float32Array[i]; 50 | if (offset >= buffer.length) { 51 | this.outputBuffers.push(this.write); 52 | this.write = { buffer: new Float32Array(this.bufferLength), trackId }; 53 | buffer = this.write.buffer; 54 | offset = 0; 55 | } 56 | } 57 | this.writeOffset = offset; 58 | return true; 59 | } 60 | 61 | process(inputs, outputs, parameters) { 62 | const output = outputs[0]; 63 | const outputChannelData = output[0]; 64 | const outputBuffers = this.outputBuffers; 65 | if (this.hasInterrupted) { 66 | this.port.postMessage({ event: 'stop' }); 67 | return false; 68 | } else if (outputBuffers.length) { 69 | this.hasStarted = true; 70 | const { buffer, trackId } = outputBuffers.shift(); 71 | for (let i = 0; i < outputChannelData.length; i++) { 72 | outputChannelData[i] = buffer[i] || 0; 73 | } 74 | if (trackId) { 75 | this.trackSampleOffsets[trackId] = 76 | this.trackSampleOffsets[trackId] || 0; 77 | this.trackSampleOffsets[trackId] += buffer.length; 78 | } 79 | return true; 80 | } else if (this.hasStarted) { 81 | this.port.postMessage({ event: 'stop' }); 82 | return false; 83 | } else { 84 | return true; 85 | } 86 | } 87 | } 88 | 89 | registerProcessor('stream_processor', StreamProcessor); 90 | `; 91 | 92 | const script = new Blob([StreamProcessorWorklet], { 93 | type: 'application/javascript', 94 | }); 95 | const src = URL.createObjectURL(script); 96 | export const StreamProcessorSrc = src; 97 | -------------------------------------------------------------------------------- /lib/websocket-utils/reconnectingWebSocket.ts: -------------------------------------------------------------------------------- 1 | import { EventEmitter } from "events"; 2 | 3 | const readyStates = ["CONNECTING", "OPEN", "CLOSING", "CLOSED"]; 4 | const KEEP_ALIVE_INTERVAL = 5000; 5 | const KEEP_ALIVE_TIMEOUT = 15000; 6 | // client side code in soupSFU has a timeout of 15 seconds for command response 7 | // 5 seconds seems reasonable that it provides roughly 3 retry attempts 8 | const WEBSOCKET_CONNECTION_TIMEOUT = 150 * 1000; 9 | const DEFAULT_RECONNECT_ATTEMPTS = 2; 10 | const MAX_RECONNECT_ATTEMPTS = 10; 11 | const DEFAULT_RECONNECT_INTERVAL = 1000; 12 | const MAX_RECONNECT_INTERVAL = 30 * 1000; 13 | const DEFAULT_RECONNECT_DECAY = 1.5; 14 | 15 | const WEBSOCKET_TIMEOUT_CODE = 4100; 16 | 17 | const SIG_CONNECTION_CANCELED = "SIG_CONNECTION_CANCELED"; 18 | const WEBSOCKET_ERROR = "WEBSOCKET_ERROR"; 19 | 20 | enum LOG_LEVEL { 21 | DEBUG, 22 | ERROR, 23 | INFO, 24 | WARN, 25 | } 26 | 27 | class rWebSocket { 28 | private _ws: WebSocket; 29 | _closedManually: boolean = false; 30 | _errored: boolean = false; 31 | _rejected: boolean = false; 32 | _timed_out: boolean = false; 33 | _initialConnectionOk: string | boolean = false; 34 | 35 | constructor(url: string, protocols?: string | string[]) { 36 | this._ws = new WebSocket(url, protocols); 37 | } 38 | 39 | addEventListener( 40 | type: string, 41 | listener: (this: WebSocket, ev: Event) => any, 42 | ) { 43 | this._ws.addEventListener(type, listener); 44 | } 45 | 46 | // Add other WebSocket methods as needed 47 | close(code?: number, reason?: string) { 48 | this._ws.close(code, reason); 49 | } 50 | 51 | send(data: string | ArrayBuffer | Blob | ArrayBufferView) { 52 | this._ws.send(data); 53 | } 54 | 55 | // Add getters for WebSocket properties 56 | get url() { 57 | return this._ws.url; 58 | } 59 | 60 | get readyState() { 61 | return this._ws.readyState; 62 | } 63 | } 64 | 65 | interface WebSocketOptions { 66 | parseBlobToJson?: boolean; 67 | } 68 | 69 | /** 70 | * Builds on top of Javascript Websockets 71 | * 72 | * This behaves like the Websocket library in every way, except if it fails to 73 | * connect or if it gets disconnected, it will try to reconnect depending on 74 | * the maximum number of reconnect attempts set. retry is not enabled for initial 75 | * connection. When initial connection fails it is best to check yourself before 76 | * you keep wreckin' yourself. 77 | * 78 | * It is API compatible, so when you have: 79 | * ws = new WebSocket('ws://....'); 80 | * you can replace with: 81 | * ws = new ReconnectingWebSocket('ws://....'); 82 | * 83 | * While it is API compatible with the NodeJS ws library, we provide the 84 | * following additional properties and events on the ReconnectingWebSocket. 85 | * 86 | * Events: 87 | * 88 | * connection-timeout 89 | * - Emitted when the web socket connection times out. 90 | * 91 | * reconnecting 92 | * - Emitted after a manual close of the web socket is done and before retrying 93 | * the connection. 94 | * 95 | * reconnect-failed 96 | * - Emitted when the number of connection attempts exceeds the set number of 97 | * reconnection attempts. 98 | * 99 | * keep-alive 100 | * - Emitted when the set keep alive interval elapses. This event may be used 101 | * to have ping pong keep-alive mechanism for web socket health. 102 | * 103 | * Properties: 104 | * 105 | * keepAliveTimeout 106 | * - The timeout for keep-alive. Default: 15000 107 | * 108 | * keepAliveInterval 109 | * - The interval at which to emit keep-alive event. Default: 5000 110 | * 111 | * shouldRetryFn 112 | * - A callback function which should return boolean to determine if a web 113 | * socket reconnection attempt should be made. When not set, connection is 114 | * always retried. 115 | * 116 | * connectionTimeout 117 | * - The timeout interval for considering whether the connection timed out. 118 | * Default: 20000 ms 119 | * 120 | * maxReconnectAttempts 121 | * - The maximum number of attempts to be made for reconnection. Default: 2 122 | * 123 | * reconnectInterval 124 | * - The interval to wait before attempting a reconnection. Default: 1000 ms 125 | */ 126 | export class ReconnectingWebSocket extends EventEmitter { 127 | /** The connection is not yet open. */ 128 | static readonly CONNECTING: 0; 129 | /** The connection is open and ready to communicate. */ 130 | static readonly OPEN: 1; 131 | /** The connection is in the process of closing. */ 132 | static readonly CLOSING: 2; 133 | /** The connection is closed. */ 134 | static readonly CLOSED: 3; 135 | 136 | private _ws: rWebSocket | null; 137 | 138 | _url: string; 139 | _protocols: string | string[] | undefined; 140 | 141 | declare private _keepAliveTimeout: number; 142 | declare private _keepAliveInterval: number; 143 | declare private _lastMsgRecvTime: number; 144 | declare private _lastMsgSendTime: number; 145 | declare private _disconnected: boolean; 146 | declare private _keepIntervalID: NodeJS.Timeout | null; 147 | declare private _connectionTimeout: number; 148 | declare private _connectionTimeoutID: NodeJS.Timeout | undefined; 149 | declare private _reconnectTimeoutID: NodeJS.Timeout | undefined; 150 | declare private _shouldRetryFn: (() => boolean) | null; 151 | declare private _reconnectAttempts: number; 152 | declare private _allowedReconnectAttempts: number; 153 | declare private _reconnectInterval: number; 154 | declare private _maxReconnectInterval: number; 155 | declare private _reconnectDecay: number; 156 | declare private _parseBlobToJson: boolean; 157 | 158 | constructor( 159 | address: string, 160 | protocols?: string | string[], 161 | options: WebSocketOptions = {}, 162 | ) { 163 | super(); 164 | 165 | if (!address) { 166 | throw new Error("Need a valid WebSocket URL"); 167 | } 168 | 169 | this._ws = null; 170 | 171 | this._url = address; 172 | this._protocols = protocols; 173 | this._parseBlobToJson = options?.parseBlobToJson ?? true; 174 | 175 | this.init(); 176 | } 177 | 178 | private init() { 179 | this._keepAliveTimeout = KEEP_ALIVE_TIMEOUT; 180 | this._keepAliveInterval = KEEP_ALIVE_INTERVAL; 181 | this._disconnected = false; 182 | this._keepIntervalID = null; 183 | this._shouldRetryFn = null; 184 | this._connectionTimeout = WEBSOCKET_CONNECTION_TIMEOUT; 185 | this._reconnectAttempts = 0; 186 | this._allowedReconnectAttempts = DEFAULT_RECONNECT_ATTEMPTS; 187 | this._reconnectInterval = DEFAULT_RECONNECT_INTERVAL; 188 | this._maxReconnectInterval = MAX_RECONNECT_INTERVAL; 189 | this._reconnectDecay = DEFAULT_RECONNECT_DECAY; 190 | } 191 | 192 | public async connect() { 193 | return new Promise((resolve, reject) => { 194 | this._disconnected = false; 195 | this.clearReconnectTimeout(); 196 | 197 | let ws: rWebSocket = new rWebSocket(this._url, this._protocols); 198 | this.setConnectionTimeout(); 199 | 200 | ws.addEventListener("close", (evt) => { 201 | const closeEvent = evt as CloseEvent; 202 | let code = ws._timed_out ? WEBSOCKET_TIMEOUT_CODE : closeEvent.code; 203 | let reason = ws._timed_out 204 | ? "websocket connection timed out" 205 | : closeEvent.reason; 206 | ws._timed_out = false; 207 | if (!ws._closedManually && ws._initialConnectionOk) { 208 | console.warn( 209 | `signaling socket closed unexpectedly: ${code}${ 210 | reason ? " " + reason : "" 211 | }`, 212 | ); 213 | this._closeSocket(); 214 | this.emit("close", code, reason); 215 | } else { 216 | this.log("signaling socket closed"); 217 | } 218 | if (!ws._closedManually && (ws._errored || ws._timed_out)) { 219 | console.warn( 220 | `signaling socket closed on error: ${code}${ 221 | reason ? " " + reason : "" 222 | }`, 223 | ); 224 | if (!ws._rejected) { 225 | ws._rejected = true; 226 | const err = new Error( 227 | `WebSocket connection error (${code}): ${reason}`, 228 | ); 229 | err.name = WEBSOCKET_ERROR; 230 | reject(err); 231 | } 232 | } 233 | }); 234 | ws.addEventListener("open", (evt) => { 235 | this.log("wss connection opened to", LOG_LEVEL.DEBUG, this._url); 236 | this.clearConnectionTimeout(); 237 | // now that the timeout closes the socket, in theory this onopen 238 | // callback should never happen in the first place, but seems 239 | // harmless to leave these safeguards in 240 | if (ws._rejected || ws._timed_out) { 241 | return; 242 | } 243 | if (ws._closedManually || (this._ws && this._ws !== ws)) { 244 | ws._rejected = true; 245 | ws.close(); 246 | let err = Error( 247 | "wss connection interrupted by disconnect or newer connection", 248 | ); 249 | err.name = SIG_CONNECTION_CANCELED; 250 | reject(err); 251 | return; 252 | } 253 | ws._initialConnectionOk = this._url; 254 | this._lastMsgRecvTime = Date.now(); 255 | if (this._keepAliveInterval) { 256 | this._keepIntervalID = setInterval( 257 | () => this.checkSocketHealthAndSendKeepAlive(), 258 | this._keepAliveInterval, 259 | ); 260 | } 261 | this._ws = ws; 262 | this.emit("open"); 263 | resolve(ws); 264 | }); 265 | ws.addEventListener("error", (evt) => { 266 | // fyi: evt is an Event here, with 0 amount of helpful info. If there 267 | // happens to be info about the error, it's included in the 268 | // accompanying close event (because that make sense. shakes head) 269 | // SO. We do not reject here. Instead, we just set the _errored 270 | // flag on the socket so when the close event occurs, it knows to 271 | // reject the promise 272 | if (!ws._closedManually) { 273 | const wsTarget = evt.currentTarget as WebSocket; 274 | this.log(`websocket error event: ${wsTarget?.url}`); 275 | } 276 | ws._errored = true; 277 | }); 278 | ws.addEventListener("message", (msg) => { 279 | void this._handleMessage(msg as MessageEvent); 280 | }); 281 | }); 282 | } 283 | 284 | private setConnectionTimeout() { 285 | this._connectionTimeoutID = setTimeout(async () => { 286 | this.log("Connection reconnect attempt timed out."); 287 | this.emit("connection-timeout"); 288 | this.clearConnectionTimeout(); 289 | await this._closeSocket(); 290 | }, this._connectionTimeout); 291 | } 292 | 293 | private clearConnectionTimeout() { 294 | clearTimeout(this._connectionTimeoutID); 295 | this._connectionTimeoutID = undefined; 296 | } 297 | 298 | private clearReconnectTimeout() { 299 | clearTimeout(this._reconnectTimeoutID); 300 | this._reconnectTimeoutID = undefined; 301 | } 302 | 303 | private clearKeepAliveInterval() { 304 | if (this._keepIntervalID) { 305 | clearInterval(this._keepIntervalID); 306 | this._keepIntervalID = null; 307 | } 308 | } 309 | 310 | private async checkSocketHealthAndSendKeepAlive() { 311 | if (!(this._ws && this._ws.readyState === WebSocket.OPEN)) { 312 | return; 313 | } 314 | 315 | if (!this._keepAliveTimeout || !this._keepAliveInterval) { 316 | return; 317 | } 318 | 319 | // See if we haven't gotten a message back recently, and if we 320 | // haven't, close the socket. the os timeouts to detect if a socket 321 | // has gone stale are longer than we want. 322 | if (Date.now() - this._lastMsgRecvTime > this._keepAliveTimeout) { 323 | this.log("Connection is stale, need to reconnect", LOG_LEVEL.WARN); 324 | await this._closeSocket(); 325 | return; 326 | } 327 | 328 | // Only emit the keep-alive event if we haven't sent anything else recently 329 | if (Date.now() - this._lastMsgSendTime < this._keepAliveInterval) { 330 | return; 331 | } 332 | 333 | this.log("Emitting keep-alive", LOG_LEVEL.DEBUG); 334 | this.emit("keep-alive"); 335 | } 336 | 337 | // We use the word manually here to imply the application using this code 338 | // or this code itself will decide to close the socket. 339 | private async _closeSocket() { 340 | this.log("Closing"); 341 | try { 342 | this.clearKeepAliveInterval(); 343 | this._lastMsgRecvTime = 0; 344 | 345 | if (this._ws) { 346 | this._ws._closedManually = true; 347 | this._ws.close(); 348 | } 349 | 350 | // query retry function if we want to retry. 351 | const shouldRetry = 352 | this._ws?._initialConnectionOk && 353 | this._shouldRetryFn && 354 | this._shouldRetryFn(); 355 | 356 | this._ws = null; 357 | 358 | if (shouldRetry) { 359 | this.log("Emitting reconnect", LOG_LEVEL.DEBUG); 360 | this.emit("reconnecting"); 361 | await this.retryFailedConnection(); 362 | } 363 | } catch (error) { 364 | this.log(`Error while closing and retrying: ${error}`, LOG_LEVEL.ERROR); 365 | } 366 | } 367 | 368 | private async retryFailedConnection() { 369 | if (this._reconnectAttempts < this._allowedReconnectAttempts) { 370 | if (this._reconnectTimeoutID) { 371 | this.log("Retry already scheduled"); 372 | return; 373 | } 374 | this.log("Retrying failed connection"); 375 | let timeout = 376 | // The timeout logic is taken from 377 | // https://github.com/joewalnes/reconnecting-websocket 378 | this._reconnectInterval * 379 | Math.pow(this._reconnectDecay, this._reconnectAttempts); 380 | timeout = 381 | timeout > this._maxReconnectInterval 382 | ? this._maxReconnectInterval 383 | : timeout; 384 | this.log(`Reconnecting in ${timeout / 1000} seconds`); 385 | 386 | this._reconnectAttempts += 1; 387 | this._reconnectTimeoutID = setTimeout(() => this.connect(), timeout); 388 | } else { 389 | this.log("Maximum connection retry attempts exceeded", LOG_LEVEL.ERROR); 390 | this.emit("reconnect-failed"); 391 | } 392 | } 393 | 394 | private log( 395 | msg: string, 396 | log_level: LOG_LEVEL = LOG_LEVEL.DEBUG, 397 | ...args: any 398 | ) { 399 | switch (log_level) { 400 | case LOG_LEVEL.DEBUG: 401 | console.debug(`websocket: ${msg}`, ...args); 402 | break; 403 | case LOG_LEVEL.ERROR: 404 | console.error(`websocket: ${msg}`, ...args); 405 | break; 406 | case LOG_LEVEL.WARN: 407 | console.warn(`websocket: ${msg}`, ...args); 408 | break; 409 | case LOG_LEVEL.INFO: 410 | default: 411 | console.log(`websocket: ${msg}`, ...args); 412 | break; 413 | } 414 | } 415 | 416 | async send(data: any) { 417 | try { 418 | if (this._ws && this._ws.readyState === WebSocket.OPEN) { 419 | this._lastMsgSendTime = Date.now(); 420 | this._ws.send(data); 421 | } else { 422 | this.log(`Failed to send data, web socket not open.`, LOG_LEVEL.ERROR); 423 | } 424 | } catch (error) { 425 | this.log(`Failed to send data. ${error}`, LOG_LEVEL.ERROR); 426 | } 427 | } 428 | 429 | async close() { 430 | try { 431 | this.log("Closing websocket"); 432 | this._disconnected = true; 433 | this.clearReconnectTimeout(); 434 | this._closeSocket(); 435 | } catch (error) { 436 | this.log(`Failed to close websocket. ${error}`); 437 | } 438 | } 439 | 440 | get readyState(): number { 441 | return this._ws?.readyState ?? WebSocket.CLOSED; 442 | } 443 | 444 | get url(): string { 445 | return this._url; 446 | } 447 | 448 | get keepAliveTimeout(): number { 449 | return this._keepAliveTimeout; 450 | } 451 | 452 | set keepAliveTimeout(keepAliveTimeout: number) { 453 | if (typeof keepAliveTimeout === "number") { 454 | this.log(`Setting ACK freshness timeout to ${keepAliveTimeout}`); 455 | this._keepAliveTimeout = keepAliveTimeout; 456 | } 457 | } 458 | 459 | get keepAliveInterval(): number { 460 | return this._keepAliveInterval; 461 | } 462 | 463 | set keepAliveInterval(keepAliveInterval: number) { 464 | if (typeof keepAliveInterval === "number") { 465 | this.log(`Setting keep-alive interval to ${keepAliveInterval}`); 466 | this._keepAliveInterval = keepAliveInterval; 467 | } 468 | } 469 | 470 | set shouldRetryFn(cb: () => boolean) { 471 | if (typeof cb === "function") { 472 | this._shouldRetryFn = cb; 473 | } 474 | } 475 | 476 | get connectionTimeout(): number { 477 | return this._connectionTimeout; 478 | } 479 | 480 | set connectionTimeout(timeout: number) { 481 | if (typeof timeout === "number") { 482 | this._connectionTimeout = timeout; 483 | } 484 | } 485 | 486 | get maxReconnectAttempts(): number { 487 | return this._allowedReconnectAttempts; 488 | } 489 | 490 | set maxReconnectAttempts(attempts: number) { 491 | if (attempts > 0 && attempts < MAX_RECONNECT_ATTEMPTS) { 492 | this.log(`Setting maximum connection retry attempts to ${attempts}`); 493 | this._allowedReconnectAttempts = attempts; 494 | } else { 495 | this._allowedReconnectAttempts = DEFAULT_RECONNECT_ATTEMPTS; 496 | } 497 | } 498 | 499 | get reconnectInterval(): number { 500 | return this._reconnectInterval; 501 | } 502 | 503 | set reconnectInterval(interval: number) { 504 | if (typeof interval === "number") { 505 | this._reconnectInterval = 506 | interval < this._maxReconnectInterval 507 | ? interval 508 | : this._maxReconnectInterval; 509 | } 510 | } 511 | 512 | async _handleMessage(event: MessageEvent) { 513 | this._lastMsgRecvTime = Date.now(); 514 | const data = event.data; 515 | 516 | const _parsePromise = new Promise((resolve, reject) => { 517 | if (typeof data === "string") { 518 | // Handle text message 519 | resolve(data); 520 | } else if (data instanceof ArrayBuffer) { 521 | // Handle binary message 522 | const arrayBuffer = data; 523 | // Parse the ArrayBuffer as needed 524 | // Example: Convert ArrayBuffer to Uint8Array 525 | resolve(new Uint8Array(arrayBuffer)); 526 | // Process the Uint8Array as needed 527 | } else if (data instanceof Blob) { 528 | if (!this._parseBlobToJson) { 529 | resolve(data); 530 | return; 531 | } 532 | // Handle Blob message 533 | const blob = data; 534 | // Convert Blob to ArrayBuffer 535 | const reader = new FileReader(); 536 | reader.onload = () => { 537 | const text = reader.result as string; 538 | try { 539 | const json = JSON.parse(text); 540 | resolve(json); 541 | } catch (e) { 542 | console.error("Failed to parse JSON from Blob:", e); 543 | } 544 | }; 545 | reader.readAsText(blob); 546 | } 547 | }); 548 | 549 | let msg = await _parsePromise; 550 | 551 | this.emit("message", msg); 552 | } 553 | } 554 | 555 | [ 556 | "binaryType", 557 | "bufferedAmount", 558 | "extensions", 559 | "protocol", 560 | "readyState", 561 | "url", 562 | "keepAliveTimeout", 563 | "keepAliveInterval", 564 | "shouldRetryFn", 565 | "connectionTimeout", 566 | "maxReconnectAttempts", 567 | "reconnectInterval", 568 | ].forEach((property) => { 569 | Object.defineProperty(ReconnectingWebSocket.prototype, property, { 570 | enumerable: true, 571 | }); 572 | }); 573 | 574 | ["CONNECTING", "OPEN", "CLOSING", "CLOSED"].forEach((property) => { 575 | Object.defineProperty(ReconnectingWebSocket.prototype, property, { 576 | enumerable: true, 577 | value: readyStates.indexOf(property), 578 | }); 579 | }); 580 | 581 | ["CONNECTING", "OPEN", "CLOSING", "CLOSED"].forEach((property) => { 582 | Object.defineProperty(ReconnectingWebSocket, property, { 583 | enumerable: true, 584 | value: readyStates.indexOf(property), 585 | }); 586 | }); 587 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "private": true, 3 | "name": "pipecat-client-web-transports", 4 | "version": "0.0.0", 5 | "workspaces": [ 6 | "transports/*" 7 | ], 8 | "scripts": { 9 | "build": "npm run build --workspaces" 10 | }, 11 | "devDependencies": { 12 | "@parcel/packager-ts": "^2.13.2", 13 | "@parcel/transformer-typescript-tsc": "^2.13.2", 14 | "@parcel/transformer-typescript-types": "^2.13.2", 15 | "@parcel/validator-typescript": "^2.12.0", 16 | "@swc/helpers": "^0.5.13", 17 | "parcel": "^2.13.2", 18 | "prettier": "^3.5.3", 19 | "typescript": "^5.5.4" 20 | }, 21 | "peerDependencies": { 22 | "@daily-co/daily-js": "^0.77.0" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /transports/daily/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to **Pipecat Daily WebRTC Transport** will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [0.4.0] 9 | 10 | - Bumped dependency to @pipecat-ai/client-js@~0.4.0 11 | 12 | ## [0.3.10] 13 | 14 | - Fix an issue where iOS devices have ~500ms of audio cut off after declaring 15 | that the track state is playable. 16 | 17 | ## [0.3.9] 18 | 19 | DO NOT USE 20 | 21 | ## [0.3.8] 22 | 23 | - Fix issue resulting in the camera starting despite enableCam setting. 24 | 25 | ## [0.3.7] 26 | 27 | - Added support for disconnecting the client if the Daily call errors out. 28 | 29 | ## [0.3.6] 30 | 31 | ### Fixed 32 | 33 | - Fixed an issue where the transport could call `clientReady()` multiple times, 34 | once for each `track-started` event. Now, `clientReady()` is called for the 35 | first track only. 36 | 37 | - Added support for buffering audio until the bot is ready using the 38 | `bufferLocalAudioUntilBotReady` property. Once the bot is ready, the buffered 39 | audio will be sent, allowing the user to begin speaking before the bot has 40 | joined the call. 41 | 42 | ## [0.3.4] - 2024-12-16 43 | 44 | ### Added 45 | 46 | - Screen sharing support 47 | - Added `startScreenShare` and `stopScreenShare` methods 48 | - Added `isSharingScreen` getter property 49 | 50 | ## [0.3.3] - 2024-12-11 51 | 52 | - Fixed READMEs 53 | 54 | ## [0.3.2] - 2024-12-11 55 | 56 | - Added new abstract `RealtimeWebsocketTransport` class for direct 57 | voice-to-voice transports 58 | 59 | - Added new `GeminiLiveWebsocketTransport` 60 | 61 | - Added [basic example](./examples/geminiMultiModalLive) for using 62 | `GeminiLiveWebsocketTransport` 63 | 64 | ## [0.2.3] - 2024-12-06 65 | 66 | ### Fixed 67 | 68 | - Added missing event support for managing audio speakers 69 | 70 | ## [0.2.2] - 2024-11-12 71 | 72 | ### Added 73 | 74 | - Implemented log levels as part of `realtime-ai` package. 75 | 76 | ## [0.2.1] - 2024-10-28 77 | 78 | - Version bump to align with core `realtime-ai` package. 79 | -------------------------------------------------------------------------------- /transports/daily/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2024, Daily 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /transports/daily/README.md: -------------------------------------------------------------------------------- 1 | # Pipecat's Real-Time Voice Inference - Daily Transport 2 | 3 | [![Docs](https://img.shields.io/badge/documentation-blue)](https://docs.pipecat.ai/client/js/transports/daily) 4 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/daily-transport) 5 | [![Demo](https://img.shields.io/badge/Demo-coral)](https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot) 6 | 7 | Daily transport package for use with `@pipecat-ai/client-js`. 8 | 9 | ## Installation 10 | 11 | ```bash copy 12 | npm install \ 13 | @pipecat-ai/client-js \ 14 | @pipecat-ai/daily-transport 15 | ``` 16 | 17 | ## Overview 18 | 19 | The DailyTransport class provides a WebRTC transport layer using [Daily.co's](https://daily.co) infrastructure. It handles audio/video device management, WebRTC connections, and real-time communication between clients and bots. 20 | 21 | ## Features 22 | 23 | - 🎥 Complete camera device management 24 | - 🎤 Microphone input handling 25 | - 🔊 Speaker output control 26 | - 📡 WebRTC connection management 27 | - 🤖 Bot participant tracking 28 | - 📊 Audio level monitoring 29 | - 💬 Real-time messaging 30 | 31 | ## Usage 32 | 33 | ### Basic Setup 34 | 35 | ```javascript 36 | import { RTVIClient } from "@pipecat-ai/client-js"; 37 | import { DailyTransport } from "@pipecat-ai/daily-transport"; 38 | 39 | const transport = new DailyTransport({ 40 | dailyFactoryOptions: { 41 | // Daily.co specific configuration 42 | } 43 | }); 44 | 45 | const rtviClient = new RTVIClient({ 46 | transport, 47 | enableCam: false, // Default camera off 48 | enableMic: true, // Default microphone on 49 | callbacks: { 50 | // Event handlers 51 | }, 52 | params: { 53 | baseUrl, 54 | endpoints 55 | } 56 | // ... 57 | }); 58 | 59 | await rtviClient.connect(); 60 | ``` 61 | 62 | ## API Reference 63 | 64 | ### Constructor Options 65 | 66 | ```typescript 67 | interface DailyTransportConstructorOptions { 68 | dailyFactoryOptions?: DailyFactoryOptions; // Daily.co specific configuration 69 | } 70 | ``` 71 | 72 | ### States 73 | 74 | The transport can be in one of these states: 75 | - "initializing" 76 | - "initialized" 77 | - "connecting" 78 | - "connected" 79 | - "ready" 80 | - "disconnecting" 81 | - "error" 82 | 83 | ## Events 84 | 85 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info. 86 | 87 | ## Error Handling 88 | 89 | The transport includes error handling for: 90 | - Connection failures 91 | - Device errors 92 | - Authentication issues 93 | - Message transmission problems 94 | 95 | ## License 96 | BSD-2 Clause 97 | -------------------------------------------------------------------------------- /transports/daily/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@pipecat-ai/daily-transport", 3 | "version": "0.4.0", 4 | "license": "BSD-2-Clause", 5 | "main": "dist/index.js", 6 | "module": "dist/index.module.js", 7 | "types": "dist/index.d.ts", 8 | "source": "src/index.ts", 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git" 12 | }, 13 | "files": [ 14 | "dist", 15 | "package.json", 16 | "README.md" 17 | ], 18 | "scripts": { 19 | "build": "parcel build --no-cache", 20 | "dev": "parcel watch", 21 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0" 22 | }, 23 | "devDependencies": { 24 | "@pipecat-ai/client-js": "^0.4.0", 25 | "eslint": "9.11.1", 26 | "eslint-config-prettier": "^9.1.0", 27 | "eslint-plugin-simple-import-sort": "^12.1.1" 28 | }, 29 | "peerDependencies": { 30 | "@pipecat-ai/client-js": "~0.4.0" 31 | }, 32 | "dependencies": { 33 | "@daily-co/daily-js": "^0.77.0" 34 | }, 35 | "description": "Pipecat Daily Transport Package", 36 | "author": "Daily.co", 37 | "bugs": { 38 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues" 39 | }, 40 | "homepage": "https://github.com/pipecat-ai/pipecat-client-web-transports/blob/main/transports/daily-webrtc/README.md" 41 | } 42 | -------------------------------------------------------------------------------- /transports/daily/src/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./transport"; 2 | -------------------------------------------------------------------------------- /transports/daily/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "ESNext", 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "skipLibCheck": true, 7 | "jsx": "preserve", 8 | 9 | /* Bundler mode */ 10 | "moduleResolution": "bundler", 11 | "allowImportingTsExtensions": true, 12 | "allowJs": true, 13 | "noEmit": true, 14 | "resolveJsonModule": true, 15 | "isolatedModules": true, 16 | "moduleDetection": "force", 17 | 18 | /* Linting */ 19 | "strict": true, 20 | "noUnusedLocals": true, 21 | "noUnusedParameters": false, 22 | "noFallthroughCasesInSwitch": true 23 | }, 24 | "include": ["src"] 25 | } 26 | -------------------------------------------------------------------------------- /transports/gemini-live-websocket-transport/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2024, Daily 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /transports/gemini-live-websocket-transport/README.md: -------------------------------------------------------------------------------- 1 | # Gemini Live Websocket Transport 2 | 3 | [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai/client/js/transports/gemini) 4 | [![Demo](https://img.shields.io/badge/Demo-forestgreen)](examples/directToLLMTransports/README.md) 5 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/gemini-live-websocket-transport) 6 | 7 | A real-time websocket transport implementation for interacting with Google's Gemini Multimodal Live API, supporting bidirectional audio and unidirectional text communication. 8 | 9 | ## Installation 10 | 11 | ```bash copy 12 | npm install \ 13 | @pipecat-ai/client-js \ 14 | @pipecat-ai/real-time-websocket-transport \ 15 | @pipecat-ai/gemini-live-websocket-transport 16 | ``` 17 | 18 | ## Overview 19 | 20 | The `GeminiLiveWebsocketTransport` class extends the `DirectToLLMBaseWebSocketTransport` to implement a fully functional [RTVI `Transport`](https://docs.pipecat.ai/client/js/transports/transport). It provides a framework for implementing real-time communication directly with the [Gemini Multimodal Live](https://ai.google.dev/api/multimodal-live) voice-to-voice service. It handles media device management, audio/video streams, and state management for the connection. 21 | 22 | ## Features 23 | 24 | - Real-time bidirectional communication with Gemini Multimodal Live 25 | - Input device management 26 | - Audio streaming support 27 | - Text message support 28 | - Automatic reconnection handling 29 | - Configurable generation parameters 30 | - Support for initial conversation context 31 | 32 | ## Usage 33 | 34 | ### Basic Setup 35 | 36 | ```javascript 37 | import { GeminiLiveWebsocketTransport, GeminiLLMServiceOptions } from '@pipecat-ai/gemini-live-websocket-transport'; 38 | 39 | const options: GeminiLLMServiceOptions = { 40 | api_key: 'YOUR_API_KEY', 41 | generation_config: { 42 | temperature: 0.7, 43 | maxOutput_tokens: 1000 44 | } 45 | }; 46 | 47 | const transport = new GeminiLiveWebsocketTransport(options); 48 | let RTVIConfig: RTVIClientOptions = { 49 | transport, 50 | ... 51 | }; 52 | 53 | ``` 54 | 55 | ### Configuration Options 56 | 57 | ```typescript 58 | interface GeminiLLMServiceOptions { 59 | api_key: string; // Required: Your Gemini API key 60 | initial_messages?: Array<{ // Optional: Initial conversation context 61 | content: string; 62 | role: string; 63 | }>; 64 | generation_config?: { // Optional: Generation parameters 65 | candidate_count?: number; 66 | maxOutput_tokens?: number; 67 | temperature?: number; 68 | top_p?: number; 69 | top_k?: number; 70 | presence_penalty?: number; 71 | frequency_penalty?: number; 72 | response_modalities?: string; 73 | speech_config?: { 74 | voice_config?: { 75 | prebuilt_voice_config?: { 76 | voice_name: "Puck" | "Charon" | "Kore" | "Fenrir" | "Aoede"; 77 | }; 78 | }; 79 | }; 80 | }; 81 | } 82 | ``` 83 | 84 | ### Sending Messages 85 | 86 | ```javascript 87 | // at setup time... 88 | llmHelper = new LLMHelper({}); 89 | rtviClient.registerHelper("llm", llmHelper); 90 | // the 'llm' name in this call above isn't used. 91 | //that value is specific to working with a pipecat pipeline 92 | 93 | // at time of sending message... 94 | // Send text prompt message 95 | llmHelper.appendToMessages({ role: "user", content: 'Hello Gemini!' }); 96 | ``` 97 | 98 | ### Handling Events 99 | 100 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info. 101 | 102 | ## API Reference 103 | 104 | ### Methods 105 | 106 | - `initialize()`: Set up the transport and establish connection 107 | - `sendMessage(message)`: Send a text message 108 | - `handleUserAudioStream(data)`: Stream audio data to the model 109 | - `disconnectLLM()`: Close the connection 110 | - `sendReadyMessage()`: Signal ready state 111 | 112 | ### States 113 | 114 | The transport can be in one of the following states: 115 | - "disconnected" 116 | - "initializing" 117 | - "initialized" 118 | - "connecting" 119 | - "connected" 120 | - "ready" 121 | - "disconnecting 122 | - "error" 123 | 124 | ## Error Handling 125 | 126 | The transport includes comprehensive error handling for: 127 | - Connection failures 128 | - Websocket errors 129 | - API key validation 130 | - Message transmission errors 131 | 132 | ## License 133 | BSD-2 Clause 134 | -------------------------------------------------------------------------------- /transports/gemini-live-websocket-transport/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@pipecat-ai/gemini-live-websocket-transport", 3 | "version": "0.4.0", 4 | "license": "BSD-2-Clause", 5 | "main": "dist/index.js", 6 | "module": "dist/index.module.js", 7 | "types": "dist/index.d.ts", 8 | "source": "src/index.ts", 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git" 12 | }, 13 | "files": [ 14 | "dist", 15 | "package.json", 16 | "README.md" 17 | ], 18 | "scripts": { 19 | "build": "parcel build --no-cache", 20 | "dev": "parcel watch", 21 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0" 22 | }, 23 | "dependencies": { 24 | "@daily-co/daily-js": "^0.79.0" 25 | }, 26 | "devDependencies": { 27 | "@pipecat-ai/client-js": "^0.4.0", 28 | "@types/node": "^22.9.0", 29 | "eslint": "9.11.1", 30 | "eslint-config-prettier": "^9.1.0", 31 | "eslint-plugin-simple-import-sort": "^12.1.1" 32 | }, 33 | "peerDependencies": { 34 | "@pipecat-ai/client-js": "~0.4.0" 35 | }, 36 | "description": "Pipecat Gemini Multimodal Live Transport Package", 37 | "author": "Daily.co", 38 | "bugs": { 39 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues" 40 | }, 41 | "homepage": "https://github.com/pipecat-ai/pipecat-client-web-transports/blob/main/transports/gemini-live-websocket-transport/README.md" 42 | } 43 | -------------------------------------------------------------------------------- /transports/gemini-live-websocket-transport/src/directToLLMBaseWebSocketTransport.ts: -------------------------------------------------------------------------------- 1 | import { 2 | BotTTSTextData, 3 | RTVIClientOptions, 4 | RTVIMessage, 5 | Tracks, 6 | TranscriptData, 7 | Transport, 8 | TransportState, 9 | } from "@pipecat-ai/client-js"; 10 | 11 | import { MediaManager } from "../../../lib/media-mgmt/mediaManager"; 12 | 13 | export interface LLMServiceOptions { 14 | api_key?: string; 15 | initial_messages?: Array; 16 | model?: string; 17 | settings?: Record; 18 | } 19 | 20 | /** 21 | * DirectToLLMBaseWebSocketTransport is an abstract class that provides a client-side 22 | * interface for connecting to a real-time AI service. It is intended to 23 | * connect directly to the service. (No Pipecat server is involved.) 24 | */ 25 | export abstract class DirectToLLMBaseWebSocketTransport extends Transport { 26 | // Utilities for audio. 27 | private _mediaManager; 28 | protected _service_options: LLMServiceOptions; 29 | 30 | protected _botIsSpeaking = false; 31 | 32 | constructor(service_options: LLMServiceOptions, manager: MediaManager) { 33 | super(); 34 | this._service_options = service_options; 35 | this._mediaManager = manager; 36 | this._mediaManager.setUserAudioCallback( 37 | this.handleUserAudioStream.bind(this), 38 | ); 39 | } 40 | 41 | /** 42 | * This method will be called from initialize() 43 | * Subclasses should initialize the LLM client and media player/recorder 44 | * and call initializeAudio() from within this method. 45 | */ 46 | abstract initializeLLM(): void; 47 | /** 48 | * This method will be called from initialize() 49 | * Subclasses should etup listeners for LLM events from within this method 50 | */ 51 | abstract attachLLMListeners(): void; 52 | /** 53 | * This method will be called from connect() 54 | * Subclasses should connect to the LLM and pass along the initial messages 55 | * @param initial_messages 56 | */ 57 | abstract connectLLM(): Promise; 58 | /** 59 | * This method will be called from disconnect() 60 | * Subclasses should disconnect from the LLM 61 | */ 62 | abstract disconnectLLM(): Promise; 63 | /** 64 | * This method will be called regularly with audio data from the user 65 | * Subclasses should handle this data and pass it along to the LLM 66 | * @param data ArrayBuffer of audio data 67 | */ 68 | abstract handleUserAudioStream(data: ArrayBuffer): void; 69 | 70 | // subclasses should implement this method to initialize the LLM 71 | // client and call super() on this method 72 | initialize( 73 | options: RTVIClientOptions, 74 | messageHandler: (ev: RTVIMessage) => void, 75 | ): void { 76 | this._options = options; 77 | this._callbacks = options.callbacks ?? {}; 78 | this._onMessage = messageHandler; 79 | 80 | this._mediaManager.setRTVIOptions(options); 81 | 82 | this.initializeLLM(); 83 | 84 | this.attachDeviceListeners(); 85 | this.attachLLMListeners(); 86 | 87 | this.state = "disconnected"; 88 | } 89 | 90 | async initDevices(): Promise { 91 | this.state = "initializing"; 92 | await this._mediaManager.initialize(); 93 | this.state = "initialized"; 94 | } 95 | 96 | async connect( 97 | authBundle: unknown, 98 | abortController: AbortController, 99 | ): Promise { 100 | this.state = "connecting"; 101 | 102 | await this.connectLLM(); 103 | 104 | // connect user audio to llm 105 | this._mediaManager.connect(); 106 | this.state = "connected"; 107 | this._callbacks.onConnected?.(); 108 | } 109 | 110 | async disconnect(): Promise { 111 | this.state = "disconnecting"; 112 | await this._mediaManager.disconnect(); 113 | await this.disconnectLLM(); 114 | this.state = "disconnected"; 115 | this._callbacks.onDisconnected?.(); 116 | } 117 | 118 | getAllMics(): Promise { 119 | return this._mediaManager.getAllMics(); 120 | } 121 | getAllCams(): Promise { 122 | return this._mediaManager.getAllCams(); 123 | } 124 | getAllSpeakers(): Promise { 125 | return this._mediaManager.getAllSpeakers(); 126 | } 127 | 128 | async updateMic(micId: string): Promise { 129 | return this._mediaManager.updateMic(micId); 130 | } 131 | updateCam(camId: string): void { 132 | return this._mediaManager.updateCam(camId); 133 | } 134 | updateSpeaker(speakerId: string): void { 135 | return this._mediaManager.updateSpeaker(speakerId); 136 | } 137 | 138 | get selectedMic(): MediaDeviceInfo | Record { 139 | return this._mediaManager.selectedMic; 140 | } 141 | get selectedCam(): MediaDeviceInfo | Record { 142 | return this._mediaManager.selectedCam; 143 | } 144 | get selectedSpeaker(): MediaDeviceInfo | Record { 145 | return this._mediaManager.selectedSpeaker; 146 | } 147 | 148 | enableMic(enable: boolean): void { 149 | this._mediaManager.enableMic(enable); 150 | } 151 | enableCam(enable: boolean): void { 152 | this._mediaManager.enableCam(enable); 153 | } 154 | 155 | get isCamEnabled(): boolean { 156 | return this._mediaManager.isCamEnabled; 157 | } 158 | get isMicEnabled(): boolean { 159 | return this._mediaManager.isMicEnabled; 160 | } 161 | 162 | get state(): TransportState { 163 | return this._state; 164 | } 165 | 166 | set state(state: TransportState) { 167 | if (this._state === state) return; 168 | 169 | this._state = state; 170 | this._callbacks.onTransportStateChanged?.(state); 171 | } 172 | 173 | get expiry(): number | undefined { 174 | return this._expiry; 175 | } 176 | 177 | tracks(): Tracks { 178 | return this._mediaManager.tracks(); 179 | } 180 | 181 | // Realtime event handlers 182 | async userStartedSpeaking(): Promise { 183 | // Handle interruption 184 | const trackSampleOffset = await this._mediaManager.userStartedSpeaking(); 185 | this._callbacks.onUserStartedSpeaking?.(); 186 | return trackSampleOffset; 187 | } 188 | 189 | userStoppedSpeaking(): void { 190 | this._callbacks.onUserStoppedSpeaking?.(); 191 | } 192 | 193 | userTranscript(transcript: TranscriptData): void { 194 | this._callbacks.onUserTranscript?.(transcript); 195 | } 196 | 197 | botStartedSpeaking(): void { 198 | if (!this._botIsSpeaking) { 199 | this._botIsSpeaking = true; 200 | this._callbacks.onBotStartedSpeaking?.(); 201 | } 202 | } 203 | 204 | botStoppedSpeaking(): void { 205 | if (this._botIsSpeaking) { 206 | this._botIsSpeaking = false; 207 | this._callbacks.onBotStoppedSpeaking?.(); 208 | } 209 | } 210 | 211 | botTtsText(data: BotTTSTextData): void { 212 | this._callbacks.onBotTtsText?.(data); 213 | } 214 | 215 | bufferBotAudio(audio: ArrayBuffer, id?: string): void { 216 | this._mediaManager.bufferBotAudio(audio, id); 217 | } 218 | 219 | connectionError(errorMsg: string): void { 220 | console.error(errorMsg); 221 | this.state = "error"; 222 | this.disconnect(); 223 | } 224 | 225 | private attachDeviceListeners(): void {} 226 | } 227 | -------------------------------------------------------------------------------- /transports/gemini-live-websocket-transport/src/geminiLiveWebSocketTransport.ts: -------------------------------------------------------------------------------- 1 | import { MediaManager } from "../../../lib/media-mgmt/mediaManager"; 2 | import { DailyMediaManager } from "../../../lib/media-mgmt/dailyMediaManager"; 3 | 4 | import { 5 | logger, 6 | RTVIActionRequestData, 7 | RTVIMessage, 8 | RTVIMessageType, 9 | TransportStartError, 10 | } from "@pipecat-ai/client-js"; 11 | import { ReconnectingWebSocket } from "../../../lib/websocket-utils/reconnectingWebSocket"; 12 | import { 13 | DirectToLLMBaseWebSocketTransport, 14 | LLMServiceOptions, 15 | } from "./directToLLMBaseWebSocketTransport"; 16 | 17 | const HOST = `generativelanguage.googleapis.com`; 18 | const BIDI_PATH = `google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent`; 19 | const MODEL = "models/gemini-2.0-flash-exp"; 20 | 21 | export interface GeminiLLMServiceOptions extends LLMServiceOptions { 22 | initial_messages?: Array<{ content: string; role: string }>; 23 | api_key: string; 24 | settings?: { 25 | candidate_count?: number; 26 | maxOutput_tokens?: number; 27 | temperature?: number; 28 | top_p?: number; 29 | top_k?: number; 30 | presence_penalty?: number; 31 | frequency_penalty?: number; 32 | response_modalities?: string; 33 | speech_config?: { 34 | voice_config?: { 35 | prebuilt_voice_config?: { 36 | voice_name: "Puck" | "Charon" | "Kore" | "Fenrir" | "Aoede"; 37 | // | "Voice O"; 38 | }; 39 | }; 40 | }; 41 | }; 42 | } 43 | 44 | export class GeminiLiveWebsocketTransport extends DirectToLLMBaseWebSocketTransport { 45 | declare private _ws: ReconnectingWebSocket | null; 46 | declare private _botResponseID: number; 47 | declare private _botIsReadyResolve: 48 | | ((value: void | PromiseLike) => void) 49 | | null; 50 | 51 | constructor( 52 | service_options: GeminiLLMServiceOptions, 53 | manager?: MediaManager, 54 | ) { 55 | if (!manager) { 56 | manager = new DailyMediaManager(); 57 | } 58 | super(service_options, manager); 59 | 60 | this._ws = null; 61 | 62 | this._botResponseID = 0; 63 | } 64 | 65 | initializeLLM(): void { 66 | const service_options = this._service_options as GeminiLLMServiceOptions; 67 | const apiKey = service_options.api_key; 68 | if (!apiKey) { 69 | console.error("!!! No API key provided in llm_service_options"); 70 | return; 71 | } 72 | const base_url = `wss://${HOST}/ws/${BIDI_PATH}`; 73 | this._ws = new ReconnectingWebSocket(`${base_url}?key=${apiKey}`); 74 | // don't run the keep alive interval until we determine if there's an api for it 75 | this._ws.keepAliveInterval = 0; 76 | } 77 | 78 | // This is called from super.initialize() 79 | attachLLMListeners(): void { 80 | if (!this._ws) { 81 | console.error( 82 | "attachLLMListeners called before the websocket is initialized. Be sure to call initializeLLM() first.", 83 | ); 84 | return; 85 | } 86 | this._ws.on("open", () => {}); 87 | this._ws.on("message", async (msg: any) => { 88 | const content = msg.serverContent; 89 | if (!content) { 90 | if ("setupComplete" in msg) { 91 | this.state = "ready"; 92 | if (this._botIsReadyResolve) { 93 | this._botIsReadyResolve(); 94 | this._botIsReadyResolve = null; 95 | } 96 | } else { 97 | console.log("received unknown message", msg); 98 | } 99 | return; 100 | } 101 | if (content.modelTurn) { 102 | let result: ArrayBuffer | null = null; 103 | content.modelTurn.parts?.forEach((part: { inlineData: any }) => { 104 | if (part.inlineData?.data) { 105 | if (result) { 106 | mergeBuffers(result, base64ToArrayBuffer(part.inlineData.data)); 107 | } else { 108 | result = base64ToArrayBuffer(part.inlineData.data); 109 | } 110 | } 111 | }); 112 | if (result) { 113 | if (!this._botIsSpeaking) { 114 | this._botResponseID++; 115 | this.botStartedSpeaking(); 116 | } 117 | this.bufferBotAudio(result, this._botResponseID.toString()); 118 | } 119 | } else if (content.interrupted) { 120 | await this.userStartedSpeaking(); 121 | } else if (content.turnComplete) { 122 | this.botStoppedSpeaking(); 123 | } else { 124 | // console.log('unhandled message', content); 125 | } 126 | }); 127 | this._ws.on("error", (error: Error) => { 128 | this.connectionError(`websocket error: ${error}`); 129 | }); 130 | this._ws.on("connection-timeout", () => { 131 | this.connectionError("websocket connection timed out"); 132 | }); 133 | this._ws.on("close", (code: number) => { 134 | this.connectionError(`websocket connection closed. Code: ${code}`); 135 | }); 136 | this._ws.on("reconnect-failed", () => { 137 | this.connectionError(`websocket reconnect failed`); 138 | }); 139 | } 140 | 141 | async connectLLM(): Promise { 142 | if (!this._ws) { 143 | console.error( 144 | "connectLLM called before the websocket is initialized. Be sure to call initializeLLM() first.", 145 | ); 146 | return; 147 | } 148 | try { 149 | await this._ws.connect(); 150 | } catch (error) { 151 | const msg = `Failed to connect to LLM: ${error}`; 152 | console.error(msg); 153 | this.state = "error"; 154 | throw new TransportStartError(msg); 155 | } 156 | 157 | const service_options = this._service_options as GeminiLLMServiceOptions; 158 | const model = service_options?.model ?? MODEL; 159 | const generation_config = service_options?.settings ?? {}; 160 | let config = { setup: { model, generation_config } }; 161 | await this._sendMsg(config); 162 | 163 | // For this bare-bones prototype, let's just see if we have any initial_messages in the params 164 | // we were constructed with. 165 | if (service_options?.initial_messages) { 166 | service_options.initial_messages.forEach( 167 | (msg: { content: string; role: string }) => { 168 | this._sendTextInput(msg.content, msg.role); 169 | }, 170 | ); 171 | } 172 | } 173 | 174 | async disconnectLLM(): Promise { 175 | await this._ws?.close(); 176 | } 177 | 178 | async sendReadyMessage(): Promise { 179 | const p = new Promise((resolve) => { 180 | if (this.state === "ready") { 181 | resolve(); 182 | } else { 183 | this._botIsReadyResolve = resolve; 184 | } 185 | }); 186 | await p; 187 | this._onMessage({ 188 | type: RTVIMessageType.BOT_READY, 189 | data: {}, 190 | } as RTVIMessage); 191 | } 192 | 193 | handleUserAudioStream(data: ArrayBuffer): void { 194 | if (this.state === "ready") { 195 | try { 196 | void this._sendAudioInput(data); 197 | } catch (error) { 198 | console.error("Error adding audio to stream player", error); 199 | this.state = "error"; 200 | // todo: should check this error more carefully, implement disconnect, implement 201 | // ping/ack connection monitoring and reconnection logic, etc. 202 | } 203 | } 204 | } 205 | 206 | sendMessage(message: RTVIMessage): void { 207 | switch (message.type) { 208 | case "action": 209 | { 210 | const data = message.data as RTVIActionRequestData; 211 | switch (data.action) { 212 | case "append_to_messages": 213 | if (data.arguments) { 214 | for (const a of data.arguments) { 215 | if (a.name === "messages") { 216 | const value = a.value as Array<{ 217 | content: string; 218 | role: string; 219 | }>; 220 | for (const m of value) { 221 | this._sendTextInput(m.content, m.role); 222 | } 223 | } 224 | } 225 | } 226 | break; 227 | case "get_context": 228 | case "set_context": 229 | console.warn("get_context and set_context are not implemented"); 230 | break; 231 | } 232 | } 233 | break; 234 | } 235 | } 236 | 237 | async _sendAudioInput(data: ArrayBuffer): Promise { 238 | // TODO: pull this number from the media manager 239 | const sampleRate = 24000; 240 | const msg = { 241 | realtimeInput: { 242 | mediaChunks: [ 243 | { 244 | mimeType: `audio/pcm;rate=${sampleRate}`, 245 | data: arrayBufferToBase64(data), 246 | }, 247 | ], 248 | }, 249 | }; 250 | await this._sendMsg(msg); 251 | } 252 | 253 | async _sendTextInput(text: string, role: string): Promise { 254 | const msg = { 255 | clientContent: { 256 | turns: [ 257 | { 258 | role, 259 | parts: [{ text }], 260 | }, 261 | ], 262 | turnComplete: role === "user" ? true : false, 263 | }, 264 | }; 265 | await this._sendMsg(msg); 266 | } 267 | 268 | async _sendMsg(msg: unknown): Promise { 269 | if (!this._ws) { 270 | console.error("sendMsg called but WS is null"); 271 | return; 272 | } 273 | if (this._ws.readyState !== WebSocket.OPEN) { 274 | console.error("attempt to send to closed socket"); 275 | return; 276 | } 277 | if (!msg) { 278 | console.error("need a msg to send a msg"); 279 | return; 280 | } 281 | try { 282 | await this._ws.send(JSON.stringify(msg)); 283 | } catch (e) { 284 | console.error("sendMsg error", e); 285 | } 286 | } 287 | 288 | // Not implemented 289 | enableScreenShare(enable: boolean): void { 290 | logger.error( 291 | "startScreenShare not implemented for GeminiLiveWebsocketTransport", 292 | ); 293 | throw new Error("Not implemented"); 294 | } 295 | 296 | public get isSharingScreen(): boolean { 297 | logger.error( 298 | "isSharingScreen not implemented for GeminiLiveWebsocketTransport", 299 | ); 300 | return false; 301 | } 302 | } 303 | 304 | function base64ToArrayBuffer(base64: string): ArrayBuffer { 305 | const binaryString = atob(base64); 306 | const len = binaryString.length; 307 | const bytes = new Uint8Array(len); 308 | for (let i = 0; i < len; i++) { 309 | bytes[i] = binaryString.charCodeAt(i); 310 | } 311 | return bytes.buffer; 312 | } 313 | 314 | function arrayBufferToBase64(buffer: ArrayBuffer): string { 315 | const bytes = new Uint8Array(buffer); 316 | let binary = ""; 317 | for (let i = 0; i < bytes.byteLength; i++) { 318 | binary += String.fromCharCode(bytes[i]); 319 | } 320 | return btoa(binary); 321 | } 322 | 323 | function mergeBuffers( 324 | leftBuffer: ArrayBuffer, 325 | rightBuffer: ArrayBuffer, 326 | ): ArrayBuffer { 327 | const tmpArray = new Uint8Array( 328 | leftBuffer.byteLength + rightBuffer.byteLength, 329 | ); 330 | tmpArray.set(new Uint8Array(leftBuffer), 0); 331 | tmpArray.set(new Uint8Array(rightBuffer), leftBuffer.byteLength); 332 | return tmpArray.buffer; 333 | } 334 | -------------------------------------------------------------------------------- /transports/gemini-live-websocket-transport/src/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./geminiLiveWebSocketTransport"; 2 | -------------------------------------------------------------------------------- /transports/gemini-live-websocket-transport/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "ESNext", 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "types": ["node"], 7 | "skipLibCheck": true, 8 | "jsx": "preserve", 9 | 10 | /* Bundler mode */ 11 | "moduleResolution": "bundler", 12 | "allowImportingTsExtensions": true, 13 | "allowJs": true, 14 | "noEmit": true, 15 | "resolveJsonModule": true, 16 | "isolatedModules": true, 17 | "moduleDetection": "force", 18 | 19 | /* Linting */ 20 | "strict": true, 21 | "noUnusedLocals": true, 22 | "noUnusedParameters": false, 23 | "noFallthroughCasesInSwitch": true 24 | }, 25 | "include": ["src"] 26 | } 27 | -------------------------------------------------------------------------------- /transports/openai-realtime-webrtc-transport/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2024, Daily 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /transports/openai-realtime-webrtc-transport/README.md: -------------------------------------------------------------------------------- 1 | # OpenAI RealTime WebRTC Transport 2 | 3 | [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai/client/js/transports/openai-webrtc) 4 | [![Demo](https://img.shields.io/badge/Demo-forestgreen)](examples/directToLLMTransports/README.md) 5 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/openai-realtime-webrtc-transport) 6 | 7 | A real-time websocket transport implementation for interacting with Google's Gemini Multimodal Live API, supporting bidirectional audio and unidirectional text communication. 8 | 9 | ## Installation 10 | 11 | ```bash copy 12 | npm install \ 13 | @pipecat-ai/client-js \ 14 | @pipecat-ai/openai-realtime-webrtc-transport 15 | ``` 16 | 17 | ## Overview 18 | 19 | The `OpenAIRealTimeWebRTCTransport` is a fully functional [RTVI `Transport`](https://docs.pipecat.ai/client/js/transports/transport). It provides a framework for implementing real-time communication directly with the [OpenAI Realtime API using WebRTC](https://platform.openai.com/docs/guides/realtime-webrtc) voice-to-voice service. It handles media device management, audio/video streams, and state management for the connection. 20 | 21 | ## Features 22 | 23 | - Real-time bidirectional communication with OpenAI Realtime API 24 | - Input device management 25 | - Audio streaming support 26 | - Text message support 27 | - Automatic reconnection handling 28 | - Configurable generation parameters 29 | - Support for initial conversation context 30 | 31 | ## Usage 32 | 33 | ### Basic Setup 34 | 35 | ```javascript 36 | import { OpenAIRealTimeWebRTCTransport, OpenAIServiceOptions } from '@pipecat-ai/openai-realtime-webrtc-transport'; 37 | 38 | const options: OpenAIServiceOptions = { 39 | api_key: 'YOUR_API_KEY', 40 | session_config: { 41 | instructions: 'you are a confused jellyfish', 42 | } 43 | }; 44 | 45 | const transport = new OpenAIRealTimeWebRTCTransport(options); 46 | let RTVIConfig: RTVIClientOptions = { 47 | transport, 48 | ... 49 | }; 50 | 51 | ``` 52 | 53 | ### Configuration Options 54 | 55 | ```typescript 56 | /********************************** 57 | * OpenAI-specific types 58 | * types and comments below are based on: 59 | * gpt-4o-realtime-preview-2024-12-17 60 | **********************************/ 61 | type JSONSchema = { [key: string]: any }; 62 | export type OpenAIFunctionTool = { 63 | type: "function"; 64 | name: string; 65 | description: string; 66 | parameters: JSONSchema; 67 | }; 68 | 69 | export type OpenAIServerVad = { 70 | type: "server_vad"; 71 | create_response?: boolean; // defaults to true 72 | interrupt_response?: boolean; // defaults to true 73 | prefix_padding_ms?: number; // defaults to 300ms 74 | silence_duration_ms?: number; // defaults to 500ms 75 | threshold?: number; // range (0.0, 1.0); defaults to 0.5 76 | }; 77 | 78 | export type OpenAISemanticVAD = { 79 | type: "semantic_vad"; 80 | eagerness?: "low" | "medium" | "high" | "auto"; // defaults to "auto", equivalent to "medium" 81 | create_response?: boolean; // defaults to true 82 | interrupt_response?: boolean; // defaults to true 83 | }; 84 | 85 | export type OpenAISessionConfig = Partial<{ 86 | modalities?: string; 87 | instructions?: string; 88 | voice?: 89 | | "alloy" 90 | | "ash" 91 | | "ballad" 92 | | "coral" 93 | | "echo" 94 | | "sage" 95 | | "shimmer" 96 | | "verse"; 97 | input_audio_noise_reduction?: { 98 | type: "near_field" | "far_field"; 99 | } | null; // defaults to null/off 100 | input_audio_transcription?: { 101 | model: "whisper-1" | "gpt-4o-transcribe" | "gpt-4o-mini-transcribe"; 102 | language?: string; 103 | prompt?: string[] | string; // gpt-4o models take a string 104 | } | null; // we default this to gpt-4o-transcribe 105 | turn_detection?: OpenAIServerVad | OpenAISemanticVAD | null; // defaults to server_vad 106 | temperature?: number; 107 | max_tokens?: number | "inf"; 108 | tools?: Array; 109 | }>; 110 | 111 | export interface OpenAIServiceOptions { 112 | api_key: string; 113 | model?: string; 114 | initial_messages?: LLMContextMessage[]; 115 | settings?: OpenAISessionConfig; 116 | } 117 | ``` 118 | 119 | ### Sending Messages 120 | 121 | ```javascript 122 | // at setup time... 123 | llmHelper = new LLMHelper({}); 124 | rtviClient.registerHelper("llm", llmHelper); 125 | // the 'llm' name in this call above isn't used. 126 | //that value is specific to working with a pipecat pipeline 127 | 128 | // at time of sending message... 129 | // Send text prompt message 130 | llmHelper.appendToMessages({ role: "user", content: 'Hello OpenAI!' }); 131 | ``` 132 | 133 | ### Handling Events 134 | 135 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info. 136 | 137 | ### Updating Session Configuration 138 | 139 | ```javascript 140 | transport.updateSessionConfig({ 141 | instructions: 'you are a an over-sharing neighbor', 142 | input_audio_noise_reduction: { 143 | type: 'near_field' 144 | } 145 | }); 146 | ``` 147 | 148 | ## API Reference 149 | 150 | ### Methods 151 | 152 | - `initialize()`: Set up the transport and establish connection 153 | - `sendMessage(message)`: Send a text message 154 | - `handleUserAudioStream(data)`: Stream audio data to the model 155 | - `disconnectLLM()`: Close the connection 156 | - `sendReadyMessage()`: Signal ready state 157 | 158 | ### States 159 | 160 | The transport can be in one of the following states: 161 | - "disconnected" 162 | - "initializing" 163 | - "initialized" 164 | - "connecting" 165 | - "connected" 166 | - "ready" 167 | - "disconnecting 168 | - "error" 169 | 170 | ## Error Handling 171 | 172 | The transport includes comprehensive error handling for: 173 | - Connection failures 174 | - WebRTC connection errors 175 | - API key validation 176 | - Message transmission errors 177 | 178 | ## License 179 | BSD-2 Clause 180 | -------------------------------------------------------------------------------- /transports/openai-realtime-webrtc-transport/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@pipecat-ai/openai-realtime-webrtc-transport", 3 | "version": "0.4.0", 4 | "license": "BSD-2-Clause", 5 | "main": "dist/index.js", 6 | "module": "dist/index.module.js", 7 | "types": "dist/index.d.ts", 8 | "source": "src/index.ts", 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git" 12 | }, 13 | "files": [ 14 | "dist", 15 | "package.json", 16 | "README.md" 17 | ], 18 | "scripts": { 19 | "build": "parcel build --no-cache", 20 | "dev": "parcel watch", 21 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0" 22 | }, 23 | "devDependencies": { 24 | "@pipecat-ai/client-js": "^0.4.0", 25 | "@types/node": "^22.9.0", 26 | "eslint": "9.11.1", 27 | "eslint-config-prettier": "^9.1.0", 28 | "eslint-plugin-simple-import-sort": "^12.1.1" 29 | }, 30 | "peerDependencies": { 31 | "@pipecat-ai/client-js": "~0.4.0" 32 | }, 33 | "dependencies": { 34 | "@daily-co/daily-js": "^0.77.0", 35 | "dequal": "^2.0.3" 36 | }, 37 | "description": "Pipecat OpenAI RealTime Transport Package", 38 | "author": "Daily.co", 39 | "bugs": { 40 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues" 41 | }, 42 | "homepage": "https://github.com/pipecat-ai/pipecat-client-web-transports/blob/main/transports/openai-llm-direct-transport/README.md" 43 | } 44 | -------------------------------------------------------------------------------- /transports/openai-realtime-webrtc-transport/src/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./OpenAIRealTimeWebRTCTransport"; 2 | -------------------------------------------------------------------------------- /transports/openai-realtime-webrtc-transport/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "ESNext", 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "types": ["node"], 7 | "skipLibCheck": true, 8 | "jsx": "preserve", 9 | 10 | /* Bundler mode */ 11 | "moduleResolution": "bundler", 12 | "allowImportingTsExtensions": true, 13 | "noEmit": true, 14 | "resolveJsonModule": true, 15 | "isolatedModules": true, 16 | "moduleDetection": "force", 17 | 18 | /* Linting */ 19 | "strict": true, 20 | "noUnusedLocals": true, 21 | "noUnusedParameters": false, 22 | "noFallthroughCasesInSwitch": true 23 | }, 24 | "include": ["src"] 25 | } 26 | -------------------------------------------------------------------------------- /transports/small-webrtc-transport/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to **Pipecat Small WebRTC Transport** will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [0.4.0] 9 | 10 | - Bumped dependency to @pipecat-ai/client-js@~0.4.0 11 | 12 | ## [0.0.5] - 2025-05-19 13 | 14 | ### Fixed 15 | 16 | - `SmallWebRTCTransport` updates transport state to 'ready' when client ready message is sent. 17 | 18 | ## [0.0.4] - 2025-04-29 19 | 20 | ### Added 21 | 22 | - Added `waitForICEGathering` property: this allows users to configure whether the transport should 23 | explicitly wait for the iceGatheringState to become complete during the negotiation phase. 24 | 25 | ### Fixed 26 | 27 | - `SmallWebRTCTransport` class now accepts `RTCIceServer`[] instead of just the `String`[] of urls. 28 | 29 | ## [0.0.3] - 2025-04-11 30 | 31 | ### Added 32 | 33 | - Handling a new incoming `peerLeft` signalling messages from Pipecat. 34 | 35 | ## [0.0.2] - 2025-04-10 36 | 37 | ### Added 38 | 39 | - Send a signalling message whenever a track is enabled or disabled. 40 | - Handle incoming `renegotiate` signalling messages from Pipecat in a new format. 41 | 42 | ## [0.0.1] - 2025-04-09 43 | 44 | ### Added 45 | 46 | - Web client transport for the Pipecat **SmallWebRTCTransport**. 47 | -------------------------------------------------------------------------------- /transports/small-webrtc-transport/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2024, Daily 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /transports/small-webrtc-transport/README.md: -------------------------------------------------------------------------------- 1 | # Pipecat's Real-Time Voice Inference - Small WebRTC Transport 2 | 3 | [![Docs](https://img.shields.io/badge/documentation-blue)](https://docs.pipecat.ai/client/js/transports/small-webrtc) 4 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/small-webrtc-transport) 5 | [![Demo](https://img.shields.io/badge/Demo-forestgreen)](https://github.com/pipecat-ai/pipecat/tree/main/examples/p2p-webrtc) 6 | 7 | Small WebRTC transport package for use with `@pipecat-ai/client-js`. 8 | 9 | ## Installation 10 | 11 | ```bash copy 12 | npm install \ 13 | @pipecat-ai/client-js \ 14 | @pipecat-ai/small-webrtc-transport 15 | ``` 16 | 17 | ## Overview 18 | 19 | The SmallWebRTCTransport class provides a WebRTC transport layer establishing a PeerConnection with Pipecat SmallWebRTCTransport. It handles audio/video device management, WebRTC connections, and real-time communication between client and bot. 20 | 21 | ## Features 22 | 23 | - 🎥 Complete camera device management 24 | - 🎤 Microphone input handling 25 | - 📡 WebRTC connection management 26 | - 🤖 Bot participant tracking 27 | - 💬 Real-time messaging 28 | 29 | ## Usage 30 | 31 | ### Basic Setup 32 | 33 | ```javascript 34 | import { RTVIClient } from "@pipecat-ai/client-js"; 35 | import { SmallWebRTCTransport } from "@pipecat-ai/small-webrtc-transport"; 36 | 37 | const transport = new SmallWebRTCTransport(); 38 | 39 | const rtviClient = new RTVIClient({ 40 | transport, 41 | enableCam: false, // Default camera off 42 | enableMic: true, // Default microphone on 43 | callbacks: { 44 | // Event handlers 45 | }, 46 | params: { 47 | baseUrl, 48 | endpoints 49 | } 50 | // ... 51 | }); 52 | 53 | await rtviClient.connect(); 54 | ``` 55 | 56 | ## API Reference 57 | 58 | ### States 59 | 60 | The transport can be in one of these states: 61 | - "initializing" 62 | - "initialized" 63 | - "connecting" 64 | - "connected" 65 | - "ready" 66 | - "disconnecting" 67 | - "error" 68 | 69 | ## Events 70 | 71 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info. 72 | 73 | ## Error Handling 74 | 75 | The transport includes error handling for: 76 | - Connection failures 77 | - Device errors 78 | 79 | ## License 80 | BSD-2 Clause 81 | -------------------------------------------------------------------------------- /transports/small-webrtc-transport/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@pipecat-ai/small-webrtc-transport", 3 | "version": "0.4.0", 4 | "license": "BSD-2-Clause", 5 | "main": "dist/index.js", 6 | "module": "dist/index.module.js", 7 | "types": "dist/index.d.ts", 8 | "source": "src/index.ts", 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git" 12 | }, 13 | "files": [ 14 | "dist", 15 | "package.json", 16 | "README.md" 17 | ], 18 | "scripts": { 19 | "build": "parcel build --no-cache", 20 | "dev": "parcel watch", 21 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0", 22 | "prepare": "npm run build" 23 | }, 24 | "devDependencies": { 25 | "@pipecat-ai/client-js": "^0.4.0", 26 | "@types/node": "^22.9.0", 27 | "eslint": "9.11.1", 28 | "eslint-config-prettier": "^9.1.0", 29 | "eslint-plugin-simple-import-sort": "^12.1.1" 30 | }, 31 | "peerDependencies": { 32 | "@pipecat-ai/client-js": "~0.4.0" 33 | }, 34 | "dependencies": { 35 | "@daily-co/daily-js": "^0.77.0", 36 | "dequal": "^2.0.3" 37 | }, 38 | "description": "Pipecat Small WebRTC Transport Package", 39 | "author": "Daily.co", 40 | "bugs": { 41 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues" 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /transports/small-webrtc-transport/src/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./smallWebRTCTransport"; 2 | -------------------------------------------------------------------------------- /transports/small-webrtc-transport/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "ESNext", 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "types": ["node"], 7 | "skipLibCheck": true, 8 | "jsx": "preserve", 9 | 10 | /* Bundler mode */ 11 | "moduleResolution": "bundler", 12 | "allowImportingTsExtensions": true, 13 | "allowJs": true, 14 | "noEmit": true, 15 | "resolveJsonModule": true, 16 | "isolatedModules": true, 17 | "moduleDetection": "force", 18 | 19 | /* Linting */ 20 | "strict": true, 21 | "noUnusedLocals": true, 22 | "noUnusedParameters": false, 23 | "noFallthroughCasesInSwitch": true 24 | }, 25 | "include": ["src"] 26 | } 27 | -------------------------------------------------------------------------------- /transports/websocket-transport/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2024, Daily 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /transports/websocket-transport/README.md: -------------------------------------------------------------------------------- 1 | # Websocket Transport 2 | 3 | [![Demo](https://img.shields.io/badge/Demo-forestgreen)](https://github.com/pipecat-ai/pipecat/tree/main/examples/websocket/README.md) 4 | ![NPM Version](https://img.shields.io/npm/v/@pipecat-ai/websocket-transport) 5 | 6 | Websocket transport package for use with `@pipecat-ai/client-js`. 7 | 8 | ## Installation 9 | 10 | ```bash copy 11 | npm install \ 12 | @pipecat-ai/client-js \ 13 | @pipecat-ai/websocket-transport 14 | ``` 15 | 16 | ## Overview 17 | 18 | The WebSocketTransport class provides a Websocket transport layer establishing a connection with Pipecat WebSocketTransport. It handles audio device management and real-time communication between client and bot. 19 | 20 | ## Features 21 | 22 | - 🎤 Microphone input handling 23 | - 🤖 Bot participant tracking 24 | - 💬 Real-time messaging 25 | 26 | ## Usage 27 | 28 | ### Basic Setup 29 | 30 | ```javascript 31 | import { RTVIClient } from "@pipecat-ai/client-js"; 32 | import { WebSocketTransport } from "@pipecat-ai/small-webrtc-transport"; 33 | 34 | const transport = new WebSocketTransport(); 35 | 36 | const rtviClient = new RTVIClient({ 37 | transport, 38 | enableMic: true, // Default microphone on 39 | callbacks: { 40 | // Event handlers 41 | }, 42 | params: { 43 | baseUrl, 44 | endpoints 45 | } 46 | // ... 47 | }); 48 | 49 | await rtviClient.connect(); 50 | ``` 51 | 52 | ## API Reference 53 | 54 | ### States 55 | 56 | The transport can be in one of these states: 57 | - "initializing" 58 | - "initialized" 59 | - "connecting" 60 | - "connected" 61 | - "ready" 62 | - "disconnecting" 63 | - "error" 64 | 65 | ## Events 66 | 67 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info. 68 | 69 | ## Error Handling 70 | 71 | The transport includes error handling for: 72 | - Connection failures 73 | - Device errors 74 | 75 | ## License 76 | BSD-2 Clause 77 | 78 | -------------------------------------------------------------------------------- /transports/websocket-transport/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@pipecat-ai/websocket-transport", 3 | "version": "0.4.1", 4 | "license": "BSD-2-Clause", 5 | "main": "dist/index.js", 6 | "module": "dist/index.module.js", 7 | "types": "dist/index.d.ts", 8 | "source": "src/index.ts", 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git" 12 | }, 13 | "files": [ 14 | "dist", 15 | "package.json", 16 | "README.md" 17 | ], 18 | "scripts": { 19 | "build_proto": "bash ./proto/generate_typescript.sh", 20 | "build": "parcel build --no-cache", 21 | "dev": "parcel watch", 22 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0" 23 | }, 24 | "devDependencies": { 25 | "@pipecat-ai/client-js": "^0.4.0", 26 | "@types/node": "^22.9.0", 27 | "eslint": "9.11.1", 28 | "eslint-config-prettier": "^9.1.0", 29 | "eslint-plugin-simple-import-sort": "^12.1.1" 30 | }, 31 | "peerDependencies": { 32 | "@pipecat-ai/client-js": "~0.4.0" 33 | }, 34 | "dependencies": { 35 | "@daily-co/daily-js": "^0.79.0", 36 | "@protobuf-ts/plugin": "^2.11.0", 37 | "@protobuf-ts/runtime": "^2.11.0" 38 | }, 39 | "description": "Pipecat Base Transport for RealTime WebSocket APIs Package", 40 | "author": "Daily.co", 41 | "bugs": { 42 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues" 43 | }, 44 | "homepage": "https://github.com/pipecat-ai/pipecat-client-web-transports/blob/main/transports/realtime-websocket-transport/README.md" 45 | } 46 | -------------------------------------------------------------------------------- /transports/websocket-transport/proto/frames.proto: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright (c) 2024–2025, Daily 3 | // 4 | // SPDX-License-Identifier: BSD 2-Clause License 5 | // 6 | 7 | // Generate frames_pb2.py with: 8 | // 9 | // python -m grpc_tools.protoc --proto_path=./ --python_out=./protobufs frames.proto 10 | 11 | syntax = "proto3"; 12 | 13 | package pipecat; 14 | 15 | // Represents a basic unit of text data. 16 | message TextFrame { 17 | uint64 id = 1; 18 | string name = 2; 19 | string text = 3; 20 | } 21 | 22 | // Represents a raw chunk of audio data, 23 | // either generated by Pipecat for playback 24 | // or to be sent to Pipecat for processing. 25 | message AudioRawFrame { 26 | uint64 id = 1; 27 | string name = 2; 28 | bytes audio = 3; 29 | uint32 sample_rate = 4; 30 | uint32 num_channels = 5; 31 | optional uint64 pts = 6; 32 | } 33 | 34 | // Represents a transcribed text frame with speaker metadata. 35 | // Typically created when a participant speaks. 36 | message TranscriptionFrame { 37 | uint64 id = 1; 38 | string name = 2; 39 | string text = 3; 40 | string user_id = 4; 41 | string timestamp = 5; 42 | } 43 | 44 | // Wrapper for a generic message sent to or received from the transport layer. 45 | // Commonly used for RTVI protocol messages. 46 | message MessageFrame { 47 | string data = 1; 48 | } 49 | 50 | message Frame { 51 | oneof frame { 52 | TextFrame text = 1; 53 | AudioRawFrame audio = 2; 54 | TranscriptionFrame transcription = 3; 55 | MessageFrame message = 4; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /transports/websocket-transport/proto/generate_typescript.sh: -------------------------------------------------------------------------------- 1 | # Use this script to generate the typescript each time we change the frames.proto file 2 | rm -rf ./src/generated/* 3 | protoc \ 4 | --ts_out=generate_dependencies:./src/generated \ 5 | proto/frames.proto 6 | -------------------------------------------------------------------------------- /transports/websocket-transport/src/index.ts: -------------------------------------------------------------------------------- 1 | // export * from "./realTimeWebSocketTransport"; 2 | // export * from "../../../lib/wavtools/dist/index.d.ts"; 3 | 4 | import { WavMediaManager } from "../../../lib/media-mgmt/mediaManager"; 5 | import { DailyMediaManager } from "../../../lib/media-mgmt/dailyMediaManager"; 6 | import { WebSocketTransport } from "./webSocketTransport.ts"; 7 | 8 | export { WavMediaManager, DailyMediaManager, WebSocketTransport }; 9 | -------------------------------------------------------------------------------- /transports/websocket-transport/src/webSocketTransport.ts: -------------------------------------------------------------------------------- 1 | import { 2 | logger, 3 | RTVIClientOptions, 4 | RTVIMessage, 5 | Tracks, 6 | Transport, 7 | TransportStartError, 8 | TransportState, 9 | } from "@pipecat-ai/client-js"; 10 | 11 | import { ReconnectingWebSocket } from "../../../lib/websocket-utils/reconnectingWebSocket"; 12 | import { DailyMediaManager } from "../../../lib/media-mgmt/dailyMediaManager"; 13 | 14 | import { Frame } from "./generated/proto/frames"; 15 | import { MediaManager } from "../../../lib/media-mgmt/mediaManager"; 16 | 17 | export class WebSocketTransport extends Transport { 18 | declare private _ws: ReconnectingWebSocket | null; 19 | private static RECORDER_SAMPLE_RATE = 16_000; 20 | private audioQueue: ArrayBuffer[] = []; 21 | private _mediaManager: MediaManager; 22 | 23 | constructor() { 24 | super(); 25 | this._mediaManager = new DailyMediaManager( 26 | true, 27 | true, 28 | undefined, 29 | undefined, 30 | 512, 31 | WebSocketTransport.RECORDER_SAMPLE_RATE, 32 | ); 33 | this._mediaManager.setUserAudioCallback( 34 | this.handleUserAudioStream.bind(this), 35 | ); 36 | this._ws = null; 37 | } 38 | 39 | initialize( 40 | options: RTVIClientOptions, 41 | messageHandler: (ev: RTVIMessage) => void, 42 | ): void { 43 | this._options = options; 44 | this._callbacks = options.callbacks ?? {}; 45 | this._onMessage = messageHandler; 46 | this._mediaManager.setRTVIOptions(options); 47 | this.state = "disconnected"; 48 | } 49 | 50 | async initDevices(): Promise { 51 | this.state = "initializing"; 52 | await this._mediaManager.initialize(); 53 | this.state = "initialized"; 54 | } 55 | 56 | async connect( 57 | authBundle: unknown, 58 | abortController: AbortController, 59 | ): Promise { 60 | this.state = "connecting"; 61 | try { 62 | this._ws = this.initializeWebsocket(authBundle); 63 | await this._ws.connect(); 64 | await this._mediaManager.connect(); 65 | this.state = "connected"; 66 | this._callbacks.onConnected?.(); 67 | } catch (error) { 68 | const msg = `Failed to connect to websocket: ${error}`; 69 | logger.error(msg); 70 | this.state = "error"; 71 | throw new TransportStartError(msg); 72 | } 73 | } 74 | 75 | async disconnect(): Promise { 76 | this.state = "disconnecting"; 77 | await this._mediaManager.disconnect(); 78 | await this._ws?.close(); 79 | this.state = "disconnected"; 80 | this._callbacks.onDisconnected?.(); 81 | } 82 | 83 | getAllMics(): Promise { 84 | return this._mediaManager.getAllMics(); 85 | } 86 | getAllCams(): Promise { 87 | return this._mediaManager.getAllCams(); 88 | } 89 | getAllSpeakers(): Promise { 90 | return this._mediaManager.getAllSpeakers(); 91 | } 92 | 93 | async updateMic(micId: string): Promise { 94 | return this._mediaManager.updateMic(micId); 95 | } 96 | updateCam(camId: string): void { 97 | return this._mediaManager.updateCam(camId); 98 | } 99 | updateSpeaker(speakerId: string): void { 100 | return this._mediaManager.updateSpeaker(speakerId); 101 | } 102 | 103 | get selectedMic(): MediaDeviceInfo | Record { 104 | return this._mediaManager.selectedMic; 105 | } 106 | get selectedSpeaker(): MediaDeviceInfo | Record { 107 | return this._mediaManager.selectedSpeaker; 108 | } 109 | 110 | enableMic(enable: boolean): void { 111 | this._mediaManager.enableMic(enable); 112 | } 113 | get isMicEnabled(): boolean { 114 | return this._mediaManager.isMicEnabled; 115 | } 116 | 117 | get state(): TransportState { 118 | return this._state; 119 | } 120 | 121 | set state(state: TransportState) { 122 | if (this._state === state) return; 123 | 124 | this._state = state; 125 | this._callbacks.onTransportStateChanged?.(state); 126 | } 127 | 128 | get expiry(): number | undefined { 129 | return this._expiry; 130 | } 131 | 132 | tracks(): Tracks { 133 | return this._mediaManager.tracks(); 134 | } 135 | 136 | initializeWebsocket(authBundle: any): ReconnectingWebSocket { 137 | const ws = new ReconnectingWebSocket(`${authBundle.ws_url}`, undefined, { 138 | parseBlobToJson: false, 139 | }); 140 | // disabling the keep alive, there is no API for it inside Pipecat 141 | ws.keepAliveInterval = 0; 142 | ws.on("open", () => { 143 | logger.debug("Websocket connection opened"); 144 | }); 145 | ws.on("message", async (data: Blob) => { 146 | let arrayBuffer: ArrayBuffer = await data.arrayBuffer(); 147 | const parsedFrame = Frame.fromBinary(new Uint8Array(arrayBuffer)).frame; 148 | if (parsedFrame.oneofKind === "audio") { 149 | // We should be able to use parsedFrame.audio.audio.buffer but for 150 | // some reason that contains all the bytes from the protobuf message. 151 | const audioVector = Array.from(parsedFrame.audio.audio); 152 | const uint8Array = new Uint8Array(audioVector); 153 | const int16Array = new Int16Array(uint8Array.buffer); 154 | this._mediaManager.bufferBotAudio(int16Array); 155 | } else if (parsedFrame.oneofKind === "message") { 156 | let jsonText = parsedFrame.message.data; 157 | try { 158 | let jsonMessage = JSON.parse(jsonText); 159 | if (jsonMessage.label === "rtvi-ai") { 160 | this._onMessage(jsonMessage as RTVIMessage); 161 | } 162 | } catch { 163 | logger.warn("Failed to parse message", jsonText); 164 | } 165 | } 166 | }); 167 | ws.on("error", (error: Error) => { 168 | this.connectionError(`websocket error: ${error}`); 169 | }); 170 | ws.on("connection-timeout", () => { 171 | this.connectionError("websocket connection timed out"); 172 | }); 173 | ws.on("close", (code: number) => { 174 | this.connectionError(`websocket connection closed. Code: ${code}`); 175 | }); 176 | ws.on("reconnect-failed", () => { 177 | this.connectionError(`websocket reconnect failed`); 178 | }); 179 | return ws; 180 | } 181 | 182 | sendReadyMessage(): void { 183 | this.state = "ready"; 184 | this.sendMessage(RTVIMessage.clientReady()); 185 | } 186 | 187 | handleUserAudioStream(data: ArrayBuffer): void { 188 | if (this.state === "ready") { 189 | try { 190 | void this.flushAudioQueue(); 191 | void this._sendAudioInput(data); 192 | } catch (error) { 193 | logger.error("Error sending audio stream to websocket:", error); 194 | this.state = "error"; 195 | } 196 | } else { 197 | this.audioQueue.push(data); 198 | } 199 | } 200 | 201 | private flushAudioQueue(): void { 202 | if (this.audioQueue.length <= 0) { 203 | return; 204 | } 205 | logger.info("Will flush audio queue", this.audioQueue.length); 206 | while (this.audioQueue.length > 0) { 207 | const queuedData = this.audioQueue.shift(); 208 | if (queuedData) void this._sendAudioInput(queuedData); 209 | } 210 | } 211 | 212 | sendMessage(message: RTVIMessage): void { 213 | logger.debug("Received message to send to Web Socket", message); 214 | const frame = Frame.create({ 215 | frame: { 216 | oneofKind: "message", 217 | message: { 218 | data: JSON.stringify(message), 219 | }, 220 | }, 221 | }); 222 | void this._sendMsg(frame); 223 | } 224 | 225 | async _sendAudioInput(data: ArrayBuffer): Promise { 226 | const pcmByteArray = new Uint8Array(data); 227 | const frame = Frame.create({ 228 | frame: { 229 | oneofKind: "audio", 230 | audio: { 231 | id: 0n, 232 | name: "audio", 233 | audio: pcmByteArray, 234 | sampleRate: WebSocketTransport.RECORDER_SAMPLE_RATE, 235 | numChannels: 1, 236 | }, 237 | }, 238 | }); 239 | await this._sendMsg(frame); 240 | } 241 | 242 | async _sendMsg(msg: Frame): Promise { 243 | if (!this._ws) { 244 | logger.error("sendMsg called but WS is null"); 245 | return; 246 | } 247 | if (this._ws.readyState !== WebSocket.OPEN) { 248 | logger.error("attempt to send to closed socket"); 249 | return; 250 | } 251 | if (!msg) { 252 | logger.error("need a msg to send a msg"); 253 | return; 254 | } 255 | try { 256 | const encodedFrame = new Uint8Array(Frame.toBinary(msg)); 257 | await this._ws.send(encodedFrame); 258 | } catch (e) { 259 | logger.error("sendMsg error", e); 260 | } 261 | } 262 | 263 | connectionError(errorMsg: string): void { 264 | console.error(errorMsg); 265 | this.state = "error"; 266 | void this.disconnect(); 267 | } 268 | 269 | // Not implemented 270 | enableScreenShare(enable: boolean): void { 271 | logger.error("startScreenShare not implemented for WebSocketTransport"); 272 | throw new Error("Not implemented"); 273 | } 274 | 275 | public get isSharingScreen(): boolean { 276 | logger.error("isSharingScreen not implemented for WebSocketTransport"); 277 | return false; 278 | } 279 | 280 | enableCam(enable: boolean) { 281 | logger.error("enableCam not implemented for WebSocketTransport"); 282 | throw new Error("Not implemented"); 283 | } 284 | 285 | get isCamEnabled(): boolean { 286 | logger.error("isCamEnabled not implemented for WebSocketTransport"); 287 | return false; 288 | } 289 | 290 | get selectedCam(): MediaDeviceInfo | Record { 291 | logger.error("selectedCam not implemented for WebSocketTransport"); 292 | throw new Error("Not implemented"); 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /transports/websocket-transport/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "ESNext", 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "types": ["node"], 7 | "skipLibCheck": true, 8 | "jsx": "preserve", 9 | 10 | /* Bundler mode */ 11 | "moduleResolution": "bundler", 12 | "allowImportingTsExtensions": true, 13 | "allowJs": true, 14 | "noEmit": true, 15 | "resolveJsonModule": true, 16 | "isolatedModules": true, 17 | "moduleDetection": "force", 18 | 19 | /* Linting */ 20 | "strict": true, 21 | "noUnusedLocals": true, 22 | "noUnusedParameters": false, 23 | "noFallthroughCasesInSwitch": true 24 | }, 25 | "include": ["src"] 26 | } 27 | --------------------------------------------------------------------------------