├── .gitignore
├── .parcelrc
├── .prettierrc
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── examples
└── directToLLMTransports
│ ├── README.md
│ ├── README.md.bak
│ ├── env.example
│ ├── index.html
│ ├── package-lock.json
│ ├── package.json
│ ├── src
│ ├── app.ts
│ ├── styles.css
│ └── vite-env.d.ts
│ └── tsconfig.json
├── lib
├── media-mgmt
│ ├── dailyMediaManager.ts
│ └── mediaManager.ts
├── wavtools
│ ├── index.js
│ └── lib
│ │ ├── analysis
│ │ ├── audio_analysis.js
│ │ └── constants.js
│ │ ├── mediastream_recorder.js
│ │ ├── wav_packer.js
│ │ ├── wav_recorder.js
│ │ ├── wav_stream_player.js
│ │ └── worklets
│ │ ├── audio_processor.js
│ │ └── stream_processor.js
└── websocket-utils
│ └── reconnectingWebSocket.ts
├── package-lock.json
├── package.json
└── transports
├── daily
├── CHANGELOG.md
├── LICENSE
├── README.md
├── package.json
├── src
│ ├── index.ts
│ └── transport.ts
└── tsconfig.json
├── gemini-live-websocket-transport
├── LICENSE
├── README.md
├── package.json
├── src
│ ├── directToLLMBaseWebSocketTransport.ts
│ ├── geminiLiveWebSocketTransport.ts
│ └── index.ts
└── tsconfig.json
├── openai-realtime-webrtc-transport
├── LICENSE
├── README.md
├── package.json
├── src
│ ├── OpenAIRealTimeWebRTCTransport.ts
│ └── index.ts
└── tsconfig.json
├── small-webrtc-transport
├── CHANGELOG.md
├── LICENSE
├── README.md
├── package.json
├── src
│ ├── index.ts
│ └── smallWebRTCTransport.ts
└── tsconfig.json
└── websocket-transport
├── LICENSE
├── README.md
├── package.json
├── proto
├── frames.proto
└── generate_typescript.sh
├── src
├── generated
│ └── proto
│ │ └── frames.ts
├── index.ts
└── webSocketTransport.ts
└── tsconfig.json
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | pnpm-debug.log*
8 | lerna-debug.log*
9 |
10 | node_modules
11 | dist
12 | dist-ssr
13 | *.local
14 | .parcel-cache
15 |
16 | .env
17 |
18 | # Editor directories and files
19 | .vscode/*
20 | !.vscode/extensions.json
21 | .idea
22 | .DS_Store
23 | *.suo
24 | *.ntvs*
25 | *.njsproj
26 | *.sln
27 | *.sw?
28 |
--------------------------------------------------------------------------------
/.parcelrc:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "@parcel/config-default",
3 | "transformers": {
4 | "*.{ts,tsx}": [
5 | "@parcel/transformer-typescript-tsc"
6 | ]
7 | }
8 | }
--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "semi": true,
3 | "tabWidth": 2,
4 | "useTabs": false,
5 | "singleQuote": false
6 | }
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Contributing to Pipecat Client Web Trnsports
2 |
3 | We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved:
4 |
5 | 1. **Fork this repository**: Start by forking the Pipecat Client Web Transports repository to your GitHub account.
6 |
7 | 2. **Clone the repository**: Clone your forked repository to your local machine.
8 | ```bash
9 | git clone https://github.com/your-username/pipecat-client-web-transports
10 | ```
11 | 3. **Create a branch**: For your contribution, create a new branch.
12 | ```bash
13 | git checkout -b your-branch-name
14 | ```
15 | 4. **Make your changes**: Edit or add files as necessary.
16 | 5. **Test your changes**: Ensure that your changes look correct and follow the style set in the codebase.
17 | 6. **Commit your changes**: Once you're satisfied with your changes, commit them with a meaningful message.
18 |
19 | ```bash
20 | git commit -m "Description of your changes"
21 | ```
22 |
23 | 7. **Push your changes**: Push your branch to your forked repository.
24 |
25 | ```bash
26 | git push origin your-branch-name
27 | ```
28 |
29 | 9. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo.
30 | > Important: Describe the changes you've made clearly!
31 |
32 | Our maintainers will review your PR, and once everything is good, your contributions will be merged!
33 |
34 | # Contributor Covenant Code of Conduct
35 |
36 | ## Our Pledge
37 |
38 | We as members, contributors, and leaders pledge to make participation in our
39 | community a harassment-free experience for everyone, regardless of age, body
40 | size, visible or invisible disability, ethnicity, sex characteristics, gender
41 | identity and expression, level of experience, education, socio-economic status,
42 | nationality, personal appearance, race, caste, color, religion, or sexual
43 | identity and orientation.
44 |
45 | We pledge to act and interact in ways that contribute to an open, welcoming,
46 | diverse, inclusive, and healthy community.
47 |
48 | ## Our Standards
49 |
50 | Examples of behavior that contributes to a positive environment for our
51 | community include:
52 |
53 | - Demonstrating empathy and kindness toward other people
54 | - Being respectful of differing opinions, viewpoints, and experiences
55 | - Giving and gracefully accepting constructive feedback
56 | - Accepting responsibility and apologizing to those affected by our mistakes,
57 | and learning from the experience
58 | - Focusing on what is best not just for us as individuals, but for the overall
59 | community
60 |
61 | Examples of unacceptable behavior include:
62 |
63 | - The use of sexualized language or imagery, and sexual attention or advances of
64 | any kind
65 | - Trolling, insulting or derogatory comments, and personal or political attacks
66 | - Public or private harassment
67 | - Publishing others' private information, such as a physical or email address,
68 | without their explicit permission
69 | - Other conduct which could reasonably be considered inappropriate in a
70 | professional setting
71 |
72 | ## Enforcement Responsibilities
73 |
74 | Community leaders are responsible for clarifying and enforcing our standards of
75 | acceptable behavior and will take appropriate and fair corrective action in
76 | response to any behavior that they deem inappropriate, threatening, offensive,
77 | or harmful.
78 |
79 | Community leaders have the right and responsibility to remove, edit, or reject
80 | comments, commits, code, wiki edits, issues, and other contributions that are
81 | not aligned to this Code of Conduct, and will communicate reasons for moderation
82 | decisions when appropriate.
83 |
84 | ## Scope
85 |
86 | This Code of Conduct applies within all community spaces, and also applies when
87 | an individual is officially representing the community in public spaces.
88 | Examples of representing our community include using an official email address,
89 | posting via an official social media account, or acting as an appointed
90 | representative at an online or offline event.
91 |
92 | ## Enforcement
93 |
94 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
95 | reported to the community leaders responsible for enforcement at pipecat-ai@daily.co.
96 | All complaints will be reviewed and investigated promptly and fairly.
97 |
98 | All community leaders are obligated to respect the privacy and security of the
99 | reporter of any incident.
100 |
101 | ## Enforcement Guidelines
102 |
103 | Community leaders will follow these Community Impact Guidelines in determining
104 | the consequences for any action they deem in violation of this Code of Conduct:
105 |
106 | ### 1. Correction
107 |
108 | **Community Impact**: Use of inappropriate language or other behavior deemed
109 | unprofessional or unwelcome in the community.
110 |
111 | **Consequence**: A private, written warning from community leaders, providing
112 | clarity around the nature of the violation and an explanation of why the
113 | behavior was inappropriate. A public apology may be requested.
114 |
115 | ### 2. Warning
116 |
117 | **Community Impact**: A violation through a single incident or series of
118 | actions.
119 |
120 | **Consequence**: A warning with consequences for continued behavior. No
121 | interaction with the people involved, including unsolicited interaction with
122 | those enforcing the Code of Conduct, for a specified period of time. This
123 | includes avoiding interactions in community spaces as well as external channels
124 | like social media. Violating these terms may lead to a temporary or permanent
125 | ban.
126 |
127 | ### 3. Temporary Ban
128 |
129 | **Community Impact**: A serious violation of community standards, including
130 | sustained inappropriate behavior.
131 |
132 | **Consequence**: A temporary ban from any sort of interaction or public
133 | communication with the community for a specified period of time. No public or
134 | private interaction with the people involved, including unsolicited interaction
135 | with those enforcing the Code of Conduct, is allowed during this period.
136 | Violating these terms may lead to a permanent ban.
137 |
138 | ### 4. Permanent Ban
139 |
140 | **Community Impact**: Demonstrating a pattern of violation of community
141 | standards, including sustained inappropriate behavior, harassment of an
142 | individual, or aggression toward or disparagement of classes of individuals.
143 |
144 | **Consequence**: A permanent ban from any sort of public interaction within the
145 | community.
146 |
147 | ## Attribution
148 |
149 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
150 | version 2.1, available at
151 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
152 |
153 | Community Impact Guidelines were inspired by
154 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
155 |
156 | For answers to common questions about this code of conduct, see the FAQ at
157 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
158 | [https://www.contributor-covenant.org/translations][translations].
159 |
160 | [homepage]: https://www.contributor-covenant.org
161 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
162 | [Mozilla CoC]: https://github.com/mozilla/diversity
163 | [FAQ]: https://www.contributor-covenant.org/faq
164 | [translations]: https://www.contributor-covenant.org/translations
165 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2024, Daily
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pipecat Client Web Transports
2 |
3 | [](https://docs.pipecat.ai/client/js/transports/transport)
4 | [](https://discord.gg/pipecat)
5 |
6 | A mono-repo to house the various supported Transport options to be used with the pipecat-client-web library. Currently, there are four transports: `small-webrtc-transport`, `daily-transport`, `gemini-live-websocket-transport`, and `openai-realtime-webrtc-transport`.
7 |
8 | ## Documentation
9 |
10 | Pipecat Transports are intended to be used in conjunction with a Pipecat web client. Please refer to the full Pipecat client documentation [here](https://docs.pipecat.ai/client/introduction) and an overview of the [Transport API here](https://docs.pipecat.ai/client/js/transports/transport)
11 |
12 | ## Current Transports
13 |
14 | ### [SmallWebRTCTransport](/transports/small-webrtc-transport/README.md)
15 |
16 | [](https://docs.pipecat.ai/client/js/transports/small-webrtc)
17 | [](/transports/small-webrtc-transport/README.md)
18 | [](https://github.com/pipecat-ai/pipecat/tree/main/examples/p2p-webrtc)
19 | 
20 |
21 | This Transport creates a peer-to-peer WebRTC connection between the client and the bot process. This Transport is the client-side counterpart to the Pipecat [SmallWebRTCTransport component](https://docs.pipecat.ai/server/services/transport/small-webrtc).
22 |
23 | This is the simplest low-latency audio/video transport for Pipecat. This transport is recommended for local development and demos. Things to be aware of:
24 | - This transport is a direct connection between the client and the bot process. If you need multiple clients to connect to the same bot, you will need to use a different transport.
25 | - For production usage at scale, a distributed WebRTC network that can do edge/mesh routing, has session-level observability and metrics, and can offload recording and other auxiliary services is often useful.
26 |
27 | Typical media flow using a SmallWebRTCTransport:
28 | ```
29 | ┌──────────────────────────────────────────────────┐
30 | │ │
31 | ┌─────────────────────────┐ │ Server ┌─────────┐ │
32 | │ │ │ │Pipecat │ │
33 | │ Client │ RTVI Messages │ │Pipeline │ │
34 | │ │ & │ │ │
35 | │ ┌────────────────────┐ │ WebRTC Media │ ┌────────────────────┐ media │ ┌─────┐ │ │
36 | │ │SmallWebRTCTransport│◄─┼────────────────┼─►│SmallWebRTCTransport┼────────────┼─► STT │ │ │
37 | │ └────────────────────┘ │ │ └───────▲────────────┘ in │ └──┬──┘ │ │
38 | │ │ │ │ │ │ │ │
39 | └─────────────────────────┘ │ │ │ ┌──▼──┐ │ │
40 | │ │ │ │ LLM │ │ │
41 | │ │ │ └──┬──┘ │ │
42 | │ │ │ │ │ │
43 | │ │ │ ┌──▼──┐ │ │
44 | │ │ media │ │ TTS │ │ │
45 | │ └─────────────────────────┼─┴─────┘ │ │
46 | │ out └─────────┘ │
47 | │ │
48 | └──────────────────────────────────────────────────┘
49 | ```
50 |
51 | ### [DailyTransport](/transports/daily/README.md)
52 |
53 | [](https://docs.pipecat.ai/client/js/transports/daily)
54 | [](/transports/daily/README.md)
55 | [](https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot)
56 | 
57 |
58 | This Transport uses the [Daily](https://daily.co) audio and video calling service to connect to a bot and stream media over a WebRTC connection. This Transport is the client-side counterpart to the Pipecat [DailyTransport component](https://docs.pipecat.ai/server/services/transport/daily).
59 |
60 | Typical media flow using a DailyTransport:
61 | ```
62 |
63 | ┌────────────────────────────────────────────┐
64 | │ │
65 | ┌───────────────────┐ │ Server ┌─────────┐ │
66 | │ │ │ │Pipecat │ │
67 | │ Client │ RTVI Messages │ │Pipeline │ │
68 | │ │ & │ │ │ │
69 | │ ┌──────────────┐ │ WebRTC Media │ ┌──────────────┐ media │ ┌─────┐ │ │
70 | │ │DailyTransport│◄─┼────────────────┼─►│DailyTransport┼────────────┼─► STT │ │ │
71 | │ └──────────────┘ │ │ └───────▲──────┘ in │ └──┬──┘ │ │
72 | │ │ │ │ │ │ │ │
73 | └───────────────────┘ │ │ │ ┌──▼──┐ │ │
74 | │ │ │ │ LLM │ │ │
75 | │ │ │ └──┬──┘ │ │
76 | │ │ │ │ │ │
77 | │ │ │ ┌──▼──┐ │ │
78 | │ │ media │ │ TTS │ │ │
79 | │ └───────────────────┼─┴─────┘ │ │
80 | │ out └─────────┘ │
81 | │ │
82 | └────────────────────────────────────────────┘
83 |
84 | ```
85 |
86 | ### [GeminiLiveWebSocketTransport](transports/gemini-live-websocket-transport/README.md)
87 | [](https://docs.pipecat.ai/client/js/transports/gemini)
88 | [](transports/gemini-live-websocket-transport/README.md)
89 | [](examples/directToLLMTransports/README.md)
90 | 
91 |
92 | This Transport extends the [RealTimeWebSocketTransport](transports/realtime-websocket-transport/README) and connects directly to Gemini over a WebSocket connection using the Multimodal Live API. This type of transport is great for testing different services out without the need to build a server component. Just be aware that it is insecure since you will need to have access to your Gemini API Key client-side so not probably something you want to use in your production app.
93 |
94 | Media flow using a GeminiLiveWebSocketTransport:
95 | ```
96 | Client Server
97 | ┌────────────────────────────────────┐
98 | │ │
99 | │ RTVIClient │ ┌──────────────┐
100 | │ │ Media over │ │
101 | │ ┌──────────────────────────────┐ │ WebSocket │ Gemini │
102 | │ │ GeminiLiveWebSocketTransport │◄─┼────────────────┼─► Server │
103 | │ └──────────────────────────────┘ │ │ │
104 | │ │ └──────────────┘
105 | └────────────────────────────────────┘
106 | ```
107 |
108 | ### [OpenAIRealTimeWebRTCTransport](transports/gemini-live-websocket-transport/README.md)
109 | [](https://docs.pipecat.ai/client/js/transports/openai-webrtc)
110 | [](transports/openai-realtime-webrtc-transport/README.md)
111 | [](examples/directToLLMTransports/README.md)
112 | 
113 |
114 | This Transport connects directly to OpenAI over a WebRTC connection using the RealTime API. This type of transport is great for testing different services out without the need to build a server component. Just be aware that it is insecure since you will need to have access to your OpenAI API Key client-side so not probably something you want to use in your production app. It does not implement the Ephemeral Token process.
115 |
116 | Media flow using a OpenAIRealTimeWebRTCTransport:
117 | ```
118 | Client Server
119 | ┌─────────────────────────────────────┐
120 | │ │
121 | │ RTVIClient │ ┌──────────────┐
122 | │ │ Media over │ │
123 | │ ┌───────────────────────────────┐ │ WebRTC │ OpenAI │
124 | │ │ OpenAIRealTimeWebRTCTransport │◄─┼────────────────┼─► Server │
125 | │ └───────────────────────────────┘ │ │ │
126 | │ │ └──────────────┘
127 | └─────────────────────────────────────┘
128 | ```
129 |
130 | ## Local Development
131 |
132 | ### Build the transport libraries
133 |
134 | ```bash
135 | $ npm i
136 | $ npm run build
137 | ```
138 |
139 | ## License
140 | BSD-2 Clause
141 |
142 | ## Contributing
143 | We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
144 |
145 | - **Found a bug?** Open an [issue](https://github.com/pipecat-ai/pipecat-client-web-transports/issues)
146 | - **Have a feature idea?** Start a [discussion](https://discord.gg/pipecat)
147 | - **Want to contribute code?** Check our [CONTRIBUTING.md](CONTRIBUTING.md) guide
148 | - **Documentation improvements?** [Docs](https://github.com/pipecat-ai/docs) PRs are always welcome
149 |
150 | Before submitting a pull request, please check existing issues and PRs to avoid duplicates.
151 |
152 | We aim to review all contributions promptly and provide constructive feedback to help get your changes merged.
--------------------------------------------------------------------------------
/examples/directToLLMTransports/README.md:
--------------------------------------------------------------------------------
1 | # Pipecat (RTVI) Client Demo for Direct Communication with LLMs
2 |
3 | ## Overview
4 | This application demonstrates a real-time voice interaction system using the RTVI Client library with both the Gemini Multimodal Live and OpenAI RealTime WebRTC integrations. It enables two-way communication between users and the LLM, featuring voice input/output, text messaging, and various audio controls.
5 |
6 | ## Features
7 | - Real-time voice interaction with a Gemini Multimodal Live bot
8 | - Real-time voice interaction with an OpenAI RealTime bot
9 | - Microphone input control and device selection
10 | - Text-based message prompting
11 | - Audio visualization through dynamic speech bubbles
12 | - Comprehensive event handling system
13 | - Connection state management
14 |
15 | ## Prerequisites
16 | - Gemini API key (set as environment variable `VITE_DANGEROUS_GEMINI_API_KEY`)
17 | - OpenAI API key (set as environment variable `VITE_DANGEROUS_OPENAI_API_KEY`)
18 | - Optional [OpenWeather API](https://openweathermap.org/api) key for fetching weather. If none is provided, the app will generate something random.
19 | - Modern web browser with WebSocket support
20 | - Access to microphone
21 |
22 | ## Dependencies
23 | ```
24 | # from base folder
25 | $ npm i
26 | $ npm run build
27 | ```
28 |
29 |
30 | ## Setup and Installation
31 | ```
32 | npm i
33 | npm run dev
34 |
35 | cp env.example .env
36 | # update .env with API keys
37 | ```
38 |
39 | ### To run the example with Gemini MultiModal Live:
40 |
41 | Open [http://localhost:5173/](http://localhost:5173/)
42 |
43 | ### To run the example with OpenAI RealTime:
44 |
45 | Open [http://localhost:5173?service=openai](http://localhost:5173?service=openai)
46 |
47 | ## Documentation Reference
48 | [RTVI Client Documentation](https://docs.pipecat.ai/client/introduction)
49 | [Gemini Multimodal Live Documentation](https://ai.google.dev/api/multimodal-live)
50 | [OpenAI RealTime WebRTC Documentation](https://platform.openai.com/docs/guides/realtime-webrtc)
51 |
52 | ## Usage
53 |
54 | ### Initialization
55 | The application automatically initializes when the DOM content is loaded. It sets up:
56 | - Audio device selection
57 | - Microphone controls
58 | - Bot connection management
59 | - Event handlers
60 |
61 | ### Controls
62 | - **Toggle Bot**: Connect/disconnect the AI assistant
63 | - **Mute/Unmute**: Control microphone input
64 | - **Microphone Selection**: Choose input device
65 | - **Text Input**: Send text messages to the bot
66 |
67 | ### Event Handling
68 | The application handles various events including:
69 | - Transport state changes
70 | - Bot connection status
71 | - Audio track management
72 | - Speech detection
73 | - Error handling
74 | - Audio level visualization
75 |
76 | ## Key Components
77 |
78 | ### RTVIClient Configuration
79 | ```typescript
80 | let RTVIConfig: RTVIClientOptions = {
81 | transport,
82 | params: {
83 | baseUrl: "api",
84 | requestData: { },
85 | },
86 | enableMic: true,
87 | enableCam: false,
88 | timeout: 30 * 1000,
89 | };
90 | ```
91 |
92 | ### Gemini Multimodal Live Service Configuration
93 | ```typescript
94 | const llm_service_options: GeminiLLMServiceOptions = {
95 | api_key: process.env.VITE_DANGEROUS_GEMINI_API_KEY,
96 | model: "models/gemini-2.0-flash-exp",
97 | // ... additional configuration
98 | };
99 | ```
100 |
101 | For all service options and their defaults, see [GeminiLLMServiceOptions](../../transports/gemini-live-websocket-transport/src/geminiLiveWebSocketTransport.ts#21)
102 |
103 | ### OpenAI Realtime API Service Configuration
104 | ```typescript
105 | const llm_service_options: OpenAIServiceOptions = {
106 | api_key: import.meta.env.VITE_DANGEROUS_OPENAI_API_KEY,
107 | // ... additional configuration
108 | };
109 | ```
110 |
111 | For all service options and their defaults, see [OpenAIServiceOptions](../../transports/openai-realtime-webrtc-transport/src/OpenAIRealTimeWebRTCTransport.ts#28)
112 |
113 | ## Notes
114 | - Gemini integration currently does not support transcripts
115 |
116 | ## License
117 | BSD-2 Clause
118 |
--------------------------------------------------------------------------------
/examples/directToLLMTransports/README.md.bak:
--------------------------------------------------------------------------------
1 |
2 | # Pipecat JavaScript Client SDK Example using a Gemini MultiModal Live Transport
3 |
4 | ## Setup
5 |
6 | Build transport dependencies
7 |
8 | ```
9 | # from base folder
10 | $ yarn
11 | $ yarn workspace @pipecat-ai/realtime-websocket-transport build
12 | $ yarn workspace @pipecat-ai/gemini-live-websocket-transport
13 | ```
14 |
15 | ## Install and run locally
16 |
17 | ```
18 | npm i
19 | npm run dev
20 |
21 | cp env.example .env
22 | # update .env with API keys
23 | ```
24 |
25 | Open [http://localhost:5173/](http://localhost:5173/)
26 |
27 | ## Demo code
28 |
29 | This is bare-bones LLM voice chat app that sets up an [RTVI Client](https://github.com/pipecat-ai/pipecat-client-web)(Pipecat's client-side JS component) with a [GeminiLiveWebsocketTransport](../../transports/gemini-live-websocket-transport/src/geminiLiveWebSocketTransport.ts) to communicate directly with Google's Multimodal Live API over a websocket connection.
30 |
31 | The application code is all in two files:
32 |
33 | - [index.html](./index.html)
34 | - [src/app.ts](./src/app.ts)
35 |
--------------------------------------------------------------------------------
/examples/directToLLMTransports/env.example:
--------------------------------------------------------------------------------
1 | VITE_DANGEROUS_GEMINI_API_KEY=
2 | VITE_DANGEROUS_OPENAI_API_KEY=
3 | VITE_DANGEROUS_OPENWEATHER_API_KEY=
--------------------------------------------------------------------------------
/examples/directToLLMTransports/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | My Web App
5 |
6 |
7 |
8 |
9 |
10 |
11 |
Transport state: disconnected
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/examples/directToLLMTransports/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "1116",
3 | "version": "1.0.0",
4 | "main": "server/server.ts",
5 | "scripts": {
6 | "dev": "vite",
7 | "build": "vite build",
8 | "preview": "vite preview"
9 | },
10 | "keywords": [],
11 | "author": "",
12 | "license": "BSD-2-Clause",
13 | "description": "",
14 | "dependencies": {
15 | "@pipecat-ai/client-js": "^0.3.5",
16 | "@pipecat-ai/gemini-live-websocket-transport": "file:../../transports/gemini-live-websocket-transport",
17 | "@pipecat-ai/openai-realtime-webrtc-transport": "file:../../transports/openai-realtime-webrtc-transport",
18 | "dotenv": "^16.4.5",
19 | "express": "^4.21.1",
20 | "morgan": "^1.10.0"
21 | },
22 | "devDependencies": {
23 | "@types/express": "^5.0.0",
24 | "@types/morgan": "^1.9.9",
25 | "@types/node": "^22.9.0",
26 | "concurrently": "^9.1.0",
27 | "eslint": "^9.15.0",
28 | "nodemon": "^3.1.7",
29 | "ts-node": "^10.9.2",
30 | "ts-node-dev": "^2.0.0",
31 | "typescript": "^5.6.3",
32 | "vite": "^5.4.11"
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/examples/directToLLMTransports/src/app.ts:
--------------------------------------------------------------------------------
1 | // Import Types for Gemini Transport
2 | import {
3 | GeminiLiveWebsocketTransport,
4 | GeminiLLMServiceOptions,
5 | } from "@pipecat-ai/gemini-live-websocket-transport";
6 |
7 | import {
8 | OpenAIRealTimeWebRTCTransport,
9 | OpenAIServiceOptions,
10 | } from "@pipecat-ai/openai-realtime-webrtc-transport";
11 |
12 | // Import core Pipecat RTVI Client and types
13 | import {
14 | LLMHelper,
15 | FunctionCallParams,
16 | Transport,
17 | RTVIClient,
18 | RTVIEvent,
19 | RTVIMessage,
20 | Participant,
21 | TranscriptData,
22 | BotTTSTextData,
23 | RTVIClientOptions,
24 | } from "@pipecat-ai/client-js";
25 |
26 | // Global variables for DOM elements and client state
27 | let statusDiv: HTMLElement;
28 | let audioDiv: HTMLDivElement;
29 | let toggleBotButton: HTMLButtonElement;
30 | let submitBtn: HTMLButtonElement;
31 | let rtviClient: RTVIClient;
32 | let llmHelper: LLMHelper;
33 | let botRunning = false;
34 |
35 | // Initialize the application when DOM is fully loaded
36 | document.addEventListener("DOMContentLoaded", () => {
37 | statusDiv = document.getElementById("status")!;
38 | toggleBotButton = document.getElementById("toggleBot")! as HTMLButtonElement;
39 | toggleBotButton.addEventListener("click", () => toggleBot());
40 |
41 | // Handle microphone device selection
42 | document.getElementById("mic-picker")!.onchange = (e) => {
43 | const target = e.target as HTMLSelectElement;
44 | console.log("user changed device", target, target.value);
45 | rtviClient.updateMic(target.value);
46 | };
47 |
48 | // Set up mute button functionality
49 | const muteBtn = document.getElementById("toggleMute")!;
50 | muteBtn.addEventListener("click", () => {
51 | muteBtn.textContent = rtviClient.isMicEnabled ? "Unmute Mic" : "Mute Mic";
52 | rtviClient.enableMic(!rtviClient.isMicEnabled);
53 | });
54 |
55 | // Set up text submission button
56 | submitBtn = document.getElementById("submit-text")! as HTMLButtonElement;
57 | submitBtn.addEventListener("click", () => {
58 | sendUserMessage();
59 | });
60 | submitBtn.disabled = true;
61 |
62 | // Initialize the bot
63 | initBot();
64 | });
65 |
66 | // Connect / Disconnect from bot
67 | async function toggleBot() {
68 | toggleBotButton.disabled = true;
69 | if (botRunning) {
70 | console.log("disconnecting bot");
71 | await disconnectBot();
72 | } else {
73 | console.log("connecting bot");
74 | await connectBot();
75 | }
76 | toggleBotButton.textContent = botRunning ? "Disconnect" : "Connect";
77 | }
78 |
79 | // Initialize the bot with configuration
80 | async function initBot() {
81 | const urlParams = new URLSearchParams(window.location.search);
82 | const service = urlParams.get("service") || "gemini";
83 | const { transport, service_options } =
84 | service === "gemini" ? initGeminiTransport() : initOpenAITransport();
85 |
86 | // Configure RTVI client options
87 | let RTVIConfig: RTVIClientOptions = {
88 | transport,
89 | params: {
90 | baseUrl: "api",
91 | requestData: { service_options },
92 | },
93 | enableMic: true,
94 | enableCam: false,
95 | timeout: 30 * 1000,
96 | };
97 | RTVIConfig.customConnectHandler = () => Promise.resolve();
98 |
99 | // Create new RTVI client instance
100 | rtviClient = new RTVIClient(RTVIConfig);
101 | llmHelper = new LLMHelper({});
102 | llmHelper.handleFunctionCall(async (fn: FunctionCallParams) => {
103 | return await handleFunctionCall(fn.functionName, fn.arguments);
104 | });
105 | rtviClient.registerHelper(service, llmHelper);
106 |
107 | // Make RTVI client and transport available globally for debugging
108 | (window as any).client = rtviClient;
109 |
110 | // Set up RTVI event handlers and initialize devices
111 | setupEventHandlers(rtviClient);
112 | await setupDevices();
113 | }
114 |
115 | // Initialize the Gemini LLM and its service options
116 | function initGeminiTransport() {
117 | // Configure Gemini LLM service options
118 | const llm_service_options: GeminiLLMServiceOptions = {
119 | api_key: import.meta.env.VITE_DANGEROUS_GEMINI_API_KEY,
120 | model: "models/gemini-2.0-flash-exp",
121 | initial_messages: [
122 | // Set up initial system and user messages.
123 | // Without the user message, the bot will not respond immediately
124 | // and wait for the user to speak first.
125 | {
126 | role: "model",
127 | content: "You are a pencil salesman...",
128 | },
129 | { role: "user", content: "Hello!" },
130 | ],
131 | settings: {
132 | speech_config: {
133 | voice_config: {
134 | prebuilt_voice_config: {
135 | // Options are: "Puck" | "Charon" | "Kore" | "Fenrir" | "Aoede"
136 | voice_name: "Charon",
137 | },
138 | },
139 | },
140 | },
141 | };
142 |
143 | // Initialize transport
144 | let transport: Transport = new GeminiLiveWebsocketTransport(
145 | llm_service_options
146 | );
147 |
148 | return { transport, service_options: llm_service_options };
149 | }
150 |
151 | function initOpenAITransport() {
152 | // Configure OpenAI LLM service options
153 | const llm_service_options: OpenAIServiceOptions = {
154 | api_key: import.meta.env.VITE_DANGEROUS_OPENAI_API_KEY,
155 | settings: {
156 | instructions: "You are a pirate. You are looking for buried treasure.",
157 | voice: "echo",
158 | input_audio_noise_reduction: { type: "near_field" },
159 | turn_detection: { type: "semantic_vad" },
160 | tools: [
161 | {
162 | type: "function",
163 | name: "changeBackgroundColor",
164 | description: "Change the background color of the page",
165 | parameters: {
166 | type: "object",
167 | properties: {
168 | color: {
169 | type: "string",
170 | description: "A hex value of the color",
171 | },
172 | },
173 | },
174 | },
175 | {
176 | type: "function",
177 | name: "getWeather",
178 | description: "Gets the current weather for a given location",
179 | parameters: {
180 | type: "object",
181 | properties: {
182 | location: {
183 | type: "string",
184 | description: "A city or location",
185 | },
186 | },
187 | },
188 | },
189 | ],
190 | },
191 | initial_messages: [{ role: "user", content: "Hello" }],
192 | };
193 |
194 | // Initialize transport
195 | let transport: Transport = new OpenAIRealTimeWebRTCTransport(
196 | llm_service_options
197 | );
198 |
199 | return { transport, service_options: llm_service_options };
200 | }
201 |
202 | // Initialize and update available audio devices
203 | async function setupDevices() {
204 | await rtviClient.initDevices();
205 | const mics = await rtviClient.getAllMics();
206 | updateMicList(mics);
207 | }
208 |
209 | // Updates the microphone selection dropdown
210 | function updateMicList(mics: MediaDeviceInfo[]) {
211 | const micPicker = document.getElementById("mic-picker")!;
212 | micPicker.replaceChildren();
213 | const curMic = rtviClient.selectedMic?.deviceId;
214 | mics.forEach((mic) => {
215 | let el = document.createElement("option");
216 | el.textContent = mic.label;
217 | el.value = mic.deviceId;
218 | micPicker.appendChild(el);
219 | if (mic.deviceId === curMic) {
220 | el.selected = true;
221 | }
222 | });
223 | }
224 |
225 | // Connect client to Gemini Multimodal Live bot
226 | async function connectBot() {
227 | statusDiv.textContent = "Joining...";
228 | try {
229 | await rtviClient.connect();
230 | console.log("READY! Let's GO!");
231 | } catch (e) {
232 | console.error("Error connecting", e);
233 | toggleBotButton.disabled = false;
234 | return;
235 | }
236 | toggleBotButton.disabled = false;
237 | submitBtn.disabled = false;
238 | botRunning = true;
239 | }
240 |
241 | // Disconnect client from Gemini Multimodal Live bot
242 | async function disconnectBot() {
243 | try {
244 | await rtviClient.disconnect();
245 | } catch (e) {
246 | console.error("Error disconnecting", e);
247 | }
248 | toggleBotButton.disabled = false;
249 | submitBtn.disabled = true;
250 | botRunning = false;
251 | }
252 |
253 | // Set up event handlers for RTVI client
254 | // https://docs.pipecat.ai/client/js/api-reference/callbacks#2-event-listeners
255 | export async function setupEventHandlers(rtviClient: RTVIClient) {
256 | audioDiv = document.getElementById("audio") as HTMLDivElement;
257 |
258 | rtviClient.on(RTVIEvent.TransportStateChanged, (state: string) => {
259 | console.log(`-- transport state change: ${state} --`);
260 | statusDiv.textContent = `Transport state: ${state}`;
261 | if (state === "disconnected") {
262 | botRunning = false;
263 | toggleBotButton.textContent = "Connect";
264 | }
265 | });
266 |
267 | rtviClient.on(RTVIEvent.Connected, () => {
268 | console.log("-- user connected --");
269 | });
270 |
271 | rtviClient.on(RTVIEvent.Disconnected, () => {
272 | console.log("-- user disconnected --");
273 | });
274 |
275 | rtviClient.on(RTVIEvent.BotConnected, () => {
276 | console.log("-- bot connected --");
277 | });
278 |
279 | rtviClient.on(RTVIEvent.BotDisconnected, () => {
280 | console.log("--bot disconnected --");
281 | });
282 |
283 | rtviClient.on(RTVIEvent.BotReady, () => {
284 | console.log("-- bot ready to chat! --");
285 | });
286 |
287 | // For realtime v2v transports, this event will only fire for the
288 | // local participant.
289 | rtviClient.on(
290 | RTVIEvent.TrackStarted,
291 | (track: MediaStreamTrack, participant?: Participant) => {
292 | console.log(" --> track started", participant, track);
293 | if (participant?.local) {
294 | return;
295 | }
296 | let audio = document.createElement("audio");
297 | audio.srcObject = new MediaStream([track]);
298 | audio.autoplay = true;
299 | audioDiv.appendChild(audio);
300 | }
301 | );
302 |
303 | // For realtime v2v transports, this event will only fire for the
304 | // local participant.
305 | rtviClient.on(
306 | RTVIEvent.TrackStopped,
307 | (track: MediaStreamTrack, participant?: Participant) => {
308 | console.log(" --> track stopped", participant, track);
309 | }
310 | );
311 |
312 | rtviClient.on(RTVIEvent.UserStartedSpeaking, () => {
313 | console.log("-- user started speaking -- ");
314 | });
315 |
316 | rtviClient.on(RTVIEvent.UserStoppedSpeaking, () => {
317 | console.log("-- user stopped speaking -- ");
318 | });
319 |
320 | rtviClient.on(RTVIEvent.BotStartedSpeaking, () => {
321 | console.log("-- bot started speaking -- ");
322 | });
323 |
324 | rtviClient.on(RTVIEvent.BotStoppedSpeaking, () => {
325 | console.log("-- bot stopped speaking -- ");
326 | });
327 |
328 | // multimodal live does not currently provide transcripts so this will not fire
329 | rtviClient.on(RTVIEvent.UserTranscript, (transcript: TranscriptData) => {
330 | console.log("[EVENT] UserTranscript", transcript);
331 | });
332 |
333 | // multimodal live does not currently provide transcripts so this will not fire
334 | rtviClient.on(RTVIEvent.BotTtsText, (data: BotTTSTextData) => {
335 | console.log("[EVENT] BotTtsText", data);
336 | });
337 |
338 | // multimodal live does not currently provide transcripts so this will not fire
339 | rtviClient.on(RTVIEvent.BotTranscript, (data: BotTTSTextData) => {
340 | console.log("[EVENT] BotTranscript", data);
341 | });
342 |
343 | rtviClient.on(RTVIEvent.Error, (message: RTVIMessage) => {
344 | console.log("[EVENT] RTVI Error!", message);
345 | });
346 |
347 | rtviClient.on(RTVIEvent.MessageError, (message: RTVIMessage) => {
348 | console.log("[EVENT] RTVI ErrorMessage error!", message);
349 | });
350 |
351 | // multimodal live does not currently provide metrics so this will not fire
352 | rtviClient.on(RTVIEvent.Metrics, (data) => {
353 | // let's only print out ttfb for now
354 | if (!data.ttfb) {
355 | return;
356 | }
357 | data.ttfb.map((metric) => {
358 | console.log(`[METRICS] ${metric.processor} ttfb: ${metric.value}`);
359 | });
360 | });
361 |
362 | rtviClient.on(RTVIEvent.MicUpdated, (mic: MediaDeviceInfo) => {
363 | const micPicker = document.getElementById("mic-picker")!;
364 | for (let i = 0; i < micPicker.children.length; i++) {
365 | let el = micPicker.children[i] as HTMLOptionElement;
366 | el.selected = el.value === mic.deviceId;
367 | }
368 | });
369 |
370 | rtviClient.on(RTVIEvent.AvailableMicsUpdated, (mics: MediaDeviceInfo[]) => {
371 | updateMicList(mics);
372 | });
373 |
374 | rtviClient.on(RTVIEvent.LocalAudioLevel, (level: number) => {
375 | updateSpeakerBubble(level, "user");
376 | });
377 | rtviClient.on(RTVIEvent.RemoteAudioLevel, (level: number) => {
378 | updateSpeakerBubble(level, "bot");
379 | });
380 | }
381 |
382 | // Send user message to bot.
383 | function sendUserMessage() {
384 | const textInput = document.getElementById("text-input")! as HTMLInputElement;
385 | llmHelper.appendToMessages({ role: "user", content: textInput.value }, true);
386 | textInput.value = "";
387 | }
388 |
389 | // Update the speaker bubble size based on the audio level
390 | function updateSpeakerBubble(level: number, whom: string) {
391 | const volume = level * 100;
392 | const userBubble = document.getElementById(
393 | whom === "user" ? "user-bubble" : "bot-bubble"
394 | )!;
395 | // Scale the bubble size based on the volume value
396 | const scale = 1 + volume / 50; // Adjust the divisor to control the scaling effect
397 | userBubble.style.transform = `scale(${scale})`;
398 | }
399 |
400 | function _generateRandomWeather() {
401 | const temperature = Math.random() * 200 - 80;
402 | const humidity = Math.random() * 100;
403 | const conditions = ["sunny", "cloudy", "rainy", "snowy"];
404 | const condition = conditions[Math.floor(Math.random() * conditions.length)];
405 | const windSpeed = Math.random() * 50;
406 | const windGusts = windSpeed + Math.random() * 20;
407 | return {
408 | temperature,
409 | humidity,
410 | condition,
411 | windSpeed,
412 | windGusts,
413 | };
414 | }
415 |
416 | async function handleFunctionCall(functionName: string, args: unknown) {
417 | console.log("[EVENT] LLMFunctionCall", functionName);
418 | const toolFunctions: { [key: string]: any } = {
419 | changeBackgroundColor: ({ color }: { [key: string]: string }) => {
420 | console.log("changing background color to", color);
421 | document.body.style.backgroundColor = color;
422 | return { success: true, color };
423 | },
424 | getWeather: async ({ location }: { [key: string]: string }) => {
425 | console.log("getting weather for", location);
426 | const key = import.meta.env.VITE_DANGEROUS_OPENWEATHER_API_KEY;
427 | if (!key) {
428 | const ret = { success: true, weather: _generateRandomWeather() };
429 | console.log("returning weather", ret);
430 | return ret;
431 | }
432 | const locationReq = await fetch(
433 | `http://api.openweathermap.org/geo/1.0/direct?q=${location}&limit=1&appid=${key}`
434 | );
435 | const locJson = await locationReq.json();
436 | const loc = { lat: locJson[0].lat, lon: locJson[0].lon };
437 | const exclude = ["minutely", "hourly", "daily"].join(",");
438 | const weatherRec = await fetch(
439 | `https://api.openweathermap.org/data/3.0/onecall?lat=${loc.lat}&lon=${loc.lon}&exclude=${exclude}&appid=${key}`
440 | );
441 | const weather = await weatherRec.json();
442 | return { success: true, weather: weather.current };
443 | },
444 | };
445 | const toolFunction = toolFunctions[functionName];
446 | if (toolFunction) {
447 | let result = await toolFunction(args);
448 | console.debug("returning result", result);
449 | return result;
450 | }
451 | }
452 |
--------------------------------------------------------------------------------
/examples/directToLLMTransports/src/styles.css:
--------------------------------------------------------------------------------
1 | body {
2 | max-width: 800px;
3 | margin: 20px 20px;
4 | font-family: system-ui, -apple-system, sans-serif;
5 | }
6 |
7 | #join-div a {
8 | color: #2563eb;
9 | cursor: pointer;
10 | }
11 |
12 | #connected-div {
13 | width: 90vw;
14 | margin: 20px 2vw;
15 | display: flex;
16 | flex-direction: column;
17 | justify-content: space-between;
18 | }
19 |
20 | #controls {
21 | display: flex;
22 | justify-content: left;
23 | margin: 10px 0;
24 | }
25 |
26 | #controls div {
27 | padding: 5px;
28 | margin: 0 5px;
29 | }
30 |
31 | button {
32 | padding: 5px;
33 | margin: 0 5px;
34 | width: 8em;
35 | border-radius: 10px;
36 | background-color: aliceblue;
37 | }
38 |
39 | button:active {
40 | transform: translateY(1px); /* Move the button down slightly */
41 | box-shadow: 2px 2px #666; /* Add a shadow to create a pressed effect */
42 | }
43 |
44 | #text-div {
45 | display: flex;
46 | flex-direction: column;
47 | margin: 10px 0;
48 | }
49 |
50 | #text-div label {
51 | margin: 5px 0;
52 | }
53 |
54 | #text-div textarea {
55 | margin: 5px 0;
56 | padding: 5px;
57 | width: 50%;
58 | border-radius: 10px;
59 | }
60 |
61 | #bubbles {
62 | margin: 20px 0px;
63 | border-radius: 16px;
64 | display: flex;
65 | flex-direction: row;
66 | }
67 |
68 | .bubble {
69 | width: 50px;
70 | height: 50px;
71 | border-radius: 50%;
72 | transition: transform 0.1s ease;
73 | margin: 15px;
74 | }
75 |
76 | #user-bubble {
77 | background-color: #4caf50;
78 | }
79 |
80 | #bot-bubble {
81 | background-color: #2196f3;
82 | }
83 |
84 | #content h2 {
85 | padding-left: 20px;
86 | }
87 |
88 | #chat-text {
89 | display: flex;
90 | flex-direction: column;
91 | align-items: left;
92 | overflow-y: auto;
93 | padding: 20px;
94 | flex: 1;
95 | }
96 |
97 | .user-message {
98 | display: flex;
99 | flex-direction: column;
100 | background-color: #f0f0f0;
101 | padding: 16px;
102 | margin: 12px 6px;
103 | border-radius: 8px;
104 | line-height: 1.5;
105 | }
106 |
107 | .user-message .interim {
108 | color: #707070;
109 | }
110 |
111 | .user-message::before {
112 | content: "User\A";
113 | font-size: 14px;
114 | color: #666;
115 | font-weight: 500;
116 | display: block;
117 | padding-bottom: 0.4em;
118 | }
119 |
120 | .assistant-message {
121 | display: flex;
122 | flex-direction: column;
123 | background-color: #ffffff;
124 | border: 1px solid #e0e0e0;
125 | margin: 12px 6px;
126 | padding: 16px;
127 | border-radius: 8px;
128 | line-height: 1.5;
129 | }
130 |
131 | .assistant-message::before {
132 | content: "Assistant";
133 | font-size: 14px;
134 | color: #666;
135 | font-weight: 500;
136 | display: block;
137 | padding-bottom: 0.4em;
138 | }
139 |
--------------------------------------------------------------------------------
/examples/directToLLMTransports/src/vite-env.d.ts:
--------------------------------------------------------------------------------
1 | ///
2 |
--------------------------------------------------------------------------------
/examples/directToLLMTransports/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "module": "ESNext",
5 | "lib": ["ES2020", "DOM", "DOM.Iterable"],
6 | "skipLibCheck": true,
7 | "jsx": "preserve",
8 |
9 | /* Bundler mode */
10 | "moduleResolution": "bundler",
11 | "allowImportingTsExtensions": true,
12 | "allowJs": true,
13 | "noEmit": true,
14 | "resolveJsonModule": true,
15 | "isolatedModules": true,
16 | "moduleDetection": "force",
17 |
18 | /* Linting */
19 | "strict": true,
20 | "noUnusedLocals": true,
21 | "noUnusedParameters": false,
22 | "noFallthroughCasesInSwitch": true
23 | },
24 | "include": ["src", "lib"]
25 | }
26 |
--------------------------------------------------------------------------------
/lib/media-mgmt/dailyMediaManager.ts:
--------------------------------------------------------------------------------
1 | import { MediaManager } from "./mediaManager";
2 | import { MediaStreamRecorder, WavStreamPlayer } from "../wavtools";
3 |
4 | import Daily, {
5 | DailyCall,
6 | DailyEventObjectAvailableDevicesUpdated,
7 | DailyEventObjectLocalAudioLevel,
8 | DailyEventObjectSelectedDevicesUpdated,
9 | DailyEventObjectTrack,
10 | DailyParticipant,
11 | DailyParticipantsObject,
12 | } from "@daily-co/daily-js";
13 | import { Participant, Tracks } from "@pipecat-ai/client-js";
14 |
15 | export class DailyMediaManager extends MediaManager {
16 | private _daily: DailyCall;
17 | private _mediaStreamRecorder: MediaStreamRecorder | undefined;
18 | private _wavStreamPlayer: WavStreamPlayer | undefined;
19 |
20 | private _initialized: boolean;
21 | private _connected: boolean;
22 | private _connectResolve: ((value: void | PromiseLike) => void) | null;
23 |
24 | private _currentAudioTrack: MediaStreamTrack | null;
25 | private _selectedCam: MediaDeviceInfo | Record = {};
26 | private _selectedMic: MediaDeviceInfo | Record = {};
27 | private _selectedSpeaker: MediaDeviceInfo | Record = {};
28 |
29 | private _remoteAudioLevelInterval: NodeJS.Timeout | null = null;
30 |
31 | private onTrackStartedCallback?: (event: DailyEventObjectTrack) => void;
32 | private onTrackStoppedCallback?: (event: DailyEventObjectTrack) => void;
33 |
34 | private _recorderChunkSize: number | undefined = undefined;
35 |
36 | constructor(
37 | enablePlayer: boolean = true,
38 | enableRecording: boolean = true,
39 | onTrackStartedCallback?: (event: DailyEventObjectTrack) => void,
40 | onTrackStoppedCallback?: (event: DailyEventObjectTrack) => void,
41 | recorderChunkSize: number | undefined = undefined,
42 | recorderSampleRate: number = 24000,
43 | ) {
44 | super();
45 | this._initialized = false;
46 | this._connected = false;
47 | this._currentAudioTrack = null;
48 | this._connectResolve = null;
49 | this.onTrackStartedCallback = onTrackStartedCallback;
50 | this.onTrackStoppedCallback = onTrackStoppedCallback;
51 | this._recorderChunkSize = recorderChunkSize;
52 |
53 | this._daily = Daily.getCallInstance() ?? Daily.createCallObject();
54 |
55 | if (enableRecording) {
56 | this._mediaStreamRecorder = new MediaStreamRecorder({
57 | sampleRate: recorderSampleRate,
58 | });
59 | }
60 | if (enablePlayer) {
61 | this._wavStreamPlayer = new WavStreamPlayer({ sampleRate: 24000 });
62 | }
63 |
64 | this._daily.on("track-started", this.handleTrackStarted.bind(this));
65 | this._daily.on("track-stopped", this.handleTrackStopped.bind(this));
66 | this._daily.on(
67 | "available-devices-updated",
68 | this._handleAvailableDevicesUpdated.bind(this),
69 | );
70 | this._daily.on(
71 | "selected-devices-updated",
72 | this._handleSelectedDevicesUpdated.bind(this),
73 | );
74 | this._daily.on("local-audio-level", this._handleLocalAudioLevel.bind(this));
75 | }
76 |
77 | async initialize(): Promise {
78 | if (this._initialized) {
79 | console.warn("DailyMediaManager already initialized");
80 | return;
81 | }
82 | const infos = await this._daily.startCamera({
83 | startVideoOff: !this._camEnabled,
84 | startAudioOff: !this._micEnabled,
85 | });
86 | const { devices } = await this._daily.enumerateDevices();
87 | const cams = devices.filter((d) => d.kind === "videoinput");
88 | const mics = devices.filter((d) => d.kind === "audioinput");
89 | const speakers = devices.filter((d) => d.kind === "audiooutput");
90 | this._callbacks.onAvailableCamsUpdated?.(cams);
91 | this._callbacks.onAvailableMicsUpdated?.(mics);
92 | this._callbacks.onAvailableSpeakersUpdated?.(speakers);
93 | this._selectedCam = infos.camera;
94 | this._callbacks.onCamUpdated?.(infos.camera as MediaDeviceInfo);
95 | this._selectedMic = infos.mic;
96 | this._callbacks.onMicUpdated?.(infos.mic as MediaDeviceInfo);
97 | this._selectedSpeaker = infos.speaker;
98 | this._callbacks.onSpeakerUpdated?.(infos.speaker as MediaDeviceInfo);
99 |
100 | // Instantiate audio observers
101 | if (!this._daily.isLocalAudioLevelObserverRunning())
102 | await this._daily.startLocalAudioLevelObserver(100);
103 |
104 | if (this._wavStreamPlayer) {
105 | await this._wavStreamPlayer.connect();
106 | if (!this._remoteAudioLevelInterval) {
107 | this._remoteAudioLevelInterval = setInterval(() => {
108 | const frequencies = this._wavStreamPlayer!.getFrequencies();
109 | let aveVal = 0;
110 | if (frequencies.values?.length) {
111 | aveVal =
112 | frequencies.values.reduce((a, c) => a + c, 0) /
113 | frequencies.values.length;
114 | }
115 | this._handleRemoteAudioLevel(aveVal);
116 | }, 100);
117 | }
118 | }
119 | this._initialized = true;
120 | }
121 |
122 | async connect(): Promise {
123 | if (this._connected) {
124 | console.warn("DailyMediaManager already connected");
125 | return;
126 | }
127 | this._connected = true;
128 | if (!this._initialized) {
129 | return new Promise((resolve) => {
130 | (async () => {
131 | this._connectResolve = resolve;
132 | await this.initialize();
133 | })();
134 | });
135 | }
136 | if (this._micEnabled) {
137 | this._startRecording();
138 | }
139 | }
140 |
141 | async disconnect(): Promise {
142 | if (this._remoteAudioLevelInterval) {
143 | clearInterval(this._remoteAudioLevelInterval);
144 | }
145 | this._remoteAudioLevelInterval = null;
146 | this._daily.leave();
147 | this._currentAudioTrack = null;
148 | await this._mediaStreamRecorder?.end();
149 | this._wavStreamPlayer?.interrupt();
150 | this._initialized = false;
151 | this._connected = false;
152 | }
153 |
154 | async userStartedSpeaking(): Promise {
155 | return this._wavStreamPlayer?.interrupt();
156 | }
157 |
158 | bufferBotAudio(
159 | data: ArrayBuffer | Int16Array,
160 | id?: string,
161 | ): Int16Array | undefined {
162 | return this._wavStreamPlayer?.add16BitPCM(data, id);
163 | }
164 |
165 | async getAllMics(): Promise {
166 | let devices = (await this._daily.enumerateDevices()).devices;
167 | return devices.filter((device) => device.kind === "audioinput");
168 | }
169 | async getAllCams(): Promise {
170 | let devices = (await this._daily.enumerateDevices()).devices;
171 | return devices.filter((device) => device.kind === "videoinput");
172 | }
173 | async getAllSpeakers(): Promise {
174 | let devices = (await this._daily.enumerateDevices()).devices;
175 | return devices.filter((device) => device.kind === "audiooutput");
176 | }
177 |
178 | updateMic(micId: string) {
179 | this._daily
180 | .setInputDevicesAsync({ audioDeviceId: micId })
181 | .then((deviceInfo) => {
182 | this._selectedMic = deviceInfo.mic;
183 | });
184 | }
185 | updateCam(camId: string) {
186 | this._daily
187 | .setInputDevicesAsync({ videoDeviceId: camId })
188 | .then((deviceInfo) => {
189 | this._selectedCam = deviceInfo.camera;
190 | });
191 | }
192 | async updateSpeaker(speakerId: string): Promise {
193 | if (speakerId !== "default" && this._selectedSpeaker.deviceId === speakerId)
194 | return;
195 | let sID = speakerId;
196 | if (sID === "default") {
197 | const speakers = await this.getAllSpeakers();
198 | const defaultSpeaker = speakers.find((s) => s.deviceId === "default");
199 | if (!defaultSpeaker) {
200 | console.warn("No default speaker found");
201 | return;
202 | }
203 | speakers.splice(speakers.indexOf(defaultSpeaker), 1);
204 | const defaultSpeakerCp = speakers.find((s) =>
205 | defaultSpeaker.label.includes(s.label),
206 | );
207 | sID = defaultSpeakerCp?.deviceId ?? speakerId;
208 | }
209 | this._wavStreamPlayer?.updateSpeaker(sID).then(() => {
210 | this._selectedSpeaker = { deviceId: speakerId } as MediaDeviceInfo;
211 | this._callbacks.onSpeakerUpdated?.(this._selectedSpeaker);
212 | });
213 | }
214 |
215 | get selectedMic(): MediaDeviceInfo | Record {
216 | return this._selectedMic;
217 | }
218 | get selectedCam(): MediaDeviceInfo | Record {
219 | return this._selectedCam;
220 | }
221 | get selectedSpeaker(): MediaDeviceInfo | Record {
222 | return this._selectedSpeaker;
223 | }
224 |
225 | async enableMic(enable: boolean): Promise {
226 | this._micEnabled = enable;
227 | if (!this._daily.participants()?.local) return;
228 | this._daily.setLocalAudio(enable);
229 | if (this._mediaStreamRecorder) {
230 | if (enable) {
231 | if (this._mediaStreamRecorder.getStatus() === "paused") {
232 | this._startRecording();
233 | } // else, we'll record on the track-started event
234 | } else {
235 | if (this._mediaStreamRecorder.getStatus() === "recording") {
236 | this._mediaStreamRecorder.pause();
237 | }
238 | }
239 | }
240 | }
241 | enableCam(enable: boolean): void {
242 | this._camEnabled = enable;
243 | this._daily.setLocalVideo(enable);
244 | }
245 |
246 | get isCamEnabled(): boolean {
247 | return this._daily.localVideo();
248 | }
249 | get isMicEnabled(): boolean {
250 | return this._daily.localAudio();
251 | }
252 |
253 | tracks(): Tracks {
254 | const participants: DailyParticipantsObject = this._daily.participants();
255 | return {
256 | local: {
257 | audio: participants?.local?.tracks?.audio?.persistentTrack,
258 | video: participants?.local?.tracks?.video?.persistentTrack,
259 | },
260 | };
261 | }
262 |
263 | private _startRecording(): void {
264 | if (!this._connected || !this._mediaStreamRecorder) return;
265 | try {
266 | this._mediaStreamRecorder.record((data) => {
267 | this._userAudioCallback(data.mono);
268 | }, this._recorderChunkSize);
269 | } catch (e) {
270 | const err = e as Error;
271 | if (!err.message.includes("Already recording")) {
272 | console.error("Error starting recording", e);
273 | }
274 | }
275 | }
276 |
277 | private _handleAvailableDevicesUpdated(
278 | event: DailyEventObjectAvailableDevicesUpdated,
279 | ) {
280 | this._callbacks.onAvailableCamsUpdated?.(
281 | event.availableDevices.filter((d) => d.kind === "videoinput"),
282 | );
283 | this._callbacks.onAvailableMicsUpdated?.(
284 | event.availableDevices.filter((d) => d.kind === "audioinput"),
285 | );
286 | this._callbacks.onAvailableSpeakersUpdated?.(
287 | event.availableDevices.filter((d) => d.kind === "audiooutput"),
288 | );
289 | if (this._selectedSpeaker.deviceId === "default") {
290 | this.updateSpeaker("default");
291 | }
292 | }
293 |
294 | private _handleSelectedDevicesUpdated(
295 | event: DailyEventObjectSelectedDevicesUpdated,
296 | ) {
297 | if (this._selectedCam?.deviceId !== event.devices.camera) {
298 | this._selectedCam = event.devices.camera;
299 | this._callbacks.onCamUpdated?.(event.devices.camera as MediaDeviceInfo);
300 | }
301 | if (this._selectedMic?.deviceId !== event.devices.mic) {
302 | this._selectedMic = event.devices.mic;
303 | this._callbacks.onMicUpdated?.(event.devices.mic as MediaDeviceInfo);
304 | }
305 | }
306 |
307 | private _handleLocalAudioLevel(ev: DailyEventObjectLocalAudioLevel) {
308 | this._callbacks.onLocalAudioLevel?.(ev.audioLevel);
309 | }
310 |
311 | private _handleRemoteAudioLevel(audioLevel: number) {
312 | this._callbacks.onRemoteAudioLevel?.(audioLevel, botParticipant());
313 | }
314 |
315 | protected async handleTrackStarted(event: DailyEventObjectTrack) {
316 | if (!event.participant?.local) return;
317 | if (event.track.kind === "audio") {
318 | if (this._mediaStreamRecorder) {
319 | const status = this._mediaStreamRecorder.getStatus();
320 | switch (status) {
321 | case "ended":
322 | await this._mediaStreamRecorder.begin(event.track);
323 | if (this._connected) {
324 | this._startRecording();
325 | if (this._connectResolve) {
326 | this._connectResolve();
327 | this._connectResolve = null;
328 | }
329 | }
330 | break;
331 | case "paused":
332 | this._startRecording();
333 | break;
334 | case "recording":
335 | default:
336 | if (this._currentAudioTrack !== event.track) {
337 | await this._mediaStreamRecorder.end();
338 | await this._mediaStreamRecorder.begin(event.track);
339 | this._startRecording();
340 | } else {
341 | console.warn(
342 | "track-started event received for current track and already recording",
343 | );
344 | }
345 | break;
346 | }
347 | }
348 | this._currentAudioTrack = event.track;
349 | }
350 | this._callbacks.onTrackStarted?.(
351 | event.track,
352 | event.participant
353 | ? dailyParticipantToParticipant(event.participant)
354 | : undefined,
355 | );
356 | this.onTrackStartedCallback?.(event);
357 | }
358 |
359 | protected handleTrackStopped(event: DailyEventObjectTrack) {
360 | if (!event.participant?.local) return;
361 | if (event.track.kind === "audio") {
362 | if (
363 | this._mediaStreamRecorder &&
364 | this._mediaStreamRecorder.getStatus() === "recording"
365 | ) {
366 | this._mediaStreamRecorder.pause();
367 | }
368 | }
369 | this._callbacks.onTrackStopped?.(
370 | event.track,
371 | event.participant
372 | ? dailyParticipantToParticipant(event.participant)
373 | : undefined,
374 | );
375 | this.onTrackStoppedCallback?.(event);
376 | }
377 | }
378 |
379 | const dailyParticipantToParticipant = (p: DailyParticipant): Participant => ({
380 | id: p.user_id,
381 | local: p.local,
382 | name: p.user_name,
383 | });
384 |
385 | const botParticipant = () => ({
386 | id: "bot",
387 | local: false,
388 | name: "Bot",
389 | });
390 |
--------------------------------------------------------------------------------
/lib/media-mgmt/mediaManager.ts:
--------------------------------------------------------------------------------
1 | import { WavRecorder, WavStreamPlayer } from "../wavtools";
2 |
3 | import {
4 | RTVIClientOptions,
5 | RTVIEventCallbacks,
6 | Tracks,
7 | } from "@pipecat-ai/client-js";
8 |
9 | export abstract class MediaManager {
10 | declare protected _userAudioCallback: (data: ArrayBuffer) => void;
11 | declare protected _options: RTVIClientOptions;
12 | protected _callbacks: RTVIEventCallbacks = {};
13 |
14 | protected _micEnabled: boolean;
15 | protected _camEnabled: boolean;
16 |
17 | constructor() {
18 | this._micEnabled = true;
19 | this._camEnabled = false;
20 | }
21 |
22 | setUserAudioCallback(userAudioCallback: (data: ArrayBuffer) => void) {
23 | this._userAudioCallback = userAudioCallback;
24 | }
25 | setRTVIOptions(options: RTVIClientOptions, override: boolean = false) {
26 | if (this._options && !override) return;
27 | this._options = options;
28 | this._callbacks = options.callbacks ?? {};
29 | this._micEnabled = options.enableMic ?? true;
30 | this._camEnabled = options.enableCam ?? false;
31 | }
32 |
33 | abstract initialize(): Promise;
34 | abstract connect(): Promise;
35 | abstract disconnect(): Promise;
36 |
37 | abstract userStartedSpeaking(): Promise;
38 | abstract bufferBotAudio(
39 | data: ArrayBuffer | Int16Array,
40 | id?: string,
41 | ): Int16Array | undefined;
42 |
43 | abstract getAllMics(): Promise;
44 | abstract getAllCams(): Promise;
45 | abstract getAllSpeakers(): Promise;
46 |
47 | abstract updateMic(micId: string): void;
48 | abstract updateCam(camId: string): void;
49 | abstract updateSpeaker(speakerId: string): void;
50 |
51 | abstract get selectedMic(): MediaDeviceInfo | Record;
52 | abstract get selectedCam(): MediaDeviceInfo | Record;
53 | abstract get selectedSpeaker(): MediaDeviceInfo | Record;
54 |
55 | abstract enableMic(enable: boolean): void;
56 | abstract enableCam(enable: boolean): void;
57 |
58 | abstract get isCamEnabled(): boolean;
59 | abstract get isMicEnabled(): boolean;
60 |
61 | abstract tracks(): Tracks;
62 | }
63 |
64 | export class WavMediaManager extends MediaManager {
65 | private _wavRecorder;
66 | private _wavStreamPlayer;
67 |
68 | private _initialized = false;
69 | private _recorderChunkSize: number | undefined = undefined;
70 |
71 | constructor(
72 | recorderChunkSize: number | undefined = undefined,
73 | recorderSampleRate: number | undefined = 24000,
74 | ) {
75 | super();
76 | this._recorderChunkSize = recorderChunkSize;
77 | this._wavRecorder = new WavRecorder({ sampleRate: recorderSampleRate });
78 | this._wavStreamPlayer = new WavStreamPlayer({ sampleRate: 24000 });
79 | }
80 |
81 | async initialize(): Promise {
82 | await this._wavRecorder.begin();
83 | this._wavRecorder.listenForDeviceChange(null);
84 | this._wavRecorder.listenForDeviceChange(
85 | this._handleAvailableDevicesUpdated.bind(this),
86 | );
87 | await this._wavStreamPlayer.connect();
88 | this._initialized = true;
89 | }
90 |
91 | async connect(): Promise {
92 | if (!this._initialized) {
93 | await this.initialize();
94 | }
95 | const isAlreadyRecording = this._wavRecorder.getStatus() == "recording";
96 | if (this._micEnabled && !isAlreadyRecording) {
97 | await this._startRecording();
98 | }
99 | }
100 |
101 | async disconnect(): Promise {
102 | if (!this._initialized) {
103 | return;
104 | }
105 | await this._wavRecorder.end();
106 | await this._wavStreamPlayer.interrupt();
107 | this._initialized = false;
108 | }
109 |
110 | async userStartedSpeaking(): Promise {
111 | return this._wavStreamPlayer.interrupt();
112 | }
113 |
114 | bufferBotAudio(data: ArrayBuffer | Int16Array, id?: string): Int16Array {
115 | return this._wavStreamPlayer.add16BitPCM(data, id);
116 | }
117 |
118 | getAllMics(): Promise {
119 | return this._wavRecorder.listDevices();
120 | }
121 | getAllCams(): Promise {
122 | // TODO: Video not supported yet
123 | return Promise.resolve([]);
124 | }
125 | getAllSpeakers(): Promise {
126 | // TODO: Implement speaker support
127 | return Promise.resolve([]);
128 | }
129 |
130 | async updateMic(micId: string): Promise {
131 | const prevMic = this._wavRecorder.deviceSelection;
132 | await this._wavRecorder.end();
133 | await this._wavRecorder.begin(micId);
134 | if (this._micEnabled) {
135 | await this._startRecording();
136 | }
137 | const curMic = this._wavRecorder.deviceSelection;
138 | if (curMic && prevMic && prevMic.label !== curMic.label) {
139 | this._callbacks.onMicUpdated?.(curMic);
140 | }
141 | }
142 |
143 | updateCam(camId: string): void {
144 | // TODO: Video not supported yet
145 | }
146 | updateSpeaker(speakerId: string): void {
147 | // TODO: Implement speaker support
148 | }
149 |
150 | get selectedMic(): MediaDeviceInfo | Record {
151 | return this._wavRecorder.deviceSelection ?? {};
152 | }
153 | get selectedCam(): MediaDeviceInfo | Record {
154 | // TODO: Video not supported yet
155 | return {};
156 | }
157 | get selectedSpeaker(): MediaDeviceInfo | Record {
158 | // TODO: Implement speaker support
159 | return {};
160 | }
161 |
162 | async enableMic(enable: boolean): Promise {
163 | this._micEnabled = enable;
164 | if (!this._wavRecorder.stream) return;
165 | this._wavRecorder.stream
166 | .getAudioTracks()
167 | .forEach((track: MediaStreamTrack) => {
168 | track.enabled = enable;
169 | if (!enable) {
170 | this._callbacks.onTrackStopped?.(track, localParticipant());
171 | }
172 | });
173 | if (enable) {
174 | await this._startRecording();
175 | } else {
176 | await this._wavRecorder.pause();
177 | }
178 | }
179 | enableCam(enable: boolean): void {
180 | // TODO: Video not supported yet
181 | }
182 |
183 | get isCamEnabled(): boolean {
184 | // TODO: Video not supported yet
185 | return false;
186 | }
187 | get isMicEnabled(): boolean {
188 | return this._micEnabled;
189 | }
190 |
191 | tracks(): Tracks {
192 | const tracks = this._wavRecorder.stream?.getTracks()[0];
193 | return { local: tracks ? { audio: tracks } : {} };
194 | }
195 |
196 | private async _startRecording() {
197 | await this._wavRecorder.record((data) => {
198 | this._userAudioCallback(data.mono);
199 | }, this._recorderChunkSize);
200 | const track = this._wavRecorder.stream?.getAudioTracks()[0];
201 | if (track) {
202 | this._callbacks.onTrackStarted?.(track, localParticipant());
203 | }
204 | }
205 |
206 | private _handleAvailableDevicesUpdated(devices: MediaDeviceInfo[]) {
207 | this._callbacks.onAvailableCamsUpdated?.(
208 | devices.filter((d) => d.kind === "videoinput"),
209 | );
210 | this._callbacks.onAvailableMicsUpdated?.(
211 | devices.filter((d) => d.kind === "audioinput"),
212 | );
213 | // if the current device went away or we're using the default and
214 | // the default changed, reset the mic.
215 | const defaultDevice = devices.find((d) => d.deviceId === "default");
216 | const currentDevice = this._wavRecorder.deviceSelection;
217 | if (
218 | currentDevice &&
219 | (!devices.some((d) => d.deviceId === currentDevice.deviceId) ||
220 | (currentDevice.deviceId === "default" &&
221 | currentDevice.label !== defaultDevice?.label))
222 | ) {
223 | this.updateMic("");
224 | }
225 | }
226 | }
227 |
228 | const localParticipant = () => {
229 | return {
230 | id: "local",
231 | name: "",
232 | local: true,
233 | };
234 | };
235 |
--------------------------------------------------------------------------------
/lib/wavtools/index.js:
--------------------------------------------------------------------------------
1 | import { WavPacker } from './lib/wav_packer.js';
2 | import { AudioAnalysis } from './lib/analysis/audio_analysis.js';
3 | import { WavStreamPlayer } from './lib/wav_stream_player.js';
4 | import { WavRecorder } from './lib/wav_recorder.js';
5 | import { MediaStreamRecorder } from './lib/mediastream_recorder.js';
6 |
7 | export {
8 | AudioAnalysis,
9 | MediaStreamRecorder,
10 | WavPacker,
11 | WavStreamPlayer,
12 | WavRecorder,
13 | };
14 |
--------------------------------------------------------------------------------
/lib/wavtools/lib/analysis/audio_analysis.js:
--------------------------------------------------------------------------------
1 | import {
2 | noteFrequencies,
3 | noteFrequencyLabels,
4 | voiceFrequencies,
5 | voiceFrequencyLabels,
6 | } from './constants.js';
7 |
8 | /**
9 | * Output of AudioAnalysis for the frequency domain of the audio
10 | * @typedef {Object} AudioAnalysisOutputType
11 | * @property {Float32Array} values Amplitude of this frequency between {0, 1} inclusive
12 | * @property {number[]} frequencies Raw frequency bucket values
13 | * @property {string[]} labels Labels for the frequency bucket values
14 | */
15 |
16 | /**
17 | * Analyzes audio for visual output
18 | * @class
19 | */
20 | export class AudioAnalysis {
21 | /**
22 | * Retrieves frequency domain data from an AnalyserNode adjusted to a decibel range
23 | * returns human-readable formatting and labels
24 | * @param {AnalyserNode} analyser
25 | * @param {number} sampleRate
26 | * @param {Float32Array} [fftResult]
27 | * @param {"frequency"|"music"|"voice"} [analysisType]
28 | * @param {number} [minDecibels] default -100
29 | * @param {number} [maxDecibels] default -30
30 | * @returns {AudioAnalysisOutputType}
31 | */
32 | static getFrequencies(
33 | analyser,
34 | sampleRate,
35 | fftResult,
36 | analysisType = 'frequency',
37 | minDecibels = -100,
38 | maxDecibels = -30,
39 | ) {
40 | if (!fftResult) {
41 | fftResult = new Float32Array(analyser.frequencyBinCount);
42 | analyser.getFloatFrequencyData(fftResult);
43 | }
44 | const nyquistFrequency = sampleRate / 2;
45 | const frequencyStep = (1 / fftResult.length) * nyquistFrequency;
46 | let outputValues;
47 | let frequencies;
48 | let labels;
49 | if (analysisType === 'music' || analysisType === 'voice') {
50 | const useFrequencies =
51 | analysisType === 'voice' ? voiceFrequencies : noteFrequencies;
52 | const aggregateOutput = Array(useFrequencies.length).fill(minDecibels);
53 | for (let i = 0; i < fftResult.length; i++) {
54 | const frequency = i * frequencyStep;
55 | const amplitude = fftResult[i];
56 | for (let n = useFrequencies.length - 1; n >= 0; n--) {
57 | if (frequency > useFrequencies[n]) {
58 | aggregateOutput[n] = Math.max(aggregateOutput[n], amplitude);
59 | break;
60 | }
61 | }
62 | }
63 | outputValues = aggregateOutput;
64 | frequencies =
65 | analysisType === 'voice' ? voiceFrequencies : noteFrequencies;
66 | labels =
67 | analysisType === 'voice' ? voiceFrequencyLabels : noteFrequencyLabels;
68 | } else {
69 | outputValues = Array.from(fftResult);
70 | frequencies = outputValues.map((_, i) => frequencyStep * i);
71 | labels = frequencies.map((f) => `${f.toFixed(2)} Hz`);
72 | }
73 | // We normalize to {0, 1}
74 | const normalizedOutput = outputValues.map((v) => {
75 | return Math.max(
76 | 0,
77 | Math.min((v - minDecibels) / (maxDecibels - minDecibels), 1),
78 | );
79 | });
80 | const values = new Float32Array(normalizedOutput);
81 | return {
82 | values,
83 | frequencies,
84 | labels,
85 | };
86 | }
87 |
88 | /**
89 | * Creates a new AudioAnalysis instance for an HTMLAudioElement
90 | * @param {HTMLAudioElement} audioElement
91 | * @param {AudioBuffer|null} [audioBuffer] If provided, will cache all frequency domain data from the buffer
92 | * @returns {AudioAnalysis}
93 | */
94 | constructor(audioElement, audioBuffer = null) {
95 | this.fftResults = [];
96 | if (audioBuffer) {
97 | /**
98 | * Modified from
99 | * https://stackoverflow.com/questions/75063715/using-the-web-audio-api-to-analyze-a-song-without-playing
100 | *
101 | * We do this to populate FFT values for the audio if provided an `audioBuffer`
102 | * The reason to do this is that Safari fails when using `createMediaElementSource`
103 | * This has a non-zero RAM cost so we only opt-in to run it on Safari, Chrome is better
104 | */
105 | const { length, sampleRate } = audioBuffer;
106 | const offlineAudioContext = new OfflineAudioContext({
107 | length,
108 | sampleRate,
109 | });
110 | const source = offlineAudioContext.createBufferSource();
111 | source.buffer = audioBuffer;
112 | const analyser = offlineAudioContext.createAnalyser();
113 | analyser.fftSize = 8192;
114 | analyser.smoothingTimeConstant = 0.1;
115 | source.connect(analyser);
116 | // limit is :: 128 / sampleRate;
117 | // but we just want 60fps - cuts ~1s from 6MB to 1MB of RAM
118 | const renderQuantumInSeconds = 1 / 60;
119 | const durationInSeconds = length / sampleRate;
120 | const analyze = (index) => {
121 | const suspendTime = renderQuantumInSeconds * index;
122 | if (suspendTime < durationInSeconds) {
123 | offlineAudioContext.suspend(suspendTime).then(() => {
124 | const fftResult = new Float32Array(analyser.frequencyBinCount);
125 | analyser.getFloatFrequencyData(fftResult);
126 | this.fftResults.push(fftResult);
127 | analyze(index + 1);
128 | });
129 | }
130 | if (index === 1) {
131 | offlineAudioContext.startRendering();
132 | } else {
133 | offlineAudioContext.resume();
134 | }
135 | };
136 | source.start(0);
137 | analyze(1);
138 | this.audio = audioElement;
139 | this.context = offlineAudioContext;
140 | this.analyser = analyser;
141 | this.sampleRate = sampleRate;
142 | this.audioBuffer = audioBuffer;
143 | } else {
144 | const audioContext = new AudioContext();
145 | const track = audioContext.createMediaElementSource(audioElement);
146 | const analyser = audioContext.createAnalyser();
147 | analyser.fftSize = 8192;
148 | analyser.smoothingTimeConstant = 0.1;
149 | track.connect(analyser);
150 | analyser.connect(audioContext.destination);
151 | this.audio = audioElement;
152 | this.context = audioContext;
153 | this.analyser = analyser;
154 | this.sampleRate = this.context.sampleRate;
155 | this.audioBuffer = null;
156 | }
157 | }
158 |
159 | /**
160 | * Gets the current frequency domain data from the playing audio track
161 | * @param {"frequency"|"music"|"voice"} [analysisType]
162 | * @param {number} [minDecibels] default -100
163 | * @param {number} [maxDecibels] default -30
164 | * @returns {AudioAnalysisOutputType}
165 | */
166 | getFrequencies(
167 | analysisType = 'frequency',
168 | minDecibels = -100,
169 | maxDecibels = -30,
170 | ) {
171 | let fftResult = null;
172 | if (this.audioBuffer && this.fftResults.length) {
173 | const pct = this.audio.currentTime / this.audio.duration;
174 | const index = Math.min(
175 | (pct * this.fftResults.length) | 0,
176 | this.fftResults.length - 1,
177 | );
178 | fftResult = this.fftResults[index];
179 | }
180 | return AudioAnalysis.getFrequencies(
181 | this.analyser,
182 | this.sampleRate,
183 | fftResult,
184 | analysisType,
185 | minDecibels,
186 | maxDecibels,
187 | );
188 | }
189 |
190 | /**
191 | * Resume the internal AudioContext if it was suspended due to the lack of
192 | * user interaction when the AudioAnalysis was instantiated.
193 | * @returns {Promise}
194 | */
195 | async resumeIfSuspended() {
196 | if (this.context.state === 'suspended') {
197 | await this.context.resume();
198 | }
199 | return true;
200 | }
201 | }
202 |
203 | globalThis.AudioAnalysis = AudioAnalysis;
204 |
--------------------------------------------------------------------------------
/lib/wavtools/lib/analysis/constants.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Constants for help with visualization
3 | * Helps map frequency ranges from Fast Fourier Transform
4 | * to human-interpretable ranges, notably music ranges and
5 | * human vocal ranges.
6 | */
7 |
8 | // Eighth octave frequencies
9 | const octave8Frequencies = [
10 | 4186.01, 4434.92, 4698.63, 4978.03, 5274.04, 5587.65, 5919.91, 6271.93,
11 | 6644.88, 7040.0, 7458.62, 7902.13,
12 | ];
13 |
14 | // Labels for each of the above frequencies
15 | const octave8FrequencyLabels = [
16 | 'C',
17 | 'C#',
18 | 'D',
19 | 'D#',
20 | 'E',
21 | 'F',
22 | 'F#',
23 | 'G',
24 | 'G#',
25 | 'A',
26 | 'A#',
27 | 'B',
28 | ];
29 |
30 | /**
31 | * All note frequencies from 1st to 8th octave
32 | * in format "A#8" (A#, 8th octave)
33 | */
34 | export const noteFrequencies = [];
35 | export const noteFrequencyLabels = [];
36 | for (let i = 1; i <= 8; i++) {
37 | for (let f = 0; f < octave8Frequencies.length; f++) {
38 | const freq = octave8Frequencies[f];
39 | noteFrequencies.push(freq / Math.pow(2, 8 - i));
40 | noteFrequencyLabels.push(octave8FrequencyLabels[f] + i);
41 | }
42 | }
43 |
44 | /**
45 | * Subset of the note frequencies between 32 and 2000 Hz
46 | * 6 octave range: C1 to B6
47 | */
48 | const voiceFrequencyRange = [32.0, 2000.0];
49 | export const voiceFrequencies = noteFrequencies.filter((_, i) => {
50 | return (
51 | noteFrequencies[i] > voiceFrequencyRange[0] &&
52 | noteFrequencies[i] < voiceFrequencyRange[1]
53 | );
54 | });
55 | export const voiceFrequencyLabels = noteFrequencyLabels.filter((_, i) => {
56 | return (
57 | noteFrequencies[i] > voiceFrequencyRange[0] &&
58 | noteFrequencies[i] < voiceFrequencyRange[1]
59 | );
60 | });
61 |
--------------------------------------------------------------------------------
/lib/wavtools/lib/mediastream_recorder.js:
--------------------------------------------------------------------------------
1 | import { AudioProcessorSrc } from "./worklets/audio_processor.js";
2 | import { AudioAnalysis } from "./analysis/audio_analysis.js";
3 | import { WavPacker } from "./wav_packer.js";
4 |
5 | /**
6 | * Decodes audio into a wav file
7 | * @typedef {Object} DecodedAudioType
8 | * @property {Blob} blob
9 | * @property {string} url
10 | * @property {Float32Array} values
11 | * @property {AudioBuffer} audioBuffer
12 | */
13 |
14 | /**
15 | * Records live stream of user audio as PCM16 "audio/wav" data
16 | * @class
17 | */
18 | export class MediaStreamRecorder {
19 | /**
20 | * Create a new MediaStreamRecorder instance
21 | * @param {{sampleRate?: number, outputToSpeakers?: boolean, debug?: boolean}} [options]
22 | * @returns {MediaStreamRecorder}
23 | */
24 | constructor({
25 | sampleRate = 44100,
26 | outputToSpeakers = false,
27 | debug = false,
28 | } = {}) {
29 | // Script source
30 | this.scriptSrc = AudioProcessorSrc;
31 | // Config
32 | this.sampleRate = sampleRate;
33 | this.outputToSpeakers = outputToSpeakers;
34 | this.debug = !!debug;
35 | // State variables
36 | this.stream = null;
37 | this.processor = null;
38 | this.source = null;
39 | this.node = null;
40 | this.recording = false;
41 | // Event handling with AudioWorklet
42 | this._lastEventId = 0;
43 | this.eventReceipts = {};
44 | this.eventTimeout = 5000;
45 | // Process chunks of audio
46 | this._chunkProcessor = () => {};
47 | this._chunkProcessorSize = void 0;
48 | this._chunkProcessorBuffer = {
49 | raw: new ArrayBuffer(0),
50 | mono: new ArrayBuffer(0),
51 | };
52 | }
53 |
54 | /**
55 | * Logs data in debug mode
56 | * @param {...any} arguments
57 | * @returns {true}
58 | */
59 | log() {
60 | if (this.debug) {
61 | this.log(...arguments);
62 | }
63 | return true;
64 | }
65 |
66 | /**
67 | * Retrieves the current sampleRate for the recorder
68 | * @returns {number}
69 | */
70 | getSampleRate() {
71 | return this.sampleRate;
72 | }
73 |
74 | /**
75 | * Retrieves the current status of the recording
76 | * @returns {"ended"|"paused"|"recording"}
77 | */
78 | getStatus() {
79 | if (!this.processor) {
80 | return "ended";
81 | } else if (!this.recording) {
82 | return "paused";
83 | } else {
84 | return "recording";
85 | }
86 | }
87 |
88 | /**
89 | * Sends an event to the AudioWorklet
90 | * @private
91 | * @param {string} name
92 | * @param {{[key: string]: any}} data
93 | * @param {AudioWorkletNode} [_processor]
94 | * @returns {Promise<{[key: string]: any}>}
95 | */
96 | async _event(name, data = {}, _processor = null) {
97 | _processor = _processor || this.processor;
98 | if (!_processor) {
99 | throw new Error("Can not send events without recording first");
100 | }
101 | const message = {
102 | event: name,
103 | id: this._lastEventId++,
104 | data,
105 | };
106 | _processor.port.postMessage(message);
107 | const t0 = new Date().valueOf();
108 | while (!this.eventReceipts[message.id]) {
109 | if (new Date().valueOf() - t0 > this.eventTimeout) {
110 | throw new Error(`Timeout waiting for "${name}" event`);
111 | }
112 | await new Promise((res) => setTimeout(() => res(true), 1));
113 | }
114 | const payload = this.eventReceipts[message.id];
115 | delete this.eventReceipts[message.id];
116 | return payload;
117 | }
118 |
119 | /**
120 | * Begins a recording session for the given audioTrack
121 | * Microphone recording indicator will appear on browser tab but status will be "paused"
122 | * @param {MediaStreamTrack} [audioTrack] if no device provided, default device will be used
123 | * @returns {Promise}
124 | */
125 | async begin(audioTrack) {
126 | if (this.processor) {
127 | throw new Error(
128 | `Already connected: please call .end() to start a new session`
129 | );
130 | }
131 |
132 | if (!audioTrack || audioTrack.kind !== "audio") {
133 | throw new Error("No audio track provided");
134 | }
135 |
136 | this.stream = new MediaStream([audioTrack]);
137 |
138 | const context = new AudioContext({ sampleRate: this.sampleRate });
139 | const source = context.createMediaStreamSource(this.stream);
140 | // Load and execute the module script.
141 | try {
142 | await context.audioWorklet.addModule(this.scriptSrc);
143 | } catch (e) {
144 | console.error(e);
145 | throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`);
146 | }
147 | const processor = new AudioWorkletNode(context, "audio_processor");
148 | processor.port.onmessage = (e) => {
149 | const { event, id, data } = e.data;
150 | if (event === "receipt") {
151 | this.eventReceipts[id] = data;
152 | } else if (event === "chunk") {
153 | if (this._chunkProcessorSize) {
154 | const buffer = this._chunkProcessorBuffer;
155 | this._chunkProcessorBuffer = {
156 | raw: WavPacker.mergeBuffers(buffer.raw, data.raw),
157 | mono: WavPacker.mergeBuffers(buffer.mono, data.mono),
158 | };
159 | if (
160 | this._chunkProcessorBuffer.mono.byteLength >=
161 | this._chunkProcessorSize
162 | ) {
163 | this._chunkProcessor(this._chunkProcessorBuffer);
164 | this._chunkProcessorBuffer = {
165 | raw: new ArrayBuffer(0),
166 | mono: new ArrayBuffer(0),
167 | };
168 | }
169 | } else {
170 | this._chunkProcessor(data);
171 | }
172 | }
173 | };
174 |
175 | const node = source.connect(processor);
176 | const analyser = context.createAnalyser();
177 | analyser.fftSize = 8192;
178 | analyser.smoothingTimeConstant = 0.1;
179 | node.connect(analyser);
180 | if (this.outputToSpeakers) {
181 | // eslint-disable-next-line no-console
182 | console.warn(
183 | "Warning: Output to speakers may affect sound quality,\n" +
184 | "especially due to system audio feedback preventative measures.\n" +
185 | "use only for debugging"
186 | );
187 | analyser.connect(context.destination);
188 | }
189 |
190 | this.source = source;
191 | this.node = node;
192 | this.analyser = analyser;
193 | this.processor = processor;
194 | return true;
195 | }
196 |
197 | /**
198 | * Gets the current frequency domain data from the recording track
199 | * @param {"frequency"|"music"|"voice"} [analysisType]
200 | * @param {number} [minDecibels] default -100
201 | * @param {number} [maxDecibels] default -30
202 | * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType}
203 | */
204 | getFrequencies(
205 | analysisType = "frequency",
206 | minDecibels = -100,
207 | maxDecibels = -30
208 | ) {
209 | if (!this.processor) {
210 | throw new Error("Session ended: please call .begin() first");
211 | }
212 | return AudioAnalysis.getFrequencies(
213 | this.analyser,
214 | this.sampleRate,
215 | null,
216 | analysisType,
217 | minDecibels,
218 | maxDecibels
219 | );
220 | }
221 |
222 | /**
223 | * Pauses the recording
224 | * Keeps microphone stream open but halts storage of audio
225 | * @returns {Promise}
226 | */
227 | async pause() {
228 | if (!this.processor) {
229 | throw new Error("Session ended: please call .begin() first");
230 | } else if (!this.recording) {
231 | throw new Error("Already paused: please call .record() first");
232 | }
233 | if (this._chunkProcessorBuffer.raw.byteLength) {
234 | this._chunkProcessor(this._chunkProcessorBuffer);
235 | }
236 | this.log("Pausing ...");
237 | await this._event("stop");
238 | this.recording = false;
239 | return true;
240 | }
241 |
242 | /**
243 | * Start recording stream and storing to memory from the connected audio source
244 | * @param {(data: { mono: Int16Array; raw: Int16Array }) => any} [chunkProcessor]
245 | * @param {number} [chunkSize] chunkProcessor will not be triggered until this size threshold met in mono audio
246 | * @returns {Promise}
247 | */
248 | async record(chunkProcessor = () => {}, chunkSize = 8192) {
249 | if (!this.processor) {
250 | throw new Error("Session ended: please call .begin() first");
251 | } else if (this.recording) {
252 | throw new Error("Already recording: HELLO please call .pause() first");
253 | } else if (typeof chunkProcessor !== "function") {
254 | throw new Error(`chunkProcessor must be a function`);
255 | }
256 | this._chunkProcessor = chunkProcessor;
257 | this._chunkProcessorSize = chunkSize;
258 | this._chunkProcessorBuffer = {
259 | raw: new ArrayBuffer(0),
260 | mono: new ArrayBuffer(0),
261 | };
262 | this.log("Recording ...");
263 | await this._event("start");
264 | this.recording = true;
265 | return true;
266 | }
267 |
268 | /**
269 | * Clears the audio buffer, empties stored recording
270 | * @returns {Promise}
271 | */
272 | async clear() {
273 | if (!this.processor) {
274 | throw new Error("Session ended: please call .begin() first");
275 | }
276 | await this._event("clear");
277 | return true;
278 | }
279 |
280 | /**
281 | * Reads the current audio stream data
282 | * @returns {Promise<{meanValues: Float32Array, channels: Array}>}
283 | */
284 | async read() {
285 | if (!this.processor) {
286 | throw new Error("Session ended: please call .begin() first");
287 | }
288 | this.log("Reading ...");
289 | const result = await this._event("read");
290 | return result;
291 | }
292 |
293 | /**
294 | * Saves the current audio stream to a file
295 | * @param {boolean} [force] Force saving while still recording
296 | * @returns {Promise}
297 | */
298 | async save(force = false) {
299 | if (!this.processor) {
300 | throw new Error("Session ended: please call .begin() first");
301 | }
302 | if (!force && this.recording) {
303 | throw new Error(
304 | "Currently recording: please call .pause() first, or call .save(true) to force"
305 | );
306 | }
307 | this.log("Exporting ...");
308 | const exportData = await this._event("export");
309 | const packer = new WavPacker();
310 | const result = packer.pack(this.sampleRate, exportData.audio);
311 | return result;
312 | }
313 |
314 | /**
315 | * Ends the current recording session and saves the result
316 | * @returns {Promise}
317 | */
318 | async end() {
319 | if (!this.processor) {
320 | throw new Error("Session ended: please call .begin() first");
321 | }
322 |
323 | const _processor = this.processor;
324 |
325 | this.log("Stopping ...");
326 | await this._event("stop");
327 | this.recording = false;
328 |
329 | this.log("Exporting ...");
330 | const exportData = await this._event("export", {}, _processor);
331 |
332 | this.processor.disconnect();
333 | this.source.disconnect();
334 | this.node.disconnect();
335 | this.analyser.disconnect();
336 | this.stream = null;
337 | this.processor = null;
338 | this.source = null;
339 | this.node = null;
340 |
341 | const packer = new WavPacker();
342 | const result = packer.pack(this.sampleRate, exportData.audio);
343 | return result;
344 | }
345 |
346 | /**
347 | * Performs a full cleanup of WavRecorder instance
348 | * Stops actively listening via microphone and removes existing listeners
349 | * @returns {Promise}
350 | */
351 | async quit() {
352 | this.listenForDeviceChange(null);
353 | if (this.processor) {
354 | await this.end();
355 | }
356 | return true;
357 | }
358 | }
359 |
360 | globalThis.WavRecorder = WavRecorder;
361 |
--------------------------------------------------------------------------------
/lib/wavtools/lib/wav_packer.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Raw wav audio file contents
3 | * @typedef {Object} WavPackerAudioType
4 | * @property {Blob} blob
5 | * @property {string} url
6 | * @property {number} channelCount
7 | * @property {number} sampleRate
8 | * @property {number} duration
9 | */
10 |
11 | /**
12 | * Utility class for assembling PCM16 "audio/wav" data
13 | * @class
14 | */
15 | export class WavPacker {
16 | /**
17 | * Converts Float32Array of amplitude data to ArrayBuffer in Int16Array format
18 | * @param {Float32Array} float32Array
19 | * @returns {ArrayBuffer}
20 | */
21 | static floatTo16BitPCM(float32Array) {
22 | const buffer = new ArrayBuffer(float32Array.length * 2);
23 | const view = new DataView(buffer);
24 | let offset = 0;
25 | for (let i = 0; i < float32Array.length; i++, offset += 2) {
26 | let s = Math.max(-1, Math.min(1, float32Array[i]));
27 | view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
28 | }
29 | return buffer;
30 | }
31 |
32 | /**
33 | * Concatenates two ArrayBuffers
34 | * @param {ArrayBuffer} leftBuffer
35 | * @param {ArrayBuffer} rightBuffer
36 | * @returns {ArrayBuffer}
37 | */
38 | static mergeBuffers(leftBuffer, rightBuffer) {
39 | const tmpArray = new Uint8Array(
40 | leftBuffer.byteLength + rightBuffer.byteLength
41 | );
42 | tmpArray.set(new Uint8Array(leftBuffer), 0);
43 | tmpArray.set(new Uint8Array(rightBuffer), leftBuffer.byteLength);
44 | return tmpArray.buffer;
45 | }
46 |
47 | /**
48 | * Packs data into an Int16 format
49 | * @private
50 | * @param {number} size 0 = 1x Int16, 1 = 2x Int16
51 | * @param {number} arg value to pack
52 | * @returns
53 | */
54 | _packData(size, arg) {
55 | return [
56 | new Uint8Array([arg, arg >> 8]),
57 | new Uint8Array([arg, arg >> 8, arg >> 16, arg >> 24]),
58 | ][size];
59 | }
60 |
61 | /**
62 | * Packs audio into "audio/wav" Blob
63 | * @param {number} sampleRate
64 | * @param {{bitsPerSample: number, channels: Array, data: Int16Array}} audio
65 | * @returns {WavPackerAudioType}
66 | */
67 | pack(sampleRate, audio) {
68 | if (!audio?.bitsPerSample) {
69 | throw new Error(`Missing "bitsPerSample"`);
70 | } else if (!audio?.channels) {
71 | throw new Error(`Missing "channels"`);
72 | } else if (!audio?.data) {
73 | throw new Error(`Missing "data"`);
74 | }
75 | const { bitsPerSample, channels, data } = audio;
76 | const output = [
77 | // Header
78 | 'RIFF',
79 | this._packData(
80 | 1,
81 | 4 + (8 + 24) /* chunk 1 length */ + (8 + 8) /* chunk 2 length */
82 | ), // Length
83 | 'WAVE',
84 | // chunk 1
85 | 'fmt ', // Sub-chunk identifier
86 | this._packData(1, 16), // Chunk length
87 | this._packData(0, 1), // Audio format (1 is linear quantization)
88 | this._packData(0, channels.length),
89 | this._packData(1, sampleRate),
90 | this._packData(1, (sampleRate * channels.length * bitsPerSample) / 8), // Byte rate
91 | this._packData(0, (channels.length * bitsPerSample) / 8),
92 | this._packData(0, bitsPerSample),
93 | // chunk 2
94 | 'data', // Sub-chunk identifier
95 | this._packData(
96 | 1,
97 | (channels[0].length * channels.length * bitsPerSample) / 8
98 | ), // Chunk length
99 | data,
100 | ];
101 | const blob = new Blob(output, { type: 'audio/mpeg' });
102 | const url = URL.createObjectURL(blob);
103 | return {
104 | blob,
105 | url,
106 | channelCount: channels.length,
107 | sampleRate,
108 | duration: data.byteLength / (channels.length * sampleRate * 2),
109 | };
110 | }
111 | }
112 |
113 | globalThis.WavPacker = WavPacker;
114 |
--------------------------------------------------------------------------------
/lib/wavtools/lib/wav_recorder.js:
--------------------------------------------------------------------------------
1 | import { AudioProcessorSrc } from './worklets/audio_processor.js';
2 | import { AudioAnalysis } from './analysis/audio_analysis.js';
3 | import { WavPacker } from './wav_packer.js';
4 |
5 | /**
6 | * Decodes audio into a wav file
7 | * @typedef {Object} DecodedAudioType
8 | * @property {Blob} blob
9 | * @property {string} url
10 | * @property {Float32Array} values
11 | * @property {AudioBuffer} audioBuffer
12 | */
13 |
14 | /**
15 | * Records live stream of user audio as PCM16 "audio/wav" data
16 | * @class
17 | */
18 | export class WavRecorder {
19 | /**
20 | * Create a new WavRecorder instance
21 | * @param {{sampleRate?: number, outputToSpeakers?: boolean, debug?: boolean}} [options]
22 | * @returns {WavRecorder}
23 | */
24 | constructor({
25 | sampleRate = 44100,
26 | outputToSpeakers = false,
27 | debug = false,
28 | } = {}) {
29 | // Script source
30 | this.scriptSrc = AudioProcessorSrc;
31 | // Config
32 | this.sampleRate = sampleRate;
33 | this.outputToSpeakers = outputToSpeakers;
34 | this.debug = !!debug;
35 | this._deviceChangeCallback = null;
36 | this._devices = [];
37 | this.deviceSelection = null;
38 | // State variables
39 | this.stream = null;
40 | this.processor = null;
41 | this.source = null;
42 | this.node = null;
43 | this.recording = false;
44 | // Event handling with AudioWorklet
45 | this._lastEventId = 0;
46 | this.eventReceipts = {};
47 | this.eventTimeout = 5000;
48 | // Process chunks of audio
49 | this._chunkProcessor = () => {};
50 | this._chunkProcessorSize = void 0;
51 | this._chunkProcessorBuffer = {
52 | raw: new ArrayBuffer(0),
53 | mono: new ArrayBuffer(0),
54 | };
55 | }
56 |
57 | /**
58 | * Decodes audio data from multiple formats to a Blob, url, Float32Array and AudioBuffer
59 | * @param {Blob|Float32Array|Int16Array|ArrayBuffer|number[]} audioData
60 | * @param {number} sampleRate
61 | * @param {number} fromSampleRate
62 | * @returns {Promise}
63 | */
64 | static async decode(audioData, sampleRate = 44100, fromSampleRate = -1) {
65 | const context = new AudioContext({ sampleRate });
66 | let arrayBuffer;
67 | let blob;
68 | if (audioData instanceof Blob) {
69 | if (fromSampleRate !== -1) {
70 | throw new Error(
71 | `Can not specify "fromSampleRate" when reading from Blob`
72 | );
73 | }
74 | blob = audioData;
75 | arrayBuffer = await blob.arrayBuffer();
76 | } else if (audioData instanceof ArrayBuffer) {
77 | if (fromSampleRate !== -1) {
78 | throw new Error(
79 | `Can not specify "fromSampleRate" when reading from ArrayBuffer`
80 | );
81 | }
82 | arrayBuffer = audioData;
83 | blob = new Blob([arrayBuffer], { type: 'audio/wav' });
84 | } else {
85 | let float32Array;
86 | let data;
87 | if (audioData instanceof Int16Array) {
88 | data = audioData;
89 | float32Array = new Float32Array(audioData.length);
90 | for (let i = 0; i < audioData.length; i++) {
91 | float32Array[i] = audioData[i] / 0x8000;
92 | }
93 | } else if (audioData instanceof Float32Array) {
94 | float32Array = audioData;
95 | } else if (audioData instanceof Array) {
96 | float32Array = new Float32Array(audioData);
97 | } else {
98 | throw new Error(
99 | `"audioData" must be one of: Blob, Float32Arrray, Int16Array, ArrayBuffer, Array`
100 | );
101 | }
102 | if (fromSampleRate === -1) {
103 | throw new Error(
104 | `Must specify "fromSampleRate" when reading from Float32Array, In16Array or Array`
105 | );
106 | } else if (fromSampleRate < 3000) {
107 | throw new Error(`Minimum "fromSampleRate" is 3000 (3kHz)`);
108 | }
109 | if (!data) {
110 | data = WavPacker.floatTo16BitPCM(float32Array);
111 | }
112 | const audio = {
113 | bitsPerSample: 16,
114 | channels: [float32Array],
115 | data,
116 | };
117 | const packer = new WavPacker();
118 | const result = packer.pack(fromSampleRate, audio);
119 | blob = result.blob;
120 | arrayBuffer = await blob.arrayBuffer();
121 | }
122 | const audioBuffer = await context.decodeAudioData(arrayBuffer);
123 | const values = audioBuffer.getChannelData(0);
124 | const url = URL.createObjectURL(blob);
125 | return {
126 | blob,
127 | url,
128 | values,
129 | audioBuffer,
130 | };
131 | }
132 |
133 | /**
134 | * Logs data in debug mode
135 | * @param {...any} arguments
136 | * @returns {true}
137 | */
138 | log() {
139 | if (this.debug) {
140 | this.log(...arguments);
141 | }
142 | return true;
143 | }
144 |
145 | /**
146 | * Retrieves the current sampleRate for the recorder
147 | * @returns {number}
148 | */
149 | getSampleRate() {
150 | return this.sampleRate;
151 | }
152 |
153 | /**
154 | * Retrieves the current status of the recording
155 | * @returns {"ended"|"paused"|"recording"}
156 | */
157 | getStatus() {
158 | if (!this.processor) {
159 | return 'ended';
160 | } else if (!this.recording) {
161 | return 'paused';
162 | } else {
163 | return 'recording';
164 | }
165 | }
166 |
167 | /**
168 | * Sends an event to the AudioWorklet
169 | * @private
170 | * @param {string} name
171 | * @param {{[key: string]: any}} data
172 | * @param {AudioWorkletNode} [_processor]
173 | * @returns {Promise<{[key: string]: any}>}
174 | */
175 | async _event(name, data = {}, _processor = null) {
176 | _processor = _processor || this.processor;
177 | if (!_processor) {
178 | throw new Error('Can not send events without recording first');
179 | }
180 | const message = {
181 | event: name,
182 | id: this._lastEventId++,
183 | data,
184 | };
185 | _processor.port.postMessage(message);
186 | const t0 = new Date().valueOf();
187 | while (!this.eventReceipts[message.id]) {
188 | if (new Date().valueOf() - t0 > this.eventTimeout) {
189 | throw new Error(`Timeout waiting for "${name}" event`);
190 | }
191 | await new Promise((res) => setTimeout(() => res(true), 1));
192 | }
193 | const payload = this.eventReceipts[message.id];
194 | delete this.eventReceipts[message.id];
195 | return payload;
196 | }
197 |
198 | /**
199 | * Sets device change callback, remove if callback provided is `null`
200 | * @param {(Array): void|null} callback
201 | * @returns {true}
202 | */
203 | listenForDeviceChange(callback) {
204 | if (callback === null && this._deviceChangeCallback) {
205 | navigator.mediaDevices.removeEventListener(
206 | 'devicechange',
207 | this._deviceChangeCallback
208 | );
209 | this._deviceChangeCallback = null;
210 | } else if (callback !== null) {
211 | // Basically a debounce; we only want this called once when devices change
212 | // And we only want the most recent callback() to be executed
213 | // if a few are operating at the same time
214 | let lastId = 0;
215 | let lastDevices = [];
216 | const serializeDevices = (devices) =>
217 | devices
218 | .map((d) => d.deviceId)
219 | .sort()
220 | .join(',');
221 | const cb = async () => {
222 | let id = ++lastId;
223 | const devices = await this.listDevices();
224 | if (id === lastId) {
225 | if (serializeDevices(lastDevices) !== serializeDevices(devices)) {
226 | lastDevices = devices;
227 | callback(devices.slice());
228 | }
229 | }
230 | };
231 | navigator.mediaDevices.addEventListener('devicechange', cb);
232 | cb();
233 | this._deviceChangeCallback = cb;
234 | }
235 | return true;
236 | }
237 |
238 | /**
239 | * Manually request permission to use the microphone
240 | * @returns {Promise}
241 | */
242 | async requestPermission() {
243 | const permissionStatus = await navigator.permissions.query({
244 | name: 'microphone',
245 | });
246 | if (permissionStatus.state === 'denied') {
247 | window.alert('You must grant microphone access to use this feature.');
248 | } else if (permissionStatus.state === 'prompt') {
249 | try {
250 | const stream = await navigator.mediaDevices.getUserMedia({
251 | audio: true,
252 | });
253 | const tracks = stream.getTracks();
254 | tracks.forEach((track) => track.stop());
255 | } catch (e) {
256 | window.alert('You must grant microphone access to use this feature.');
257 | }
258 | }
259 | return true;
260 | }
261 |
262 | /**
263 | * List all eligible devices for recording, will request permission to use microphone
264 | * @returns {Promise>}
265 | */
266 | async listDevices() {
267 | if (
268 | !navigator.mediaDevices ||
269 | !('enumerateDevices' in navigator.mediaDevices)
270 | ) {
271 | throw new Error('Could not request user devices');
272 | }
273 | await this.requestPermission();
274 | const devices = await navigator.mediaDevices.enumerateDevices();
275 | const audioDevices = devices.filter(
276 | (device) => device.kind === 'audioinput'
277 | );
278 | return audioDevices;
279 | // const defaultDeviceIndex = audioDevices.findIndex(
280 | // (device) => device.deviceId === 'default'
281 | // );
282 | // const deviceList = [];
283 | // if (defaultDeviceIndex !== -1) {
284 | // let defaultDevice = audioDevices.splice(defaultDeviceIndex, 1)[0];
285 | // let existingIndex = audioDevices.findIndex(
286 | // (device) => device.groupId === defaultDevice.groupId
287 | // );
288 | // if (existingIndex !== -1) {
289 | // defaultDevice = audioDevices.splice(existingIndex, 1)[0];
290 | // }
291 | // defaultDevice.default = true;
292 | // deviceList.push(defaultDevice);
293 | // }
294 | // return deviceList.concat(audioDevices);
295 | }
296 |
297 | /**
298 | * Begins a recording session and requests microphone permissions if not already granted
299 | * Microphone recording indicator will appear on browser tab but status will be "paused"
300 | * @param {string} [deviceId] if no device provided, default device will be used
301 | * @returns {Promise}
302 | */
303 | async begin(deviceId) {
304 | if (this.processor) {
305 | throw new Error(
306 | `Already connected: please call .end() to start a new session`
307 | );
308 | }
309 |
310 | if (
311 | !navigator.mediaDevices ||
312 | !('getUserMedia' in navigator.mediaDevices)
313 | ) {
314 | throw new Error('Could not request user media');
315 | }
316 | deviceId = deviceId ?? this.deviceSelection?.deviceId;
317 | try {
318 | const config = { audio: true };
319 | if (deviceId) {
320 | config.audio = { deviceId: { exact: deviceId } };
321 | }
322 | this.stream = await navigator.mediaDevices.getUserMedia(config);
323 | } catch (err) {
324 | throw new Error('Could not start media stream');
325 | }
326 |
327 | this.listDevices().then((devices) => {
328 | deviceId = this.stream.getAudioTracks()[0].getSettings().deviceId;
329 | console.log(
330 | 'find current device',
331 | devices,
332 | deviceId,
333 | this.stream.getAudioTracks()[0].getSettings()
334 | );
335 | this.deviceSelection = devices.find((d) => d.deviceId === deviceId);
336 | console.log('current device', this.deviceSelection);
337 | });
338 | const context = new AudioContext({ sampleRate: this.sampleRate });
339 | const source = context.createMediaStreamSource(this.stream);
340 | // Load and execute the module script.
341 | try {
342 | await context.audioWorklet.addModule(this.scriptSrc);
343 | } catch (e) {
344 | console.error(e);
345 | throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`);
346 | }
347 | const processor = new AudioWorkletNode(context, 'audio_processor');
348 | processor.port.onmessage = (e) => {
349 | const { event, id, data } = e.data;
350 | if (event === 'receipt') {
351 | this.eventReceipts[id] = data;
352 | } else if (event === 'chunk') {
353 | if (this._chunkProcessorSize) {
354 | const buffer = this._chunkProcessorBuffer;
355 | this._chunkProcessorBuffer = {
356 | raw: WavPacker.mergeBuffers(buffer.raw, data.raw),
357 | mono: WavPacker.mergeBuffers(buffer.mono, data.mono),
358 | };
359 | if (
360 | this._chunkProcessorBuffer.mono.byteLength >=
361 | this._chunkProcessorSize
362 | ) {
363 | this._chunkProcessor(this._chunkProcessorBuffer);
364 | this._chunkProcessorBuffer = {
365 | raw: new ArrayBuffer(0),
366 | mono: new ArrayBuffer(0),
367 | };
368 | }
369 | } else {
370 | this._chunkProcessor(data);
371 | }
372 | }
373 | };
374 |
375 | const node = source.connect(processor);
376 | const analyser = context.createAnalyser();
377 | analyser.fftSize = 8192;
378 | analyser.smoothingTimeConstant = 0.1;
379 | node.connect(analyser);
380 | if (this.outputToSpeakers) {
381 | // eslint-disable-next-line no-console
382 | console.warn(
383 | 'Warning: Output to speakers may affect sound quality,\n' +
384 | 'especially due to system audio feedback preventative measures.\n' +
385 | 'use only for debugging'
386 | );
387 | analyser.connect(context.destination);
388 | }
389 |
390 | this.source = source;
391 | this.node = node;
392 | this.analyser = analyser;
393 | this.processor = processor;
394 | console.log('begin completed');
395 | return true;
396 | }
397 |
398 | /**
399 | * Gets the current frequency domain data from the recording track
400 | * @param {"frequency"|"music"|"voice"} [analysisType]
401 | * @param {number} [minDecibels] default -100
402 | * @param {number} [maxDecibels] default -30
403 | * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType}
404 | */
405 | getFrequencies(
406 | analysisType = 'frequency',
407 | minDecibels = -100,
408 | maxDecibels = -30
409 | ) {
410 | if (!this.processor) {
411 | throw new Error('Session ended: please call .begin() first');
412 | }
413 | return AudioAnalysis.getFrequencies(
414 | this.analyser,
415 | this.sampleRate,
416 | null,
417 | analysisType,
418 | minDecibels,
419 | maxDecibels
420 | );
421 | }
422 |
423 | /**
424 | * Pauses the recording
425 | * Keeps microphone stream open but halts storage of audio
426 | * @returns {Promise}
427 | */
428 | async pause() {
429 | if (!this.processor) {
430 | throw new Error('Session ended: please call .begin() first');
431 | } else if (!this.recording) {
432 | throw new Error('Already paused: please call .record() first');
433 | }
434 | if (this._chunkProcessorBuffer.raw.byteLength) {
435 | this._chunkProcessor(this._chunkProcessorBuffer);
436 | }
437 | this.log('Pausing ...');
438 | await this._event('stop');
439 | this.recording = false;
440 | return true;
441 | }
442 |
443 | /**
444 | * Start recording stream and storing to memory from the connected audio source
445 | * @param {(data: { mono: Int16Array; raw: Int16Array }) => any} [chunkProcessor]
446 | * @param {number} [chunkSize] chunkProcessor will not be triggered until this size threshold met in mono audio
447 | * @returns {Promise}
448 | */
449 | async record(chunkProcessor = () => {}, chunkSize = 8192) {
450 | if (!this.processor) {
451 | throw new Error('Session ended: please call .begin() first');
452 | } else if (this.recording) {
453 | throw new Error('Already recording: please call .pause() first');
454 | } else if (typeof chunkProcessor !== 'function') {
455 | throw new Error(`chunkProcessor must be a function`);
456 | }
457 | this._chunkProcessor = chunkProcessor;
458 | this._chunkProcessorSize = chunkSize;
459 | this._chunkProcessorBuffer = {
460 | raw: new ArrayBuffer(0),
461 | mono: new ArrayBuffer(0),
462 | };
463 | this.log('Recording ...');
464 | await this._event('start');
465 | this.recording = true;
466 | return true;
467 | }
468 |
469 | /**
470 | * Clears the audio buffer, empties stored recording
471 | * @returns {Promise}
472 | */
473 | async clear() {
474 | if (!this.processor) {
475 | throw new Error('Session ended: please call .begin() first');
476 | }
477 | await this._event('clear');
478 | return true;
479 | }
480 |
481 | /**
482 | * Reads the current audio stream data
483 | * @returns {Promise<{meanValues: Float32Array, channels: Array}>}
484 | */
485 | async read() {
486 | if (!this.processor) {
487 | throw new Error('Session ended: please call .begin() first');
488 | }
489 | this.log('Reading ...');
490 | const result = await this._event('read');
491 | return result;
492 | }
493 |
494 | /**
495 | * Saves the current audio stream to a file
496 | * @param {boolean} [force] Force saving while still recording
497 | * @returns {Promise}
498 | */
499 | async save(force = false) {
500 | if (!this.processor) {
501 | throw new Error('Session ended: please call .begin() first');
502 | }
503 | if (!force && this.recording) {
504 | throw new Error(
505 | 'Currently recording: please call .pause() first, or call .save(true) to force'
506 | );
507 | }
508 | this.log('Exporting ...');
509 | const exportData = await this._event('export');
510 | const packer = new WavPacker();
511 | const result = packer.pack(this.sampleRate, exportData.audio);
512 | return result;
513 | }
514 |
515 | /**
516 | * Ends the current recording session and saves the result
517 | * @returns {Promise}
518 | */
519 | async end() {
520 | if (!this.processor) {
521 | throw new Error('Session ended: please call .begin() first');
522 | }
523 |
524 | const _processor = this.processor;
525 |
526 | this.log('Stopping ...');
527 | await this._event('stop');
528 | this.recording = false;
529 | const tracks = this.stream.getTracks();
530 | tracks.forEach((track) => track.stop());
531 |
532 | this.log('Exporting ...');
533 | const exportData = await this._event('export', {}, _processor);
534 |
535 | this.processor.disconnect();
536 | this.source.disconnect();
537 | this.node.disconnect();
538 | this.analyser.disconnect();
539 | this.stream = null;
540 | this.processor = null;
541 | this.source = null;
542 | this.node = null;
543 |
544 | const packer = new WavPacker();
545 | const result = packer.pack(this.sampleRate, exportData.audio);
546 | return result;
547 | }
548 |
549 | /**
550 | * Performs a full cleanup of WavRecorder instance
551 | * Stops actively listening via microphone and removes existing listeners
552 | * @returns {Promise}
553 | */
554 | async quit() {
555 | this.listenForDeviceChange(null);
556 | // we do not reset this on end so that selections persist across starts
557 | this.deviceSelection = null;
558 | if (this.processor) {
559 | await this.end();
560 | }
561 | return true;
562 | }
563 | }
564 |
565 | globalThis.WavRecorder = WavRecorder;
566 |
--------------------------------------------------------------------------------
/lib/wavtools/lib/wav_stream_player.js:
--------------------------------------------------------------------------------
1 | import { StreamProcessorSrc } from "./worklets/stream_processor.js";
2 | import { AudioAnalysis } from "./analysis/audio_analysis.js";
3 |
4 | /**
5 | * Plays audio streams received in raw PCM16 chunks from the browser
6 | * @class
7 | */
8 | export class WavStreamPlayer {
9 | /**
10 | * Creates a new WavStreamPlayer instance
11 | * @param {{sampleRate?: number}} options
12 | * @returns {WavStreamPlayer}
13 | */
14 | constructor({ sampleRate = 44100 } = {}) {
15 | this.scriptSrc = StreamProcessorSrc;
16 | this.sampleRate = sampleRate;
17 | this.context = null;
18 | this.stream = null;
19 | this.analyser = null;
20 | this.trackSampleOffsets = {};
21 | this.interruptedTrackIds = {};
22 | }
23 |
24 | /**
25 | * Connects the audio context and enables output to speakers
26 | * @returns {Promise}
27 | */
28 | async connect() {
29 | this.context = new AudioContext({ sampleRate: this.sampleRate });
30 | if (this._speakerID) {
31 | this.context.setSinkId(this._speakerID);
32 | }
33 | if (this.context.state === "suspended") {
34 | await this.context.resume();
35 | }
36 | try {
37 | await this.context.audioWorklet.addModule(this.scriptSrc);
38 | } catch (e) {
39 | console.error(e);
40 | throw new Error(`Could not add audioWorklet module: ${this.scriptSrc}`);
41 | }
42 | const analyser = this.context.createAnalyser();
43 | analyser.fftSize = 8192;
44 | analyser.smoothingTimeConstant = 0.1;
45 | this.analyser = analyser;
46 | return true;
47 | }
48 |
49 | /**
50 | * Gets the current frequency domain data from the playing track
51 | * @param {"frequency"|"music"|"voice"} [analysisType]
52 | * @param {number} [minDecibels] default -100
53 | * @param {number} [maxDecibels] default -30
54 | * @returns {import('./analysis/audio_analysis.js').AudioAnalysisOutputType}
55 | */
56 | getFrequencies(
57 | analysisType = "frequency",
58 | minDecibels = -100,
59 | maxDecibels = -30
60 | ) {
61 | if (!this.analyser) {
62 | throw new Error("Not connected, please call .connect() first");
63 | }
64 | return AudioAnalysis.getFrequencies(
65 | this.analyser,
66 | this.sampleRate,
67 | null,
68 | analysisType,
69 | minDecibels,
70 | maxDecibels
71 | );
72 | }
73 |
74 | /**
75 | * @param {string} speaker deviceId
76 | */
77 | async updateSpeaker(speaker) {
78 | const _prevSpeaker = this._speakerID;
79 | this._speakerID = speaker;
80 | if (this.context) {
81 | try {
82 | if (speaker === "default") {
83 | await this.context.setSinkId();
84 | } else {
85 | await this.context.setSinkId(speaker);
86 | }
87 | } catch (e) {
88 | console.error(`Could not set sinkId to ${speaker}: ${e}`);
89 | this._speakerID = _prevSpeaker;
90 | }
91 | }
92 | }
93 |
94 | /**
95 | * Starts audio streaming
96 | * @private
97 | * @returns {Promise}
98 | */
99 | _start() {
100 | const streamNode = new AudioWorkletNode(this.context, "stream_processor");
101 | streamNode.connect(this.context.destination);
102 | streamNode.port.onmessage = (e) => {
103 | const { event } = e.data;
104 | if (event === "stop") {
105 | streamNode.disconnect();
106 | this.stream = null;
107 | } else if (event === "offset") {
108 | const { requestId, trackId, offset } = e.data;
109 | const currentTime = offset / this.sampleRate;
110 | this.trackSampleOffsets[requestId] = { trackId, offset, currentTime };
111 | }
112 | };
113 | this.analyser.disconnect();
114 | streamNode.connect(this.analyser);
115 | this.stream = streamNode;
116 | return true;
117 | }
118 |
119 | /**
120 | * Adds 16BitPCM data to the currently playing audio stream
121 | * You can add chunks beyond the current play point and they will be queued for play
122 | * @param {ArrayBuffer|Int16Array} arrayBuffer
123 | * @param {string} [trackId]
124 | * @returns {Int16Array}
125 | */
126 | add16BitPCM(arrayBuffer, trackId = "default") {
127 | if (typeof trackId !== "string") {
128 | throw new Error(`trackId must be a string`);
129 | } else if (this.interruptedTrackIds[trackId]) {
130 | return;
131 | }
132 | if (!this.stream) {
133 | this._start();
134 | }
135 | let buffer;
136 | if (arrayBuffer instanceof Int16Array) {
137 | buffer = arrayBuffer;
138 | } else if (arrayBuffer instanceof ArrayBuffer) {
139 | buffer = new Int16Array(arrayBuffer);
140 | } else {
141 | throw new Error(`argument must be Int16Array or ArrayBuffer`);
142 | }
143 | this.stream.port.postMessage({ event: "write", buffer, trackId });
144 | return buffer;
145 | }
146 |
147 | /**
148 | * Gets the offset (sample count) of the currently playing stream
149 | * @param {boolean} [interrupt]
150 | * @returns {{trackId: string|null, offset: number, currentTime: number}}
151 | */
152 | async getTrackSampleOffset(interrupt = false) {
153 | if (!this.stream) {
154 | return null;
155 | }
156 | const requestId = crypto.randomUUID();
157 | this.stream.port.postMessage({
158 | event: interrupt ? "interrupt" : "offset",
159 | requestId,
160 | });
161 | let trackSampleOffset;
162 | while (!trackSampleOffset) {
163 | trackSampleOffset = this.trackSampleOffsets[requestId];
164 | await new Promise((r) => setTimeout(() => r(), 1));
165 | }
166 | const { trackId } = trackSampleOffset;
167 | if (interrupt && trackId) {
168 | this.interruptedTrackIds[trackId] = true;
169 | }
170 | return trackSampleOffset;
171 | }
172 |
173 | /**
174 | * Strips the current stream and returns the sample offset of the audio
175 | * @param {boolean} [interrupt]
176 | * @returns {{trackId: string|null, offset: number, currentTime: number}}
177 | */
178 | async interrupt() {
179 | return this.getTrackSampleOffset(true);
180 | }
181 | }
182 |
183 | globalThis.WavStreamPlayer = WavStreamPlayer;
184 |
--------------------------------------------------------------------------------
/lib/wavtools/lib/worklets/audio_processor.js:
--------------------------------------------------------------------------------
1 | const AudioProcessorWorklet = `
2 | class AudioProcessor extends AudioWorkletProcessor {
3 |
4 | constructor() {
5 | super();
6 | this.port.onmessage = this.receive.bind(this);
7 | this.initialize();
8 | }
9 |
10 | initialize() {
11 | this.foundAudio = false;
12 | this.recording = false;
13 | this.chunks = [];
14 | }
15 |
16 | /**
17 | * Concatenates sampled chunks into channels
18 | * Format is chunk[Left[], Right[]]
19 | */
20 | readChannelData(chunks, channel = -1, maxChannels = 9) {
21 | let channelLimit;
22 | if (channel !== -1) {
23 | if (chunks[0] && chunks[0].length - 1 < channel) {
24 | throw new Error(
25 | \`Channel \${channel} out of range: max \${chunks[0].length}\`
26 | );
27 | }
28 | channelLimit = channel + 1;
29 | } else {
30 | channel = 0;
31 | channelLimit = Math.min(chunks[0] ? chunks[0].length : 1, maxChannels);
32 | }
33 | const channels = [];
34 | for (let n = channel; n < channelLimit; n++) {
35 | const length = chunks.reduce((sum, chunk) => {
36 | return sum + chunk[n].length;
37 | }, 0);
38 | const buffers = chunks.map((chunk) => chunk[n]);
39 | const result = new Float32Array(length);
40 | let offset = 0;
41 | for (let i = 0; i < buffers.length; i++) {
42 | result.set(buffers[i], offset);
43 | offset += buffers[i].length;
44 | }
45 | channels[n] = result;
46 | }
47 | return channels;
48 | }
49 |
50 | /**
51 | * Combines parallel audio data into correct format,
52 | * channels[Left[], Right[]] to float32Array[LRLRLRLR...]
53 | */
54 | formatAudioData(channels) {
55 | if (channels.length === 1) {
56 | // Simple case is only one channel
57 | const float32Array = channels[0].slice();
58 | const meanValues = channels[0].slice();
59 | return { float32Array, meanValues };
60 | } else {
61 | const float32Array = new Float32Array(
62 | channels[0].length * channels.length
63 | );
64 | const meanValues = new Float32Array(channels[0].length);
65 | for (let i = 0; i < channels[0].length; i++) {
66 | const offset = i * channels.length;
67 | let meanValue = 0;
68 | for (let n = 0; n < channels.length; n++) {
69 | float32Array[offset + n] = channels[n][i];
70 | meanValue += channels[n][i];
71 | }
72 | meanValues[i] = meanValue / channels.length;
73 | }
74 | return { float32Array, meanValues };
75 | }
76 | }
77 |
78 | /**
79 | * Converts 32-bit float data to 16-bit integers
80 | */
81 | floatTo16BitPCM(float32Array) {
82 | const buffer = new ArrayBuffer(float32Array.length * 2);
83 | const view = new DataView(buffer);
84 | let offset = 0;
85 | for (let i = 0; i < float32Array.length; i++, offset += 2) {
86 | let s = Math.max(-1, Math.min(1, float32Array[i]));
87 | view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
88 | }
89 | return buffer;
90 | }
91 |
92 | /**
93 | * Retrieves the most recent amplitude values from the audio stream
94 | * @param {number} channel
95 | */
96 | getValues(channel = -1) {
97 | const channels = this.readChannelData(this.chunks, channel);
98 | const { meanValues } = this.formatAudioData(channels);
99 | return { meanValues, channels };
100 | }
101 |
102 | /**
103 | * Exports chunks as an audio/wav file
104 | */
105 | export() {
106 | const channels = this.readChannelData(this.chunks);
107 | const { float32Array, meanValues } = this.formatAudioData(channels);
108 | const audioData = this.floatTo16BitPCM(float32Array);
109 | return {
110 | meanValues: meanValues,
111 | audio: {
112 | bitsPerSample: 16,
113 | channels: channels,
114 | data: audioData,
115 | },
116 | };
117 | }
118 |
119 | receive(e) {
120 | const { event, id } = e.data;
121 | let receiptData = {};
122 | switch (event) {
123 | case 'start':
124 | this.recording = true;
125 | break;
126 | case 'stop':
127 | this.recording = false;
128 | break;
129 | case 'clear':
130 | this.initialize();
131 | break;
132 | case 'export':
133 | receiptData = this.export();
134 | break;
135 | case 'read':
136 | receiptData = this.getValues();
137 | break;
138 | default:
139 | break;
140 | }
141 | // Always send back receipt
142 | this.port.postMessage({ event: 'receipt', id, data: receiptData });
143 | }
144 |
145 | sendChunk(chunk) {
146 | const channels = this.readChannelData([chunk]);
147 | const { float32Array, meanValues } = this.formatAudioData(channels);
148 | const rawAudioData = this.floatTo16BitPCM(float32Array);
149 | const monoAudioData = this.floatTo16BitPCM(meanValues);
150 | this.port.postMessage({
151 | event: 'chunk',
152 | data: {
153 | mono: monoAudioData,
154 | raw: rawAudioData,
155 | },
156 | });
157 | }
158 |
159 | process(inputList, outputList, parameters) {
160 | // Copy input to output (e.g. speakers)
161 | // Note that this creates choppy sounds with Mac products
162 | const sourceLimit = Math.min(inputList.length, outputList.length);
163 | for (let inputNum = 0; inputNum < sourceLimit; inputNum++) {
164 | const input = inputList[inputNum];
165 | const output = outputList[inputNum];
166 | const channelCount = Math.min(input.length, output.length);
167 | for (let channelNum = 0; channelNum < channelCount; channelNum++) {
168 | input[channelNum].forEach((sample, i) => {
169 | output[channelNum][i] = sample;
170 | });
171 | }
172 | }
173 | const inputs = inputList[0];
174 | // There's latency at the beginning of a stream before recording starts
175 | // Make sure we actually receive audio data before we start storing chunks
176 | let sliceIndex = 0;
177 | if (!this.foundAudio) {
178 | for (const channel of inputs) {
179 | sliceIndex = 0; // reset for each channel
180 | if (this.foundAudio) {
181 | break;
182 | }
183 | if (channel) {
184 | for (const value of channel) {
185 | if (value !== 0) {
186 | // find only one non-zero entry in any channel
187 | this.foundAudio = true;
188 | break;
189 | } else {
190 | sliceIndex++;
191 | }
192 | }
193 | }
194 | }
195 | }
196 | if (inputs && inputs[0] && this.foundAudio && this.recording) {
197 | // We need to copy the TypedArray, because the \`process\`
198 | // internals will reuse the same buffer to hold each input
199 | const chunk = inputs.map((input) => input.slice(sliceIndex));
200 | this.chunks.push(chunk);
201 | this.sendChunk(chunk);
202 | }
203 | return true;
204 | }
205 | }
206 |
207 | registerProcessor('audio_processor', AudioProcessor);
208 | `;
209 |
210 | const script = new Blob([AudioProcessorWorklet], {
211 | type: 'application/javascript',
212 | });
213 | const src = URL.createObjectURL(script);
214 | export const AudioProcessorSrc = src;
215 |
--------------------------------------------------------------------------------
/lib/wavtools/lib/worklets/stream_processor.js:
--------------------------------------------------------------------------------
1 | export const StreamProcessorWorklet = `
2 | class StreamProcessor extends AudioWorkletProcessor {
3 | constructor() {
4 | super();
5 | this.hasStarted = false;
6 | this.hasInterrupted = false;
7 | this.outputBuffers = [];
8 | this.bufferLength = 128;
9 | this.write = { buffer: new Float32Array(this.bufferLength), trackId: null };
10 | this.writeOffset = 0;
11 | this.trackSampleOffsets = {};
12 | this.port.onmessage = (event) => {
13 | if (event.data) {
14 | const payload = event.data;
15 | if (payload.event === 'write') {
16 | const int16Array = payload.buffer;
17 | const float32Array = new Float32Array(int16Array.length);
18 | for (let i = 0; i < int16Array.length; i++) {
19 | float32Array[i] = int16Array[i] / 0x8000; // Convert Int16 to Float32
20 | }
21 | this.writeData(float32Array, payload.trackId);
22 | } else if (
23 | payload.event === 'offset' ||
24 | payload.event === 'interrupt'
25 | ) {
26 | const requestId = payload.requestId;
27 | const trackId = this.write.trackId;
28 | const offset = this.trackSampleOffsets[trackId] || 0;
29 | this.port.postMessage({
30 | event: 'offset',
31 | requestId,
32 | trackId,
33 | offset,
34 | });
35 | if (payload.event === 'interrupt') {
36 | this.hasInterrupted = true;
37 | }
38 | } else {
39 | throw new Error(\`Unhandled event "\${payload.event}"\`);
40 | }
41 | }
42 | };
43 | }
44 |
45 | writeData(float32Array, trackId = null) {
46 | let { buffer } = this.write;
47 | let offset = this.writeOffset;
48 | for (let i = 0; i < float32Array.length; i++) {
49 | buffer[offset++] = float32Array[i];
50 | if (offset >= buffer.length) {
51 | this.outputBuffers.push(this.write);
52 | this.write = { buffer: new Float32Array(this.bufferLength), trackId };
53 | buffer = this.write.buffer;
54 | offset = 0;
55 | }
56 | }
57 | this.writeOffset = offset;
58 | return true;
59 | }
60 |
61 | process(inputs, outputs, parameters) {
62 | const output = outputs[0];
63 | const outputChannelData = output[0];
64 | const outputBuffers = this.outputBuffers;
65 | if (this.hasInterrupted) {
66 | this.port.postMessage({ event: 'stop' });
67 | return false;
68 | } else if (outputBuffers.length) {
69 | this.hasStarted = true;
70 | const { buffer, trackId } = outputBuffers.shift();
71 | for (let i = 0; i < outputChannelData.length; i++) {
72 | outputChannelData[i] = buffer[i] || 0;
73 | }
74 | if (trackId) {
75 | this.trackSampleOffsets[trackId] =
76 | this.trackSampleOffsets[trackId] || 0;
77 | this.trackSampleOffsets[trackId] += buffer.length;
78 | }
79 | return true;
80 | } else if (this.hasStarted) {
81 | this.port.postMessage({ event: 'stop' });
82 | return false;
83 | } else {
84 | return true;
85 | }
86 | }
87 | }
88 |
89 | registerProcessor('stream_processor', StreamProcessor);
90 | `;
91 |
92 | const script = new Blob([StreamProcessorWorklet], {
93 | type: 'application/javascript',
94 | });
95 | const src = URL.createObjectURL(script);
96 | export const StreamProcessorSrc = src;
97 |
--------------------------------------------------------------------------------
/lib/websocket-utils/reconnectingWebSocket.ts:
--------------------------------------------------------------------------------
1 | import { EventEmitter } from "events";
2 |
3 | const readyStates = ["CONNECTING", "OPEN", "CLOSING", "CLOSED"];
4 | const KEEP_ALIVE_INTERVAL = 5000;
5 | const KEEP_ALIVE_TIMEOUT = 15000;
6 | // client side code in soupSFU has a timeout of 15 seconds for command response
7 | // 5 seconds seems reasonable that it provides roughly 3 retry attempts
8 | const WEBSOCKET_CONNECTION_TIMEOUT = 150 * 1000;
9 | const DEFAULT_RECONNECT_ATTEMPTS = 2;
10 | const MAX_RECONNECT_ATTEMPTS = 10;
11 | const DEFAULT_RECONNECT_INTERVAL = 1000;
12 | const MAX_RECONNECT_INTERVAL = 30 * 1000;
13 | const DEFAULT_RECONNECT_DECAY = 1.5;
14 |
15 | const WEBSOCKET_TIMEOUT_CODE = 4100;
16 |
17 | const SIG_CONNECTION_CANCELED = "SIG_CONNECTION_CANCELED";
18 | const WEBSOCKET_ERROR = "WEBSOCKET_ERROR";
19 |
20 | enum LOG_LEVEL {
21 | DEBUG,
22 | ERROR,
23 | INFO,
24 | WARN,
25 | }
26 |
27 | class rWebSocket {
28 | private _ws: WebSocket;
29 | _closedManually: boolean = false;
30 | _errored: boolean = false;
31 | _rejected: boolean = false;
32 | _timed_out: boolean = false;
33 | _initialConnectionOk: string | boolean = false;
34 |
35 | constructor(url: string, protocols?: string | string[]) {
36 | this._ws = new WebSocket(url, protocols);
37 | }
38 |
39 | addEventListener(
40 | type: string,
41 | listener: (this: WebSocket, ev: Event) => any,
42 | ) {
43 | this._ws.addEventListener(type, listener);
44 | }
45 |
46 | // Add other WebSocket methods as needed
47 | close(code?: number, reason?: string) {
48 | this._ws.close(code, reason);
49 | }
50 |
51 | send(data: string | ArrayBuffer | Blob | ArrayBufferView) {
52 | this._ws.send(data);
53 | }
54 |
55 | // Add getters for WebSocket properties
56 | get url() {
57 | return this._ws.url;
58 | }
59 |
60 | get readyState() {
61 | return this._ws.readyState;
62 | }
63 | }
64 |
65 | interface WebSocketOptions {
66 | parseBlobToJson?: boolean;
67 | }
68 |
69 | /**
70 | * Builds on top of Javascript Websockets
71 | *
72 | * This behaves like the Websocket library in every way, except if it fails to
73 | * connect or if it gets disconnected, it will try to reconnect depending on
74 | * the maximum number of reconnect attempts set. retry is not enabled for initial
75 | * connection. When initial connection fails it is best to check yourself before
76 | * you keep wreckin' yourself.
77 | *
78 | * It is API compatible, so when you have:
79 | * ws = new WebSocket('ws://....');
80 | * you can replace with:
81 | * ws = new ReconnectingWebSocket('ws://....');
82 | *
83 | * While it is API compatible with the NodeJS ws library, we provide the
84 | * following additional properties and events on the ReconnectingWebSocket.
85 | *
86 | * Events:
87 | *
88 | * connection-timeout
89 | * - Emitted when the web socket connection times out.
90 | *
91 | * reconnecting
92 | * - Emitted after a manual close of the web socket is done and before retrying
93 | * the connection.
94 | *
95 | * reconnect-failed
96 | * - Emitted when the number of connection attempts exceeds the set number of
97 | * reconnection attempts.
98 | *
99 | * keep-alive
100 | * - Emitted when the set keep alive interval elapses. This event may be used
101 | * to have ping pong keep-alive mechanism for web socket health.
102 | *
103 | * Properties:
104 | *
105 | * keepAliveTimeout
106 | * - The timeout for keep-alive. Default: 15000
107 | *
108 | * keepAliveInterval
109 | * - The interval at which to emit keep-alive event. Default: 5000
110 | *
111 | * shouldRetryFn
112 | * - A callback function which should return boolean to determine if a web
113 | * socket reconnection attempt should be made. When not set, connection is
114 | * always retried.
115 | *
116 | * connectionTimeout
117 | * - The timeout interval for considering whether the connection timed out.
118 | * Default: 20000 ms
119 | *
120 | * maxReconnectAttempts
121 | * - The maximum number of attempts to be made for reconnection. Default: 2
122 | *
123 | * reconnectInterval
124 | * - The interval to wait before attempting a reconnection. Default: 1000 ms
125 | */
126 | export class ReconnectingWebSocket extends EventEmitter {
127 | /** The connection is not yet open. */
128 | static readonly CONNECTING: 0;
129 | /** The connection is open and ready to communicate. */
130 | static readonly OPEN: 1;
131 | /** The connection is in the process of closing. */
132 | static readonly CLOSING: 2;
133 | /** The connection is closed. */
134 | static readonly CLOSED: 3;
135 |
136 | private _ws: rWebSocket | null;
137 |
138 | _url: string;
139 | _protocols: string | string[] | undefined;
140 |
141 | declare private _keepAliveTimeout: number;
142 | declare private _keepAliveInterval: number;
143 | declare private _lastMsgRecvTime: number;
144 | declare private _lastMsgSendTime: number;
145 | declare private _disconnected: boolean;
146 | declare private _keepIntervalID: NodeJS.Timeout | null;
147 | declare private _connectionTimeout: number;
148 | declare private _connectionTimeoutID: NodeJS.Timeout | undefined;
149 | declare private _reconnectTimeoutID: NodeJS.Timeout | undefined;
150 | declare private _shouldRetryFn: (() => boolean) | null;
151 | declare private _reconnectAttempts: number;
152 | declare private _allowedReconnectAttempts: number;
153 | declare private _reconnectInterval: number;
154 | declare private _maxReconnectInterval: number;
155 | declare private _reconnectDecay: number;
156 | declare private _parseBlobToJson: boolean;
157 |
158 | constructor(
159 | address: string,
160 | protocols?: string | string[],
161 | options: WebSocketOptions = {},
162 | ) {
163 | super();
164 |
165 | if (!address) {
166 | throw new Error("Need a valid WebSocket URL");
167 | }
168 |
169 | this._ws = null;
170 |
171 | this._url = address;
172 | this._protocols = protocols;
173 | this._parseBlobToJson = options?.parseBlobToJson ?? true;
174 |
175 | this.init();
176 | }
177 |
178 | private init() {
179 | this._keepAliveTimeout = KEEP_ALIVE_TIMEOUT;
180 | this._keepAliveInterval = KEEP_ALIVE_INTERVAL;
181 | this._disconnected = false;
182 | this._keepIntervalID = null;
183 | this._shouldRetryFn = null;
184 | this._connectionTimeout = WEBSOCKET_CONNECTION_TIMEOUT;
185 | this._reconnectAttempts = 0;
186 | this._allowedReconnectAttempts = DEFAULT_RECONNECT_ATTEMPTS;
187 | this._reconnectInterval = DEFAULT_RECONNECT_INTERVAL;
188 | this._maxReconnectInterval = MAX_RECONNECT_INTERVAL;
189 | this._reconnectDecay = DEFAULT_RECONNECT_DECAY;
190 | }
191 |
192 | public async connect() {
193 | return new Promise((resolve, reject) => {
194 | this._disconnected = false;
195 | this.clearReconnectTimeout();
196 |
197 | let ws: rWebSocket = new rWebSocket(this._url, this._protocols);
198 | this.setConnectionTimeout();
199 |
200 | ws.addEventListener("close", (evt) => {
201 | const closeEvent = evt as CloseEvent;
202 | let code = ws._timed_out ? WEBSOCKET_TIMEOUT_CODE : closeEvent.code;
203 | let reason = ws._timed_out
204 | ? "websocket connection timed out"
205 | : closeEvent.reason;
206 | ws._timed_out = false;
207 | if (!ws._closedManually && ws._initialConnectionOk) {
208 | console.warn(
209 | `signaling socket closed unexpectedly: ${code}${
210 | reason ? " " + reason : ""
211 | }`,
212 | );
213 | this._closeSocket();
214 | this.emit("close", code, reason);
215 | } else {
216 | this.log("signaling socket closed");
217 | }
218 | if (!ws._closedManually && (ws._errored || ws._timed_out)) {
219 | console.warn(
220 | `signaling socket closed on error: ${code}${
221 | reason ? " " + reason : ""
222 | }`,
223 | );
224 | if (!ws._rejected) {
225 | ws._rejected = true;
226 | const err = new Error(
227 | `WebSocket connection error (${code}): ${reason}`,
228 | );
229 | err.name = WEBSOCKET_ERROR;
230 | reject(err);
231 | }
232 | }
233 | });
234 | ws.addEventListener("open", (evt) => {
235 | this.log("wss connection opened to", LOG_LEVEL.DEBUG, this._url);
236 | this.clearConnectionTimeout();
237 | // now that the timeout closes the socket, in theory this onopen
238 | // callback should never happen in the first place, but seems
239 | // harmless to leave these safeguards in
240 | if (ws._rejected || ws._timed_out) {
241 | return;
242 | }
243 | if (ws._closedManually || (this._ws && this._ws !== ws)) {
244 | ws._rejected = true;
245 | ws.close();
246 | let err = Error(
247 | "wss connection interrupted by disconnect or newer connection",
248 | );
249 | err.name = SIG_CONNECTION_CANCELED;
250 | reject(err);
251 | return;
252 | }
253 | ws._initialConnectionOk = this._url;
254 | this._lastMsgRecvTime = Date.now();
255 | if (this._keepAliveInterval) {
256 | this._keepIntervalID = setInterval(
257 | () => this.checkSocketHealthAndSendKeepAlive(),
258 | this._keepAliveInterval,
259 | );
260 | }
261 | this._ws = ws;
262 | this.emit("open");
263 | resolve(ws);
264 | });
265 | ws.addEventListener("error", (evt) => {
266 | // fyi: evt is an Event here, with 0 amount of helpful info. If there
267 | // happens to be info about the error, it's included in the
268 | // accompanying close event (because that make sense. shakes head)
269 | // SO. We do not reject here. Instead, we just set the _errored
270 | // flag on the socket so when the close event occurs, it knows to
271 | // reject the promise
272 | if (!ws._closedManually) {
273 | const wsTarget = evt.currentTarget as WebSocket;
274 | this.log(`websocket error event: ${wsTarget?.url}`);
275 | }
276 | ws._errored = true;
277 | });
278 | ws.addEventListener("message", (msg) => {
279 | void this._handleMessage(msg as MessageEvent);
280 | });
281 | });
282 | }
283 |
284 | private setConnectionTimeout() {
285 | this._connectionTimeoutID = setTimeout(async () => {
286 | this.log("Connection reconnect attempt timed out.");
287 | this.emit("connection-timeout");
288 | this.clearConnectionTimeout();
289 | await this._closeSocket();
290 | }, this._connectionTimeout);
291 | }
292 |
293 | private clearConnectionTimeout() {
294 | clearTimeout(this._connectionTimeoutID);
295 | this._connectionTimeoutID = undefined;
296 | }
297 |
298 | private clearReconnectTimeout() {
299 | clearTimeout(this._reconnectTimeoutID);
300 | this._reconnectTimeoutID = undefined;
301 | }
302 |
303 | private clearKeepAliveInterval() {
304 | if (this._keepIntervalID) {
305 | clearInterval(this._keepIntervalID);
306 | this._keepIntervalID = null;
307 | }
308 | }
309 |
310 | private async checkSocketHealthAndSendKeepAlive() {
311 | if (!(this._ws && this._ws.readyState === WebSocket.OPEN)) {
312 | return;
313 | }
314 |
315 | if (!this._keepAliveTimeout || !this._keepAliveInterval) {
316 | return;
317 | }
318 |
319 | // See if we haven't gotten a message back recently, and if we
320 | // haven't, close the socket. the os timeouts to detect if a socket
321 | // has gone stale are longer than we want.
322 | if (Date.now() - this._lastMsgRecvTime > this._keepAliveTimeout) {
323 | this.log("Connection is stale, need to reconnect", LOG_LEVEL.WARN);
324 | await this._closeSocket();
325 | return;
326 | }
327 |
328 | // Only emit the keep-alive event if we haven't sent anything else recently
329 | if (Date.now() - this._lastMsgSendTime < this._keepAliveInterval) {
330 | return;
331 | }
332 |
333 | this.log("Emitting keep-alive", LOG_LEVEL.DEBUG);
334 | this.emit("keep-alive");
335 | }
336 |
337 | // We use the word manually here to imply the application using this code
338 | // or this code itself will decide to close the socket.
339 | private async _closeSocket() {
340 | this.log("Closing");
341 | try {
342 | this.clearKeepAliveInterval();
343 | this._lastMsgRecvTime = 0;
344 |
345 | if (this._ws) {
346 | this._ws._closedManually = true;
347 | this._ws.close();
348 | }
349 |
350 | // query retry function if we want to retry.
351 | const shouldRetry =
352 | this._ws?._initialConnectionOk &&
353 | this._shouldRetryFn &&
354 | this._shouldRetryFn();
355 |
356 | this._ws = null;
357 |
358 | if (shouldRetry) {
359 | this.log("Emitting reconnect", LOG_LEVEL.DEBUG);
360 | this.emit("reconnecting");
361 | await this.retryFailedConnection();
362 | }
363 | } catch (error) {
364 | this.log(`Error while closing and retrying: ${error}`, LOG_LEVEL.ERROR);
365 | }
366 | }
367 |
368 | private async retryFailedConnection() {
369 | if (this._reconnectAttempts < this._allowedReconnectAttempts) {
370 | if (this._reconnectTimeoutID) {
371 | this.log("Retry already scheduled");
372 | return;
373 | }
374 | this.log("Retrying failed connection");
375 | let timeout =
376 | // The timeout logic is taken from
377 | // https://github.com/joewalnes/reconnecting-websocket
378 | this._reconnectInterval *
379 | Math.pow(this._reconnectDecay, this._reconnectAttempts);
380 | timeout =
381 | timeout > this._maxReconnectInterval
382 | ? this._maxReconnectInterval
383 | : timeout;
384 | this.log(`Reconnecting in ${timeout / 1000} seconds`);
385 |
386 | this._reconnectAttempts += 1;
387 | this._reconnectTimeoutID = setTimeout(() => this.connect(), timeout);
388 | } else {
389 | this.log("Maximum connection retry attempts exceeded", LOG_LEVEL.ERROR);
390 | this.emit("reconnect-failed");
391 | }
392 | }
393 |
394 | private log(
395 | msg: string,
396 | log_level: LOG_LEVEL = LOG_LEVEL.DEBUG,
397 | ...args: any
398 | ) {
399 | switch (log_level) {
400 | case LOG_LEVEL.DEBUG:
401 | console.debug(`websocket: ${msg}`, ...args);
402 | break;
403 | case LOG_LEVEL.ERROR:
404 | console.error(`websocket: ${msg}`, ...args);
405 | break;
406 | case LOG_LEVEL.WARN:
407 | console.warn(`websocket: ${msg}`, ...args);
408 | break;
409 | case LOG_LEVEL.INFO:
410 | default:
411 | console.log(`websocket: ${msg}`, ...args);
412 | break;
413 | }
414 | }
415 |
416 | async send(data: any) {
417 | try {
418 | if (this._ws && this._ws.readyState === WebSocket.OPEN) {
419 | this._lastMsgSendTime = Date.now();
420 | this._ws.send(data);
421 | } else {
422 | this.log(`Failed to send data, web socket not open.`, LOG_LEVEL.ERROR);
423 | }
424 | } catch (error) {
425 | this.log(`Failed to send data. ${error}`, LOG_LEVEL.ERROR);
426 | }
427 | }
428 |
429 | async close() {
430 | try {
431 | this.log("Closing websocket");
432 | this._disconnected = true;
433 | this.clearReconnectTimeout();
434 | this._closeSocket();
435 | } catch (error) {
436 | this.log(`Failed to close websocket. ${error}`);
437 | }
438 | }
439 |
440 | get readyState(): number {
441 | return this._ws?.readyState ?? WebSocket.CLOSED;
442 | }
443 |
444 | get url(): string {
445 | return this._url;
446 | }
447 |
448 | get keepAliveTimeout(): number {
449 | return this._keepAliveTimeout;
450 | }
451 |
452 | set keepAliveTimeout(keepAliveTimeout: number) {
453 | if (typeof keepAliveTimeout === "number") {
454 | this.log(`Setting ACK freshness timeout to ${keepAliveTimeout}`);
455 | this._keepAliveTimeout = keepAliveTimeout;
456 | }
457 | }
458 |
459 | get keepAliveInterval(): number {
460 | return this._keepAliveInterval;
461 | }
462 |
463 | set keepAliveInterval(keepAliveInterval: number) {
464 | if (typeof keepAliveInterval === "number") {
465 | this.log(`Setting keep-alive interval to ${keepAliveInterval}`);
466 | this._keepAliveInterval = keepAliveInterval;
467 | }
468 | }
469 |
470 | set shouldRetryFn(cb: () => boolean) {
471 | if (typeof cb === "function") {
472 | this._shouldRetryFn = cb;
473 | }
474 | }
475 |
476 | get connectionTimeout(): number {
477 | return this._connectionTimeout;
478 | }
479 |
480 | set connectionTimeout(timeout: number) {
481 | if (typeof timeout === "number") {
482 | this._connectionTimeout = timeout;
483 | }
484 | }
485 |
486 | get maxReconnectAttempts(): number {
487 | return this._allowedReconnectAttempts;
488 | }
489 |
490 | set maxReconnectAttempts(attempts: number) {
491 | if (attempts > 0 && attempts < MAX_RECONNECT_ATTEMPTS) {
492 | this.log(`Setting maximum connection retry attempts to ${attempts}`);
493 | this._allowedReconnectAttempts = attempts;
494 | } else {
495 | this._allowedReconnectAttempts = DEFAULT_RECONNECT_ATTEMPTS;
496 | }
497 | }
498 |
499 | get reconnectInterval(): number {
500 | return this._reconnectInterval;
501 | }
502 |
503 | set reconnectInterval(interval: number) {
504 | if (typeof interval === "number") {
505 | this._reconnectInterval =
506 | interval < this._maxReconnectInterval
507 | ? interval
508 | : this._maxReconnectInterval;
509 | }
510 | }
511 |
512 | async _handleMessage(event: MessageEvent) {
513 | this._lastMsgRecvTime = Date.now();
514 | const data = event.data;
515 |
516 | const _parsePromise = new Promise((resolve, reject) => {
517 | if (typeof data === "string") {
518 | // Handle text message
519 | resolve(data);
520 | } else if (data instanceof ArrayBuffer) {
521 | // Handle binary message
522 | const arrayBuffer = data;
523 | // Parse the ArrayBuffer as needed
524 | // Example: Convert ArrayBuffer to Uint8Array
525 | resolve(new Uint8Array(arrayBuffer));
526 | // Process the Uint8Array as needed
527 | } else if (data instanceof Blob) {
528 | if (!this._parseBlobToJson) {
529 | resolve(data);
530 | return;
531 | }
532 | // Handle Blob message
533 | const blob = data;
534 | // Convert Blob to ArrayBuffer
535 | const reader = new FileReader();
536 | reader.onload = () => {
537 | const text = reader.result as string;
538 | try {
539 | const json = JSON.parse(text);
540 | resolve(json);
541 | } catch (e) {
542 | console.error("Failed to parse JSON from Blob:", e);
543 | }
544 | };
545 | reader.readAsText(blob);
546 | }
547 | });
548 |
549 | let msg = await _parsePromise;
550 |
551 | this.emit("message", msg);
552 | }
553 | }
554 |
555 | [
556 | "binaryType",
557 | "bufferedAmount",
558 | "extensions",
559 | "protocol",
560 | "readyState",
561 | "url",
562 | "keepAliveTimeout",
563 | "keepAliveInterval",
564 | "shouldRetryFn",
565 | "connectionTimeout",
566 | "maxReconnectAttempts",
567 | "reconnectInterval",
568 | ].forEach((property) => {
569 | Object.defineProperty(ReconnectingWebSocket.prototype, property, {
570 | enumerable: true,
571 | });
572 | });
573 |
574 | ["CONNECTING", "OPEN", "CLOSING", "CLOSED"].forEach((property) => {
575 | Object.defineProperty(ReconnectingWebSocket.prototype, property, {
576 | enumerable: true,
577 | value: readyStates.indexOf(property),
578 | });
579 | });
580 |
581 | ["CONNECTING", "OPEN", "CLOSING", "CLOSED"].forEach((property) => {
582 | Object.defineProperty(ReconnectingWebSocket, property, {
583 | enumerable: true,
584 | value: readyStates.indexOf(property),
585 | });
586 | });
587 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "private": true,
3 | "name": "pipecat-client-web-transports",
4 | "version": "0.0.0",
5 | "workspaces": [
6 | "transports/*"
7 | ],
8 | "scripts": {
9 | "build": "npm run build --workspaces"
10 | },
11 | "devDependencies": {
12 | "@parcel/packager-ts": "^2.13.2",
13 | "@parcel/transformer-typescript-tsc": "^2.13.2",
14 | "@parcel/transformer-typescript-types": "^2.13.2",
15 | "@parcel/validator-typescript": "^2.12.0",
16 | "@swc/helpers": "^0.5.13",
17 | "parcel": "^2.13.2",
18 | "prettier": "^3.5.3",
19 | "typescript": "^5.5.4"
20 | },
21 | "peerDependencies": {
22 | "@daily-co/daily-js": "^0.77.0"
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/transports/daily/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to **Pipecat Daily WebRTC Transport** will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7 |
8 | ## [0.4.0]
9 |
10 | - Bumped dependency to @pipecat-ai/client-js@~0.4.0
11 |
12 | ## [0.3.10]
13 |
14 | - Fix an issue where iOS devices have ~500ms of audio cut off after declaring
15 | that the track state is playable.
16 |
17 | ## [0.3.9]
18 |
19 | DO NOT USE
20 |
21 | ## [0.3.8]
22 |
23 | - Fix issue resulting in the camera starting despite enableCam setting.
24 |
25 | ## [0.3.7]
26 |
27 | - Added support for disconnecting the client if the Daily call errors out.
28 |
29 | ## [0.3.6]
30 |
31 | ### Fixed
32 |
33 | - Fixed an issue where the transport could call `clientReady()` multiple times,
34 | once for each `track-started` event. Now, `clientReady()` is called for the
35 | first track only.
36 |
37 | - Added support for buffering audio until the bot is ready using the
38 | `bufferLocalAudioUntilBotReady` property. Once the bot is ready, the buffered
39 | audio will be sent, allowing the user to begin speaking before the bot has
40 | joined the call.
41 |
42 | ## [0.3.4] - 2024-12-16
43 |
44 | ### Added
45 |
46 | - Screen sharing support
47 | - Added `startScreenShare` and `stopScreenShare` methods
48 | - Added `isSharingScreen` getter property
49 |
50 | ## [0.3.3] - 2024-12-11
51 |
52 | - Fixed READMEs
53 |
54 | ## [0.3.2] - 2024-12-11
55 |
56 | - Added new abstract `RealtimeWebsocketTransport` class for direct
57 | voice-to-voice transports
58 |
59 | - Added new `GeminiLiveWebsocketTransport`
60 |
61 | - Added [basic example](./examples/geminiMultiModalLive) for using
62 | `GeminiLiveWebsocketTransport`
63 |
64 | ## [0.2.3] - 2024-12-06
65 |
66 | ### Fixed
67 |
68 | - Added missing event support for managing audio speakers
69 |
70 | ## [0.2.2] - 2024-11-12
71 |
72 | ### Added
73 |
74 | - Implemented log levels as part of `realtime-ai` package.
75 |
76 | ## [0.2.1] - 2024-10-28
77 |
78 | - Version bump to align with core `realtime-ai` package.
79 |
--------------------------------------------------------------------------------
/transports/daily/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2024, Daily
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/transports/daily/README.md:
--------------------------------------------------------------------------------
1 | # Pipecat's Real-Time Voice Inference - Daily Transport
2 |
3 | [](https://docs.pipecat.ai/client/js/transports/daily)
4 | 
5 | [](https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot)
6 |
7 | Daily transport package for use with `@pipecat-ai/client-js`.
8 |
9 | ## Installation
10 |
11 | ```bash copy
12 | npm install \
13 | @pipecat-ai/client-js \
14 | @pipecat-ai/daily-transport
15 | ```
16 |
17 | ## Overview
18 |
19 | The DailyTransport class provides a WebRTC transport layer using [Daily.co's](https://daily.co) infrastructure. It handles audio/video device management, WebRTC connections, and real-time communication between clients and bots.
20 |
21 | ## Features
22 |
23 | - 🎥 Complete camera device management
24 | - 🎤 Microphone input handling
25 | - 🔊 Speaker output control
26 | - 📡 WebRTC connection management
27 | - 🤖 Bot participant tracking
28 | - 📊 Audio level monitoring
29 | - 💬 Real-time messaging
30 |
31 | ## Usage
32 |
33 | ### Basic Setup
34 |
35 | ```javascript
36 | import { RTVIClient } from "@pipecat-ai/client-js";
37 | import { DailyTransport } from "@pipecat-ai/daily-transport";
38 |
39 | const transport = new DailyTransport({
40 | dailyFactoryOptions: {
41 | // Daily.co specific configuration
42 | }
43 | });
44 |
45 | const rtviClient = new RTVIClient({
46 | transport,
47 | enableCam: false, // Default camera off
48 | enableMic: true, // Default microphone on
49 | callbacks: {
50 | // Event handlers
51 | },
52 | params: {
53 | baseUrl,
54 | endpoints
55 | }
56 | // ...
57 | });
58 |
59 | await rtviClient.connect();
60 | ```
61 |
62 | ## API Reference
63 |
64 | ### Constructor Options
65 |
66 | ```typescript
67 | interface DailyTransportConstructorOptions {
68 | dailyFactoryOptions?: DailyFactoryOptions; // Daily.co specific configuration
69 | }
70 | ```
71 |
72 | ### States
73 |
74 | The transport can be in one of these states:
75 | - "initializing"
76 | - "initialized"
77 | - "connecting"
78 | - "connected"
79 | - "ready"
80 | - "disconnecting"
81 | - "error"
82 |
83 | ## Events
84 |
85 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info.
86 |
87 | ## Error Handling
88 |
89 | The transport includes error handling for:
90 | - Connection failures
91 | - Device errors
92 | - Authentication issues
93 | - Message transmission problems
94 |
95 | ## License
96 | BSD-2 Clause
97 |
--------------------------------------------------------------------------------
/transports/daily/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@pipecat-ai/daily-transport",
3 | "version": "0.4.0",
4 | "license": "BSD-2-Clause",
5 | "main": "dist/index.js",
6 | "module": "dist/index.module.js",
7 | "types": "dist/index.d.ts",
8 | "source": "src/index.ts",
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git"
12 | },
13 | "files": [
14 | "dist",
15 | "package.json",
16 | "README.md"
17 | ],
18 | "scripts": {
19 | "build": "parcel build --no-cache",
20 | "dev": "parcel watch",
21 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0"
22 | },
23 | "devDependencies": {
24 | "@pipecat-ai/client-js": "^0.4.0",
25 | "eslint": "9.11.1",
26 | "eslint-config-prettier": "^9.1.0",
27 | "eslint-plugin-simple-import-sort": "^12.1.1"
28 | },
29 | "peerDependencies": {
30 | "@pipecat-ai/client-js": "~0.4.0"
31 | },
32 | "dependencies": {
33 | "@daily-co/daily-js": "^0.77.0"
34 | },
35 | "description": "Pipecat Daily Transport Package",
36 | "author": "Daily.co",
37 | "bugs": {
38 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues"
39 | },
40 | "homepage": "https://github.com/pipecat-ai/pipecat-client-web-transports/blob/main/transports/daily-webrtc/README.md"
41 | }
42 |
--------------------------------------------------------------------------------
/transports/daily/src/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./transport";
2 |
--------------------------------------------------------------------------------
/transports/daily/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "module": "ESNext",
5 | "lib": ["ES2020", "DOM", "DOM.Iterable"],
6 | "skipLibCheck": true,
7 | "jsx": "preserve",
8 |
9 | /* Bundler mode */
10 | "moduleResolution": "bundler",
11 | "allowImportingTsExtensions": true,
12 | "allowJs": true,
13 | "noEmit": true,
14 | "resolveJsonModule": true,
15 | "isolatedModules": true,
16 | "moduleDetection": "force",
17 |
18 | /* Linting */
19 | "strict": true,
20 | "noUnusedLocals": true,
21 | "noUnusedParameters": false,
22 | "noFallthroughCasesInSwitch": true
23 | },
24 | "include": ["src"]
25 | }
26 |
--------------------------------------------------------------------------------
/transports/gemini-live-websocket-transport/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2024, Daily
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/transports/gemini-live-websocket-transport/README.md:
--------------------------------------------------------------------------------
1 | # Gemini Live Websocket Transport
2 |
3 | [](https://docs.pipecat.ai/client/js/transports/gemini)
4 | [](examples/directToLLMTransports/README.md)
5 | 
6 |
7 | A real-time websocket transport implementation for interacting with Google's Gemini Multimodal Live API, supporting bidirectional audio and unidirectional text communication.
8 |
9 | ## Installation
10 |
11 | ```bash copy
12 | npm install \
13 | @pipecat-ai/client-js \
14 | @pipecat-ai/real-time-websocket-transport \
15 | @pipecat-ai/gemini-live-websocket-transport
16 | ```
17 |
18 | ## Overview
19 |
20 | The `GeminiLiveWebsocketTransport` class extends the `DirectToLLMBaseWebSocketTransport` to implement a fully functional [RTVI `Transport`](https://docs.pipecat.ai/client/js/transports/transport). It provides a framework for implementing real-time communication directly with the [Gemini Multimodal Live](https://ai.google.dev/api/multimodal-live) voice-to-voice service. It handles media device management, audio/video streams, and state management for the connection.
21 |
22 | ## Features
23 |
24 | - Real-time bidirectional communication with Gemini Multimodal Live
25 | - Input device management
26 | - Audio streaming support
27 | - Text message support
28 | - Automatic reconnection handling
29 | - Configurable generation parameters
30 | - Support for initial conversation context
31 |
32 | ## Usage
33 |
34 | ### Basic Setup
35 |
36 | ```javascript
37 | import { GeminiLiveWebsocketTransport, GeminiLLMServiceOptions } from '@pipecat-ai/gemini-live-websocket-transport';
38 |
39 | const options: GeminiLLMServiceOptions = {
40 | api_key: 'YOUR_API_KEY',
41 | generation_config: {
42 | temperature: 0.7,
43 | maxOutput_tokens: 1000
44 | }
45 | };
46 |
47 | const transport = new GeminiLiveWebsocketTransport(options);
48 | let RTVIConfig: RTVIClientOptions = {
49 | transport,
50 | ...
51 | };
52 |
53 | ```
54 |
55 | ### Configuration Options
56 |
57 | ```typescript
58 | interface GeminiLLMServiceOptions {
59 | api_key: string; // Required: Your Gemini API key
60 | initial_messages?: Array<{ // Optional: Initial conversation context
61 | content: string;
62 | role: string;
63 | }>;
64 | generation_config?: { // Optional: Generation parameters
65 | candidate_count?: number;
66 | maxOutput_tokens?: number;
67 | temperature?: number;
68 | top_p?: number;
69 | top_k?: number;
70 | presence_penalty?: number;
71 | frequency_penalty?: number;
72 | response_modalities?: string;
73 | speech_config?: {
74 | voice_config?: {
75 | prebuilt_voice_config?: {
76 | voice_name: "Puck" | "Charon" | "Kore" | "Fenrir" | "Aoede";
77 | };
78 | };
79 | };
80 | };
81 | }
82 | ```
83 |
84 | ### Sending Messages
85 |
86 | ```javascript
87 | // at setup time...
88 | llmHelper = new LLMHelper({});
89 | rtviClient.registerHelper("llm", llmHelper);
90 | // the 'llm' name in this call above isn't used.
91 | //that value is specific to working with a pipecat pipeline
92 |
93 | // at time of sending message...
94 | // Send text prompt message
95 | llmHelper.appendToMessages({ role: "user", content: 'Hello Gemini!' });
96 | ```
97 |
98 | ### Handling Events
99 |
100 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info.
101 |
102 | ## API Reference
103 |
104 | ### Methods
105 |
106 | - `initialize()`: Set up the transport and establish connection
107 | - `sendMessage(message)`: Send a text message
108 | - `handleUserAudioStream(data)`: Stream audio data to the model
109 | - `disconnectLLM()`: Close the connection
110 | - `sendReadyMessage()`: Signal ready state
111 |
112 | ### States
113 |
114 | The transport can be in one of the following states:
115 | - "disconnected"
116 | - "initializing"
117 | - "initialized"
118 | - "connecting"
119 | - "connected"
120 | - "ready"
121 | - "disconnecting
122 | - "error"
123 |
124 | ## Error Handling
125 |
126 | The transport includes comprehensive error handling for:
127 | - Connection failures
128 | - Websocket errors
129 | - API key validation
130 | - Message transmission errors
131 |
132 | ## License
133 | BSD-2 Clause
134 |
--------------------------------------------------------------------------------
/transports/gemini-live-websocket-transport/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@pipecat-ai/gemini-live-websocket-transport",
3 | "version": "0.4.0",
4 | "license": "BSD-2-Clause",
5 | "main": "dist/index.js",
6 | "module": "dist/index.module.js",
7 | "types": "dist/index.d.ts",
8 | "source": "src/index.ts",
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git"
12 | },
13 | "files": [
14 | "dist",
15 | "package.json",
16 | "README.md"
17 | ],
18 | "scripts": {
19 | "build": "parcel build --no-cache",
20 | "dev": "parcel watch",
21 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0"
22 | },
23 | "dependencies": {
24 | "@daily-co/daily-js": "^0.79.0"
25 | },
26 | "devDependencies": {
27 | "@pipecat-ai/client-js": "^0.4.0",
28 | "@types/node": "^22.9.0",
29 | "eslint": "9.11.1",
30 | "eslint-config-prettier": "^9.1.0",
31 | "eslint-plugin-simple-import-sort": "^12.1.1"
32 | },
33 | "peerDependencies": {
34 | "@pipecat-ai/client-js": "~0.4.0"
35 | },
36 | "description": "Pipecat Gemini Multimodal Live Transport Package",
37 | "author": "Daily.co",
38 | "bugs": {
39 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues"
40 | },
41 | "homepage": "https://github.com/pipecat-ai/pipecat-client-web-transports/blob/main/transports/gemini-live-websocket-transport/README.md"
42 | }
43 |
--------------------------------------------------------------------------------
/transports/gemini-live-websocket-transport/src/directToLLMBaseWebSocketTransport.ts:
--------------------------------------------------------------------------------
1 | import {
2 | BotTTSTextData,
3 | RTVIClientOptions,
4 | RTVIMessage,
5 | Tracks,
6 | TranscriptData,
7 | Transport,
8 | TransportState,
9 | } from "@pipecat-ai/client-js";
10 |
11 | import { MediaManager } from "../../../lib/media-mgmt/mediaManager";
12 |
13 | export interface LLMServiceOptions {
14 | api_key?: string;
15 | initial_messages?: Array;
16 | model?: string;
17 | settings?: Record;
18 | }
19 |
20 | /**
21 | * DirectToLLMBaseWebSocketTransport is an abstract class that provides a client-side
22 | * interface for connecting to a real-time AI service. It is intended to
23 | * connect directly to the service. (No Pipecat server is involved.)
24 | */
25 | export abstract class DirectToLLMBaseWebSocketTransport extends Transport {
26 | // Utilities for audio.
27 | private _mediaManager;
28 | protected _service_options: LLMServiceOptions;
29 |
30 | protected _botIsSpeaking = false;
31 |
32 | constructor(service_options: LLMServiceOptions, manager: MediaManager) {
33 | super();
34 | this._service_options = service_options;
35 | this._mediaManager = manager;
36 | this._mediaManager.setUserAudioCallback(
37 | this.handleUserAudioStream.bind(this),
38 | );
39 | }
40 |
41 | /**
42 | * This method will be called from initialize()
43 | * Subclasses should initialize the LLM client and media player/recorder
44 | * and call initializeAudio() from within this method.
45 | */
46 | abstract initializeLLM(): void;
47 | /**
48 | * This method will be called from initialize()
49 | * Subclasses should etup listeners for LLM events from within this method
50 | */
51 | abstract attachLLMListeners(): void;
52 | /**
53 | * This method will be called from connect()
54 | * Subclasses should connect to the LLM and pass along the initial messages
55 | * @param initial_messages
56 | */
57 | abstract connectLLM(): Promise;
58 | /**
59 | * This method will be called from disconnect()
60 | * Subclasses should disconnect from the LLM
61 | */
62 | abstract disconnectLLM(): Promise;
63 | /**
64 | * This method will be called regularly with audio data from the user
65 | * Subclasses should handle this data and pass it along to the LLM
66 | * @param data ArrayBuffer of audio data
67 | */
68 | abstract handleUserAudioStream(data: ArrayBuffer): void;
69 |
70 | // subclasses should implement this method to initialize the LLM
71 | // client and call super() on this method
72 | initialize(
73 | options: RTVIClientOptions,
74 | messageHandler: (ev: RTVIMessage) => void,
75 | ): void {
76 | this._options = options;
77 | this._callbacks = options.callbacks ?? {};
78 | this._onMessage = messageHandler;
79 |
80 | this._mediaManager.setRTVIOptions(options);
81 |
82 | this.initializeLLM();
83 |
84 | this.attachDeviceListeners();
85 | this.attachLLMListeners();
86 |
87 | this.state = "disconnected";
88 | }
89 |
90 | async initDevices(): Promise {
91 | this.state = "initializing";
92 | await this._mediaManager.initialize();
93 | this.state = "initialized";
94 | }
95 |
96 | async connect(
97 | authBundle: unknown,
98 | abortController: AbortController,
99 | ): Promise {
100 | this.state = "connecting";
101 |
102 | await this.connectLLM();
103 |
104 | // connect user audio to llm
105 | this._mediaManager.connect();
106 | this.state = "connected";
107 | this._callbacks.onConnected?.();
108 | }
109 |
110 | async disconnect(): Promise {
111 | this.state = "disconnecting";
112 | await this._mediaManager.disconnect();
113 | await this.disconnectLLM();
114 | this.state = "disconnected";
115 | this._callbacks.onDisconnected?.();
116 | }
117 |
118 | getAllMics(): Promise {
119 | return this._mediaManager.getAllMics();
120 | }
121 | getAllCams(): Promise {
122 | return this._mediaManager.getAllCams();
123 | }
124 | getAllSpeakers(): Promise {
125 | return this._mediaManager.getAllSpeakers();
126 | }
127 |
128 | async updateMic(micId: string): Promise {
129 | return this._mediaManager.updateMic(micId);
130 | }
131 | updateCam(camId: string): void {
132 | return this._mediaManager.updateCam(camId);
133 | }
134 | updateSpeaker(speakerId: string): void {
135 | return this._mediaManager.updateSpeaker(speakerId);
136 | }
137 |
138 | get selectedMic(): MediaDeviceInfo | Record {
139 | return this._mediaManager.selectedMic;
140 | }
141 | get selectedCam(): MediaDeviceInfo | Record {
142 | return this._mediaManager.selectedCam;
143 | }
144 | get selectedSpeaker(): MediaDeviceInfo | Record {
145 | return this._mediaManager.selectedSpeaker;
146 | }
147 |
148 | enableMic(enable: boolean): void {
149 | this._mediaManager.enableMic(enable);
150 | }
151 | enableCam(enable: boolean): void {
152 | this._mediaManager.enableCam(enable);
153 | }
154 |
155 | get isCamEnabled(): boolean {
156 | return this._mediaManager.isCamEnabled;
157 | }
158 | get isMicEnabled(): boolean {
159 | return this._mediaManager.isMicEnabled;
160 | }
161 |
162 | get state(): TransportState {
163 | return this._state;
164 | }
165 |
166 | set state(state: TransportState) {
167 | if (this._state === state) return;
168 |
169 | this._state = state;
170 | this._callbacks.onTransportStateChanged?.(state);
171 | }
172 |
173 | get expiry(): number | undefined {
174 | return this._expiry;
175 | }
176 |
177 | tracks(): Tracks {
178 | return this._mediaManager.tracks();
179 | }
180 |
181 | // Realtime event handlers
182 | async userStartedSpeaking(): Promise {
183 | // Handle interruption
184 | const trackSampleOffset = await this._mediaManager.userStartedSpeaking();
185 | this._callbacks.onUserStartedSpeaking?.();
186 | return trackSampleOffset;
187 | }
188 |
189 | userStoppedSpeaking(): void {
190 | this._callbacks.onUserStoppedSpeaking?.();
191 | }
192 |
193 | userTranscript(transcript: TranscriptData): void {
194 | this._callbacks.onUserTranscript?.(transcript);
195 | }
196 |
197 | botStartedSpeaking(): void {
198 | if (!this._botIsSpeaking) {
199 | this._botIsSpeaking = true;
200 | this._callbacks.onBotStartedSpeaking?.();
201 | }
202 | }
203 |
204 | botStoppedSpeaking(): void {
205 | if (this._botIsSpeaking) {
206 | this._botIsSpeaking = false;
207 | this._callbacks.onBotStoppedSpeaking?.();
208 | }
209 | }
210 |
211 | botTtsText(data: BotTTSTextData): void {
212 | this._callbacks.onBotTtsText?.(data);
213 | }
214 |
215 | bufferBotAudio(audio: ArrayBuffer, id?: string): void {
216 | this._mediaManager.bufferBotAudio(audio, id);
217 | }
218 |
219 | connectionError(errorMsg: string): void {
220 | console.error(errorMsg);
221 | this.state = "error";
222 | this.disconnect();
223 | }
224 |
225 | private attachDeviceListeners(): void {}
226 | }
227 |
--------------------------------------------------------------------------------
/transports/gemini-live-websocket-transport/src/geminiLiveWebSocketTransport.ts:
--------------------------------------------------------------------------------
1 | import { MediaManager } from "../../../lib/media-mgmt/mediaManager";
2 | import { DailyMediaManager } from "../../../lib/media-mgmt/dailyMediaManager";
3 |
4 | import {
5 | logger,
6 | RTVIActionRequestData,
7 | RTVIMessage,
8 | RTVIMessageType,
9 | TransportStartError,
10 | } from "@pipecat-ai/client-js";
11 | import { ReconnectingWebSocket } from "../../../lib/websocket-utils/reconnectingWebSocket";
12 | import {
13 | DirectToLLMBaseWebSocketTransport,
14 | LLMServiceOptions,
15 | } from "./directToLLMBaseWebSocketTransport";
16 |
17 | const HOST = `generativelanguage.googleapis.com`;
18 | const BIDI_PATH = `google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent`;
19 | const MODEL = "models/gemini-2.0-flash-exp";
20 |
21 | export interface GeminiLLMServiceOptions extends LLMServiceOptions {
22 | initial_messages?: Array<{ content: string; role: string }>;
23 | api_key: string;
24 | settings?: {
25 | candidate_count?: number;
26 | maxOutput_tokens?: number;
27 | temperature?: number;
28 | top_p?: number;
29 | top_k?: number;
30 | presence_penalty?: number;
31 | frequency_penalty?: number;
32 | response_modalities?: string;
33 | speech_config?: {
34 | voice_config?: {
35 | prebuilt_voice_config?: {
36 | voice_name: "Puck" | "Charon" | "Kore" | "Fenrir" | "Aoede";
37 | // | "Voice O";
38 | };
39 | };
40 | };
41 | };
42 | }
43 |
44 | export class GeminiLiveWebsocketTransport extends DirectToLLMBaseWebSocketTransport {
45 | declare private _ws: ReconnectingWebSocket | null;
46 | declare private _botResponseID: number;
47 | declare private _botIsReadyResolve:
48 | | ((value: void | PromiseLike) => void)
49 | | null;
50 |
51 | constructor(
52 | service_options: GeminiLLMServiceOptions,
53 | manager?: MediaManager,
54 | ) {
55 | if (!manager) {
56 | manager = new DailyMediaManager();
57 | }
58 | super(service_options, manager);
59 |
60 | this._ws = null;
61 |
62 | this._botResponseID = 0;
63 | }
64 |
65 | initializeLLM(): void {
66 | const service_options = this._service_options as GeminiLLMServiceOptions;
67 | const apiKey = service_options.api_key;
68 | if (!apiKey) {
69 | console.error("!!! No API key provided in llm_service_options");
70 | return;
71 | }
72 | const base_url = `wss://${HOST}/ws/${BIDI_PATH}`;
73 | this._ws = new ReconnectingWebSocket(`${base_url}?key=${apiKey}`);
74 | // don't run the keep alive interval until we determine if there's an api for it
75 | this._ws.keepAliveInterval = 0;
76 | }
77 |
78 | // This is called from super.initialize()
79 | attachLLMListeners(): void {
80 | if (!this._ws) {
81 | console.error(
82 | "attachLLMListeners called before the websocket is initialized. Be sure to call initializeLLM() first.",
83 | );
84 | return;
85 | }
86 | this._ws.on("open", () => {});
87 | this._ws.on("message", async (msg: any) => {
88 | const content = msg.serverContent;
89 | if (!content) {
90 | if ("setupComplete" in msg) {
91 | this.state = "ready";
92 | if (this._botIsReadyResolve) {
93 | this._botIsReadyResolve();
94 | this._botIsReadyResolve = null;
95 | }
96 | } else {
97 | console.log("received unknown message", msg);
98 | }
99 | return;
100 | }
101 | if (content.modelTurn) {
102 | let result: ArrayBuffer | null = null;
103 | content.modelTurn.parts?.forEach((part: { inlineData: any }) => {
104 | if (part.inlineData?.data) {
105 | if (result) {
106 | mergeBuffers(result, base64ToArrayBuffer(part.inlineData.data));
107 | } else {
108 | result = base64ToArrayBuffer(part.inlineData.data);
109 | }
110 | }
111 | });
112 | if (result) {
113 | if (!this._botIsSpeaking) {
114 | this._botResponseID++;
115 | this.botStartedSpeaking();
116 | }
117 | this.bufferBotAudio(result, this._botResponseID.toString());
118 | }
119 | } else if (content.interrupted) {
120 | await this.userStartedSpeaking();
121 | } else if (content.turnComplete) {
122 | this.botStoppedSpeaking();
123 | } else {
124 | // console.log('unhandled message', content);
125 | }
126 | });
127 | this._ws.on("error", (error: Error) => {
128 | this.connectionError(`websocket error: ${error}`);
129 | });
130 | this._ws.on("connection-timeout", () => {
131 | this.connectionError("websocket connection timed out");
132 | });
133 | this._ws.on("close", (code: number) => {
134 | this.connectionError(`websocket connection closed. Code: ${code}`);
135 | });
136 | this._ws.on("reconnect-failed", () => {
137 | this.connectionError(`websocket reconnect failed`);
138 | });
139 | }
140 |
141 | async connectLLM(): Promise {
142 | if (!this._ws) {
143 | console.error(
144 | "connectLLM called before the websocket is initialized. Be sure to call initializeLLM() first.",
145 | );
146 | return;
147 | }
148 | try {
149 | await this._ws.connect();
150 | } catch (error) {
151 | const msg = `Failed to connect to LLM: ${error}`;
152 | console.error(msg);
153 | this.state = "error";
154 | throw new TransportStartError(msg);
155 | }
156 |
157 | const service_options = this._service_options as GeminiLLMServiceOptions;
158 | const model = service_options?.model ?? MODEL;
159 | const generation_config = service_options?.settings ?? {};
160 | let config = { setup: { model, generation_config } };
161 | await this._sendMsg(config);
162 |
163 | // For this bare-bones prototype, let's just see if we have any initial_messages in the params
164 | // we were constructed with.
165 | if (service_options?.initial_messages) {
166 | service_options.initial_messages.forEach(
167 | (msg: { content: string; role: string }) => {
168 | this._sendTextInput(msg.content, msg.role);
169 | },
170 | );
171 | }
172 | }
173 |
174 | async disconnectLLM(): Promise {
175 | await this._ws?.close();
176 | }
177 |
178 | async sendReadyMessage(): Promise {
179 | const p = new Promise((resolve) => {
180 | if (this.state === "ready") {
181 | resolve();
182 | } else {
183 | this._botIsReadyResolve = resolve;
184 | }
185 | });
186 | await p;
187 | this._onMessage({
188 | type: RTVIMessageType.BOT_READY,
189 | data: {},
190 | } as RTVIMessage);
191 | }
192 |
193 | handleUserAudioStream(data: ArrayBuffer): void {
194 | if (this.state === "ready") {
195 | try {
196 | void this._sendAudioInput(data);
197 | } catch (error) {
198 | console.error("Error adding audio to stream player", error);
199 | this.state = "error";
200 | // todo: should check this error more carefully, implement disconnect, implement
201 | // ping/ack connection monitoring and reconnection logic, etc.
202 | }
203 | }
204 | }
205 |
206 | sendMessage(message: RTVIMessage): void {
207 | switch (message.type) {
208 | case "action":
209 | {
210 | const data = message.data as RTVIActionRequestData;
211 | switch (data.action) {
212 | case "append_to_messages":
213 | if (data.arguments) {
214 | for (const a of data.arguments) {
215 | if (a.name === "messages") {
216 | const value = a.value as Array<{
217 | content: string;
218 | role: string;
219 | }>;
220 | for (const m of value) {
221 | this._sendTextInput(m.content, m.role);
222 | }
223 | }
224 | }
225 | }
226 | break;
227 | case "get_context":
228 | case "set_context":
229 | console.warn("get_context and set_context are not implemented");
230 | break;
231 | }
232 | }
233 | break;
234 | }
235 | }
236 |
237 | async _sendAudioInput(data: ArrayBuffer): Promise {
238 | // TODO: pull this number from the media manager
239 | const sampleRate = 24000;
240 | const msg = {
241 | realtimeInput: {
242 | mediaChunks: [
243 | {
244 | mimeType: `audio/pcm;rate=${sampleRate}`,
245 | data: arrayBufferToBase64(data),
246 | },
247 | ],
248 | },
249 | };
250 | await this._sendMsg(msg);
251 | }
252 |
253 | async _sendTextInput(text: string, role: string): Promise {
254 | const msg = {
255 | clientContent: {
256 | turns: [
257 | {
258 | role,
259 | parts: [{ text }],
260 | },
261 | ],
262 | turnComplete: role === "user" ? true : false,
263 | },
264 | };
265 | await this._sendMsg(msg);
266 | }
267 |
268 | async _sendMsg(msg: unknown): Promise {
269 | if (!this._ws) {
270 | console.error("sendMsg called but WS is null");
271 | return;
272 | }
273 | if (this._ws.readyState !== WebSocket.OPEN) {
274 | console.error("attempt to send to closed socket");
275 | return;
276 | }
277 | if (!msg) {
278 | console.error("need a msg to send a msg");
279 | return;
280 | }
281 | try {
282 | await this._ws.send(JSON.stringify(msg));
283 | } catch (e) {
284 | console.error("sendMsg error", e);
285 | }
286 | }
287 |
288 | // Not implemented
289 | enableScreenShare(enable: boolean): void {
290 | logger.error(
291 | "startScreenShare not implemented for GeminiLiveWebsocketTransport",
292 | );
293 | throw new Error("Not implemented");
294 | }
295 |
296 | public get isSharingScreen(): boolean {
297 | logger.error(
298 | "isSharingScreen not implemented for GeminiLiveWebsocketTransport",
299 | );
300 | return false;
301 | }
302 | }
303 |
304 | function base64ToArrayBuffer(base64: string): ArrayBuffer {
305 | const binaryString = atob(base64);
306 | const len = binaryString.length;
307 | const bytes = new Uint8Array(len);
308 | for (let i = 0; i < len; i++) {
309 | bytes[i] = binaryString.charCodeAt(i);
310 | }
311 | return bytes.buffer;
312 | }
313 |
314 | function arrayBufferToBase64(buffer: ArrayBuffer): string {
315 | const bytes = new Uint8Array(buffer);
316 | let binary = "";
317 | for (let i = 0; i < bytes.byteLength; i++) {
318 | binary += String.fromCharCode(bytes[i]);
319 | }
320 | return btoa(binary);
321 | }
322 |
323 | function mergeBuffers(
324 | leftBuffer: ArrayBuffer,
325 | rightBuffer: ArrayBuffer,
326 | ): ArrayBuffer {
327 | const tmpArray = new Uint8Array(
328 | leftBuffer.byteLength + rightBuffer.byteLength,
329 | );
330 | tmpArray.set(new Uint8Array(leftBuffer), 0);
331 | tmpArray.set(new Uint8Array(rightBuffer), leftBuffer.byteLength);
332 | return tmpArray.buffer;
333 | }
334 |
--------------------------------------------------------------------------------
/transports/gemini-live-websocket-transport/src/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./geminiLiveWebSocketTransport";
2 |
--------------------------------------------------------------------------------
/transports/gemini-live-websocket-transport/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "module": "ESNext",
5 | "lib": ["ES2020", "DOM", "DOM.Iterable"],
6 | "types": ["node"],
7 | "skipLibCheck": true,
8 | "jsx": "preserve",
9 |
10 | /* Bundler mode */
11 | "moduleResolution": "bundler",
12 | "allowImportingTsExtensions": true,
13 | "allowJs": true,
14 | "noEmit": true,
15 | "resolveJsonModule": true,
16 | "isolatedModules": true,
17 | "moduleDetection": "force",
18 |
19 | /* Linting */
20 | "strict": true,
21 | "noUnusedLocals": true,
22 | "noUnusedParameters": false,
23 | "noFallthroughCasesInSwitch": true
24 | },
25 | "include": ["src"]
26 | }
27 |
--------------------------------------------------------------------------------
/transports/openai-realtime-webrtc-transport/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2024, Daily
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/transports/openai-realtime-webrtc-transport/README.md:
--------------------------------------------------------------------------------
1 | # OpenAI RealTime WebRTC Transport
2 |
3 | [](https://docs.pipecat.ai/client/js/transports/openai-webrtc)
4 | [](examples/directToLLMTransports/README.md)
5 | 
6 |
7 | A real-time websocket transport implementation for interacting with Google's Gemini Multimodal Live API, supporting bidirectional audio and unidirectional text communication.
8 |
9 | ## Installation
10 |
11 | ```bash copy
12 | npm install \
13 | @pipecat-ai/client-js \
14 | @pipecat-ai/openai-realtime-webrtc-transport
15 | ```
16 |
17 | ## Overview
18 |
19 | The `OpenAIRealTimeWebRTCTransport` is a fully functional [RTVI `Transport`](https://docs.pipecat.ai/client/js/transports/transport). It provides a framework for implementing real-time communication directly with the [OpenAI Realtime API using WebRTC](https://platform.openai.com/docs/guides/realtime-webrtc) voice-to-voice service. It handles media device management, audio/video streams, and state management for the connection.
20 |
21 | ## Features
22 |
23 | - Real-time bidirectional communication with OpenAI Realtime API
24 | - Input device management
25 | - Audio streaming support
26 | - Text message support
27 | - Automatic reconnection handling
28 | - Configurable generation parameters
29 | - Support for initial conversation context
30 |
31 | ## Usage
32 |
33 | ### Basic Setup
34 |
35 | ```javascript
36 | import { OpenAIRealTimeWebRTCTransport, OpenAIServiceOptions } from '@pipecat-ai/openai-realtime-webrtc-transport';
37 |
38 | const options: OpenAIServiceOptions = {
39 | api_key: 'YOUR_API_KEY',
40 | session_config: {
41 | instructions: 'you are a confused jellyfish',
42 | }
43 | };
44 |
45 | const transport = new OpenAIRealTimeWebRTCTransport(options);
46 | let RTVIConfig: RTVIClientOptions = {
47 | transport,
48 | ...
49 | };
50 |
51 | ```
52 |
53 | ### Configuration Options
54 |
55 | ```typescript
56 | /**********************************
57 | * OpenAI-specific types
58 | * types and comments below are based on:
59 | * gpt-4o-realtime-preview-2024-12-17
60 | **********************************/
61 | type JSONSchema = { [key: string]: any };
62 | export type OpenAIFunctionTool = {
63 | type: "function";
64 | name: string;
65 | description: string;
66 | parameters: JSONSchema;
67 | };
68 |
69 | export type OpenAIServerVad = {
70 | type: "server_vad";
71 | create_response?: boolean; // defaults to true
72 | interrupt_response?: boolean; // defaults to true
73 | prefix_padding_ms?: number; // defaults to 300ms
74 | silence_duration_ms?: number; // defaults to 500ms
75 | threshold?: number; // range (0.0, 1.0); defaults to 0.5
76 | };
77 |
78 | export type OpenAISemanticVAD = {
79 | type: "semantic_vad";
80 | eagerness?: "low" | "medium" | "high" | "auto"; // defaults to "auto", equivalent to "medium"
81 | create_response?: boolean; // defaults to true
82 | interrupt_response?: boolean; // defaults to true
83 | };
84 |
85 | export type OpenAISessionConfig = Partial<{
86 | modalities?: string;
87 | instructions?: string;
88 | voice?:
89 | | "alloy"
90 | | "ash"
91 | | "ballad"
92 | | "coral"
93 | | "echo"
94 | | "sage"
95 | | "shimmer"
96 | | "verse";
97 | input_audio_noise_reduction?: {
98 | type: "near_field" | "far_field";
99 | } | null; // defaults to null/off
100 | input_audio_transcription?: {
101 | model: "whisper-1" | "gpt-4o-transcribe" | "gpt-4o-mini-transcribe";
102 | language?: string;
103 | prompt?: string[] | string; // gpt-4o models take a string
104 | } | null; // we default this to gpt-4o-transcribe
105 | turn_detection?: OpenAIServerVad | OpenAISemanticVAD | null; // defaults to server_vad
106 | temperature?: number;
107 | max_tokens?: number | "inf";
108 | tools?: Array;
109 | }>;
110 |
111 | export interface OpenAIServiceOptions {
112 | api_key: string;
113 | model?: string;
114 | initial_messages?: LLMContextMessage[];
115 | settings?: OpenAISessionConfig;
116 | }
117 | ```
118 |
119 | ### Sending Messages
120 |
121 | ```javascript
122 | // at setup time...
123 | llmHelper = new LLMHelper({});
124 | rtviClient.registerHelper("llm", llmHelper);
125 | // the 'llm' name in this call above isn't used.
126 | //that value is specific to working with a pipecat pipeline
127 |
128 | // at time of sending message...
129 | // Send text prompt message
130 | llmHelper.appendToMessages({ role: "user", content: 'Hello OpenAI!' });
131 | ```
132 |
133 | ### Handling Events
134 |
135 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info.
136 |
137 | ### Updating Session Configuration
138 |
139 | ```javascript
140 | transport.updateSessionConfig({
141 | instructions: 'you are a an over-sharing neighbor',
142 | input_audio_noise_reduction: {
143 | type: 'near_field'
144 | }
145 | });
146 | ```
147 |
148 | ## API Reference
149 |
150 | ### Methods
151 |
152 | - `initialize()`: Set up the transport and establish connection
153 | - `sendMessage(message)`: Send a text message
154 | - `handleUserAudioStream(data)`: Stream audio data to the model
155 | - `disconnectLLM()`: Close the connection
156 | - `sendReadyMessage()`: Signal ready state
157 |
158 | ### States
159 |
160 | The transport can be in one of the following states:
161 | - "disconnected"
162 | - "initializing"
163 | - "initialized"
164 | - "connecting"
165 | - "connected"
166 | - "ready"
167 | - "disconnecting
168 | - "error"
169 |
170 | ## Error Handling
171 |
172 | The transport includes comprehensive error handling for:
173 | - Connection failures
174 | - WebRTC connection errors
175 | - API key validation
176 | - Message transmission errors
177 |
178 | ## License
179 | BSD-2 Clause
180 |
--------------------------------------------------------------------------------
/transports/openai-realtime-webrtc-transport/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@pipecat-ai/openai-realtime-webrtc-transport",
3 | "version": "0.4.0",
4 | "license": "BSD-2-Clause",
5 | "main": "dist/index.js",
6 | "module": "dist/index.module.js",
7 | "types": "dist/index.d.ts",
8 | "source": "src/index.ts",
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git"
12 | },
13 | "files": [
14 | "dist",
15 | "package.json",
16 | "README.md"
17 | ],
18 | "scripts": {
19 | "build": "parcel build --no-cache",
20 | "dev": "parcel watch",
21 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0"
22 | },
23 | "devDependencies": {
24 | "@pipecat-ai/client-js": "^0.4.0",
25 | "@types/node": "^22.9.0",
26 | "eslint": "9.11.1",
27 | "eslint-config-prettier": "^9.1.0",
28 | "eslint-plugin-simple-import-sort": "^12.1.1"
29 | },
30 | "peerDependencies": {
31 | "@pipecat-ai/client-js": "~0.4.0"
32 | },
33 | "dependencies": {
34 | "@daily-co/daily-js": "^0.77.0",
35 | "dequal": "^2.0.3"
36 | },
37 | "description": "Pipecat OpenAI RealTime Transport Package",
38 | "author": "Daily.co",
39 | "bugs": {
40 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues"
41 | },
42 | "homepage": "https://github.com/pipecat-ai/pipecat-client-web-transports/blob/main/transports/openai-llm-direct-transport/README.md"
43 | }
44 |
--------------------------------------------------------------------------------
/transports/openai-realtime-webrtc-transport/src/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./OpenAIRealTimeWebRTCTransport";
2 |
--------------------------------------------------------------------------------
/transports/openai-realtime-webrtc-transport/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "module": "ESNext",
5 | "lib": ["ES2020", "DOM", "DOM.Iterable"],
6 | "types": ["node"],
7 | "skipLibCheck": true,
8 | "jsx": "preserve",
9 |
10 | /* Bundler mode */
11 | "moduleResolution": "bundler",
12 | "allowImportingTsExtensions": true,
13 | "noEmit": true,
14 | "resolveJsonModule": true,
15 | "isolatedModules": true,
16 | "moduleDetection": "force",
17 |
18 | /* Linting */
19 | "strict": true,
20 | "noUnusedLocals": true,
21 | "noUnusedParameters": false,
22 | "noFallthroughCasesInSwitch": true
23 | },
24 | "include": ["src"]
25 | }
26 |
--------------------------------------------------------------------------------
/transports/small-webrtc-transport/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to **Pipecat Small WebRTC Transport** will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7 |
8 | ## [0.4.0]
9 |
10 | - Bumped dependency to @pipecat-ai/client-js@~0.4.0
11 |
12 | ## [0.0.5] - 2025-05-19
13 |
14 | ### Fixed
15 |
16 | - `SmallWebRTCTransport` updates transport state to 'ready' when client ready message is sent.
17 |
18 | ## [0.0.4] - 2025-04-29
19 |
20 | ### Added
21 |
22 | - Added `waitForICEGathering` property: this allows users to configure whether the transport should
23 | explicitly wait for the iceGatheringState to become complete during the negotiation phase.
24 |
25 | ### Fixed
26 |
27 | - `SmallWebRTCTransport` class now accepts `RTCIceServer`[] instead of just the `String`[] of urls.
28 |
29 | ## [0.0.3] - 2025-04-11
30 |
31 | ### Added
32 |
33 | - Handling a new incoming `peerLeft` signalling messages from Pipecat.
34 |
35 | ## [0.0.2] - 2025-04-10
36 |
37 | ### Added
38 |
39 | - Send a signalling message whenever a track is enabled or disabled.
40 | - Handle incoming `renegotiate` signalling messages from Pipecat in a new format.
41 |
42 | ## [0.0.1] - 2025-04-09
43 |
44 | ### Added
45 |
46 | - Web client transport for the Pipecat **SmallWebRTCTransport**.
47 |
--------------------------------------------------------------------------------
/transports/small-webrtc-transport/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2024, Daily
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/transports/small-webrtc-transport/README.md:
--------------------------------------------------------------------------------
1 | # Pipecat's Real-Time Voice Inference - Small WebRTC Transport
2 |
3 | [](https://docs.pipecat.ai/client/js/transports/small-webrtc)
4 | 
5 | [](https://github.com/pipecat-ai/pipecat/tree/main/examples/p2p-webrtc)
6 |
7 | Small WebRTC transport package for use with `@pipecat-ai/client-js`.
8 |
9 | ## Installation
10 |
11 | ```bash copy
12 | npm install \
13 | @pipecat-ai/client-js \
14 | @pipecat-ai/small-webrtc-transport
15 | ```
16 |
17 | ## Overview
18 |
19 | The SmallWebRTCTransport class provides a WebRTC transport layer establishing a PeerConnection with Pipecat SmallWebRTCTransport. It handles audio/video device management, WebRTC connections, and real-time communication between client and bot.
20 |
21 | ## Features
22 |
23 | - 🎥 Complete camera device management
24 | - 🎤 Microphone input handling
25 | - 📡 WebRTC connection management
26 | - 🤖 Bot participant tracking
27 | - 💬 Real-time messaging
28 |
29 | ## Usage
30 |
31 | ### Basic Setup
32 |
33 | ```javascript
34 | import { RTVIClient } from "@pipecat-ai/client-js";
35 | import { SmallWebRTCTransport } from "@pipecat-ai/small-webrtc-transport";
36 |
37 | const transport = new SmallWebRTCTransport();
38 |
39 | const rtviClient = new RTVIClient({
40 | transport,
41 | enableCam: false, // Default camera off
42 | enableMic: true, // Default microphone on
43 | callbacks: {
44 | // Event handlers
45 | },
46 | params: {
47 | baseUrl,
48 | endpoints
49 | }
50 | // ...
51 | });
52 |
53 | await rtviClient.connect();
54 | ```
55 |
56 | ## API Reference
57 |
58 | ### States
59 |
60 | The transport can be in one of these states:
61 | - "initializing"
62 | - "initialized"
63 | - "connecting"
64 | - "connected"
65 | - "ready"
66 | - "disconnecting"
67 | - "error"
68 |
69 | ## Events
70 |
71 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info.
72 |
73 | ## Error Handling
74 |
75 | The transport includes error handling for:
76 | - Connection failures
77 | - Device errors
78 |
79 | ## License
80 | BSD-2 Clause
81 |
--------------------------------------------------------------------------------
/transports/small-webrtc-transport/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@pipecat-ai/small-webrtc-transport",
3 | "version": "0.4.0",
4 | "license": "BSD-2-Clause",
5 | "main": "dist/index.js",
6 | "module": "dist/index.module.js",
7 | "types": "dist/index.d.ts",
8 | "source": "src/index.ts",
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git"
12 | },
13 | "files": [
14 | "dist",
15 | "package.json",
16 | "README.md"
17 | ],
18 | "scripts": {
19 | "build": "parcel build --no-cache",
20 | "dev": "parcel watch",
21 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0",
22 | "prepare": "npm run build"
23 | },
24 | "devDependencies": {
25 | "@pipecat-ai/client-js": "^0.4.0",
26 | "@types/node": "^22.9.0",
27 | "eslint": "9.11.1",
28 | "eslint-config-prettier": "^9.1.0",
29 | "eslint-plugin-simple-import-sort": "^12.1.1"
30 | },
31 | "peerDependencies": {
32 | "@pipecat-ai/client-js": "~0.4.0"
33 | },
34 | "dependencies": {
35 | "@daily-co/daily-js": "^0.77.0",
36 | "dequal": "^2.0.3"
37 | },
38 | "description": "Pipecat Small WebRTC Transport Package",
39 | "author": "Daily.co",
40 | "bugs": {
41 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues"
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/transports/small-webrtc-transport/src/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./smallWebRTCTransport";
2 |
--------------------------------------------------------------------------------
/transports/small-webrtc-transport/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "module": "ESNext",
5 | "lib": ["ES2020", "DOM", "DOM.Iterable"],
6 | "types": ["node"],
7 | "skipLibCheck": true,
8 | "jsx": "preserve",
9 |
10 | /* Bundler mode */
11 | "moduleResolution": "bundler",
12 | "allowImportingTsExtensions": true,
13 | "allowJs": true,
14 | "noEmit": true,
15 | "resolveJsonModule": true,
16 | "isolatedModules": true,
17 | "moduleDetection": "force",
18 |
19 | /* Linting */
20 | "strict": true,
21 | "noUnusedLocals": true,
22 | "noUnusedParameters": false,
23 | "noFallthroughCasesInSwitch": true
24 | },
25 | "include": ["src"]
26 | }
27 |
--------------------------------------------------------------------------------
/transports/websocket-transport/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2024, Daily
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/transports/websocket-transport/README.md:
--------------------------------------------------------------------------------
1 | # Websocket Transport
2 |
3 | [](https://github.com/pipecat-ai/pipecat/tree/main/examples/websocket/README.md)
4 | 
5 |
6 | Websocket transport package for use with `@pipecat-ai/client-js`.
7 |
8 | ## Installation
9 |
10 | ```bash copy
11 | npm install \
12 | @pipecat-ai/client-js \
13 | @pipecat-ai/websocket-transport
14 | ```
15 |
16 | ## Overview
17 |
18 | The WebSocketTransport class provides a Websocket transport layer establishing a connection with Pipecat WebSocketTransport. It handles audio device management and real-time communication between client and bot.
19 |
20 | ## Features
21 |
22 | - 🎤 Microphone input handling
23 | - 🤖 Bot participant tracking
24 | - 💬 Real-time messaging
25 |
26 | ## Usage
27 |
28 | ### Basic Setup
29 |
30 | ```javascript
31 | import { RTVIClient } from "@pipecat-ai/client-js";
32 | import { WebSocketTransport } from "@pipecat-ai/small-webrtc-transport";
33 |
34 | const transport = new WebSocketTransport();
35 |
36 | const rtviClient = new RTVIClient({
37 | transport,
38 | enableMic: true, // Default microphone on
39 | callbacks: {
40 | // Event handlers
41 | },
42 | params: {
43 | baseUrl,
44 | endpoints
45 | }
46 | // ...
47 | });
48 |
49 | await rtviClient.connect();
50 | ```
51 |
52 | ## API Reference
53 |
54 | ### States
55 |
56 | The transport can be in one of these states:
57 | - "initializing"
58 | - "initialized"
59 | - "connecting"
60 | - "connected"
61 | - "ready"
62 | - "disconnecting"
63 | - "error"
64 |
65 | ## Events
66 |
67 | The transport implements the various [RTVI event handlers](https://docs.pipecat.ai/client/js/api-reference/callbacks). Check out the docs or samples for more info.
68 |
69 | ## Error Handling
70 |
71 | The transport includes error handling for:
72 | - Connection failures
73 | - Device errors
74 |
75 | ## License
76 | BSD-2 Clause
77 |
78 |
--------------------------------------------------------------------------------
/transports/websocket-transport/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@pipecat-ai/websocket-transport",
3 | "version": "0.4.1",
4 | "license": "BSD-2-Clause",
5 | "main": "dist/index.js",
6 | "module": "dist/index.module.js",
7 | "types": "dist/index.d.ts",
8 | "source": "src/index.ts",
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/pipecat-ai/pipecat-client-web-transports.git"
12 | },
13 | "files": [
14 | "dist",
15 | "package.json",
16 | "README.md"
17 | ],
18 | "scripts": {
19 | "build_proto": "bash ./proto/generate_typescript.sh",
20 | "build": "parcel build --no-cache",
21 | "dev": "parcel watch",
22 | "lint": "eslint . --ext ts --report-unused-disable-directives --max-warnings 0"
23 | },
24 | "devDependencies": {
25 | "@pipecat-ai/client-js": "^0.4.0",
26 | "@types/node": "^22.9.0",
27 | "eslint": "9.11.1",
28 | "eslint-config-prettier": "^9.1.0",
29 | "eslint-plugin-simple-import-sort": "^12.1.1"
30 | },
31 | "peerDependencies": {
32 | "@pipecat-ai/client-js": "~0.4.0"
33 | },
34 | "dependencies": {
35 | "@daily-co/daily-js": "^0.79.0",
36 | "@protobuf-ts/plugin": "^2.11.0",
37 | "@protobuf-ts/runtime": "^2.11.0"
38 | },
39 | "description": "Pipecat Base Transport for RealTime WebSocket APIs Package",
40 | "author": "Daily.co",
41 | "bugs": {
42 | "url": "https://github.com/pipecat-ai/pipecat-client-web-transports/issues"
43 | },
44 | "homepage": "https://github.com/pipecat-ai/pipecat-client-web-transports/blob/main/transports/realtime-websocket-transport/README.md"
45 | }
46 |
--------------------------------------------------------------------------------
/transports/websocket-transport/proto/frames.proto:
--------------------------------------------------------------------------------
1 | //
2 | // Copyright (c) 2024–2025, Daily
3 | //
4 | // SPDX-License-Identifier: BSD 2-Clause License
5 | //
6 |
7 | // Generate frames_pb2.py with:
8 | //
9 | // python -m grpc_tools.protoc --proto_path=./ --python_out=./protobufs frames.proto
10 |
11 | syntax = "proto3";
12 |
13 | package pipecat;
14 |
15 | // Represents a basic unit of text data.
16 | message TextFrame {
17 | uint64 id = 1;
18 | string name = 2;
19 | string text = 3;
20 | }
21 |
22 | // Represents a raw chunk of audio data,
23 | // either generated by Pipecat for playback
24 | // or to be sent to Pipecat for processing.
25 | message AudioRawFrame {
26 | uint64 id = 1;
27 | string name = 2;
28 | bytes audio = 3;
29 | uint32 sample_rate = 4;
30 | uint32 num_channels = 5;
31 | optional uint64 pts = 6;
32 | }
33 |
34 | // Represents a transcribed text frame with speaker metadata.
35 | // Typically created when a participant speaks.
36 | message TranscriptionFrame {
37 | uint64 id = 1;
38 | string name = 2;
39 | string text = 3;
40 | string user_id = 4;
41 | string timestamp = 5;
42 | }
43 |
44 | // Wrapper for a generic message sent to or received from the transport layer.
45 | // Commonly used for RTVI protocol messages.
46 | message MessageFrame {
47 | string data = 1;
48 | }
49 |
50 | message Frame {
51 | oneof frame {
52 | TextFrame text = 1;
53 | AudioRawFrame audio = 2;
54 | TranscriptionFrame transcription = 3;
55 | MessageFrame message = 4;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/transports/websocket-transport/proto/generate_typescript.sh:
--------------------------------------------------------------------------------
1 | # Use this script to generate the typescript each time we change the frames.proto file
2 | rm -rf ./src/generated/*
3 | protoc \
4 | --ts_out=generate_dependencies:./src/generated \
5 | proto/frames.proto
6 |
--------------------------------------------------------------------------------
/transports/websocket-transport/src/index.ts:
--------------------------------------------------------------------------------
1 | // export * from "./realTimeWebSocketTransport";
2 | // export * from "../../../lib/wavtools/dist/index.d.ts";
3 |
4 | import { WavMediaManager } from "../../../lib/media-mgmt/mediaManager";
5 | import { DailyMediaManager } from "../../../lib/media-mgmt/dailyMediaManager";
6 | import { WebSocketTransport } from "./webSocketTransport.ts";
7 |
8 | export { WavMediaManager, DailyMediaManager, WebSocketTransport };
9 |
--------------------------------------------------------------------------------
/transports/websocket-transport/src/webSocketTransport.ts:
--------------------------------------------------------------------------------
1 | import {
2 | logger,
3 | RTVIClientOptions,
4 | RTVIMessage,
5 | Tracks,
6 | Transport,
7 | TransportStartError,
8 | TransportState,
9 | } from "@pipecat-ai/client-js";
10 |
11 | import { ReconnectingWebSocket } from "../../../lib/websocket-utils/reconnectingWebSocket";
12 | import { DailyMediaManager } from "../../../lib/media-mgmt/dailyMediaManager";
13 |
14 | import { Frame } from "./generated/proto/frames";
15 | import { MediaManager } from "../../../lib/media-mgmt/mediaManager";
16 |
17 | export class WebSocketTransport extends Transport {
18 | declare private _ws: ReconnectingWebSocket | null;
19 | private static RECORDER_SAMPLE_RATE = 16_000;
20 | private audioQueue: ArrayBuffer[] = [];
21 | private _mediaManager: MediaManager;
22 |
23 | constructor() {
24 | super();
25 | this._mediaManager = new DailyMediaManager(
26 | true,
27 | true,
28 | undefined,
29 | undefined,
30 | 512,
31 | WebSocketTransport.RECORDER_SAMPLE_RATE,
32 | );
33 | this._mediaManager.setUserAudioCallback(
34 | this.handleUserAudioStream.bind(this),
35 | );
36 | this._ws = null;
37 | }
38 |
39 | initialize(
40 | options: RTVIClientOptions,
41 | messageHandler: (ev: RTVIMessage) => void,
42 | ): void {
43 | this._options = options;
44 | this._callbacks = options.callbacks ?? {};
45 | this._onMessage = messageHandler;
46 | this._mediaManager.setRTVIOptions(options);
47 | this.state = "disconnected";
48 | }
49 |
50 | async initDevices(): Promise {
51 | this.state = "initializing";
52 | await this._mediaManager.initialize();
53 | this.state = "initialized";
54 | }
55 |
56 | async connect(
57 | authBundle: unknown,
58 | abortController: AbortController,
59 | ): Promise {
60 | this.state = "connecting";
61 | try {
62 | this._ws = this.initializeWebsocket(authBundle);
63 | await this._ws.connect();
64 | await this._mediaManager.connect();
65 | this.state = "connected";
66 | this._callbacks.onConnected?.();
67 | } catch (error) {
68 | const msg = `Failed to connect to websocket: ${error}`;
69 | logger.error(msg);
70 | this.state = "error";
71 | throw new TransportStartError(msg);
72 | }
73 | }
74 |
75 | async disconnect(): Promise {
76 | this.state = "disconnecting";
77 | await this._mediaManager.disconnect();
78 | await this._ws?.close();
79 | this.state = "disconnected";
80 | this._callbacks.onDisconnected?.();
81 | }
82 |
83 | getAllMics(): Promise {
84 | return this._mediaManager.getAllMics();
85 | }
86 | getAllCams(): Promise {
87 | return this._mediaManager.getAllCams();
88 | }
89 | getAllSpeakers(): Promise {
90 | return this._mediaManager.getAllSpeakers();
91 | }
92 |
93 | async updateMic(micId: string): Promise {
94 | return this._mediaManager.updateMic(micId);
95 | }
96 | updateCam(camId: string): void {
97 | return this._mediaManager.updateCam(camId);
98 | }
99 | updateSpeaker(speakerId: string): void {
100 | return this._mediaManager.updateSpeaker(speakerId);
101 | }
102 |
103 | get selectedMic(): MediaDeviceInfo | Record {
104 | return this._mediaManager.selectedMic;
105 | }
106 | get selectedSpeaker(): MediaDeviceInfo | Record {
107 | return this._mediaManager.selectedSpeaker;
108 | }
109 |
110 | enableMic(enable: boolean): void {
111 | this._mediaManager.enableMic(enable);
112 | }
113 | get isMicEnabled(): boolean {
114 | return this._mediaManager.isMicEnabled;
115 | }
116 |
117 | get state(): TransportState {
118 | return this._state;
119 | }
120 |
121 | set state(state: TransportState) {
122 | if (this._state === state) return;
123 |
124 | this._state = state;
125 | this._callbacks.onTransportStateChanged?.(state);
126 | }
127 |
128 | get expiry(): number | undefined {
129 | return this._expiry;
130 | }
131 |
132 | tracks(): Tracks {
133 | return this._mediaManager.tracks();
134 | }
135 |
136 | initializeWebsocket(authBundle: any): ReconnectingWebSocket {
137 | const ws = new ReconnectingWebSocket(`${authBundle.ws_url}`, undefined, {
138 | parseBlobToJson: false,
139 | });
140 | // disabling the keep alive, there is no API for it inside Pipecat
141 | ws.keepAliveInterval = 0;
142 | ws.on("open", () => {
143 | logger.debug("Websocket connection opened");
144 | });
145 | ws.on("message", async (data: Blob) => {
146 | let arrayBuffer: ArrayBuffer = await data.arrayBuffer();
147 | const parsedFrame = Frame.fromBinary(new Uint8Array(arrayBuffer)).frame;
148 | if (parsedFrame.oneofKind === "audio") {
149 | // We should be able to use parsedFrame.audio.audio.buffer but for
150 | // some reason that contains all the bytes from the protobuf message.
151 | const audioVector = Array.from(parsedFrame.audio.audio);
152 | const uint8Array = new Uint8Array(audioVector);
153 | const int16Array = new Int16Array(uint8Array.buffer);
154 | this._mediaManager.bufferBotAudio(int16Array);
155 | } else if (parsedFrame.oneofKind === "message") {
156 | let jsonText = parsedFrame.message.data;
157 | try {
158 | let jsonMessage = JSON.parse(jsonText);
159 | if (jsonMessage.label === "rtvi-ai") {
160 | this._onMessage(jsonMessage as RTVIMessage);
161 | }
162 | } catch {
163 | logger.warn("Failed to parse message", jsonText);
164 | }
165 | }
166 | });
167 | ws.on("error", (error: Error) => {
168 | this.connectionError(`websocket error: ${error}`);
169 | });
170 | ws.on("connection-timeout", () => {
171 | this.connectionError("websocket connection timed out");
172 | });
173 | ws.on("close", (code: number) => {
174 | this.connectionError(`websocket connection closed. Code: ${code}`);
175 | });
176 | ws.on("reconnect-failed", () => {
177 | this.connectionError(`websocket reconnect failed`);
178 | });
179 | return ws;
180 | }
181 |
182 | sendReadyMessage(): void {
183 | this.state = "ready";
184 | this.sendMessage(RTVIMessage.clientReady());
185 | }
186 |
187 | handleUserAudioStream(data: ArrayBuffer): void {
188 | if (this.state === "ready") {
189 | try {
190 | void this.flushAudioQueue();
191 | void this._sendAudioInput(data);
192 | } catch (error) {
193 | logger.error("Error sending audio stream to websocket:", error);
194 | this.state = "error";
195 | }
196 | } else {
197 | this.audioQueue.push(data);
198 | }
199 | }
200 |
201 | private flushAudioQueue(): void {
202 | if (this.audioQueue.length <= 0) {
203 | return;
204 | }
205 | logger.info("Will flush audio queue", this.audioQueue.length);
206 | while (this.audioQueue.length > 0) {
207 | const queuedData = this.audioQueue.shift();
208 | if (queuedData) void this._sendAudioInput(queuedData);
209 | }
210 | }
211 |
212 | sendMessage(message: RTVIMessage): void {
213 | logger.debug("Received message to send to Web Socket", message);
214 | const frame = Frame.create({
215 | frame: {
216 | oneofKind: "message",
217 | message: {
218 | data: JSON.stringify(message),
219 | },
220 | },
221 | });
222 | void this._sendMsg(frame);
223 | }
224 |
225 | async _sendAudioInput(data: ArrayBuffer): Promise {
226 | const pcmByteArray = new Uint8Array(data);
227 | const frame = Frame.create({
228 | frame: {
229 | oneofKind: "audio",
230 | audio: {
231 | id: 0n,
232 | name: "audio",
233 | audio: pcmByteArray,
234 | sampleRate: WebSocketTransport.RECORDER_SAMPLE_RATE,
235 | numChannels: 1,
236 | },
237 | },
238 | });
239 | await this._sendMsg(frame);
240 | }
241 |
242 | async _sendMsg(msg: Frame): Promise {
243 | if (!this._ws) {
244 | logger.error("sendMsg called but WS is null");
245 | return;
246 | }
247 | if (this._ws.readyState !== WebSocket.OPEN) {
248 | logger.error("attempt to send to closed socket");
249 | return;
250 | }
251 | if (!msg) {
252 | logger.error("need a msg to send a msg");
253 | return;
254 | }
255 | try {
256 | const encodedFrame = new Uint8Array(Frame.toBinary(msg));
257 | await this._ws.send(encodedFrame);
258 | } catch (e) {
259 | logger.error("sendMsg error", e);
260 | }
261 | }
262 |
263 | connectionError(errorMsg: string): void {
264 | console.error(errorMsg);
265 | this.state = "error";
266 | void this.disconnect();
267 | }
268 |
269 | // Not implemented
270 | enableScreenShare(enable: boolean): void {
271 | logger.error("startScreenShare not implemented for WebSocketTransport");
272 | throw new Error("Not implemented");
273 | }
274 |
275 | public get isSharingScreen(): boolean {
276 | logger.error("isSharingScreen not implemented for WebSocketTransport");
277 | return false;
278 | }
279 |
280 | enableCam(enable: boolean) {
281 | logger.error("enableCam not implemented for WebSocketTransport");
282 | throw new Error("Not implemented");
283 | }
284 |
285 | get isCamEnabled(): boolean {
286 | logger.error("isCamEnabled not implemented for WebSocketTransport");
287 | return false;
288 | }
289 |
290 | get selectedCam(): MediaDeviceInfo | Record {
291 | logger.error("selectedCam not implemented for WebSocketTransport");
292 | throw new Error("Not implemented");
293 | }
294 | }
295 |
--------------------------------------------------------------------------------
/transports/websocket-transport/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "module": "ESNext",
5 | "lib": ["ES2020", "DOM", "DOM.Iterable"],
6 | "types": ["node"],
7 | "skipLibCheck": true,
8 | "jsx": "preserve",
9 |
10 | /* Bundler mode */
11 | "moduleResolution": "bundler",
12 | "allowImportingTsExtensions": true,
13 | "allowJs": true,
14 | "noEmit": true,
15 | "resolveJsonModule": true,
16 | "isolatedModules": true,
17 | "moduleDetection": "force",
18 |
19 | /* Linting */
20 | "strict": true,
21 | "noUnusedLocals": true,
22 | "noUnusedParameters": false,
23 | "noFallthroughCasesInSwitch": true
24 | },
25 | "include": ["src"]
26 | }
27 |
--------------------------------------------------------------------------------