├── .env.example ├── diagram.jpg ├── thumbnail.jpg ├── app ├── favicon.ico ├── layout.tsx ├── services │ ├── transcriptionService.ts │ └── geminiWebSocket.ts ├── globals.css ├── page.tsx ├── utils │ └── audioUtils.ts └── components │ └── CameraPreview.tsx ├── public ├── avatars │ └── gemini.png ├── vercel.svg ├── window.svg ├── file.svg ├── globe.svg ├── next.svg └── worklets │ └── audio-processor.js ├── next.config.ts ├── postcss.config.mjs ├── lib └── utils.ts ├── next-env.d.ts ├── eslint.config.mjs ├── next.config.js ├── components.json ├── tsconfig.json ├── components └── ui │ ├── progress.tsx │ ├── avatar.tsx │ ├── scroll-area.tsx │ ├── button.tsx │ └── card.tsx ├── package.json ├── README.md └── tailwind.config.ts /.env.example: -------------------------------------------------------------------------------- 1 | GEMINI_API_KEY=your_api_key_here 2 | -------------------------------------------------------------------------------- /diagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyu2/gemini-nextjs/HEAD/diagram.jpg -------------------------------------------------------------------------------- /thumbnail.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyu2/gemini-nextjs/HEAD/thumbnail.jpg -------------------------------------------------------------------------------- /app/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyu2/gemini-nextjs/HEAD/app/favicon.ico -------------------------------------------------------------------------------- /public/avatars/gemini.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyu2/gemini-nextjs/HEAD/public/avatars/gemini.png -------------------------------------------------------------------------------- /public/vercel.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /next.config.ts: -------------------------------------------------------------------------------- 1 | import type { NextConfig } from "next"; 2 | 3 | const nextConfig: NextConfig = { 4 | /* config options here */ 5 | }; 6 | 7 | export default nextConfig; 8 | -------------------------------------------------------------------------------- /postcss.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('postcss-load-config').Config} */ 2 | const config = { 3 | plugins: { 4 | tailwindcss: {}, 5 | }, 6 | }; 7 | 8 | export default config; 9 | -------------------------------------------------------------------------------- /lib/utils.ts: -------------------------------------------------------------------------------- 1 | import { clsx, type ClassValue } from "clsx" 2 | import { twMerge } from "tailwind-merge" 3 | 4 | export function cn(...inputs: ClassValue[]) { 5 | return twMerge(clsx(inputs)) 6 | } 7 | -------------------------------------------------------------------------------- /next-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | /// 3 | 4 | // NOTE: This file should not be edited 5 | // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. 6 | -------------------------------------------------------------------------------- /public/window.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/file.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import { dirname } from "path"; 2 | import { fileURLToPath } from "url"; 3 | import { FlatCompat } from "@eslint/eslintrc"; 4 | 5 | const __filename = fileURLToPath(import.meta.url); 6 | const __dirname = dirname(__filename); 7 | 8 | const compat = new FlatCompat({ 9 | baseDirectory: __dirname, 10 | }); 11 | 12 | const eslintConfig = [ 13 | ...compat.extends("next/core-web-vitals", "next/typescript"), 14 | ]; 15 | 16 | export default eslintConfig; 17 | -------------------------------------------------------------------------------- /next.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('next').NextConfig} */ 2 | const nextConfig = { 3 | async headers() { 4 | return [ 5 | { 6 | source: '/worklets/:path*', 7 | headers: [ 8 | { 9 | key: 'Content-Type', 10 | value: 'application/javascript', 11 | }, 12 | { 13 | key: 'Access-Control-Allow-Origin', 14 | value: '*', 15 | }, 16 | ], 17 | }, 18 | ]; 19 | }, 20 | }; 21 | 22 | module.exports = nextConfig; -------------------------------------------------------------------------------- /components.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://ui.shadcn.com/schema.json", 3 | "style": "new-york", 4 | "rsc": true, 5 | "tsx": true, 6 | "tailwind": { 7 | "config": "tailwind.config.ts", 8 | "css": "app/globals.css", 9 | "baseColor": "neutral", 10 | "cssVariables": true, 11 | "prefix": "" 12 | }, 13 | "aliases": { 14 | "components": "@/components", 15 | "utils": "@/lib/utils", 16 | "ui": "@/components/ui", 17 | "lib": "@/lib", 18 | "hooks": "@/hooks" 19 | }, 20 | "iconLibrary": "lucide" 21 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2017", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": true, 8 | "noEmit": true, 9 | "esModuleInterop": true, 10 | "module": "esnext", 11 | "moduleResolution": "bundler", 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "jsx": "preserve", 15 | "incremental": true, 16 | "plugins": [ 17 | { 18 | "name": "next" 19 | } 20 | ], 21 | "paths": { 22 | "@/*": ["./*"] 23 | }, 24 | "baseUrl": "." 25 | }, 26 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 27 | "exclude": ["node_modules"] 28 | } 29 | -------------------------------------------------------------------------------- /app/layout.tsx: -------------------------------------------------------------------------------- 1 | // app/layout.tsx 2 | import type { Metadata } from 'next'; 3 | import { Inter } from 'next/font/google'; 4 | import './globals.css'; 5 | 6 | const inter = Inter({ subsets: ['latin'] }); 7 | 8 | export const metadata: Metadata = { 9 | title: 'Camera Preview App', 10 | description: 'Next.js Camera Interface', 11 | }; 12 | 13 | export default function RootLayout({ 14 | children, 15 | }: { 16 | children: React.ReactNode; 17 | }) { 18 | return ( 19 | 20 | 21 |
22 | {children} 23 |
24 | 25 | 26 | ); 27 | } 28 | -------------------------------------------------------------------------------- /components/ui/progress.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import * as React from "react" 4 | import * as ProgressPrimitive from "@radix-ui/react-progress" 5 | 6 | import { cn } from "@/lib/utils" 7 | 8 | const Progress = React.forwardRef< 9 | React.ElementRef, 10 | React.ComponentPropsWithoutRef 11 | >(({ className, value, ...props }, ref) => ( 12 | 20 | 24 | 25 | )) 26 | Progress.displayName = ProgressPrimitive.Root.displayName 27 | 28 | export { Progress } 29 | -------------------------------------------------------------------------------- /public/globe.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/services/transcriptionService.ts: -------------------------------------------------------------------------------- 1 | import { GoogleGenerativeAI } from "@google/generative-ai"; 2 | 3 | const genAI = new GoogleGenerativeAI(process.env.NEXT_PUBLIC_GEMINI_API_KEY || ''); 4 | const MODEL_NAME = "gemini-1.5-flash-8b"; 5 | 6 | export class TranscriptionService { 7 | private model; 8 | 9 | constructor() { 10 | this.model = genAI.getGenerativeModel({ model: MODEL_NAME }); 11 | } 12 | 13 | async transcribeAudio(audioBase64: string, mimeType: string = "audio/wav"): Promise { 14 | try { 15 | const result = await this.model.generateContent([ 16 | { 17 | inlineData: { 18 | mimeType: mimeType, 19 | data: audioBase64 20 | } 21 | }, 22 | { text: "Please transcribe the spoken language in this audio accurately. Ignore any background noise or non-speech sounds." }, 23 | ]); 24 | 25 | return result.response.text(); 26 | } catch (error) { 27 | console.error("Transcription error:", error); 28 | throw error; 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "my-app", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev --turbopack", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint" 10 | }, 11 | "dependencies": { 12 | "@google/generative-ai": "^0.21.0", 13 | "@radix-ui/react-avatar": "^1.1.3", 14 | "@radix-ui/react-dropdown-menu": "^2.1.6", 15 | "@radix-ui/react-progress": "^1.1.2", 16 | "@radix-ui/react-scroll-area": "^1.2.3", 17 | "@radix-ui/react-slot": "^1.1.2", 18 | "class-variance-authority": "^0.7.1", 19 | "clsx": "^2.1.1", 20 | "js-base64": "^3.7.7", 21 | "lamejs": "^1.2.1", 22 | "lucide-react": "^0.475.0", 23 | "next": "15.1.7", 24 | "react": "^19.0.0", 25 | "react-dom": "^19.0.0", 26 | "tailwind-merge": "^3.0.1", 27 | "tailwindcss-animate": "^1.0.7" 28 | }, 29 | "devDependencies": { 30 | "@eslint/eslintrc": "^3", 31 | "@types/node": "^20", 32 | "@types/react": "^19", 33 | "@types/react-dom": "^19", 34 | "eslint": "^9", 35 | "eslint-config-next": "15.1.7", 36 | "postcss": "^8", 37 | "tailwindcss": "^3.4.1", 38 | "typescript": "^5" 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /public/next.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multimodal Realtime App with Gemini 2.0 by Next.js Framework 2 | 3 | A demonstration project showing how to build a realtime multimodal application using Google's Gemini 2.0 API and Next.js. This app can process audio, video, and generate transcripts in realtime. 4 | 5 | ![Watch Demo Video](./thumbnail.jpg) 6 | 7 | [Watch the Tutorial Video](https://youtu.be/YUfer6xyExY) 8 | 9 | ## Features 10 | 11 | - Realtime audio/video(image) interaction with Gemini 2.0 Multimodal Live API 12 | - Live transcription by Gemini 1.5/2.0 GenerativeAI API 13 | - Built with Next.js for optimal performance 14 | 15 | ## Architecture 16 | 17 | ![Block Diagram](./diagram.jpg) 18 | 19 | ## Prerequisites 20 | 21 | - Node.js 18+ installed 22 | - API key for Gemini 2.0 Model 23 | 24 | ## Getting Started 25 | 26 | 1. Clone the repository 27 | ```bash 28 | git clone https://github.com/yeyu2/gemini-nextjs.git 29 | cd gemini-nextjs 30 | ``` 31 | 32 | 2. Install dependencies 33 | ```bash 34 | npm install 35 | # or 36 | yarn install 37 | ``` 38 | 39 | 3. Set up environment variables 40 | ```bash 41 | cp .env.example .env.local 42 | ``` 43 | Add your Gemini API key to `.env.local`: 44 | ``` 45 | GEMINI_API_KEY=your_api_key_here 46 | ``` 47 | 48 | 4. Run the development server 49 | ```bash 50 | npm run dev 51 | # or 52 | yarn dev 53 | ``` 54 | 55 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the application. 56 | 57 | 58 | -------------------------------------------------------------------------------- /components/ui/avatar.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import * as React from "react" 4 | import * as AvatarPrimitive from "@radix-ui/react-avatar" 5 | 6 | import { cn } from "@/lib/utils" 7 | 8 | const Avatar = React.forwardRef< 9 | React.ElementRef, 10 | React.ComponentPropsWithoutRef 11 | >(({ className, ...props }, ref) => ( 12 | 20 | )) 21 | Avatar.displayName = AvatarPrimitive.Root.displayName 22 | 23 | const AvatarImage = React.forwardRef< 24 | React.ElementRef, 25 | React.ComponentPropsWithoutRef 26 | >(({ className, ...props }, ref) => ( 27 | 32 | )) 33 | AvatarImage.displayName = AvatarPrimitive.Image.displayName 34 | 35 | const AvatarFallback = React.forwardRef< 36 | React.ElementRef, 37 | React.ComponentPropsWithoutRef 38 | >(({ className, ...props }, ref) => ( 39 | 47 | )) 48 | AvatarFallback.displayName = AvatarPrimitive.Fallback.displayName 49 | 50 | export { Avatar, AvatarImage, AvatarFallback } 51 | -------------------------------------------------------------------------------- /public/worklets/audio-processor.js: -------------------------------------------------------------------------------- 1 | // Note: AudioWorkletProcessor is available in the worklet scope 2 | class AudioProcessor extends AudioWorkletProcessor { 3 | constructor() { 4 | super(); 5 | this.bufferSize = 2048; // Reduced from 4096 to 2048 for faster response 6 | this.accumulatedSamples = new Float32Array(this.bufferSize); 7 | this.sampleCount = 0; 8 | } 9 | 10 | process(inputs, outputs, parameters) { 11 | const input = inputs[0][0]; 12 | if (!input) return true; 13 | 14 | // Accumulate samples 15 | for (let i = 0; i < input.length && this.sampleCount < this.bufferSize; i++) { 16 | this.accumulatedSamples[this.sampleCount++] = input[i]; 17 | } 18 | 19 | // Process when we have enough samples 20 | if (this.sampleCount >= this.bufferSize) { 21 | const pcm16 = new Int16Array(this.bufferSize); 22 | let sum = 0; 23 | 24 | // Simple conversion like in the original implementation 25 | for (let i = 0; i < this.bufferSize; i++) { 26 | // Scale to 16-bit range directly 27 | pcm16[i] = this.accumulatedSamples[i] * 0x7FFF; 28 | sum += Math.abs(pcm16[i]); 29 | } 30 | 31 | const buffer = new ArrayBuffer(this.bufferSize * 2); 32 | const view = new DataView(buffer); 33 | pcm16.forEach((value, index) => { 34 | view.setInt16(index * 2, value, true); 35 | }); 36 | 37 | // Simplified level calculation 38 | const level = (sum / (this.bufferSize * 0x7FFF)) * 100; 39 | 40 | this.port.postMessage({ 41 | pcmData: buffer, 42 | level: Math.min(level * 5, 100) 43 | }, [buffer]); 44 | 45 | this.sampleCount = 0; 46 | } 47 | 48 | return true; 49 | } 50 | } 51 | 52 | registerProcessor('audio-processor', AudioProcessor); -------------------------------------------------------------------------------- /components/ui/scroll-area.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import * as React from "react" 4 | import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area" 5 | 6 | import { cn } from "@/lib/utils" 7 | 8 | const ScrollArea = React.forwardRef< 9 | React.ElementRef, 10 | React.ComponentPropsWithoutRef 11 | >(({ className, children, ...props }, ref) => ( 12 | 17 | 18 | {children} 19 | 20 | 21 | 22 | 23 | )) 24 | ScrollArea.displayName = ScrollAreaPrimitive.Root.displayName 25 | 26 | const ScrollBar = React.forwardRef< 27 | React.ElementRef, 28 | React.ComponentPropsWithoutRef 29 | >(({ className, orientation = "vertical", ...props }, ref) => ( 30 | 43 | 44 | 45 | )) 46 | ScrollBar.displayName = ScrollAreaPrimitive.ScrollAreaScrollbar.displayName 47 | 48 | export { ScrollArea, ScrollBar } 49 | -------------------------------------------------------------------------------- /tailwind.config.ts: -------------------------------------------------------------------------------- 1 | import type { Config } from "tailwindcss"; 2 | 3 | export default { 4 | darkMode: ["class"], 5 | content: [ 6 | "./pages/**/*.{js,ts,jsx,tsx,mdx}", 7 | "./components/**/*.{js,ts,jsx,tsx,mdx}", 8 | "./app/**/*.{js,ts,jsx,tsx,mdx}", 9 | ], 10 | theme: { 11 | extend: { 12 | colors: { 13 | background: 'hsl(var(--background))', 14 | foreground: 'hsl(var(--foreground))', 15 | card: { 16 | DEFAULT: 'hsl(var(--card))', 17 | foreground: 'hsl(var(--card-foreground))' 18 | }, 19 | popover: { 20 | DEFAULT: 'hsl(var(--popover))', 21 | foreground: 'hsl(var(--popover-foreground))' 22 | }, 23 | primary: { 24 | DEFAULT: 'hsl(var(--primary))', 25 | foreground: 'hsl(var(--primary-foreground))' 26 | }, 27 | secondary: { 28 | DEFAULT: 'hsl(var(--secondary))', 29 | foreground: 'hsl(var(--secondary-foreground))' 30 | }, 31 | muted: { 32 | DEFAULT: 'hsl(var(--muted))', 33 | foreground: 'hsl(var(--muted-foreground))' 34 | }, 35 | accent: { 36 | DEFAULT: 'hsl(var(--accent))', 37 | foreground: 'hsl(var(--accent-foreground))' 38 | }, 39 | destructive: { 40 | DEFAULT: 'hsl(var(--destructive))', 41 | foreground: 'hsl(var(--destructive-foreground))' 42 | }, 43 | border: 'hsl(var(--border))', 44 | input: 'hsl(var(--input))', 45 | ring: 'hsl(var(--ring))', 46 | chart: { 47 | '1': 'hsl(var(--chart-1))', 48 | '2': 'hsl(var(--chart-2))', 49 | '3': 'hsl(var(--chart-3))', 50 | '4': 'hsl(var(--chart-4))', 51 | '5': 'hsl(var(--chart-5))' 52 | } 53 | }, 54 | borderRadius: { 55 | lg: 'var(--radius)', 56 | md: 'calc(var(--radius) - 2px)', 57 | sm: 'calc(var(--radius) - 4px)' 58 | } 59 | } 60 | }, 61 | plugins: [require("tailwindcss-animate")], 62 | } satisfies Config; 63 | -------------------------------------------------------------------------------- /app/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | body { 6 | font-family: Arial, Helvetica, sans-serif; 7 | } 8 | 9 | @layer base { 10 | :root { 11 | --background: 0 0% 100%; 12 | --foreground: 0 0% 3.9%; 13 | --card: 0 0% 100%; 14 | --card-foreground: 0 0% 3.9%; 15 | --popover: 0 0% 100%; 16 | --popover-foreground: 0 0% 3.9%; 17 | --primary: 0 0% 9%; 18 | --primary-foreground: 0 0% 98%; 19 | --secondary: 0 0% 96.1%; 20 | --secondary-foreground: 0 0% 9%; 21 | --muted: 0 0% 96.1%; 22 | --muted-foreground: 0 0% 45.1%; 23 | --accent: 0 0% 96.1%; 24 | --accent-foreground: 0 0% 9%; 25 | --destructive: 0 84.2% 60.2%; 26 | --destructive-foreground: 0 0% 98%; 27 | --border: 0 0% 89.8%; 28 | --input: 0 0% 89.8%; 29 | --ring: 0 0% 3.9%; 30 | --chart-1: 12 76% 61%; 31 | --chart-2: 173 58% 39%; 32 | --chart-3: 197 37% 24%; 33 | --chart-4: 43 74% 66%; 34 | --chart-5: 27 87% 67%; 35 | --radius: 0.5rem; 36 | } 37 | .dark { 38 | --background: 0 0% 3.9%; 39 | --foreground: 0 0% 98%; 40 | --card: 0 0% 3.9%; 41 | --card-foreground: 0 0% 98%; 42 | --popover: 0 0% 3.9%; 43 | --popover-foreground: 0 0% 98%; 44 | --primary: 0 0% 98%; 45 | --primary-foreground: 0 0% 9%; 46 | --secondary: 0 0% 14.9%; 47 | --secondary-foreground: 0 0% 98%; 48 | --muted: 0 0% 14.9%; 49 | --muted-foreground: 0 0% 63.9%; 50 | --accent: 0 0% 14.9%; 51 | --accent-foreground: 0 0% 98%; 52 | --destructive: 0 62.8% 30.6%; 53 | --destructive-foreground: 0 0% 98%; 54 | --border: 0 0% 14.9%; 55 | --input: 0 0% 14.9%; 56 | --ring: 0 0% 83.1%; 57 | --chart-1: 220 70% 50%; 58 | --chart-2: 160 60% 45%; 59 | --chart-3: 30 80% 55%; 60 | --chart-4: 280 65% 60%; 61 | --chart-5: 340 75% 55%; 62 | } 63 | } 64 | 65 | @layer base { 66 | * { 67 | @apply border-border; 68 | } 69 | body { 70 | @apply bg-background text-foreground; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /components/ui/button.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | import { Slot } from "@radix-ui/react-slot" 3 | import { cva, type VariantProps } from "class-variance-authority" 4 | 5 | import { cn } from "@/lib/utils" 6 | 7 | const buttonVariants = cva( 8 | "inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0", 9 | { 10 | variants: { 11 | variant: { 12 | default: 13 | "bg-primary text-primary-foreground shadow hover:bg-primary/90", 14 | destructive: 15 | "bg-destructive text-destructive-foreground shadow-sm hover:bg-destructive/90", 16 | outline: 17 | "border border-input bg-background shadow-sm hover:bg-accent hover:text-accent-foreground", 18 | secondary: 19 | "bg-secondary text-secondary-foreground shadow-sm hover:bg-secondary/80", 20 | ghost: "hover:bg-accent hover:text-accent-foreground", 21 | link: "text-primary underline-offset-4 hover:underline", 22 | }, 23 | size: { 24 | default: "h-9 px-4 py-2", 25 | sm: "h-8 rounded-md px-3 text-xs", 26 | lg: "h-10 rounded-md px-8", 27 | icon: "h-9 w-9", 28 | }, 29 | }, 30 | defaultVariants: { 31 | variant: "default", 32 | size: "default", 33 | }, 34 | } 35 | ) 36 | 37 | export interface ButtonProps 38 | extends React.ButtonHTMLAttributes, 39 | VariantProps { 40 | asChild?: boolean 41 | } 42 | 43 | const Button = React.forwardRef( 44 | ({ className, variant, size, asChild = false, ...props }, ref) => { 45 | const Comp = asChild ? Slot : "button" 46 | return ( 47 | 52 | ) 53 | } 54 | ) 55 | Button.displayName = "Button" 56 | 57 | export { Button, buttonVariants } 58 | -------------------------------------------------------------------------------- /components/ui/card.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | 3 | import { cn } from "@/lib/utils" 4 | 5 | const Card = React.forwardRef< 6 | HTMLDivElement, 7 | React.HTMLAttributes 8 | >(({ className, ...props }, ref) => ( 9 |
17 | )) 18 | Card.displayName = "Card" 19 | 20 | const CardHeader = React.forwardRef< 21 | HTMLDivElement, 22 | React.HTMLAttributes 23 | >(({ className, ...props }, ref) => ( 24 |
29 | )) 30 | CardHeader.displayName = "CardHeader" 31 | 32 | const CardTitle = React.forwardRef< 33 | HTMLDivElement, 34 | React.HTMLAttributes 35 | >(({ className, ...props }, ref) => ( 36 |
41 | )) 42 | CardTitle.displayName = "CardTitle" 43 | 44 | const CardDescription = React.forwardRef< 45 | HTMLDivElement, 46 | React.HTMLAttributes 47 | >(({ className, ...props }, ref) => ( 48 |
53 | )) 54 | CardDescription.displayName = "CardDescription" 55 | 56 | const CardContent = React.forwardRef< 57 | HTMLDivElement, 58 | React.HTMLAttributes 59 | >(({ className, ...props }, ref) => ( 60 |
61 | )) 62 | CardContent.displayName = "CardContent" 63 | 64 | const CardFooter = React.forwardRef< 65 | HTMLDivElement, 66 | React.HTMLAttributes 67 | >(({ className, ...props }, ref) => ( 68 |
73 | )) 74 | CardFooter.displayName = "CardFooter" 75 | 76 | export { Card, CardHeader, CardFooter, CardTitle, CardDescription, CardContent } 77 | -------------------------------------------------------------------------------- /app/page.tsx: -------------------------------------------------------------------------------- 1 | // app/page.tsx 2 | "use client"; 3 | import { useState, useCallback } from 'react'; 4 | import CameraPreview from './components/CameraPreview'; 5 | import { ScrollArea } from "@/components/ui/scroll-area"; 6 | import { Avatar, AvatarImage, AvatarFallback } from "@/components/ui/avatar"; 7 | 8 | // Helper function to create message components 9 | const HumanMessage = ({ text }: { text: string }) => ( 10 |
11 | 12 | 13 | H 14 | 15 |
16 |
17 |

You

18 |
19 |
20 | {text} 21 |
22 |
23 |
24 | ); 25 | 26 | const GeminiMessage = ({ text }: { text: string }) => ( 27 |
28 | 29 | 30 | AI 31 | 32 |
33 |
34 |

Gemini

35 |
36 |
37 | {text} 38 |
39 |
40 |
41 | ); 42 | 43 | export default function Home() { 44 | const [messages, setMessages] = useState<{ type: 'human' | 'gemini', text: string }[]>([]); 45 | 46 | const handleTranscription = useCallback((transcription: string) => { 47 | setMessages(prev => [...prev, { type: 'gemini', text: transcription }]); 48 | }, []); 49 | 50 | return ( 51 | <> 52 |

53 | Multimodal Live Chat 54 |

55 |
56 | 57 | 58 |
59 | 60 |
61 | 62 | {messages.map((message, index) => ( 63 | message.type === 'human' ? ( 64 | 65 | ) : ( 66 | 67 | ) 68 | ))} 69 |
70 |
71 |
72 |
73 | 74 | ); 75 | } 76 | -------------------------------------------------------------------------------- /app/utils/audioUtils.ts: -------------------------------------------------------------------------------- 1 | // Helper function to download and analyze WAV data 2 | function debugSaveWav(wavData: string, filename: string = 'debug.wav') { 3 | const byteString = atob(wavData); 4 | const bytes = new Uint8Array(byteString.length); 5 | for (let i = 0; i < byteString.length; i++) { 6 | bytes[i] = byteString.charCodeAt(i); 7 | } 8 | 9 | // Create blob and download 10 | const blob = new Blob([bytes], { type: 'audio/wav' }); 11 | const url = URL.createObjectURL(blob); 12 | const a = document.createElement('a'); 13 | a.href = url; 14 | a.download = filename; 15 | document.body.appendChild(a); 16 | a.click(); 17 | document.body.removeChild(a); 18 | URL.revokeObjectURL(url); 19 | } 20 | 21 | export function pcmToWav(pcmData: string, sampleRate: number = 24000): Promise { 22 | return new Promise((resolve, reject) => { 23 | try { 24 | // Decode base64 PCM data 25 | const binaryString = atob(pcmData); 26 | const pcmBytes = new Uint8Array(binaryString.length); 27 | for (let i = 0; i < binaryString.length; i++) { 28 | pcmBytes[i] = binaryString.charCodeAt(i); 29 | } 30 | 31 | // Convert bytes to samples (assuming 16-bit PCM) 32 | const samples = new Int16Array(pcmBytes.buffer); 33 | 34 | // Create WAV header 35 | const wavHeader = new ArrayBuffer(44); 36 | const view = new DataView(wavHeader); 37 | 38 | const pcmByteLength = samples.length * 2; // 16-bit = 2 bytes per sample 39 | 40 | // "RIFF" chunk descriptor 41 | view.setUint8(0, 'R'.charCodeAt(0)); 42 | view.setUint8(1, 'I'.charCodeAt(0)); 43 | view.setUint8(2, 'F'.charCodeAt(0)); 44 | view.setUint8(3, 'F'.charCodeAt(0)); 45 | 46 | // File length (header size + data size) 47 | view.setUint32(4, 36 + pcmByteLength, true); 48 | 49 | // "WAVE" format 50 | view.setUint8(8, 'W'.charCodeAt(0)); 51 | view.setUint8(9, 'A'.charCodeAt(0)); 52 | view.setUint8(10, 'V'.charCodeAt(0)); 53 | view.setUint8(11, 'E'.charCodeAt(0)); 54 | 55 | // "fmt " sub-chunk 56 | view.setUint8(12, 'f'.charCodeAt(0)); 57 | view.setUint8(13, 'm'.charCodeAt(0)); 58 | view.setUint8(14, 't'.charCodeAt(0)); 59 | view.setUint8(15, ' '.charCodeAt(0)); 60 | 61 | // Sub-chunk size 62 | view.setUint32(16, 16, true); 63 | // Audio format (PCM = 1) 64 | view.setUint16(20, 1, true); 65 | // Number of channels 66 | view.setUint16(22, 1, true); 67 | // Sample rate 68 | view.setUint32(24, sampleRate, true); 69 | // Byte rate 70 | view.setUint32(28, sampleRate * 2, true); 71 | // Block align 72 | view.setUint16(32, 2, true); 73 | // Bits per sample 74 | view.setUint16(34, 16, true); 75 | 76 | // "data" sub-chunk 77 | view.setUint8(36, 'd'.charCodeAt(0)); 78 | view.setUint8(37, 'a'.charCodeAt(0)); 79 | view.setUint8(38, 't'.charCodeAt(0)); 80 | view.setUint8(39, 'a'.charCodeAt(0)); 81 | 82 | // Data size 83 | view.setUint32(40, pcmByteLength, true); 84 | 85 | // Create final buffer 86 | const wavBuffer = new ArrayBuffer(wavHeader.byteLength + pcmByteLength); 87 | const wavBytes = new Uint8Array(wavBuffer); 88 | 89 | // Copy header and PCM data 90 | wavBytes.set(new Uint8Array(wavHeader), 0); 91 | wavBytes.set(new Uint8Array(samples.buffer), wavHeader.byteLength); 92 | 93 | // Use Blob and FileReader to convert to base64 94 | const blob = new Blob([wavBytes], { type: 'audio/wav' }); 95 | const reader = new FileReader(); 96 | reader.onloadend = () => { 97 | const base64data = reader.result?.toString().split(',')[1]; 98 | if (base64data) { 99 | resolve(base64data); 100 | } else { 101 | reject(new Error("Failed to convert WAV to base64")); 102 | } 103 | }; 104 | reader.onerror = reject; 105 | reader.readAsDataURL(blob); 106 | } catch (error) { 107 | reject(error); 108 | } 109 | }); 110 | } -------------------------------------------------------------------------------- /app/services/geminiWebSocket.ts: -------------------------------------------------------------------------------- 1 | import { Base64 } from 'js-base64'; 2 | import { TranscriptionService } from './transcriptionService'; 3 | import { pcmToWav } from '../utils/audioUtils'; 4 | 5 | const MODEL = "models/gemini-2.0-flash-exp"; 6 | const API_KEY = process.env.NEXT_PUBLIC_GEMINI_API_KEY; 7 | const HOST = "generativelanguage.googleapis.com"; 8 | const WS_URL = `wss://${HOST}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key=${API_KEY}`; 9 | 10 | export class GeminiWebSocket { 11 | private ws: WebSocket | null = null; 12 | private isConnected: boolean = false; 13 | private isSetupComplete: boolean = false; 14 | private onMessageCallback: ((text: string) => void) | null = null; 15 | private onSetupCompleteCallback: (() => void) | null = null; 16 | private audioContext: AudioContext | null = null; 17 | 18 | // Audio queue management 19 | private audioQueue: Float32Array[] = []; 20 | private isPlaying: boolean = false; 21 | private currentSource: AudioBufferSourceNode | null = null; 22 | private isPlayingResponse: boolean = false; 23 | private onPlayingStateChange: ((isPlaying: boolean) => void) | null = null; 24 | private onAudioLevelChange: ((level: number) => void) | null = null; 25 | private onTranscriptionCallback: ((text: string) => void) | null = null; 26 | private transcriptionService: TranscriptionService; 27 | private accumulatedPcmData: string[] = []; 28 | 29 | constructor( 30 | onMessage: (text: string) => void, 31 | onSetupComplete: () => void, 32 | onPlayingStateChange: (isPlaying: boolean) => void, 33 | onAudioLevelChange: (level: number) => void, 34 | onTranscription: (text: string) => void 35 | ) { 36 | this.onMessageCallback = onMessage; 37 | this.onSetupCompleteCallback = onSetupComplete; 38 | this.onPlayingStateChange = onPlayingStateChange; 39 | this.onAudioLevelChange = onAudioLevelChange; 40 | this.onTranscriptionCallback = onTranscription; 41 | // Create AudioContext for playback 42 | this.audioContext = new AudioContext({ 43 | sampleRate: 24000 // Match the response audio rate 44 | }); 45 | this.transcriptionService = new TranscriptionService(); 46 | } 47 | 48 | connect() { 49 | if (this.ws?.readyState === WebSocket.OPEN) { 50 | return; 51 | } 52 | 53 | this.ws = new WebSocket(WS_URL); 54 | 55 | this.ws.onopen = () => { 56 | this.isConnected = true; 57 | this.sendInitialSetup(); 58 | }; 59 | 60 | this.ws.onmessage = async (event) => { 61 | try { 62 | let messageText: string; 63 | if (event.data instanceof Blob) { 64 | const arrayBuffer = await event.data.arrayBuffer(); 65 | const bytes = new Uint8Array(arrayBuffer); 66 | messageText = new TextDecoder('utf-8').decode(bytes); 67 | } else { 68 | messageText = event.data; 69 | } 70 | 71 | await this.handleMessage(messageText); 72 | } catch (error) { 73 | console.error("[WebSocket] Error processing message:", error); 74 | } 75 | }; 76 | 77 | this.ws.onerror = (error) => { 78 | console.error("[WebSocket] Error:", error); 79 | }; 80 | 81 | this.ws.onclose = (event) => { 82 | this.isConnected = false; 83 | 84 | // Only attempt to reconnect if we haven't explicitly called disconnect 85 | if (!event.wasClean && this.isSetupComplete) { 86 | setTimeout(() => this.connect(), 1000); 87 | } 88 | }; 89 | } 90 | 91 | private sendInitialSetup() { 92 | const setupMessage = { 93 | setup: { 94 | model: MODEL, 95 | generation_config: { 96 | response_modalities: ["AUDIO"] 97 | } 98 | } 99 | }; 100 | this.ws?.send(JSON.stringify(setupMessage)); 101 | } 102 | 103 | sendMediaChunk(b64Data: string, mimeType: string) { 104 | if (!this.isConnected || !this.ws || !this.isSetupComplete) return; 105 | 106 | const message = { 107 | realtime_input: { 108 | media_chunks: [{ 109 | mime_type: mimeType === "audio/pcm" ? "audio/pcm" : mimeType, 110 | data: b64Data 111 | }] 112 | } 113 | }; 114 | 115 | try { 116 | this.ws.send(JSON.stringify(message)); 117 | } catch (error) { 118 | console.error("[WebSocket] Error sending media chunk:", error); 119 | } 120 | } 121 | 122 | private async playAudioResponse(base64Data: string) { 123 | if (!this.audioContext) return; 124 | 125 | try { 126 | // Decode base64 to bytes 127 | const binaryString = atob(base64Data); 128 | const bytes = new Uint8Array(binaryString.length); 129 | for (let i = 0; i < binaryString.length; i++) { 130 | bytes[i] = binaryString.charCodeAt(i); 131 | } 132 | 133 | // Convert to Int16Array (PCM format) 134 | const pcmData = new Int16Array(bytes.buffer); 135 | 136 | // Convert to float32 for Web Audio API 137 | const float32Data = new Float32Array(pcmData.length); 138 | for (let i = 0; i < pcmData.length; i++) { 139 | float32Data[i] = pcmData[i] / 32768.0; 140 | } 141 | 142 | // Add to queue and start playing if not already playing 143 | this.audioQueue.push(float32Data); 144 | this.playNextInQueue(); 145 | } catch (error) { 146 | console.error("[WebSocket] Error processing audio:", error); 147 | } 148 | } 149 | 150 | private async playNextInQueue() { 151 | if (!this.audioContext || this.isPlaying || this.audioQueue.length === 0) return; 152 | 153 | try { 154 | this.isPlaying = true; 155 | this.isPlayingResponse = true; 156 | this.onPlayingStateChange?.(true); 157 | const float32Data = this.audioQueue.shift()!; 158 | 159 | // Calculate audio level 160 | let sum = 0; 161 | for (let i = 0; i < float32Data.length; i++) { 162 | sum += Math.abs(float32Data[i]); 163 | } 164 | const level = Math.min((sum / float32Data.length) * 100 * 5, 100); 165 | this.onAudioLevelChange?.(level); 166 | 167 | const audioBuffer = this.audioContext.createBuffer( 168 | 1, 169 | float32Data.length, 170 | 24000 171 | ); 172 | audioBuffer.getChannelData(0).set(float32Data); 173 | 174 | this.currentSource = this.audioContext.createBufferSource(); 175 | this.currentSource.buffer = audioBuffer; 176 | this.currentSource.connect(this.audioContext.destination); 177 | 178 | this.currentSource.onended = () => { 179 | this.isPlaying = false; 180 | this.currentSource = null; 181 | if (this.audioQueue.length === 0) { 182 | this.isPlayingResponse = false; 183 | this.onPlayingStateChange?.(false); 184 | } 185 | this.playNextInQueue(); 186 | }; 187 | 188 | this.currentSource.start(); 189 | } catch (error) { 190 | console.error("[WebSocket] Error playing audio:", error); 191 | this.isPlaying = false; 192 | this.isPlayingResponse = false; 193 | this.onPlayingStateChange?.(false); 194 | this.currentSource = null; 195 | this.playNextInQueue(); 196 | } 197 | } 198 | 199 | private stopCurrentAudio() { 200 | if (this.currentSource) { 201 | try { 202 | this.currentSource.stop(); 203 | } catch (e) { 204 | // Ignore errors if already stopped 205 | } 206 | this.currentSource = null; 207 | } 208 | this.isPlaying = false; 209 | this.isPlayingResponse = false; 210 | this.onPlayingStateChange?.(false); 211 | this.audioQueue = []; // Clear queue 212 | } 213 | 214 | private async handleMessage(message: string) { 215 | try { 216 | const messageData = JSON.parse(message); 217 | 218 | if (messageData.setupComplete) { 219 | this.isSetupComplete = true; 220 | this.onSetupCompleteCallback?.(); 221 | return; 222 | } 223 | 224 | // Handle audio data 225 | if (messageData.serverContent?.modelTurn?.parts) { 226 | const parts = messageData.serverContent.modelTurn.parts; 227 | for (const part of parts) { 228 | if (part.inlineData?.mimeType === "audio/pcm;rate=24000") { 229 | this.accumulatedPcmData.push(part.inlineData.data); 230 | this.playAudioResponse(part.inlineData.data); 231 | } 232 | } 233 | } 234 | 235 | // Handle turn completion separately 236 | if (messageData.serverContent?.turnComplete === true) { 237 | if (this.accumulatedPcmData.length > 0) { 238 | try { 239 | const fullPcmData = this.accumulatedPcmData.join(''); 240 | const wavData = await pcmToWav(fullPcmData, 24000); 241 | 242 | const transcription = await this.transcriptionService.transcribeAudio( 243 | wavData, 244 | "audio/wav" 245 | ); 246 | console.log("[Transcription]:", transcription); 247 | 248 | this.onTranscriptionCallback?.(transcription); 249 | this.accumulatedPcmData = []; // Clear accumulated data 250 | } catch (error) { 251 | console.error("[WebSocket] Transcription error:", error); 252 | } 253 | } 254 | } 255 | } catch (error) { 256 | console.error("[WebSocket] Error parsing message:", error); 257 | } 258 | } 259 | 260 | disconnect() { 261 | this.isSetupComplete = false; 262 | if (this.ws) { 263 | this.ws.close(1000, "Intentional disconnect"); 264 | this.ws = null; 265 | } 266 | this.isConnected = false; 267 | this.accumulatedPcmData = []; 268 | } 269 | } -------------------------------------------------------------------------------- /app/components/CameraPreview.tsx: -------------------------------------------------------------------------------- 1 | // app/components/CameraPreview.tsx 2 | "use client"; 3 | 4 | import { useEffect, useRef, useState, useCallback } from 'react'; 5 | import { Card, CardContent } from "../../components/ui/card"; 6 | import { Button } from "../../components/ui/button"; 7 | import { Video, VideoOff } from "lucide-react"; 8 | import { GeminiWebSocket } from '../services/geminiWebSocket'; 9 | import { Base64 } from 'js-base64'; 10 | import { Avatar, AvatarImage, AvatarFallback } from "@/components/ui/avatar"; 11 | 12 | interface CameraPreviewProps { 13 | onTranscription: (text: string) => void; 14 | } 15 | 16 | export default function CameraPreview({ onTranscription }: CameraPreviewProps) { 17 | const videoRef = useRef(null); 18 | const audioContextRef = useRef(null); 19 | const [isStreaming, setIsStreaming] = useState(false); 20 | const [stream, setStream] = useState(null); 21 | const [audioLevel, setAudioLevel] = useState(0); 22 | const geminiWsRef = useRef(null); 23 | const videoCanvasRef = useRef(null); 24 | const audioWorkletNodeRef = useRef(null); 25 | const [isAudioSetup, setIsAudioSetup] = useState(false); 26 | const setupInProgressRef = useRef(false); 27 | const [isWebSocketReady, setIsWebSocketReady] = useState(false); 28 | const imageIntervalRef = useRef(null); 29 | const [isModelSpeaking, setIsModelSpeaking] = useState(false); 30 | const [outputAudioLevel, setOutputAudioLevel] = useState(0); 31 | const [connectionStatus, setConnectionStatus] = useState<'disconnected' | 'connecting' | 'connected'>('disconnected'); 32 | 33 | const cleanupAudio = useCallback(() => { 34 | if (audioWorkletNodeRef.current) { 35 | audioWorkletNodeRef.current.disconnect(); 36 | audioWorkletNodeRef.current = null; 37 | } 38 | if (audioContextRef.current) { 39 | audioContextRef.current.close(); 40 | audioContextRef.current = null; 41 | } 42 | }, []); 43 | 44 | const cleanupWebSocket = useCallback(() => { 45 | if (geminiWsRef.current) { 46 | geminiWsRef.current.disconnect(); 47 | geminiWsRef.current = null; 48 | } 49 | }, []); 50 | 51 | // Simplify sendAudioData to just send continuously 52 | const sendAudioData = (b64Data: string) => { 53 | if (!geminiWsRef.current) return; 54 | geminiWsRef.current.sendMediaChunk(b64Data, "audio/pcm"); 55 | }; 56 | 57 | const toggleCamera = async () => { 58 | if (isStreaming && stream) { 59 | setIsStreaming(false); 60 | cleanupWebSocket(); 61 | cleanupAudio(); 62 | stream.getTracks().forEach(track => track.stop()); 63 | if (videoRef.current) { 64 | videoRef.current.srcObject = null; 65 | } 66 | setStream(null); 67 | } else { 68 | try { 69 | const videoStream = await navigator.mediaDevices.getUserMedia({ 70 | video: true, 71 | audio: false 72 | }); 73 | 74 | const audioStream = await navigator.mediaDevices.getUserMedia({ 75 | audio: { 76 | sampleRate: 16000, 77 | channelCount: 1, 78 | echoCancellation: true, 79 | autoGainControl: true, 80 | noiseSuppression: true, 81 | } 82 | }); 83 | 84 | audioContextRef.current = new AudioContext({ 85 | sampleRate: 16000, 86 | }); 87 | 88 | if (videoRef.current) { 89 | videoRef.current.srcObject = videoStream; 90 | videoRef.current.muted = true; 91 | } 92 | 93 | const combinedStream = new MediaStream([ 94 | ...videoStream.getTracks(), 95 | ...audioStream.getTracks() 96 | ]); 97 | 98 | setStream(combinedStream); 99 | setIsStreaming(true); 100 | } catch (err) { 101 | console.error('Error accessing media devices:', err); 102 | cleanupAudio(); 103 | } 104 | } 105 | }; 106 | 107 | // Initialize WebSocket connection 108 | useEffect(() => { 109 | if (!isStreaming) { 110 | setConnectionStatus('disconnected'); 111 | return; 112 | } 113 | 114 | setConnectionStatus('connecting'); 115 | geminiWsRef.current = new GeminiWebSocket( 116 | (text) => { 117 | console.log("Received from Gemini:", text); 118 | }, 119 | () => { 120 | console.log("[Camera] WebSocket setup complete, starting media capture"); 121 | setIsWebSocketReady(true); 122 | setConnectionStatus('connected'); 123 | }, 124 | (isPlaying) => { 125 | setIsModelSpeaking(isPlaying); 126 | }, 127 | (level) => { 128 | setOutputAudioLevel(level); 129 | }, 130 | onTranscription 131 | ); 132 | geminiWsRef.current.connect(); 133 | 134 | return () => { 135 | if (imageIntervalRef.current) { 136 | clearInterval(imageIntervalRef.current); 137 | imageIntervalRef.current = null; 138 | } 139 | cleanupWebSocket(); 140 | setIsWebSocketReady(false); 141 | setConnectionStatus('disconnected'); 142 | }; 143 | }, [isStreaming, onTranscription, cleanupWebSocket]); 144 | 145 | // Start image capture only after WebSocket is ready 146 | useEffect(() => { 147 | if (!isStreaming || !isWebSocketReady) return; 148 | 149 | console.log("[Camera] Starting image capture interval"); 150 | imageIntervalRef.current = setInterval(captureAndSendImage, 1000); 151 | 152 | return () => { 153 | if (imageIntervalRef.current) { 154 | clearInterval(imageIntervalRef.current); 155 | imageIntervalRef.current = null; 156 | } 157 | }; 158 | }, [isStreaming, isWebSocketReady]); 159 | 160 | // Update audio processing setup 161 | useEffect(() => { 162 | if (!isStreaming || !stream || !audioContextRef.current || 163 | !isWebSocketReady || isAudioSetup || setupInProgressRef.current) return; 164 | 165 | let isActive = true; 166 | setupInProgressRef.current = true; 167 | 168 | const setupAudioProcessing = async () => { 169 | try { 170 | const ctx = audioContextRef.current; 171 | if (!ctx || ctx.state === 'closed' || !isActive) { 172 | setupInProgressRef.current = false; 173 | return; 174 | } 175 | 176 | if (ctx.state === 'suspended') { 177 | await ctx.resume(); 178 | } 179 | 180 | await ctx.audioWorklet.addModule('/worklets/audio-processor.js'); 181 | 182 | if (!isActive) { 183 | setupInProgressRef.current = false; 184 | return; 185 | } 186 | 187 | audioWorkletNodeRef.current = new AudioWorkletNode(ctx, 'audio-processor', { 188 | numberOfInputs: 1, 189 | numberOfOutputs: 1, 190 | processorOptions: { 191 | sampleRate: 16000, 192 | bufferSize: 4096, // Larger buffer size like original 193 | }, 194 | channelCount: 1, 195 | channelCountMode: 'explicit', 196 | channelInterpretation: 'speakers' 197 | }); 198 | 199 | const source = ctx.createMediaStreamSource(stream); 200 | audioWorkletNodeRef.current.port.onmessage = (event) => { 201 | if (!isActive || isModelSpeaking) return; 202 | const { pcmData, level } = event.data; 203 | setAudioLevel(level); 204 | 205 | const pcmArray = new Uint8Array(pcmData); 206 | const b64Data = Base64.fromUint8Array(pcmArray); 207 | sendAudioData(b64Data); 208 | }; 209 | 210 | source.connect(audioWorkletNodeRef.current); 211 | setIsAudioSetup(true); 212 | setupInProgressRef.current = false; 213 | 214 | return () => { 215 | source.disconnect(); 216 | if (audioWorkletNodeRef.current) { 217 | audioWorkletNodeRef.current.disconnect(); 218 | } 219 | setIsAudioSetup(false); 220 | }; 221 | } catch (error) { 222 | if (isActive) { 223 | cleanupAudio(); 224 | setIsAudioSetup(false); 225 | } 226 | setupInProgressRef.current = false; 227 | } 228 | }; 229 | 230 | console.log("[Camera] Starting audio processing setup"); 231 | setupAudioProcessing(); 232 | 233 | return () => { 234 | isActive = false; 235 | setIsAudioSetup(false); 236 | setupInProgressRef.current = false; 237 | if (audioWorkletNodeRef.current) { 238 | audioWorkletNodeRef.current.disconnect(); 239 | audioWorkletNodeRef.current = null; 240 | } 241 | }; 242 | }, [isStreaming, stream, isWebSocketReady, isModelSpeaking]); 243 | 244 | // Capture and send image 245 | const captureAndSendImage = () => { 246 | if (!videoRef.current || !videoCanvasRef.current || !geminiWsRef.current) return; 247 | 248 | const canvas = videoCanvasRef.current; 249 | const context = canvas.getContext('2d'); 250 | if (!context) return; 251 | 252 | // Set canvas size to match video 253 | canvas.width = videoRef.current.videoWidth; 254 | canvas.height = videoRef.current.videoHeight; 255 | 256 | // Draw video frame to canvas 257 | context.drawImage(videoRef.current, 0, 0); 258 | 259 | // Convert to base64 and send 260 | const imageData = canvas.toDataURL('image/jpeg', 0.8); 261 | const b64Data = imageData.split(',')[1]; 262 | geminiWsRef.current.sendMediaChunk(b64Data, "image/jpeg"); 263 | }; 264 | 265 | return ( 266 |
267 |
268 |