├── .env.example
├── diagram.jpg
├── thumbnail.jpg
├── app
├── favicon.ico
├── layout.tsx
├── services
│ ├── transcriptionService.ts
│ └── geminiWebSocket.ts
├── globals.css
├── page.tsx
├── utils
│ └── audioUtils.ts
└── components
│ └── CameraPreview.tsx
├── public
├── avatars
│ └── gemini.png
├── vercel.svg
├── window.svg
├── file.svg
├── globe.svg
├── next.svg
└── worklets
│ └── audio-processor.js
├── next.config.ts
├── postcss.config.mjs
├── lib
└── utils.ts
├── next-env.d.ts
├── eslint.config.mjs
├── next.config.js
├── components.json
├── tsconfig.json
├── components
└── ui
│ ├── progress.tsx
│ ├── avatar.tsx
│ ├── scroll-area.tsx
│ ├── button.tsx
│ └── card.tsx
├── package.json
├── README.md
└── tailwind.config.ts
/.env.example:
--------------------------------------------------------------------------------
1 | GEMINI_API_KEY=your_api_key_here
2 |
--------------------------------------------------------------------------------
/diagram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyu2/gemini-nextjs/HEAD/diagram.jpg
--------------------------------------------------------------------------------
/thumbnail.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyu2/gemini-nextjs/HEAD/thumbnail.jpg
--------------------------------------------------------------------------------
/app/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyu2/gemini-nextjs/HEAD/app/favicon.ico
--------------------------------------------------------------------------------
/public/avatars/gemini.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyu2/gemini-nextjs/HEAD/public/avatars/gemini.png
--------------------------------------------------------------------------------
/public/vercel.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/next.config.ts:
--------------------------------------------------------------------------------
1 | import type { NextConfig } from "next";
2 |
3 | const nextConfig: NextConfig = {
4 | /* config options here */
5 | };
6 |
7 | export default nextConfig;
8 |
--------------------------------------------------------------------------------
/postcss.config.mjs:
--------------------------------------------------------------------------------
1 | /** @type {import('postcss-load-config').Config} */
2 | const config = {
3 | plugins: {
4 | tailwindcss: {},
5 | },
6 | };
7 |
8 | export default config;
9 |
--------------------------------------------------------------------------------
/lib/utils.ts:
--------------------------------------------------------------------------------
1 | import { clsx, type ClassValue } from "clsx"
2 | import { twMerge } from "tailwind-merge"
3 |
4 | export function cn(...inputs: ClassValue[]) {
5 | return twMerge(clsx(inputs))
6 | }
7 |
--------------------------------------------------------------------------------
/next-env.d.ts:
--------------------------------------------------------------------------------
1 | ///
2 | ///
3 |
4 | // NOTE: This file should not be edited
5 | // see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
6 |
--------------------------------------------------------------------------------
/public/window.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/public/file.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------
1 | import { dirname } from "path";
2 | import { fileURLToPath } from "url";
3 | import { FlatCompat } from "@eslint/eslintrc";
4 |
5 | const __filename = fileURLToPath(import.meta.url);
6 | const __dirname = dirname(__filename);
7 |
8 | const compat = new FlatCompat({
9 | baseDirectory: __dirname,
10 | });
11 |
12 | const eslintConfig = [
13 | ...compat.extends("next/core-web-vitals", "next/typescript"),
14 | ];
15 |
16 | export default eslintConfig;
17 |
--------------------------------------------------------------------------------
/next.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const nextConfig = {
3 | async headers() {
4 | return [
5 | {
6 | source: '/worklets/:path*',
7 | headers: [
8 | {
9 | key: 'Content-Type',
10 | value: 'application/javascript',
11 | },
12 | {
13 | key: 'Access-Control-Allow-Origin',
14 | value: '*',
15 | },
16 | ],
17 | },
18 | ];
19 | },
20 | };
21 |
22 | module.exports = nextConfig;
--------------------------------------------------------------------------------
/components.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://ui.shadcn.com/schema.json",
3 | "style": "new-york",
4 | "rsc": true,
5 | "tsx": true,
6 | "tailwind": {
7 | "config": "tailwind.config.ts",
8 | "css": "app/globals.css",
9 | "baseColor": "neutral",
10 | "cssVariables": true,
11 | "prefix": ""
12 | },
13 | "aliases": {
14 | "components": "@/components",
15 | "utils": "@/lib/utils",
16 | "ui": "@/components/ui",
17 | "lib": "@/lib",
18 | "hooks": "@/hooks"
19 | },
20 | "iconLibrary": "lucide"
21 | }
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2017",
4 | "lib": ["dom", "dom.iterable", "esnext"],
5 | "allowJs": true,
6 | "skipLibCheck": true,
7 | "strict": true,
8 | "noEmit": true,
9 | "esModuleInterop": true,
10 | "module": "esnext",
11 | "moduleResolution": "bundler",
12 | "resolveJsonModule": true,
13 | "isolatedModules": true,
14 | "jsx": "preserve",
15 | "incremental": true,
16 | "plugins": [
17 | {
18 | "name": "next"
19 | }
20 | ],
21 | "paths": {
22 | "@/*": ["./*"]
23 | },
24 | "baseUrl": "."
25 | },
26 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
27 | "exclude": ["node_modules"]
28 | }
29 |
--------------------------------------------------------------------------------
/app/layout.tsx:
--------------------------------------------------------------------------------
1 | // app/layout.tsx
2 | import type { Metadata } from 'next';
3 | import { Inter } from 'next/font/google';
4 | import './globals.css';
5 |
6 | const inter = Inter({ subsets: ['latin'] });
7 |
8 | export const metadata: Metadata = {
9 | title: 'Camera Preview App',
10 | description: 'Next.js Camera Interface',
11 | };
12 |
13 | export default function RootLayout({
14 | children,
15 | }: {
16 | children: React.ReactNode;
17 | }) {
18 | return (
19 |
20 |
21 |
22 | {children}
23 |
24 |
25 |
26 | );
27 | }
28 |
--------------------------------------------------------------------------------
/components/ui/progress.tsx:
--------------------------------------------------------------------------------
1 | "use client"
2 |
3 | import * as React from "react"
4 | import * as ProgressPrimitive from "@radix-ui/react-progress"
5 |
6 | import { cn } from "@/lib/utils"
7 |
8 | const Progress = React.forwardRef<
9 | React.ElementRef,
10 | React.ComponentPropsWithoutRef
11 | >(({ className, value, ...props }, ref) => (
12 |
20 |
24 |
25 | ))
26 | Progress.displayName = ProgressPrimitive.Root.displayName
27 |
28 | export { Progress }
29 |
--------------------------------------------------------------------------------
/public/globe.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/app/services/transcriptionService.ts:
--------------------------------------------------------------------------------
1 | import { GoogleGenerativeAI } from "@google/generative-ai";
2 |
3 | const genAI = new GoogleGenerativeAI(process.env.NEXT_PUBLIC_GEMINI_API_KEY || '');
4 | const MODEL_NAME = "gemini-1.5-flash-8b";
5 |
6 | export class TranscriptionService {
7 | private model;
8 |
9 | constructor() {
10 | this.model = genAI.getGenerativeModel({ model: MODEL_NAME });
11 | }
12 |
13 | async transcribeAudio(audioBase64: string, mimeType: string = "audio/wav"): Promise {
14 | try {
15 | const result = await this.model.generateContent([
16 | {
17 | inlineData: {
18 | mimeType: mimeType,
19 | data: audioBase64
20 | }
21 | },
22 | { text: "Please transcribe the spoken language in this audio accurately. Ignore any background noise or non-speech sounds." },
23 | ]);
24 |
25 | return result.response.text();
26 | } catch (error) {
27 | console.error("Transcription error:", error);
28 | throw error;
29 | }
30 | }
31 | }
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "my-app",
3 | "version": "0.1.0",
4 | "private": true,
5 | "scripts": {
6 | "dev": "next dev --turbopack",
7 | "build": "next build",
8 | "start": "next start",
9 | "lint": "next lint"
10 | },
11 | "dependencies": {
12 | "@google/generative-ai": "^0.21.0",
13 | "@radix-ui/react-avatar": "^1.1.3",
14 | "@radix-ui/react-dropdown-menu": "^2.1.6",
15 | "@radix-ui/react-progress": "^1.1.2",
16 | "@radix-ui/react-scroll-area": "^1.2.3",
17 | "@radix-ui/react-slot": "^1.1.2",
18 | "class-variance-authority": "^0.7.1",
19 | "clsx": "^2.1.1",
20 | "js-base64": "^3.7.7",
21 | "lamejs": "^1.2.1",
22 | "lucide-react": "^0.475.0",
23 | "next": "15.1.7",
24 | "react": "^19.0.0",
25 | "react-dom": "^19.0.0",
26 | "tailwind-merge": "^3.0.1",
27 | "tailwindcss-animate": "^1.0.7"
28 | },
29 | "devDependencies": {
30 | "@eslint/eslintrc": "^3",
31 | "@types/node": "^20",
32 | "@types/react": "^19",
33 | "@types/react-dom": "^19",
34 | "eslint": "^9",
35 | "eslint-config-next": "15.1.7",
36 | "postcss": "^8",
37 | "tailwindcss": "^3.4.1",
38 | "typescript": "^5"
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/public/next.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Multimodal Realtime App with Gemini 2.0 by Next.js Framework
2 |
3 | A demonstration project showing how to build a realtime multimodal application using Google's Gemini 2.0 API and Next.js. This app can process audio, video, and generate transcripts in realtime.
4 |
5 | 
6 |
7 | [Watch the Tutorial Video](https://youtu.be/YUfer6xyExY)
8 |
9 | ## Features
10 |
11 | - Realtime audio/video(image) interaction with Gemini 2.0 Multimodal Live API
12 | - Live transcription by Gemini 1.5/2.0 GenerativeAI API
13 | - Built with Next.js for optimal performance
14 |
15 | ## Architecture
16 |
17 | 
18 |
19 | ## Prerequisites
20 |
21 | - Node.js 18+ installed
22 | - API key for Gemini 2.0 Model
23 |
24 | ## Getting Started
25 |
26 | 1. Clone the repository
27 | ```bash
28 | git clone https://github.com/yeyu2/gemini-nextjs.git
29 | cd gemini-nextjs
30 | ```
31 |
32 | 2. Install dependencies
33 | ```bash
34 | npm install
35 | # or
36 | yarn install
37 | ```
38 |
39 | 3. Set up environment variables
40 | ```bash
41 | cp .env.example .env.local
42 | ```
43 | Add your Gemini API key to `.env.local`:
44 | ```
45 | GEMINI_API_KEY=your_api_key_here
46 | ```
47 |
48 | 4. Run the development server
49 | ```bash
50 | npm run dev
51 | # or
52 | yarn dev
53 | ```
54 |
55 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the application.
56 |
57 |
58 |
--------------------------------------------------------------------------------
/components/ui/avatar.tsx:
--------------------------------------------------------------------------------
1 | "use client"
2 |
3 | import * as React from "react"
4 | import * as AvatarPrimitive from "@radix-ui/react-avatar"
5 |
6 | import { cn } from "@/lib/utils"
7 |
8 | const Avatar = React.forwardRef<
9 | React.ElementRef,
10 | React.ComponentPropsWithoutRef
11 | >(({ className, ...props }, ref) => (
12 |
20 | ))
21 | Avatar.displayName = AvatarPrimitive.Root.displayName
22 |
23 | const AvatarImage = React.forwardRef<
24 | React.ElementRef,
25 | React.ComponentPropsWithoutRef
26 | >(({ className, ...props }, ref) => (
27 |
32 | ))
33 | AvatarImage.displayName = AvatarPrimitive.Image.displayName
34 |
35 | const AvatarFallback = React.forwardRef<
36 | React.ElementRef,
37 | React.ComponentPropsWithoutRef
38 | >(({ className, ...props }, ref) => (
39 |
47 | ))
48 | AvatarFallback.displayName = AvatarPrimitive.Fallback.displayName
49 |
50 | export { Avatar, AvatarImage, AvatarFallback }
51 |
--------------------------------------------------------------------------------
/public/worklets/audio-processor.js:
--------------------------------------------------------------------------------
1 | // Note: AudioWorkletProcessor is available in the worklet scope
2 | class AudioProcessor extends AudioWorkletProcessor {
3 | constructor() {
4 | super();
5 | this.bufferSize = 2048; // Reduced from 4096 to 2048 for faster response
6 | this.accumulatedSamples = new Float32Array(this.bufferSize);
7 | this.sampleCount = 0;
8 | }
9 |
10 | process(inputs, outputs, parameters) {
11 | const input = inputs[0][0];
12 | if (!input) return true;
13 |
14 | // Accumulate samples
15 | for (let i = 0; i < input.length && this.sampleCount < this.bufferSize; i++) {
16 | this.accumulatedSamples[this.sampleCount++] = input[i];
17 | }
18 |
19 | // Process when we have enough samples
20 | if (this.sampleCount >= this.bufferSize) {
21 | const pcm16 = new Int16Array(this.bufferSize);
22 | let sum = 0;
23 |
24 | // Simple conversion like in the original implementation
25 | for (let i = 0; i < this.bufferSize; i++) {
26 | // Scale to 16-bit range directly
27 | pcm16[i] = this.accumulatedSamples[i] * 0x7FFF;
28 | sum += Math.abs(pcm16[i]);
29 | }
30 |
31 | const buffer = new ArrayBuffer(this.bufferSize * 2);
32 | const view = new DataView(buffer);
33 | pcm16.forEach((value, index) => {
34 | view.setInt16(index * 2, value, true);
35 | });
36 |
37 | // Simplified level calculation
38 | const level = (sum / (this.bufferSize * 0x7FFF)) * 100;
39 |
40 | this.port.postMessage({
41 | pcmData: buffer,
42 | level: Math.min(level * 5, 100)
43 | }, [buffer]);
44 |
45 | this.sampleCount = 0;
46 | }
47 |
48 | return true;
49 | }
50 | }
51 |
52 | registerProcessor('audio-processor', AudioProcessor);
--------------------------------------------------------------------------------
/components/ui/scroll-area.tsx:
--------------------------------------------------------------------------------
1 | "use client"
2 |
3 | import * as React from "react"
4 | import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area"
5 |
6 | import { cn } from "@/lib/utils"
7 |
8 | const ScrollArea = React.forwardRef<
9 | React.ElementRef,
10 | React.ComponentPropsWithoutRef
11 | >(({ className, children, ...props }, ref) => (
12 |
17 |
18 | {children}
19 |
20 |
21 |
22 |
23 | ))
24 | ScrollArea.displayName = ScrollAreaPrimitive.Root.displayName
25 |
26 | const ScrollBar = React.forwardRef<
27 | React.ElementRef,
28 | React.ComponentPropsWithoutRef
29 | >(({ className, orientation = "vertical", ...props }, ref) => (
30 |
43 |
44 |
45 | ))
46 | ScrollBar.displayName = ScrollAreaPrimitive.ScrollAreaScrollbar.displayName
47 |
48 | export { ScrollArea, ScrollBar }
49 |
--------------------------------------------------------------------------------
/tailwind.config.ts:
--------------------------------------------------------------------------------
1 | import type { Config } from "tailwindcss";
2 |
3 | export default {
4 | darkMode: ["class"],
5 | content: [
6 | "./pages/**/*.{js,ts,jsx,tsx,mdx}",
7 | "./components/**/*.{js,ts,jsx,tsx,mdx}",
8 | "./app/**/*.{js,ts,jsx,tsx,mdx}",
9 | ],
10 | theme: {
11 | extend: {
12 | colors: {
13 | background: 'hsl(var(--background))',
14 | foreground: 'hsl(var(--foreground))',
15 | card: {
16 | DEFAULT: 'hsl(var(--card))',
17 | foreground: 'hsl(var(--card-foreground))'
18 | },
19 | popover: {
20 | DEFAULT: 'hsl(var(--popover))',
21 | foreground: 'hsl(var(--popover-foreground))'
22 | },
23 | primary: {
24 | DEFAULT: 'hsl(var(--primary))',
25 | foreground: 'hsl(var(--primary-foreground))'
26 | },
27 | secondary: {
28 | DEFAULT: 'hsl(var(--secondary))',
29 | foreground: 'hsl(var(--secondary-foreground))'
30 | },
31 | muted: {
32 | DEFAULT: 'hsl(var(--muted))',
33 | foreground: 'hsl(var(--muted-foreground))'
34 | },
35 | accent: {
36 | DEFAULT: 'hsl(var(--accent))',
37 | foreground: 'hsl(var(--accent-foreground))'
38 | },
39 | destructive: {
40 | DEFAULT: 'hsl(var(--destructive))',
41 | foreground: 'hsl(var(--destructive-foreground))'
42 | },
43 | border: 'hsl(var(--border))',
44 | input: 'hsl(var(--input))',
45 | ring: 'hsl(var(--ring))',
46 | chart: {
47 | '1': 'hsl(var(--chart-1))',
48 | '2': 'hsl(var(--chart-2))',
49 | '3': 'hsl(var(--chart-3))',
50 | '4': 'hsl(var(--chart-4))',
51 | '5': 'hsl(var(--chart-5))'
52 | }
53 | },
54 | borderRadius: {
55 | lg: 'var(--radius)',
56 | md: 'calc(var(--radius) - 2px)',
57 | sm: 'calc(var(--radius) - 4px)'
58 | }
59 | }
60 | },
61 | plugins: [require("tailwindcss-animate")],
62 | } satisfies Config;
63 |
--------------------------------------------------------------------------------
/app/globals.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 |
5 | body {
6 | font-family: Arial, Helvetica, sans-serif;
7 | }
8 |
9 | @layer base {
10 | :root {
11 | --background: 0 0% 100%;
12 | --foreground: 0 0% 3.9%;
13 | --card: 0 0% 100%;
14 | --card-foreground: 0 0% 3.9%;
15 | --popover: 0 0% 100%;
16 | --popover-foreground: 0 0% 3.9%;
17 | --primary: 0 0% 9%;
18 | --primary-foreground: 0 0% 98%;
19 | --secondary: 0 0% 96.1%;
20 | --secondary-foreground: 0 0% 9%;
21 | --muted: 0 0% 96.1%;
22 | --muted-foreground: 0 0% 45.1%;
23 | --accent: 0 0% 96.1%;
24 | --accent-foreground: 0 0% 9%;
25 | --destructive: 0 84.2% 60.2%;
26 | --destructive-foreground: 0 0% 98%;
27 | --border: 0 0% 89.8%;
28 | --input: 0 0% 89.8%;
29 | --ring: 0 0% 3.9%;
30 | --chart-1: 12 76% 61%;
31 | --chart-2: 173 58% 39%;
32 | --chart-3: 197 37% 24%;
33 | --chart-4: 43 74% 66%;
34 | --chart-5: 27 87% 67%;
35 | --radius: 0.5rem;
36 | }
37 | .dark {
38 | --background: 0 0% 3.9%;
39 | --foreground: 0 0% 98%;
40 | --card: 0 0% 3.9%;
41 | --card-foreground: 0 0% 98%;
42 | --popover: 0 0% 3.9%;
43 | --popover-foreground: 0 0% 98%;
44 | --primary: 0 0% 98%;
45 | --primary-foreground: 0 0% 9%;
46 | --secondary: 0 0% 14.9%;
47 | --secondary-foreground: 0 0% 98%;
48 | --muted: 0 0% 14.9%;
49 | --muted-foreground: 0 0% 63.9%;
50 | --accent: 0 0% 14.9%;
51 | --accent-foreground: 0 0% 98%;
52 | --destructive: 0 62.8% 30.6%;
53 | --destructive-foreground: 0 0% 98%;
54 | --border: 0 0% 14.9%;
55 | --input: 0 0% 14.9%;
56 | --ring: 0 0% 83.1%;
57 | --chart-1: 220 70% 50%;
58 | --chart-2: 160 60% 45%;
59 | --chart-3: 30 80% 55%;
60 | --chart-4: 280 65% 60%;
61 | --chart-5: 340 75% 55%;
62 | }
63 | }
64 |
65 | @layer base {
66 | * {
67 | @apply border-border;
68 | }
69 | body {
70 | @apply bg-background text-foreground;
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/components/ui/button.tsx:
--------------------------------------------------------------------------------
1 | import * as React from "react"
2 | import { Slot } from "@radix-ui/react-slot"
3 | import { cva, type VariantProps } from "class-variance-authority"
4 |
5 | import { cn } from "@/lib/utils"
6 |
7 | const buttonVariants = cva(
8 | "inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0",
9 | {
10 | variants: {
11 | variant: {
12 | default:
13 | "bg-primary text-primary-foreground shadow hover:bg-primary/90",
14 | destructive:
15 | "bg-destructive text-destructive-foreground shadow-sm hover:bg-destructive/90",
16 | outline:
17 | "border border-input bg-background shadow-sm hover:bg-accent hover:text-accent-foreground",
18 | secondary:
19 | "bg-secondary text-secondary-foreground shadow-sm hover:bg-secondary/80",
20 | ghost: "hover:bg-accent hover:text-accent-foreground",
21 | link: "text-primary underline-offset-4 hover:underline",
22 | },
23 | size: {
24 | default: "h-9 px-4 py-2",
25 | sm: "h-8 rounded-md px-3 text-xs",
26 | lg: "h-10 rounded-md px-8",
27 | icon: "h-9 w-9",
28 | },
29 | },
30 | defaultVariants: {
31 | variant: "default",
32 | size: "default",
33 | },
34 | }
35 | )
36 |
37 | export interface ButtonProps
38 | extends React.ButtonHTMLAttributes,
39 | VariantProps {
40 | asChild?: boolean
41 | }
42 |
43 | const Button = React.forwardRef(
44 | ({ className, variant, size, asChild = false, ...props }, ref) => {
45 | const Comp = asChild ? Slot : "button"
46 | return (
47 |
52 | )
53 | }
54 | )
55 | Button.displayName = "Button"
56 |
57 | export { Button, buttonVariants }
58 |
--------------------------------------------------------------------------------
/components/ui/card.tsx:
--------------------------------------------------------------------------------
1 | import * as React from "react"
2 |
3 | import { cn } from "@/lib/utils"
4 |
5 | const Card = React.forwardRef<
6 | HTMLDivElement,
7 | React.HTMLAttributes
8 | >(({ className, ...props }, ref) => (
9 |
17 | ))
18 | Card.displayName = "Card"
19 |
20 | const CardHeader = React.forwardRef<
21 | HTMLDivElement,
22 | React.HTMLAttributes
23 | >(({ className, ...props }, ref) => (
24 |
29 | ))
30 | CardHeader.displayName = "CardHeader"
31 |
32 | const CardTitle = React.forwardRef<
33 | HTMLDivElement,
34 | React.HTMLAttributes
35 | >(({ className, ...props }, ref) => (
36 |
41 | ))
42 | CardTitle.displayName = "CardTitle"
43 |
44 | const CardDescription = React.forwardRef<
45 | HTMLDivElement,
46 | React.HTMLAttributes
47 | >(({ className, ...props }, ref) => (
48 |
53 | ))
54 | CardDescription.displayName = "CardDescription"
55 |
56 | const CardContent = React.forwardRef<
57 | HTMLDivElement,
58 | React.HTMLAttributes
59 | >(({ className, ...props }, ref) => (
60 |
61 | ))
62 | CardContent.displayName = "CardContent"
63 |
64 | const CardFooter = React.forwardRef<
65 | HTMLDivElement,
66 | React.HTMLAttributes
67 | >(({ className, ...props }, ref) => (
68 |
73 | ))
74 | CardFooter.displayName = "CardFooter"
75 |
76 | export { Card, CardHeader, CardFooter, CardTitle, CardDescription, CardContent }
77 |
--------------------------------------------------------------------------------
/app/page.tsx:
--------------------------------------------------------------------------------
1 | // app/page.tsx
2 | "use client";
3 | import { useState, useCallback } from 'react';
4 | import CameraPreview from './components/CameraPreview';
5 | import { ScrollArea } from "@/components/ui/scroll-area";
6 | import { Avatar, AvatarImage, AvatarFallback } from "@/components/ui/avatar";
7 |
8 | // Helper function to create message components
9 | const HumanMessage = ({ text }: { text: string }) => (
10 |
11 |
12 |
13 | H
14 |
15 |
16 |
19 |
20 | {text}
21 |
22 |
23 |
24 | );
25 |
26 | const GeminiMessage = ({ text }: { text: string }) => (
27 |
28 |
29 |
30 | AI
31 |
32 |
33 |
36 |
37 | {text}
38 |
39 |
40 |
41 | );
42 |
43 | export default function Home() {
44 | const [messages, setMessages] = useState<{ type: 'human' | 'gemini', text: string }[]>([]);
45 |
46 | const handleTranscription = useCallback((transcription: string) => {
47 | setMessages(prev => [...prev, { type: 'gemini', text: transcription }]);
48 | }, []);
49 |
50 | return (
51 | <>
52 |
53 | Multimodal Live Chat
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 | {messages.map((message, index) => (
63 | message.type === 'human' ? (
64 |
65 | ) : (
66 |
67 | )
68 | ))}
69 |
70 |
71 |
72 |
73 | >
74 | );
75 | }
76 |
--------------------------------------------------------------------------------
/app/utils/audioUtils.ts:
--------------------------------------------------------------------------------
1 | // Helper function to download and analyze WAV data
2 | function debugSaveWav(wavData: string, filename: string = 'debug.wav') {
3 | const byteString = atob(wavData);
4 | const bytes = new Uint8Array(byteString.length);
5 | for (let i = 0; i < byteString.length; i++) {
6 | bytes[i] = byteString.charCodeAt(i);
7 | }
8 |
9 | // Create blob and download
10 | const blob = new Blob([bytes], { type: 'audio/wav' });
11 | const url = URL.createObjectURL(blob);
12 | const a = document.createElement('a');
13 | a.href = url;
14 | a.download = filename;
15 | document.body.appendChild(a);
16 | a.click();
17 | document.body.removeChild(a);
18 | URL.revokeObjectURL(url);
19 | }
20 |
21 | export function pcmToWav(pcmData: string, sampleRate: number = 24000): Promise {
22 | return new Promise((resolve, reject) => {
23 | try {
24 | // Decode base64 PCM data
25 | const binaryString = atob(pcmData);
26 | const pcmBytes = new Uint8Array(binaryString.length);
27 | for (let i = 0; i < binaryString.length; i++) {
28 | pcmBytes[i] = binaryString.charCodeAt(i);
29 | }
30 |
31 | // Convert bytes to samples (assuming 16-bit PCM)
32 | const samples = new Int16Array(pcmBytes.buffer);
33 |
34 | // Create WAV header
35 | const wavHeader = new ArrayBuffer(44);
36 | const view = new DataView(wavHeader);
37 |
38 | const pcmByteLength = samples.length * 2; // 16-bit = 2 bytes per sample
39 |
40 | // "RIFF" chunk descriptor
41 | view.setUint8(0, 'R'.charCodeAt(0));
42 | view.setUint8(1, 'I'.charCodeAt(0));
43 | view.setUint8(2, 'F'.charCodeAt(0));
44 | view.setUint8(3, 'F'.charCodeAt(0));
45 |
46 | // File length (header size + data size)
47 | view.setUint32(4, 36 + pcmByteLength, true);
48 |
49 | // "WAVE" format
50 | view.setUint8(8, 'W'.charCodeAt(0));
51 | view.setUint8(9, 'A'.charCodeAt(0));
52 | view.setUint8(10, 'V'.charCodeAt(0));
53 | view.setUint8(11, 'E'.charCodeAt(0));
54 |
55 | // "fmt " sub-chunk
56 | view.setUint8(12, 'f'.charCodeAt(0));
57 | view.setUint8(13, 'm'.charCodeAt(0));
58 | view.setUint8(14, 't'.charCodeAt(0));
59 | view.setUint8(15, ' '.charCodeAt(0));
60 |
61 | // Sub-chunk size
62 | view.setUint32(16, 16, true);
63 | // Audio format (PCM = 1)
64 | view.setUint16(20, 1, true);
65 | // Number of channels
66 | view.setUint16(22, 1, true);
67 | // Sample rate
68 | view.setUint32(24, sampleRate, true);
69 | // Byte rate
70 | view.setUint32(28, sampleRate * 2, true);
71 | // Block align
72 | view.setUint16(32, 2, true);
73 | // Bits per sample
74 | view.setUint16(34, 16, true);
75 |
76 | // "data" sub-chunk
77 | view.setUint8(36, 'd'.charCodeAt(0));
78 | view.setUint8(37, 'a'.charCodeAt(0));
79 | view.setUint8(38, 't'.charCodeAt(0));
80 | view.setUint8(39, 'a'.charCodeAt(0));
81 |
82 | // Data size
83 | view.setUint32(40, pcmByteLength, true);
84 |
85 | // Create final buffer
86 | const wavBuffer = new ArrayBuffer(wavHeader.byteLength + pcmByteLength);
87 | const wavBytes = new Uint8Array(wavBuffer);
88 |
89 | // Copy header and PCM data
90 | wavBytes.set(new Uint8Array(wavHeader), 0);
91 | wavBytes.set(new Uint8Array(samples.buffer), wavHeader.byteLength);
92 |
93 | // Use Blob and FileReader to convert to base64
94 | const blob = new Blob([wavBytes], { type: 'audio/wav' });
95 | const reader = new FileReader();
96 | reader.onloadend = () => {
97 | const base64data = reader.result?.toString().split(',')[1];
98 | if (base64data) {
99 | resolve(base64data);
100 | } else {
101 | reject(new Error("Failed to convert WAV to base64"));
102 | }
103 | };
104 | reader.onerror = reject;
105 | reader.readAsDataURL(blob);
106 | } catch (error) {
107 | reject(error);
108 | }
109 | });
110 | }
--------------------------------------------------------------------------------
/app/services/geminiWebSocket.ts:
--------------------------------------------------------------------------------
1 | import { Base64 } from 'js-base64';
2 | import { TranscriptionService } from './transcriptionService';
3 | import { pcmToWav } from '../utils/audioUtils';
4 |
5 | const MODEL = "models/gemini-2.0-flash-exp";
6 | const API_KEY = process.env.NEXT_PUBLIC_GEMINI_API_KEY;
7 | const HOST = "generativelanguage.googleapis.com";
8 | const WS_URL = `wss://${HOST}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key=${API_KEY}`;
9 |
10 | export class GeminiWebSocket {
11 | private ws: WebSocket | null = null;
12 | private isConnected: boolean = false;
13 | private isSetupComplete: boolean = false;
14 | private onMessageCallback: ((text: string) => void) | null = null;
15 | private onSetupCompleteCallback: (() => void) | null = null;
16 | private audioContext: AudioContext | null = null;
17 |
18 | // Audio queue management
19 | private audioQueue: Float32Array[] = [];
20 | private isPlaying: boolean = false;
21 | private currentSource: AudioBufferSourceNode | null = null;
22 | private isPlayingResponse: boolean = false;
23 | private onPlayingStateChange: ((isPlaying: boolean) => void) | null = null;
24 | private onAudioLevelChange: ((level: number) => void) | null = null;
25 | private onTranscriptionCallback: ((text: string) => void) | null = null;
26 | private transcriptionService: TranscriptionService;
27 | private accumulatedPcmData: string[] = [];
28 |
29 | constructor(
30 | onMessage: (text: string) => void,
31 | onSetupComplete: () => void,
32 | onPlayingStateChange: (isPlaying: boolean) => void,
33 | onAudioLevelChange: (level: number) => void,
34 | onTranscription: (text: string) => void
35 | ) {
36 | this.onMessageCallback = onMessage;
37 | this.onSetupCompleteCallback = onSetupComplete;
38 | this.onPlayingStateChange = onPlayingStateChange;
39 | this.onAudioLevelChange = onAudioLevelChange;
40 | this.onTranscriptionCallback = onTranscription;
41 | // Create AudioContext for playback
42 | this.audioContext = new AudioContext({
43 | sampleRate: 24000 // Match the response audio rate
44 | });
45 | this.transcriptionService = new TranscriptionService();
46 | }
47 |
48 | connect() {
49 | if (this.ws?.readyState === WebSocket.OPEN) {
50 | return;
51 | }
52 |
53 | this.ws = new WebSocket(WS_URL);
54 |
55 | this.ws.onopen = () => {
56 | this.isConnected = true;
57 | this.sendInitialSetup();
58 | };
59 |
60 | this.ws.onmessage = async (event) => {
61 | try {
62 | let messageText: string;
63 | if (event.data instanceof Blob) {
64 | const arrayBuffer = await event.data.arrayBuffer();
65 | const bytes = new Uint8Array(arrayBuffer);
66 | messageText = new TextDecoder('utf-8').decode(bytes);
67 | } else {
68 | messageText = event.data;
69 | }
70 |
71 | await this.handleMessage(messageText);
72 | } catch (error) {
73 | console.error("[WebSocket] Error processing message:", error);
74 | }
75 | };
76 |
77 | this.ws.onerror = (error) => {
78 | console.error("[WebSocket] Error:", error);
79 | };
80 |
81 | this.ws.onclose = (event) => {
82 | this.isConnected = false;
83 |
84 | // Only attempt to reconnect if we haven't explicitly called disconnect
85 | if (!event.wasClean && this.isSetupComplete) {
86 | setTimeout(() => this.connect(), 1000);
87 | }
88 | };
89 | }
90 |
91 | private sendInitialSetup() {
92 | const setupMessage = {
93 | setup: {
94 | model: MODEL,
95 | generation_config: {
96 | response_modalities: ["AUDIO"]
97 | }
98 | }
99 | };
100 | this.ws?.send(JSON.stringify(setupMessage));
101 | }
102 |
103 | sendMediaChunk(b64Data: string, mimeType: string) {
104 | if (!this.isConnected || !this.ws || !this.isSetupComplete) return;
105 |
106 | const message = {
107 | realtime_input: {
108 | media_chunks: [{
109 | mime_type: mimeType === "audio/pcm" ? "audio/pcm" : mimeType,
110 | data: b64Data
111 | }]
112 | }
113 | };
114 |
115 | try {
116 | this.ws.send(JSON.stringify(message));
117 | } catch (error) {
118 | console.error("[WebSocket] Error sending media chunk:", error);
119 | }
120 | }
121 |
122 | private async playAudioResponse(base64Data: string) {
123 | if (!this.audioContext) return;
124 |
125 | try {
126 | // Decode base64 to bytes
127 | const binaryString = atob(base64Data);
128 | const bytes = new Uint8Array(binaryString.length);
129 | for (let i = 0; i < binaryString.length; i++) {
130 | bytes[i] = binaryString.charCodeAt(i);
131 | }
132 |
133 | // Convert to Int16Array (PCM format)
134 | const pcmData = new Int16Array(bytes.buffer);
135 |
136 | // Convert to float32 for Web Audio API
137 | const float32Data = new Float32Array(pcmData.length);
138 | for (let i = 0; i < pcmData.length; i++) {
139 | float32Data[i] = pcmData[i] / 32768.0;
140 | }
141 |
142 | // Add to queue and start playing if not already playing
143 | this.audioQueue.push(float32Data);
144 | this.playNextInQueue();
145 | } catch (error) {
146 | console.error("[WebSocket] Error processing audio:", error);
147 | }
148 | }
149 |
150 | private async playNextInQueue() {
151 | if (!this.audioContext || this.isPlaying || this.audioQueue.length === 0) return;
152 |
153 | try {
154 | this.isPlaying = true;
155 | this.isPlayingResponse = true;
156 | this.onPlayingStateChange?.(true);
157 | const float32Data = this.audioQueue.shift()!;
158 |
159 | // Calculate audio level
160 | let sum = 0;
161 | for (let i = 0; i < float32Data.length; i++) {
162 | sum += Math.abs(float32Data[i]);
163 | }
164 | const level = Math.min((sum / float32Data.length) * 100 * 5, 100);
165 | this.onAudioLevelChange?.(level);
166 |
167 | const audioBuffer = this.audioContext.createBuffer(
168 | 1,
169 | float32Data.length,
170 | 24000
171 | );
172 | audioBuffer.getChannelData(0).set(float32Data);
173 |
174 | this.currentSource = this.audioContext.createBufferSource();
175 | this.currentSource.buffer = audioBuffer;
176 | this.currentSource.connect(this.audioContext.destination);
177 |
178 | this.currentSource.onended = () => {
179 | this.isPlaying = false;
180 | this.currentSource = null;
181 | if (this.audioQueue.length === 0) {
182 | this.isPlayingResponse = false;
183 | this.onPlayingStateChange?.(false);
184 | }
185 | this.playNextInQueue();
186 | };
187 |
188 | this.currentSource.start();
189 | } catch (error) {
190 | console.error("[WebSocket] Error playing audio:", error);
191 | this.isPlaying = false;
192 | this.isPlayingResponse = false;
193 | this.onPlayingStateChange?.(false);
194 | this.currentSource = null;
195 | this.playNextInQueue();
196 | }
197 | }
198 |
199 | private stopCurrentAudio() {
200 | if (this.currentSource) {
201 | try {
202 | this.currentSource.stop();
203 | } catch (e) {
204 | // Ignore errors if already stopped
205 | }
206 | this.currentSource = null;
207 | }
208 | this.isPlaying = false;
209 | this.isPlayingResponse = false;
210 | this.onPlayingStateChange?.(false);
211 | this.audioQueue = []; // Clear queue
212 | }
213 |
214 | private async handleMessage(message: string) {
215 | try {
216 | const messageData = JSON.parse(message);
217 |
218 | if (messageData.setupComplete) {
219 | this.isSetupComplete = true;
220 | this.onSetupCompleteCallback?.();
221 | return;
222 | }
223 |
224 | // Handle audio data
225 | if (messageData.serverContent?.modelTurn?.parts) {
226 | const parts = messageData.serverContent.modelTurn.parts;
227 | for (const part of parts) {
228 | if (part.inlineData?.mimeType === "audio/pcm;rate=24000") {
229 | this.accumulatedPcmData.push(part.inlineData.data);
230 | this.playAudioResponse(part.inlineData.data);
231 | }
232 | }
233 | }
234 |
235 | // Handle turn completion separately
236 | if (messageData.serverContent?.turnComplete === true) {
237 | if (this.accumulatedPcmData.length > 0) {
238 | try {
239 | const fullPcmData = this.accumulatedPcmData.join('');
240 | const wavData = await pcmToWav(fullPcmData, 24000);
241 |
242 | const transcription = await this.transcriptionService.transcribeAudio(
243 | wavData,
244 | "audio/wav"
245 | );
246 | console.log("[Transcription]:", transcription);
247 |
248 | this.onTranscriptionCallback?.(transcription);
249 | this.accumulatedPcmData = []; // Clear accumulated data
250 | } catch (error) {
251 | console.error("[WebSocket] Transcription error:", error);
252 | }
253 | }
254 | }
255 | } catch (error) {
256 | console.error("[WebSocket] Error parsing message:", error);
257 | }
258 | }
259 |
260 | disconnect() {
261 | this.isSetupComplete = false;
262 | if (this.ws) {
263 | this.ws.close(1000, "Intentional disconnect");
264 | this.ws = null;
265 | }
266 | this.isConnected = false;
267 | this.accumulatedPcmData = [];
268 | }
269 | }
--------------------------------------------------------------------------------
/app/components/CameraPreview.tsx:
--------------------------------------------------------------------------------
1 | // app/components/CameraPreview.tsx
2 | "use client";
3 |
4 | import { useEffect, useRef, useState, useCallback } from 'react';
5 | import { Card, CardContent } from "../../components/ui/card";
6 | import { Button } from "../../components/ui/button";
7 | import { Video, VideoOff } from "lucide-react";
8 | import { GeminiWebSocket } from '../services/geminiWebSocket';
9 | import { Base64 } from 'js-base64';
10 | import { Avatar, AvatarImage, AvatarFallback } from "@/components/ui/avatar";
11 |
12 | interface CameraPreviewProps {
13 | onTranscription: (text: string) => void;
14 | }
15 |
16 | export default function CameraPreview({ onTranscription }: CameraPreviewProps) {
17 | const videoRef = useRef(null);
18 | const audioContextRef = useRef(null);
19 | const [isStreaming, setIsStreaming] = useState(false);
20 | const [stream, setStream] = useState(null);
21 | const [audioLevel, setAudioLevel] = useState(0);
22 | const geminiWsRef = useRef(null);
23 | const videoCanvasRef = useRef(null);
24 | const audioWorkletNodeRef = useRef(null);
25 | const [isAudioSetup, setIsAudioSetup] = useState(false);
26 | const setupInProgressRef = useRef(false);
27 | const [isWebSocketReady, setIsWebSocketReady] = useState(false);
28 | const imageIntervalRef = useRef(null);
29 | const [isModelSpeaking, setIsModelSpeaking] = useState(false);
30 | const [outputAudioLevel, setOutputAudioLevel] = useState(0);
31 | const [connectionStatus, setConnectionStatus] = useState<'disconnected' | 'connecting' | 'connected'>('disconnected');
32 |
33 | const cleanupAudio = useCallback(() => {
34 | if (audioWorkletNodeRef.current) {
35 | audioWorkletNodeRef.current.disconnect();
36 | audioWorkletNodeRef.current = null;
37 | }
38 | if (audioContextRef.current) {
39 | audioContextRef.current.close();
40 | audioContextRef.current = null;
41 | }
42 | }, []);
43 |
44 | const cleanupWebSocket = useCallback(() => {
45 | if (geminiWsRef.current) {
46 | geminiWsRef.current.disconnect();
47 | geminiWsRef.current = null;
48 | }
49 | }, []);
50 |
51 | // Simplify sendAudioData to just send continuously
52 | const sendAudioData = (b64Data: string) => {
53 | if (!geminiWsRef.current) return;
54 | geminiWsRef.current.sendMediaChunk(b64Data, "audio/pcm");
55 | };
56 |
57 | const toggleCamera = async () => {
58 | if (isStreaming && stream) {
59 | setIsStreaming(false);
60 | cleanupWebSocket();
61 | cleanupAudio();
62 | stream.getTracks().forEach(track => track.stop());
63 | if (videoRef.current) {
64 | videoRef.current.srcObject = null;
65 | }
66 | setStream(null);
67 | } else {
68 | try {
69 | const videoStream = await navigator.mediaDevices.getUserMedia({
70 | video: true,
71 | audio: false
72 | });
73 |
74 | const audioStream = await navigator.mediaDevices.getUserMedia({
75 | audio: {
76 | sampleRate: 16000,
77 | channelCount: 1,
78 | echoCancellation: true,
79 | autoGainControl: true,
80 | noiseSuppression: true,
81 | }
82 | });
83 |
84 | audioContextRef.current = new AudioContext({
85 | sampleRate: 16000,
86 | });
87 |
88 | if (videoRef.current) {
89 | videoRef.current.srcObject = videoStream;
90 | videoRef.current.muted = true;
91 | }
92 |
93 | const combinedStream = new MediaStream([
94 | ...videoStream.getTracks(),
95 | ...audioStream.getTracks()
96 | ]);
97 |
98 | setStream(combinedStream);
99 | setIsStreaming(true);
100 | } catch (err) {
101 | console.error('Error accessing media devices:', err);
102 | cleanupAudio();
103 | }
104 | }
105 | };
106 |
107 | // Initialize WebSocket connection
108 | useEffect(() => {
109 | if (!isStreaming) {
110 | setConnectionStatus('disconnected');
111 | return;
112 | }
113 |
114 | setConnectionStatus('connecting');
115 | geminiWsRef.current = new GeminiWebSocket(
116 | (text) => {
117 | console.log("Received from Gemini:", text);
118 | },
119 | () => {
120 | console.log("[Camera] WebSocket setup complete, starting media capture");
121 | setIsWebSocketReady(true);
122 | setConnectionStatus('connected');
123 | },
124 | (isPlaying) => {
125 | setIsModelSpeaking(isPlaying);
126 | },
127 | (level) => {
128 | setOutputAudioLevel(level);
129 | },
130 | onTranscription
131 | );
132 | geminiWsRef.current.connect();
133 |
134 | return () => {
135 | if (imageIntervalRef.current) {
136 | clearInterval(imageIntervalRef.current);
137 | imageIntervalRef.current = null;
138 | }
139 | cleanupWebSocket();
140 | setIsWebSocketReady(false);
141 | setConnectionStatus('disconnected');
142 | };
143 | }, [isStreaming, onTranscription, cleanupWebSocket]);
144 |
145 | // Start image capture only after WebSocket is ready
146 | useEffect(() => {
147 | if (!isStreaming || !isWebSocketReady) return;
148 |
149 | console.log("[Camera] Starting image capture interval");
150 | imageIntervalRef.current = setInterval(captureAndSendImage, 1000);
151 |
152 | return () => {
153 | if (imageIntervalRef.current) {
154 | clearInterval(imageIntervalRef.current);
155 | imageIntervalRef.current = null;
156 | }
157 | };
158 | }, [isStreaming, isWebSocketReady]);
159 |
160 | // Update audio processing setup
161 | useEffect(() => {
162 | if (!isStreaming || !stream || !audioContextRef.current ||
163 | !isWebSocketReady || isAudioSetup || setupInProgressRef.current) return;
164 |
165 | let isActive = true;
166 | setupInProgressRef.current = true;
167 |
168 | const setupAudioProcessing = async () => {
169 | try {
170 | const ctx = audioContextRef.current;
171 | if (!ctx || ctx.state === 'closed' || !isActive) {
172 | setupInProgressRef.current = false;
173 | return;
174 | }
175 |
176 | if (ctx.state === 'suspended') {
177 | await ctx.resume();
178 | }
179 |
180 | await ctx.audioWorklet.addModule('/worklets/audio-processor.js');
181 |
182 | if (!isActive) {
183 | setupInProgressRef.current = false;
184 | return;
185 | }
186 |
187 | audioWorkletNodeRef.current = new AudioWorkletNode(ctx, 'audio-processor', {
188 | numberOfInputs: 1,
189 | numberOfOutputs: 1,
190 | processorOptions: {
191 | sampleRate: 16000,
192 | bufferSize: 4096, // Larger buffer size like original
193 | },
194 | channelCount: 1,
195 | channelCountMode: 'explicit',
196 | channelInterpretation: 'speakers'
197 | });
198 |
199 | const source = ctx.createMediaStreamSource(stream);
200 | audioWorkletNodeRef.current.port.onmessage = (event) => {
201 | if (!isActive || isModelSpeaking) return;
202 | const { pcmData, level } = event.data;
203 | setAudioLevel(level);
204 |
205 | const pcmArray = new Uint8Array(pcmData);
206 | const b64Data = Base64.fromUint8Array(pcmArray);
207 | sendAudioData(b64Data);
208 | };
209 |
210 | source.connect(audioWorkletNodeRef.current);
211 | setIsAudioSetup(true);
212 | setupInProgressRef.current = false;
213 |
214 | return () => {
215 | source.disconnect();
216 | if (audioWorkletNodeRef.current) {
217 | audioWorkletNodeRef.current.disconnect();
218 | }
219 | setIsAudioSetup(false);
220 | };
221 | } catch (error) {
222 | if (isActive) {
223 | cleanupAudio();
224 | setIsAudioSetup(false);
225 | }
226 | setupInProgressRef.current = false;
227 | }
228 | };
229 |
230 | console.log("[Camera] Starting audio processing setup");
231 | setupAudioProcessing();
232 |
233 | return () => {
234 | isActive = false;
235 | setIsAudioSetup(false);
236 | setupInProgressRef.current = false;
237 | if (audioWorkletNodeRef.current) {
238 | audioWorkletNodeRef.current.disconnect();
239 | audioWorkletNodeRef.current = null;
240 | }
241 | };
242 | }, [isStreaming, stream, isWebSocketReady, isModelSpeaking]);
243 |
244 | // Capture and send image
245 | const captureAndSendImage = () => {
246 | if (!videoRef.current || !videoCanvasRef.current || !geminiWsRef.current) return;
247 |
248 | const canvas = videoCanvasRef.current;
249 | const context = canvas.getContext('2d');
250 | if (!context) return;
251 |
252 | // Set canvas size to match video
253 | canvas.width = videoRef.current.videoWidth;
254 | canvas.height = videoRef.current.videoHeight;
255 |
256 | // Draw video frame to canvas
257 | context.drawImage(videoRef.current, 0, 0);
258 |
259 | // Convert to base64 and send
260 | const imageData = canvas.toDataURL('image/jpeg', 0.8);
261 | const b64Data = imageData.split(',')[1];
262 | geminiWsRef.current.sendMediaChunk(b64Data, "image/jpeg");
263 | };
264 |
265 | return (
266 |
267 |
268 |
274 |
275 | {/* Connection Status Overlay */}
276 | {isStreaming && connectionStatus !== 'connected' && (
277 |
278 |
279 |
280 |
281 | {connectionStatus === 'connecting' ? 'Connecting to Gemini...' : 'Disconnected'}
282 |
283 |
284 | Please wait while we establish a secure connection
285 |
286 |
287 |
288 | )}
289 |
290 |
301 |
302 | {isStreaming && (
303 |
312 | )}
313 |
314 |
315 | );
316 | }
317 |
--------------------------------------------------------------------------------