Close

9 | 10 | 11 | 12 | ); 13 | } 14 | -------------------------------------------------------------------------------- /pages/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | @layer base { 6 | :root { 7 | --background: 0 0% 100%; 8 | --foreground: 222.2 84% 4.9%; 9 | 10 | --card: 0 0% 100%; 11 | --card-foreground: 222.2 84% 4.9%; 12 | 13 | --popover: 0 0% 100%; 14 | --popover-foreground: 222.2 84% 4.9%; 15 | 16 | --primary: 222.2 47.4% 11.2%; 17 | --primary-foreground: 210 40% 98%; 18 | 19 | --secondary: 210 40% 96.1%; 20 | --secondary-foreground: 222.2 47.4% 11.2%; 21 | 22 | --muted: 210 40% 96.1%; 23 | --muted-foreground: 215.4 16.3% 46.9%; 24 | 25 | --accent: 210 40% 96.1%; 26 | --accent-foreground: 222.2 47.4% 11.2%; 27 | 28 | --destructive: 0 84.2% 60.2%; 29 | --destructive-foreground: 210 40% 98%; 30 | 31 | --border: 214.3 31.8% 91.4%; 32 | --input: 214.3 31.8% 91.4%; 33 | --ring: 222.2 84% 4.9%; 34 | 35 | --radius: 0.5rem; 36 | } 37 | 38 | .dark { 39 | --background: 222.2 84% 4.9%; 40 | --foreground: 210 40% 98%; 41 | 42 | --card: 222.2 84% 4.9%; 43 | --card-foreground: 210 40% 98%; 44 | 45 | --popover: 222.2 84% 4.9%; 46 | --popover-foreground: 210 40% 98%; 47 | 48 | --primary: 210 40% 98%; 49 | --primary-foreground: 222.2 47.4% 11.2%; 50 | 51 | --secondary: 217.2 32.6% 17.5%; 52 | --secondary-foreground: 210 40% 98%; 53 | 54 | --muted: 217.2 32.6% 17.5%; 55 | --muted-foreground: 215 20.2% 65.1%; 56 | 57 | --accent: 217.2 32.6% 17.5%; 58 | --accent-foreground: 210 40% 98%; 59 | 60 | --destructive: 0 62.8% 30.6%; 61 | --destructive-foreground: 210 40% 98%; 62 | 63 | --border: 217.2 32.6% 17.5%; 64 | --input: 217.2 32.6% 17.5%; 65 | --ring: 212.7 26.8% 83.9%; 66 | } 67 | } 68 | 69 | @layer base { 70 | * { 71 | @apply border-border; 72 | } 73 | body { 74 | @apply bg-background text-foreground; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /pages/index.tsx: -------------------------------------------------------------------------------- 1 | import { AspectRatio } from "@/components/ui/aspect-ratio"; 2 | import { Button } from "@/components/ui/button"; 3 | import { 4 | Card, 5 | CardContent, 6 | CardDescription, 7 | CardFooter, 8 | CardHeader, 9 | CardTitle, 10 | } from "@/components/ui/card"; 11 | import { Skeleton } from "@/components/ui/skeleton"; 12 | import { storytellerSchema } from "@/storyteller/storytellerSchema"; 13 | import { Loader2, Mic } from "lucide-react"; 14 | import { delay } from "modelfusion"; 15 | import { 16 | convertAudioChunksToBase64, 17 | invokeFlow, 18 | } from "modelfusion-experimental/browser"; 19 | import { useRef, useState } from "react"; 20 | 21 | const baseUrl = process.env.NEXT_PUBLIC_BASE_URL; 22 | 23 | export default function Home() { 24 | const mediaRecorderRef = useRef(null); 25 | const audioChunksRef = useRef([]); 26 | const [isRecording, setIsRecording] = useState(false); 27 | const [waitingForUserInput, setWaitingForUserInput] = useState(true); 28 | const [imageUrl, setImageUrl] = useState(null); 29 | const [title, setTitle] = useState(null); 30 | const [input, setInput] = useState(null); 31 | const [audioUrls, setAudioUrls] = useState([]); 32 | const [activePart, setActivePart] = useState(0); 33 | const [generatingStory, setGeneratingStory] = useState(false); 34 | const [shouldAutoPlay, setShouldAutoPlay] = useState(false); 35 | const [error, setError] = useState(null); 36 | 37 | const resetError = () => { 38 | setError(null); 39 | }; 40 | 41 | const startRecording = () => { 42 | if (isRecording) return; 43 | 44 | resetError(); // Clear any previous errors 45 | 46 | navigator.mediaDevices 47 | .getUserMedia({ audio: true }) 48 | .then((stream) => { 49 | const mediaRecorder = new MediaRecorder(stream); 50 | mediaRecorderRef.current = mediaRecorder; 51 | 52 | mediaRecorder.ondataavailable = (e) => { 53 | audioChunksRef.current.push(e.data); 54 | }; 55 | 56 | // .start(1000): workaround for Safari/iphone 57 | // see https://community.openai.com/t/whisper-api-completely-wrong-for-mp4/289256/12 58 | mediaRecorder.start(1000); 59 | 60 | setIsRecording(true); 61 | }) 62 | .catch((error) => { 63 | setError( 64 | "Error accessing microphone. Please ensure you have given the necessary permissions." 65 | ); 66 | }); 67 | }; 68 | 69 | const stopRecording = () => { 70 | const mediaRecorder = mediaRecorderRef.current; 71 | 72 | if (mediaRecorder && isRecording) { 73 | mediaRecorder.onstop = async () => { 74 | setWaitingForUserInput(false); 75 | setGeneratingStory(true); 76 | setShouldAutoPlay(true); 77 | 78 | try { 79 | const mimeType = mediaRecorder.mimeType; 80 | const audioChunks = audioChunksRef.current; 81 | 82 | audioChunksRef.current = []; 83 | mediaRecorder.stream?.getTracks().forEach((track) => track.stop()); // stop microphone access 84 | 85 | invokeFlow({ 86 | url: `${baseUrl}/generate-story`, 87 | schema: storytellerSchema, 88 | input: { 89 | audioData: await convertAudioChunksToBase64({ 90 | audioChunks, 91 | mimeType, 92 | }), 93 | mimeType, 94 | }, 95 | onEvent(event) { 96 | switch (event.type) { 97 | case "transcribed-input": { 98 | setInput(event.input); 99 | break; 100 | } 101 | case "generated-image": { 102 | setImageUrl(event.url); 103 | break; 104 | } 105 | case "generated-title": { 106 | setTitle(event.title); 107 | break; 108 | } 109 | case "generated-audio-part": { 110 | audioUrls[event.index] = event.url; 111 | setAudioUrls(audioUrls.slice()); 112 | break; 113 | } 114 | } 115 | }, 116 | onStop() { 117 | setGeneratingStory(false); 118 | }, 119 | }); 120 | } catch (error) { 121 | console.error("Error generating story:", error); 122 | setError("An error occurred while generating the story:" + error); 123 | } 124 | }; 125 | 126 | mediaRecorder.stop(); 127 | setIsRecording(false); 128 | } 129 | }; 130 | 131 | const onPlaybackEnded = async () => { 132 | if (activePart === audioUrls.length - 1) { 133 | setActivePart(0); 134 | setShouldAutoPlay(false); 135 | } else { 136 | await delay(1000); // delay between parts to improve the quality of the story 137 | setActivePart(activePart + 1); 138 | } 139 | }; 140 | 141 | return ( 142 |

143 | {error && ( 144 |

148 | Error 149 | {error} 150 | 154 | 163 | 164 |

165 | )} 166 | 167 | {waitingForUserInput ? ( 168 | 169 | 170 | Story Teller 171 | 172 | Automatically generate stories for pre-school kids. 173 | 174 | 175 | 176 | 193 | 194 | 195 | ) : ( 196 | 197 | 198 | {input ? ( 199 | "{input}" 200 | ) : ( 201 | 202 | )} 203 | 204 | {title ?? } 205 | 206 | 207 | 208 | {imageUrl != null ? ( 209 |

210 | 211 | {title

216 | 217 |

218 | ) : ( 219 | 220 | )} 221 | 222 | 223 | {audioUrls[activePart] != null ? ( 224 | <> 225 |

241 | 242 | )} 243 |

244 | ); 245 | } 246 | -------------------------------------------------------------------------------- /postcss.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: { 3 | tailwindcss: {}, 4 | autoprefixer: {}, 5 | }, 6 | } 7 | -------------------------------------------------------------------------------- /src/components/ui/aspect-ratio.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import * as AspectRatioPrimitive from "@radix-ui/react-aspect-ratio"; 4 | 5 | const AspectRatio = AspectRatioPrimitive.Root; 6 | 7 | export { AspectRatio }; 8 | -------------------------------------------------------------------------------- /src/components/ui/button.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | import { Slot } from "@radix-ui/react-slot" 3 | import { cva, type VariantProps } from "class-variance-authority" 4 | 5 | import { cn } from "@/lib/utils" 6 | 7 | const buttonVariants = cva( 8 | "inline-flex items-center justify-center rounded-md text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50", 9 | { 10 | variants: { 11 | variant: { 12 | default: "bg-primary text-primary-foreground hover:bg-primary/90", 13 | destructive: 14 | "bg-destructive text-destructive-foreground hover:bg-destructive/90", 15 | outline: 16 | "border border-input bg-background hover:bg-accent hover:text-accent-foreground", 17 | secondary: 18 | "bg-secondary text-secondary-foreground hover:bg-secondary/80", 19 | ghost: "hover:bg-accent hover:text-accent-foreground", 20 | link: "text-primary underline-offset-4 hover:underline", 21 | }, 22 | size: { 23 | default: "h-10 px-4 py-2", 24 | sm: "h-9 rounded-md px-3", 25 | lg: "h-11 rounded-md px-8", 26 | icon: "h-10 w-10", 27 | }, 28 | }, 29 | defaultVariants: { 30 | variant: "default", 31 | size: "default", 32 | }, 33 | } 34 | ) 35 | 36 | export interface ButtonProps 37 | extends React.ButtonHTMLAttributes, 38 | VariantProps { 39 | asChild?: boolean 40 | } 41 | 42 | const Button = React.forwardRef( 43 | ({ className, variant, size, asChild = false, ...props }, ref) => { 44 | const Comp = asChild ? Slot : "button" 45 | return ( 46 | 51 | ) 52 | } 53 | ) 54 | Button.displayName = "Button" 55 | 56 | export { Button, buttonVariants } 57 | -------------------------------------------------------------------------------- /src/components/ui/card.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | 3 | import { cn } from "@/lib/utils" 4 | 5 | const Card = React.forwardRef< 6 | HTMLDivElement, 7 | React.HTMLAttributes 8 | >(({ className, ...props }, ref) => ( 9 |

17 | )) 18 | Card.displayName = "Card" 19 | 20 | const CardHeader = React.forwardRef< 21 | HTMLDivElement, 22 | React.HTMLAttributes 23 | >(({ className, ...props }, ref) => ( 24 |

29 | )) 30 | CardHeader.displayName = "CardHeader" 31 | 32 | const CardTitle = React.forwardRef< 33 | HTMLParagraphElement, 34 | React.HTMLAttributes 35 | >(({ className, ...props }, ref) => ( 36 |

44 | )) 45 | CardTitle.displayName = "CardTitle" 46 | 47 | const CardDescription = React.forwardRef< 48 | HTMLParagraphElement, 49 | React.HTMLAttributes 50 | >(({ className, ...props }, ref) => ( 51 |
56 | )) 57 | CardDescription.displayName = "CardDescription" 58 | 59 | const CardContent = React.forwardRef< 60 | HTMLDivElement, 61 | React.HTMLAttributes 62 | >(({ className, ...props }, ref) => ( 63 |
64 | )) 65 | CardContent.displayName = "CardContent" 66 | 67 | const CardFooter = React.forwardRef< 68 | HTMLDivElement, 69 | React.HTMLAttributes 70 | >(({ className, ...props }, ref) => ( 71 |
76 | )) 77 | CardFooter.displayName = "CardFooter" 78 | 79 | export { Card, CardHeader, CardFooter, CardTitle, CardDescription, CardContent } 80 | -------------------------------------------------------------------------------- /src/components/ui/skeleton.tsx: -------------------------------------------------------------------------------- 1 | import { cn } from "@/lib/utils" 2 | 3 | function Skeleton({ 4 | className, 5 | ...props 6 | }: React.HTMLAttributes) { 7 | return ( 8 |
12 | ) 13 | } 14 | 15 | export { Skeleton } 16 | -------------------------------------------------------------------------------- /src/lib/utils.ts: -------------------------------------------------------------------------------- 1 | import { type ClassValue, clsx } from "clsx" 2 | import { twMerge } from "tailwind-merge" 3 | 4 | export function cn(...inputs: ClassValue[]) { 5 | return twMerge(clsx(inputs)) 6 | } 7 | -------------------------------------------------------------------------------- /src/storyteller/VoiceManager.ts: -------------------------------------------------------------------------------- 1 | import { 2 | MemoryVectorIndex, 3 | SpeechGenerationModel, 4 | VectorIndexRetriever, 5 | ZodSchema, 6 | elevenlabs, 7 | generateObject, 8 | lmnt, 9 | openai, 10 | retrieve, 11 | zodSchema, 12 | } from "modelfusion"; 13 | import { readFile } from "node:fs/promises"; 14 | import { z } from "zod"; 15 | 16 | const voiceSchema = z.object({ 17 | provider: z.enum(["lmnt", "elevenlabs"]), 18 | voiceId: z.string(), 19 | name: z.string(), 20 | gender: z.enum(["M", "F"]), 21 | description: z.string(), 22 | }); 23 | 24 | export type Voice = z.infer; 25 | 26 | export class VoiceManager { 27 | private readonly voiceIndex: MemoryVectorIndex; 28 | private readonly narrator: Voice; 29 | private readonly speakerToVoice = new Map(); 30 | 31 | static async fromFile({ 32 | voicesPath, 33 | narrator, 34 | }: { 35 | voicesPath: string; 36 | narrator: Voice; 37 | }): Promise { 38 | const voicesData = await readFile(voicesPath, "utf8"); 39 | 40 | const voiceIndex = await MemoryVectorIndex.deserialize({ 41 | serializedData: voicesData, 42 | schema: new ZodSchema(voiceSchema), 43 | }); 44 | 45 | return new VoiceManager({ voiceIndex, narrator }); 46 | } 47 | 48 | constructor({ 49 | voiceIndex, 50 | narrator, 51 | }: { 52 | voiceIndex: MemoryVectorIndex; 53 | narrator: Voice; 54 | }) { 55 | this.voiceIndex = voiceIndex; 56 | this.narrator = narrator; 57 | } 58 | 59 | async getSpeechModel({ 60 | speaker, 61 | story, 62 | }: { 63 | speaker: string; 64 | story: string; 65 | }): Promise { 66 | let voice = this.speakerToVoice.get(speaker); 67 | 68 | if (voice == null) { 69 | voice = await this.selectVoice({ speaker, story }); 70 | this.speakerToVoice.set(speaker, voice); 71 | } 72 | 73 | switch (voice.provider) { 74 | case "lmnt": 75 | return lmnt.SpeechGenerator({ voice: voice.voiceId }); 76 | case "elevenlabs": 77 | return elevenlabs.SpeechGenerator({ voice: voice.voiceId }); 78 | default: 79 | throw new Error(`Unknown voice provider: ${voice.provider}`); 80 | } 81 | } 82 | 83 | private async selectVoice({ 84 | speaker, 85 | story, 86 | }: { 87 | speaker: string; 88 | story: string; 89 | }): Promise { 90 | // pre-determined narrator voice: 91 | if (speaker.toLowerCase() === "narrator") { 92 | return this.narrator; 93 | } 94 | 95 | // generate voice descriptions for the speakers: 96 | const voiceDescription = await generateObject({ 97 | functionId: "generate-voice-description", 98 | model: openai 99 | .ChatTextGenerator({ model: "gpt-3.5-turbo", temperature: 0 }) 100 | .asFunctionCallObjectGenerationModel({ fnName: "voice" }) 101 | .withTextPrompt(), 102 | schema: zodSchema( 103 | z.object({ 104 | gender: z.string().describe("M for male, F for female)"), 105 | description: z.string().describe("Voice description"), 106 | }) 107 | ), 108 | prompt: [ 109 | `## Task`, 110 | `Generate a voice description for ${speaker} from the following story for an audio book.`, 111 | "The voice should be appropriate for a preschooler listener.", 112 | "Include the gender and age in the voice description.", 113 | "", 114 | "## Story", 115 | story, 116 | "", 117 | "## Speaker", 118 | speaker, 119 | "", 120 | "## Voice description (incl. age, gender)", 121 | ].join("\n"), 122 | }); 123 | 124 | // retrieve the voice vectors from the index: 125 | const potentialVoices = await retrieve( 126 | new VectorIndexRetriever({ 127 | vectorIndex: this.voiceIndex, 128 | embeddingModel: openai.TextEmbedder({ 129 | model: "text-embedding-ada-002", 130 | }), 131 | maxResults: 5, 132 | similarityThreshold: 0.2, 133 | filter: (indexVoice) => 134 | indexVoice.provider === "elevenlabs" && 135 | (["M", "F"].includes(voiceDescription.gender) 136 | ? indexVoice.gender === voiceDescription.gender 137 | : true), 138 | }), 139 | voiceDescription.description, 140 | { functionId: "retrieve-voice" } 141 | ); 142 | 143 | const unavailableVoices = Array.from(this.speakerToVoice.values()).map( 144 | (voice) => `${voice.provider}:${voice.voiceId}` 145 | ); 146 | 147 | const voice = potentialVoices.find( 148 | (voice) => 149 | !unavailableVoices.includes(`${voice.provider}:${voice.voiceId}`) 150 | ); 151 | 152 | if (!voice) { 153 | throw new Error(`No voice found for ${speaker}`); 154 | } 155 | 156 | return voice; 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/storyteller/prepareVoices.script.ts: -------------------------------------------------------------------------------- 1 | import dotenv from "dotenv"; 2 | import fs from "fs/promises"; 3 | import { MemoryVectorIndex, openai, upsertIntoVectorIndex } from "modelfusion"; 4 | import { z } from "zod"; 5 | import { Voice } from "./VoiceManager"; 6 | 7 | dotenv.config(); 8 | 9 | async function main() { 10 | try { 11 | const vectorIndex = new MemoryVectorIndex(); 12 | 13 | await addLmntVoices(vectorIndex); 14 | await addElevenLabsVoices(vectorIndex); 15 | 16 | await fs.writeFile("./data/voices.index.json", vectorIndex.serialize()); 17 | } catch (err) { 18 | console.error("Error reading file", err); 19 | } 20 | } 21 | 22 | const lmntVoiceSchema = z.object({ 23 | id: z.string(), 24 | name: z.string(), 25 | gender: z.enum(["M", "F"]), 26 | tags: z.array(z.string()), 27 | description: z.string(), 28 | }); 29 | 30 | type LmntVoice = z.infer; 31 | 32 | async function addLmntVoices(vectorIndex: MemoryVectorIndex) { 33 | const data = await fs.readFile("./data/voices.lmnt.json", "utf8"); 34 | const lmntVoices: LmntVoice[] = Object.values(JSON.parse(data).voices); 35 | 36 | const voices: Voice[] = lmntVoices.map((voice) => ({ 37 | voiceId: voice.id, 38 | name: voice.name, 39 | provider: "lmnt", 40 | gender: voice.gender, 41 | description: 42 | (voice.gender === "M" ? "Male voice. " : "Female voice. ") + 43 | voice.tags.join(" ") + 44 | ". " + 45 | voice.description, 46 | })); 47 | 48 | await upsertIntoVectorIndex({ 49 | vectorIndex, 50 | embeddingModel: openai.TextEmbedder({ model: "text-embedding-ada-002" }), 51 | objects: voices, 52 | getValueToEmbed: (voice) => voice.description, 53 | }); 54 | } 55 | 56 | const elevenLabsVoiceSchema = z.object({ 57 | voice_id: z.string(), 58 | name: z.string(), 59 | labels: z.record(z.string()), 60 | }); 61 | 62 | type ElevenLabsVoice = z.infer; 63 | 64 | async function addElevenLabsVoices(vectorIndex: MemoryVectorIndex) { 65 | const data = await fs.readFile("./data/voices.11labs.json", "utf8"); 66 | const elevenLabsVoices: ElevenLabsVoice[] = Object.values( 67 | JSON.parse(data).voices 68 | ); 69 | 70 | const voices: Voice[] = elevenLabsVoices 71 | .filter((voice) => voice.labels.age === "young") 72 | .map((voice) => ({ 73 | voiceId: voice.voice_id, 74 | name: voice.name, 75 | provider: "elevenlabs", 76 | gender: voice.labels.gender === "female" ? "F" : "M", 77 | description: Object.entries(voice.labels) 78 | .map(([key, value]) => `${key}: ${value}`) 79 | .join(", "), 80 | })); 81 | 82 | await upsertIntoVectorIndex({ 83 | vectorIndex, 84 | embeddingModel: openai.TextEmbedder({ model: "text-embedding-ada-002" }), 85 | objects: voices, 86 | getValueToEmbed: (voice) => voice.description, 87 | }); 88 | } 89 | 90 | main(); 91 | -------------------------------------------------------------------------------- /src/storyteller/server.ts: -------------------------------------------------------------------------------- 1 | import cors from "@fastify/cors"; 2 | import fastifyStatic from "@fastify/static"; 3 | import dotenv from "dotenv"; 4 | import Fastify from "fastify"; 5 | import { modelfusion } from "modelfusion"; 6 | import { 7 | FileSystemAssetStorage, 8 | FileSystemLogger, 9 | modelFusionFastifyPlugin, 10 | } from "modelfusion-experimental/fastify-server"; 11 | import path from "node:path"; 12 | import { storyTellerFlow } from "./storyTellerFlow"; 13 | 14 | dotenv.config(); 15 | 16 | modelfusion.setLogFormat("basic-text"); 17 | 18 | const port = process.env.PORT ? parseInt(process.env.PORT) : 3001; 19 | const host = process.env.HOST ?? "localhost"; 20 | const baseUrl = process.env.BASE_URL ?? `http://${host}:${port}`; 21 | const fsBasePath = process.env.BASE_PATH ?? "runs"; 22 | 23 | export async function main() { 24 | try { 25 | const fastify = Fastify(); 26 | 27 | await fastify.register(cors, {}); 28 | await fastify.register(fastifyStatic, { 29 | root: path.join(__dirname, "..", "..", "out"), 30 | prefix: "/", 31 | }); 32 | 33 | const logger = new FileSystemLogger({ 34 | path: (run) => path.join(fsBasePath, run.runId, "logs"), 35 | }); 36 | 37 | const assetStorage = new FileSystemAssetStorage({ 38 | path: (run) => path.join(fsBasePath, run.runId, "assets"), 39 | logger, 40 | }); 41 | 42 | fastify.register(modelFusionFastifyPlugin, { 43 | baseUrl, 44 | basePath: "/generate-story", 45 | flow: storyTellerFlow, 46 | logger, 47 | assetStorage, 48 | }); 49 | 50 | console.log(`Starting server on port ${port}...`); 51 | await fastify.listen({ port, host }); 52 | console.log("Server started"); 53 | } catch (error) { 54 | console.error("Failed to start server"); 55 | console.error(error); 56 | process.exit(1); 57 | } 58 | } 59 | 60 | main(); 61 | -------------------------------------------------------------------------------- /src/storyteller/storyTellerFlow.ts: -------------------------------------------------------------------------------- 1 | import { 2 | generateImage, 3 | generateSpeech, 4 | generateText, 5 | generateTranscription, 6 | openai, 7 | stability, 8 | streamObject, 9 | zodSchema, 10 | } from "modelfusion"; 11 | import { DefaultFlow } from "modelfusion-experimental/fastify-server"; 12 | import { z } from "zod"; 13 | import { VoiceManager } from "./VoiceManager"; 14 | import { storytellerSchema } from "./storytellerSchema"; 15 | 16 | export const storyTellerFlow = new DefaultFlow({ 17 | schema: storytellerSchema, 18 | async process({ input: { mimeType, audioData }, run }) { 19 | // Transcribe the user voice input: 20 | const transcription = await generateTranscription({ 21 | functionId: "transcribe", 22 | model: openai.Transcriber({ model: "whisper-1" }), 23 | mimeType, 24 | audioData, 25 | }); 26 | 27 | run.publishEvent({ type: "transcribed-input", input: transcription }); 28 | 29 | // Generate a story based on the transcription: 30 | const story = await generateText({ 31 | functionId: "generate-story", 32 | model: openai.CompletionTextGenerator({ 33 | model: "gpt-3.5-turbo-instruct", 34 | temperature: 1.2, 35 | maxGenerationTokens: 1000, 36 | }), 37 | prompt: 38 | "Generate a story aimed at preschoolers on the following topic: \n" + 39 | `'${transcription}'.`, 40 | }); 41 | 42 | // Run in parallel: 43 | await Promise.allSettled([ 44 | // Generate title: 45 | (async () => { 46 | const title = await generateText({ 47 | functionId: "generate-title", 48 | model: openai.CompletionTextGenerator({ 49 | model: "gpt-3.5-turbo-instruct", 50 | temperature: 0.7, 51 | maxGenerationTokens: 200, 52 | stopSequences: ['"'], 53 | }), 54 | prompt: 55 | "Generate a short title for the following story for pre-school children: \n\n" + 56 | `'${story}'.\n\n` + 57 | 'Title: "', 58 | }); 59 | 60 | run.publishEvent({ type: "generated-title", title }); 61 | })(), 62 | 63 | // Generate image that represents story: 64 | (async () => { 65 | const imagePrompt = await generateText({ 66 | functionId: "generate-story-image-prompt", 67 | model: openai 68 | .ChatTextGenerator({ 69 | model: "gpt-4", 70 | temperature: 0, 71 | maxGenerationTokens: 500, 72 | }) 73 | .withTextPrompt(), 74 | prompt: 75 | "Generate a short image generation prompt " + 76 | "(only abstract keywords, max 8 keywords) for the following story: " + 77 | story, 78 | }); 79 | 80 | const storyImage = await generateImage({ 81 | functionId: "generate-story-image", 82 | model: stability 83 | .ImageGenerator({ 84 | model: "stable-diffusion-xl-1024-v1-0", 85 | cfgScale: 7, 86 | height: 1024, 87 | width: 1024, 88 | steps: 30, 89 | }) 90 | .withTextPrompt(), 91 | prompt: `${imagePrompt} style of colorful illustration for a preschooler story`, 92 | }); 93 | 94 | const imagePath = await run.storeBinaryAsset({ 95 | name: "story.png", 96 | data: Buffer.from(storyImage), 97 | contentType: "image/png", 98 | }); 99 | 100 | run.publishEvent({ type: "generated-image", url: imagePath }); 101 | })(), 102 | 103 | // expand and narrate story: 104 | (async () => { 105 | const voiceManager = await VoiceManager.fromFile({ 106 | voicesPath: "./data/voices.index.json", 107 | narrator: { 108 | voiceId: "c8ea4f2a-06e6-4d7b-9484-db941bf7c657", 109 | name: "Joe", 110 | provider: "lmnt", 111 | gender: "M", 112 | description: "Male voice. Middle-aged.", 113 | }, 114 | }); 115 | 116 | const narratedStoryPartSchema = z.object({ 117 | type: z 118 | .enum(["narration", "dialogue"]) 119 | .describe("Type of story part. Either 'narration' or 'dialogue'."), 120 | speaker: z 121 | .string() 122 | .describe( 123 | "Speaker of a dialogue (direct speech) part. Must be a single speaker." 124 | ), 125 | content: z.string().describe("Content of the story part"), 126 | }); 127 | 128 | type NarratedStoryPart = z.infer; 129 | 130 | const structuredStorySchema = z.object({ 131 | parts: z.array(narratedStoryPartSchema), 132 | }); 133 | 134 | const processedParts: Array = []; 135 | 136 | const { objectStream: audioStoryStream, objectPromise } = 137 | await streamObject({ 138 | functionId: "generate-audio-story", 139 | model: openai 140 | .ChatTextGenerator({ 141 | model: "gpt-4", 142 | temperature: 0, 143 | }) 144 | .asFunctionCallObjectGenerationModel({ 145 | fnName: "story", 146 | fnDescription: "Kids story with narration.", 147 | }) 148 | .withTextPrompt(), 149 | schema: zodSchema(structuredStorySchema), 150 | prompt: [ 151 | "Expand the following story into a longer, narrated audio story for preschoolers.", 152 | "", 153 | "The audio story should include interesting dialogue by the main characters.", 154 | "The language should be understandable by a preschooler.", 155 | "", 156 | "Add details and dialog to make the story parts longer.", 157 | "Add the speaker to each dialogue part. A dialogue part can only have one speaker.", 158 | "There must only be one narrator.", 159 | "Each spoken part must be a dialogue part with a speaker.", 160 | "", 161 | "Story:", 162 | story, 163 | ].join("\n"), 164 | fullResponse: true, 165 | }); 166 | 167 | for await (const { partialObject } of audioStoryStream) { 168 | if (partialObject.parts == null) { 169 | continue; 170 | } 171 | 172 | // the last story part might not be complete yet: 173 | const partialParts = partialObject.parts.slice(0, -1); 174 | 175 | // ensure that the remaining story parts are complete: 176 | const partialPartsParseResult = z 177 | .array(narratedStoryPartSchema) 178 | .safeParse(partialParts); 179 | 180 | if (partialPartsParseResult.success) { 181 | await processNewParts(partialPartsParseResult.data); 182 | } 183 | } 184 | 185 | // process the remaining parts: 186 | const audioStory = await objectPromise; 187 | await processNewParts(audioStory.parts); 188 | 189 | async function processNewParts(parts: NarratedStoryPart[]) { 190 | const newParts = parts.slice(processedParts.length); 191 | processedParts.push(...newParts); 192 | 193 | for (const part of newParts) { 194 | const index = processedParts.indexOf(part); 195 | const speaker = part.speaker; 196 | 197 | const narrationAudio = await generateSpeech({ 198 | functionId: "narrate-story-part", 199 | model: await voiceManager.getSpeechModel({ speaker, story }), 200 | text: part.content, 201 | }); 202 | 203 | const path = await run.storeBinaryAsset({ 204 | name: `story-part-${index}.mp3`, 205 | data: Buffer.from(narrationAudio), 206 | contentType: "audio/mpeg", 207 | }); 208 | 209 | run.publishEvent({ 210 | type: "generated-audio-part", 211 | index, 212 | url: path, 213 | }); 214 | } 215 | } 216 | })(), 217 | ]); 218 | }, 219 | }); 220 | -------------------------------------------------------------------------------- /src/storyteller/storytellerSchema.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | 3 | export const storytellerSchema = { 4 | input: z.object({ 5 | mimeType: z.string(), 6 | audioData: z.string(), 7 | }), 8 | events: z.discriminatedUnion("type", [ 9 | z.object({ 10 | type: z.literal("transcribed-input"), 11 | input: z.string(), 12 | }), 13 | z.object({ 14 | type: z.literal("generated-title"), 15 | title: z.string(), 16 | }), 17 | z.object({ 18 | type: z.literal("generated-image"), 19 | url: z.string(), 20 | }), 21 | z.object({ 22 | type: z.literal("generated-audio-part"), 23 | index: z.number(), 24 | url: z.string(), 25 | }), 26 | ]), 27 | }; 28 | -------------------------------------------------------------------------------- /tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | module.exports = { 3 | darkMode: ["class"], 4 | content: [ 5 | './pages/**/*.{ts,tsx}', 6 | './components/**/*.{ts,tsx}', 7 | './app/**/*.{ts,tsx}', 8 | './src/**/.{ts,tsx}', 9 | ], 10 | theme: { 11 | container: { 12 | center: true, 13 | padding: "2rem", 14 | screens: { 15 | "2xl": "1400px", 16 | }, 17 | }, 18 | extend: { 19 | colors: { 20 | border: "hsl(var(--border))", 21 | input: "hsl(var(--input))", 22 | ring: "hsl(var(--ring))", 23 | background: "hsl(var(--background))", 24 | foreground: "hsl(var(--foreground))", 25 | primary: { 26 | DEFAULT: "hsl(var(--primary))", 27 | foreground: "hsl(var(--primary-foreground))", 28 | }, 29 | secondary: { 30 | DEFAULT: "hsl(var(--secondary))", 31 | foreground: "hsl(var(--secondary-foreground))", 32 | }, 33 | destructive: { 34 | DEFAULT: "hsl(var(--destructive))", 35 | foreground: "hsl(var(--destructive-foreground))", 36 | }, 37 | muted: { 38 | DEFAULT: "hsl(var(--muted))", 39 | foreground: "hsl(var(--muted-foreground))", 40 | }, 41 | accent: { 42 | DEFAULT: "hsl(var(--accent))", 43 | foreground: "hsl(var(--accent-foreground))", 44 | }, 45 | popover: { 46 | DEFAULT: "hsl(var(--popover))", 47 | foreground: "hsl(var(--popover-foreground))", 48 | }, 49 | card: { 50 | DEFAULT: "hsl(var(--card))", 51 | foreground: "hsl(var(--card-foreground))", 52 | }, 53 | }, 54 | borderRadius: { 55 | lg: "var(--radius)", 56 | md: "calc(var(--radius) - 2px)", 57 | sm: "calc(var(--radius) - 4px)", 58 | }, 59 | keyframes: { 60 | "accordion-down": { 61 | from: { height: 0 }, 62 | to: { height: "var(--radix-accordion-content-height)" }, 63 | }, 64 | "accordion-up": { 65 | from: { height: "var(--radix-accordion-content-height)" }, 66 | to: { height: 0 }, 67 | }, 68 | }, 69 | animation: { 70 | "accordion-down": "accordion-down 0.2s ease-out", 71 | "accordion-up": "accordion-up 0.2s ease-out", 72 | }, 73 | }, 74 | }, 75 | plugins: [require("tailwindcss-animate")], 76 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": true, 8 | "noEmit": true, 9 | "esModuleInterop": true, 10 | "module": "esnext", 11 | "moduleResolution": "bundler", 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "jsx": "preserve", 15 | "incremental": true, 16 | "plugins": [ 17 | { 18 | "name": "next" 19 | } 20 | ], 21 | "paths": { 22 | "@/": ["./src/*"] 23 | } 24 | }, 25 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 26 | "exclude": ["node_modules"] 27 | } 28 | --------------------------------------------------------------------------------