├── .eslintrc.json ├── .gitignore ├── README.md ├── app ├── api │ └── generate │ │ └── route.tsx ├── favicon.ico ├── globals.css ├── layout.tsx ├── models │ └── sound.tsx ├── page.tsx └── views │ └── GenerateSoundView.tsx ├── components.json ├── components ├── GenerateSoundForm.tsx ├── Loader.tsx └── ui │ ├── button.tsx │ ├── form.tsx │ ├── input.tsx │ ├── label.tsx │ ├── select.tsx │ ├── textarea.tsx │ ├── toast.tsx │ ├── toaster.tsx │ └── use-toast.ts ├── lib ├── constants.tsx └── utils.ts ├── next.config.js ├── package-lock.json ├── package.json ├── postcss.config.js ├── public ├── next.svg ├── text-to-speech-diagram.png └── vercel.svg ├── tailwind.config.js └── tsconfig.json /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "next/core-web-vitals" 3 | } 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # next.js 12 | /.next/ 13 | /out/ 14 | 15 | # production 16 | /build 17 | 18 | # misc 19 | .DS_Store 20 | *.pem 21 | 22 | # debug 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | 27 | # local env files 28 | .env*.local 29 | 30 | # vercel 31 | .vercel 32 | 33 | # typescript 34 | *.tsbuildinfo 35 | next-env.d.ts 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text to Speech Sound Generation with Next.js 2 | 3 | Welcome to the **Text to Speech Sound Generation** project built using Next.js! This project demonstrates the generation of sound using pre-trained hugging face models. Users can select different hugging face sound models and input text to generate corresponding audio. The generated audio can be played back directly on the web interface. 4 | 5 | ## Overview 6 | 7 | This project showcases how to leverage pre-trained hugging face models along with the Hugging Face inference API to convert input text into synthesized speech. The user interface provides a selection of sound models to choose from and an input field to enter the desired text. Once submitted, the application fetches the generated audio from the model's API and presents it in an audio player. 8 | 9 | 10 | ## Architecture 11 | ![image](https://tyhgectxutilszaayoua.supabase.co/storage/v1/object/public/misc/text-to-speech-model.png?t=2023-08-14T02%3A52%3A52.637Z) 12 | 13 | 14 | 15 | ## Notes 16 | This application relies heavily on the stability of the Hugging Face Inference API models. Please note that occasional random errors may occur. 17 | 18 | ## How to Run the Project 19 | 20 | ### Prerequisites 21 | 22 | - Node.js 23 | - npm 24 | 25 | ### Installation 26 | 27 | 1. Clone the repository: 28 | ```sh 29 | git clone https://github.com/sambowenhughes/your-nextjs-project.git 30 | ``` 31 | 2. Navigate to the project directory: 32 | ```sh 33 | cd your-nextjs-project 34 | ``` 35 | 3. Install dependencies: 36 | ```sh 37 | npm install 38 | ``` 39 | 4. Hugging Face Access Token added to your `.env.local` file: 40 | ```sh 41 | Hugging face tokens can be created in the hugging face settings portal 42 | ``` 43 | 44 | ### Running the Application 45 | 46 | 1. Start the development server: 47 | ```sh 48 | npm run dev 49 | ``` 50 | 2. Open your browser and visit `http://localhost:3000` to access the application. 51 | 52 | ## Project Structure 53 | 54 | - `components`: Contains reusable UI components used across the application. 55 | - `lib`: Contains constants and utility functions. 56 | - `pages`: Houses the main views and API routes of the application. 57 | - `public`: Holds static assets like images, fonts, etc. 58 | - `styles`: Includes global and component-specific styles using CSS or CSS-in-JS. 59 | - `views`: Houses the main application views and components. 60 | 61 | ## Usage 62 | 63 | 1. Open the application in your browser after running it. 64 | 2. In the "Sound Model" dropdown, select a model for generating sound. 65 | 3. Enter the desired text in the "Text" input field. 66 | 4. Click the "Submit" button. 67 | 5. The generated audio will appear in an audio player below. 68 | 69 | ## Contributing 70 | 71 | Contributions are welcome! To contribute, follow these steps: 72 | 73 | 1. Fork the repository. 74 | 2. Create a new branch for your feature or bug fix. 75 | 3. Implement your changes and test them thoroughly. 76 | 4. Commit your changes and push to your forked repository. 77 | 5. Open a pull request to the main repository. 78 | 79 | ## License 80 | 81 | This project is licensed under the [MIT License](LICENSE). 82 | ``` -------------------------------------------------------------------------------- /app/api/generate/route.tsx: -------------------------------------------------------------------------------- 1 | /** 2 | * Handles an HTTP POST request to generate audio using a pre-trained model. 3 | * Expects a JSON payload with 'modelUrl' and 'input' fields. 4 | * @param {Request} request - The incoming HTTP request. 5 | * @returns {Response} - The generated audio as an HTTP response. 6 | */ 7 | export async function POST(request: Request): Promise { 8 | // Parse the JSON payload from the request body 9 | const requestBody = await request.json(); 10 | 11 | // Check if the 'modelUrl' field is provided in the request body 12 | if (!requestBody.modelUrl) { 13 | throw new Error("Missing 'model url' field in the request body"); 14 | } 15 | 16 | // Check if the 'input' field is provided in the request body 17 | if (!requestBody.input) { 18 | throw new Error("Missing 'input' field in the request body"); 19 | } 20 | 21 | // Check if the 'input' field is provided in the request body 22 | if (!process.env.HUGGING_FACE_TOKEN) { 23 | throw new Error("Missing 'Hugging Face Access Token'"); 24 | } 25 | 26 | // Extract the 'modelUrl' and 'input' from the request body 27 | const modelUrl = requestBody.modelUrl; 28 | const input = requestBody.input; 29 | 30 | // Make a POST request to the specified 'modelUrl' using Hugging Face token for authorization 31 | const response = await fetch( 32 | modelUrl, 33 | { 34 | headers: { 35 | Authorization: `Bearer ${process.env.HUGGING_FACE_TOKEN}`, // Use the correct token 36 | "Content-Type": "application/json", 37 | }, 38 | method: "POST", 39 | body: JSON.stringify({ inputs: input }), 40 | } 41 | ); 42 | 43 | // Get the generated audio data as an ArrayBuffer 44 | const audioData = await response.arrayBuffer(); 45 | 46 | // Check if the HTTP response is not successful 47 | if (!response.ok) { 48 | throw new Error("Request failed"); 49 | } 50 | 51 | // Create an HTTP response with the generated audio data 52 | return new Response(audioData, { 53 | headers: { 54 | "Content-Type": "audio/mpeg", // Adjust the content type based on the actual audio format 55 | }, 56 | }); 57 | } 58 | -------------------------------------------------------------------------------- /app/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sambowenhughes/text-to-speech-using-hugging-face/420d4013093199f5845c45b8961c0d00917f6b09/app/favicon.ico -------------------------------------------------------------------------------- /app/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | @layer base { 6 | :root { 7 | --background: 0 0% 100%; 8 | --foreground: 222.2 84% 4.9%; 9 | 10 | --muted: 210 40% 96.1%; 11 | --muted-foreground: 215.4 16.3% 46.9%; 12 | 13 | --popover: 0 0% 100%; 14 | --popover-foreground: 222.2 84% 4.9%; 15 | 16 | --card: 0 0% 100%; 17 | --card-foreground: 222.2 84% 4.9%; 18 | 19 | --border: 214.3 31.8% 91.4%; 20 | --input: 214.3 31.8% 91.4%; 21 | 22 | --primary: 222.2 47.4% 11.2%; 23 | --primary-foreground: 210 40% 98%; 24 | 25 | --secondary: 210 40% 96.1%; 26 | --secondary-foreground: 222.2 47.4% 11.2%; 27 | 28 | --accent: 210 40% 96.1%; 29 | --accent-foreground: 222.2 47.4% 11.2%; 30 | 31 | --destructive: 0 84.2% 60.2%; 32 | --destructive-foreground: 210 40% 98%; 33 | 34 | --ring: 215 20.2% 65.1%; 35 | 36 | --radius: 0.5rem; 37 | } 38 | 39 | .dark { 40 | --background: 222.2 84% 4.9%; 41 | --foreground: 210 40% 98%; 42 | 43 | --muted: 217.2 32.6% 17.5%; 44 | --muted-foreground: 215 20.2% 65.1%; 45 | 46 | --popover: 222.2 84% 4.9%; 47 | --popover-foreground: 210 40% 98%; 48 | 49 | --card: 222.2 84% 4.9%; 50 | --card-foreground: 210 40% 98%; 51 | 52 | --border: 217.2 32.6% 17.5%; 53 | --input: 217.2 32.6% 17.5%; 54 | 55 | --primary: 210 40% 98%; 56 | --primary-foreground: 222.2 47.4% 11.2%; 57 | 58 | --secondary: 217.2 32.6% 17.5%; 59 | --secondary-foreground: 210 40% 98%; 60 | 61 | --accent: 217.2 32.6% 17.5%; 62 | --accent-foreground: 210 40% 98%; 63 | 64 | --destructive: 0 62.8% 30.6%; 65 | --destructive-foreground: 0 85.7% 97.3%; 66 | 67 | --ring: 217.2 32.6% 17.5%; 68 | } 69 | } 70 | 71 | @layer base { 72 | * { 73 | @apply border-border; 74 | } 75 | body { 76 | @apply bg-background text-foreground; 77 | } 78 | } -------------------------------------------------------------------------------- /app/layout.tsx: -------------------------------------------------------------------------------- 1 | import './globals.css' 2 | import type { Metadata } from 'next' 3 | import { Inter } from 'next/font/google' 4 | 5 | const inter = Inter({ subsets: ['latin'] }) 6 | 7 | export const metadata: Metadata = { 8 | title: 'Create Next App', 9 | description: 'Generated by create next app', 10 | } 11 | 12 | export default function RootLayout({ 13 | children, 14 | }: { 15 | children: React.ReactNode 16 | }) { 17 | return ( 18 | 19 | {children} 20 | 21 | ) 22 | } 23 | -------------------------------------------------------------------------------- /app/models/sound.tsx: -------------------------------------------------------------------------------- 1 | /** 2 | * Represents the request payload for generating sound using a pre-trained model. 3 | */ 4 | interface CreateSoundRequest { 5 | /** 6 | * The URL of the pre-trained model to be used for sound generation. 7 | */ 8 | modelUrl: string; 9 | 10 | /** 11 | * The input text that will be used to generate the sound. 12 | */ 13 | text: string; 14 | } 15 | -------------------------------------------------------------------------------- /app/page.tsx: -------------------------------------------------------------------------------- 1 | import GenerateSoundView from "./views/GenerateSoundView"; 2 | 3 | /** 4 | * The main entry point component for the application. 5 | * It renders the GenerateSoundView component. 6 | */ 7 | export default function Home() { 8 | return ( 9 |
10 | {/* Render the GenerateSoundView component */} 11 | 12 |
13 | ); 14 | } 15 | -------------------------------------------------------------------------------- /app/views/GenerateSoundView.tsx: -------------------------------------------------------------------------------- 1 | 'use client' 2 | 3 | import { GenerateSoundForm } from "@/components/GenerateSoundForm"; 4 | import Loader from "@/components/Loader"; 5 | import { useState } from "react"; 6 | 7 | /** 8 | * The main view component for generating sound using a pre-trained model. 9 | */ 10 | export default function GenerateSoundView() { 11 | // State to manage loading status and audio URL 12 | const [isLoading, setIsLoading] = useState(false); 13 | const [audioUrl, setAudioUrl] = useState(null); 14 | 15 | /** 16 | * Handles the process of fetching audio data using the provided request. 17 | * @param {CreateSoundRequest} request - The request containing model URL and text. 18 | */ 19 | const handleGetAudio = async (request: CreateSoundRequest) => { 20 | setIsLoading(true); 21 | 22 | try { 23 | // Make a POST request to the server's API endpoint to generate audio 24 | const response = await fetch("/api/generate", { 25 | method: "POST", 26 | headers: { 27 | "Content-Type": "application/json", 28 | }, 29 | body: JSON.stringify({ 30 | input: request.text, 31 | modelUrl: request.modelUrl, 32 | }), 33 | }); 34 | 35 | if (!response.ok) { 36 | throw new Error("Failed to fetch audio data."); 37 | } 38 | 39 | // Get the audio data as an ArrayBuffer 40 | const data = await response.arrayBuffer(); 41 | 42 | // Convert ArrayBuffer to Blob and create a URL for the audio 43 | const blob = new Blob([data], { type: "audio/mpeg" }); 44 | const audioUrl = URL.createObjectURL(blob); 45 | setAudioUrl(audioUrl); 46 | setIsLoading(false); 47 | } catch (error) { 48 | setIsLoading(false); 49 | } 50 | }; 51 | 52 | return ( 53 |
54 |
55 |
56 |

Text to Speech

57 |
58 | {/* Render the form component for generating sound */} 59 | 60 |
61 |
62 |
63 | {isLoading ? ( 64 | // Show loader when fetching audio data 65 | 66 | ) : ( 67 | // Display audio player when audio is available 68 | <> 69 | {audioUrl && ( 70 | 73 | )} 74 | 75 | )} 76 |
77 |
78 |
79 | ); 80 | } 81 | -------------------------------------------------------------------------------- /components.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://ui.shadcn.com/schema.json", 3 | "style": "default", 4 | "rsc": true, 5 | "tsx": true, 6 | "tailwind": { 7 | "config": "tailwind.config.js", 8 | "css": "app/globals.css", 9 | "baseColor": "slate", 10 | "cssVariables": true 11 | }, 12 | "aliases": { 13 | "components": "@/components", 14 | "utils": "@/lib/utils" 15 | } 16 | } -------------------------------------------------------------------------------- /components/GenerateSoundForm.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | // Import necessary modules and components 4 | import { zodResolver } from "@hookform/resolvers/zod"; 5 | import { useForm } from "react-hook-form"; 6 | import * as z from "zod"; 7 | 8 | import { Button } from "@/components/ui/button"; 9 | import { 10 | Form, 11 | FormControl, 12 | FormDescription, 13 | FormField, 14 | FormItem, 15 | FormLabel, 16 | FormMessage, 17 | } from "@/components/ui/form"; 18 | import { 19 | Select, 20 | SelectContent, 21 | SelectItem, 22 | SelectTrigger, 23 | SelectValue, 24 | } from "@/components/ui/select"; 25 | import { useState } from "react"; 26 | import { Textarea } from "./ui/textarea"; 27 | import SOUND_MODELS, { SoundModel } from "@/lib/constants"; 28 | 29 | // Define the validation schema for the form fields 30 | const FormSchema = z.object({ 31 | soundModel: z.string({ 32 | required_error: "Please select a Hugging Face sound model to use.", 33 | }), 34 | text: z.string({ 35 | required_error: "Please select a text for the model to use.", 36 | }), 37 | }); 38 | 39 | // Define the props interface for the GenerateSoundForm component 40 | interface GenerateSoundFormProps { 41 | handleGetAudio: (data: CreateSoundRequest) => void; 42 | } 43 | 44 | // Main component function 45 | export function GenerateSoundForm({ handleGetAudio }: GenerateSoundFormProps) { 46 | // State for tracking form submission status 47 | const [formSubmitting, setFormSubmitting] = useState(false); 48 | 49 | // Initialize the react-hook-form with the validation schema 50 | const form = useForm>({ 51 | resolver: zodResolver(FormSchema), 52 | }); 53 | 54 | // Function to handle form submission 55 | function onSubmit(data: z.infer) { 56 | setFormSubmitting(true); 57 | 58 | // Prepare the sound request object 59 | const soundRequest: CreateSoundRequest = { 60 | modelUrl: data.soundModel, 61 | text: data.text, 62 | }; 63 | 64 | // Call the provided handler function with the sound request 65 | handleGetAudio(soundRequest); 66 | 67 | setFormSubmitting(false); 68 | } 69 | 70 | return ( 71 |
72 | {/* Form component that uses react-hook-form */} 73 |
74 | 75 | {/* Form field for selecting the sound model */} 76 | ( 80 | 81 | Sound Model 82 | {/* Select component for choosing a sound model */} 83 | 102 | 103 | This model will generate your sound. 104 | 105 | 106 | 107 | )} 108 | /> 109 | {/* Form field for entering the text */} 110 | ( 114 | 115 | Text 116 | 117 | {/* Textarea component for entering text */} 118 |