├── .env.example ├── .eslintrc.js ├── .gitignore ├── .prettierrc ├── .vscode └── settings.json ├── LICENSE ├── NOTICE ├── README.md ├── assets └── How-dubbing-works.png ├── bun.lockb ├── input └── example.mp4 ├── output └── example-result.mp4 ├── package.json ├── src ├── core │ └── index.ts ├── elevenlabs │ └── elevenlabs.ts ├── ffmpeg │ ├── audio-utils.ts │ ├── ffmpegPatch.ts │ └── video-utils.ts ├── lipsync │ └── lipsync.ts ├── llm │ ├── openai.ts │ └── prompt-builder.ts ├── separator │ └── spleeter.ts ├── smart-sync │ └── adaptation.ts ├── speech │ └── speechGenerator.ts ├── subtitles │ └── subtitles-generator.ts ├── transcription │ ├── formatter.ts │ ├── textTranslator.ts │ └── transcriber.ts ├── types │ ├── index.d.ts │ ├── lipsync.d.ts │ ├── speech.d.ts │ └── spleeter.d.ts └── utils │ ├── config.ts │ ├── constants.ts │ └── helpers.ts ├── start.sh ├── temporary-files └── example.txt └── tsconfig.json /.env.example: -------------------------------------------------------------------------------- 1 | PORT=4000 2 | OPENAI_API_KEY=your_openai_api_key_here 3 | GLADIA_API_KEY=your_gladia_api_key_here 4 | ELEVEN_LABS_API_KEY=your_eleven_labs_api_key_here 5 | LALAL_LICENSE_KEY=your_lalal_license_key_here 6 | SYNC_LAB_API_KEY=your_sync_lab_api_key_here 7 | 8 | #AWS (For lipsync) 9 | AWS_S3_REGION=your_aws_s3_region_here 10 | AWS_ACCESS_KEY_ID=your_aws_access_key_id_here 11 | AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here 12 | AWS_BUCKET_NAME=your_aws_bucket_name_here -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | parser: '@typescript-eslint/parser', 3 | parserOptions: { 4 | project: 'tsconfig.json', 5 | tsconfigRootDir: __dirname, 6 | sourceType: 'module', 7 | }, 8 | plugins: ['@typescript-eslint/eslint-plugin'], 9 | extends: ['plugin:@typescript-eslint/recommended', 'plugin:prettier/recommended'], 10 | root: true, 11 | env: { 12 | node: true, 13 | jest: true, 14 | }, 15 | ignorePatterns: ['.eslintrc.js'], 16 | rules: { 17 | '@typescript-eslint/interface-name-prefix': 'off', 18 | '@typescript-eslint/explicit-function-return-type': 'off', 19 | '@typescript-eslint/explicit-module-boundary-types': 'off', 20 | '@typescript-eslint/no-explicit-any': 'off', 21 | '@typescript-eslint/no-floating-promises': 'error', 22 | }, 23 | prettier: { 24 | printWidth: 110, 25 | }, 26 | }; 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore 2 | 3 | # Logs 4 | 5 | logs 6 | _.log 7 | npm-debug.log_ 8 | yarn-debug.log* 9 | yarn-error.log* 10 | lerna-debug.log* 11 | .pnpm-debug.log* 12 | 13 | .vscode 14 | 15 | # Caches 16 | 17 | .cache 18 | 19 | # Diagnostic reports (https://nodejs.org/api/report.html) 20 | 21 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 22 | 23 | # Runtime data 24 | 25 | pids 26 | _.pid 27 | _.seed 28 | *.pid.lock 29 | 30 | # Directory for instrumented libs generated by jscoverage/JSCover 31 | 32 | lib-cov 33 | 34 | # Coverage directory used by tools like istanbul 35 | 36 | coverage 37 | *.lcov 38 | 39 | # nyc test coverage 40 | 41 | .nyc_output 42 | 43 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 44 | 45 | .grunt 46 | 47 | # Bower dependency directory (https://bower.io/) 48 | 49 | bower_components 50 | 51 | # node-waf configuration 52 | 53 | .lock-wscript 54 | 55 | # Compiled binary addons (https://nodejs.org/api/addons.html) 56 | 57 | build/Release 58 | 59 | # Dependency directories 60 | 61 | node_modules/ 62 | jspm_packages/ 63 | 64 | # Snowpack dependency directory (https://snowpack.dev/) 65 | 66 | web_modules/ 67 | 68 | # TypeScript cache 69 | 70 | *.tsbuildinfo 71 | 72 | # Optional npm cache directory 73 | 74 | .npm 75 | 76 | # Optional eslint cache 77 | 78 | .eslintcache 79 | 80 | # Optional stylelint cache 81 | 82 | .stylelintcache 83 | 84 | # Microbundle cache 85 | 86 | .rpt2_cache/ 87 | .rts2_cache_cjs/ 88 | .rts2_cache_es/ 89 | .rts2_cache_umd/ 90 | 91 | # Optional REPL history 92 | 93 | .node_repl_history 94 | 95 | # Output of 'npm pack' 96 | 97 | *.tgz 98 | 99 | # Yarn Integrity file 100 | 101 | .yarn-integrity 102 | 103 | # dotenv environment variable files 104 | 105 | .env 106 | .env.development.local 107 | .env.test.local 108 | .env.production.local 109 | .env.local 110 | 111 | # parcel-bundler cache (https://parceljs.org/) 112 | 113 | .parcel-cache 114 | 115 | # Next.js build output 116 | 117 | .next 118 | out 119 | 120 | # Nuxt.js build / generate output 121 | 122 | .nuxt 123 | dist 124 | 125 | # Gatsby files 126 | 127 | # Comment in the public line in if your project uses Gatsby and not Next.js 128 | 129 | # https://nextjs.org/blog/next-9-1#public-directory-support 130 | 131 | # public 132 | 133 | # vuepress build output 134 | 135 | .vuepress/dist 136 | 137 | # vuepress v2.x temp and cache directory 138 | 139 | .temp 140 | 141 | # Docusaurus cache and generated files 142 | 143 | .docusaurus 144 | 145 | # Serverless directories 146 | 147 | .serverless/ 148 | 149 | # FuseBox cache 150 | 151 | .fusebox/ 152 | 153 | # DynamoDB Local files 154 | 155 | .dynamodb/ 156 | 157 | # TernJS port file 158 | 159 | .tern-port 160 | 161 | # Stores VSCode versions used for testing VSCode extensions 162 | 163 | .vscode-test 164 | 165 | # yarn v2 166 | 167 | .yarn/cache 168 | .yarn/unplugged 169 | .yarn/build-state.yml 170 | .yarn/install-state.gz 171 | .pnp.* 172 | 173 | # IntelliJ based IDEs 174 | .idea 175 | 176 | # Finder (MacOS) folder config 177 | .DS_Store 178 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "singleQuote": true, 3 | "trailingComma": "all", 4 | "printWidth": 110 5 | } 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "prettier.prettierPath": "./node_modules/prettier" 3 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution-NonCommercial 4.0 International License 2 | 3 | This work is licensed under the Creative Commons 4 | Attribution-NonCommercial 4.0 International License. 5 | 6 | To view a copy of this license, visit 7 | https://creativecommons.org/licenses/by-nc/4.0/ or send a 8 | letter to Creative Commons, PO Box 1866, Mountain View, 9 | CA 94042, USA. 10 | 11 | You are free to: 12 | 13 | - Share — copy and redistribute the material in any medium 14 | or format 15 | - Adapt — remix, transform, and build upon the material 16 | 17 | Under the following terms: 18 | 19 | - Attribution — You must give appropriate credit, provide a 20 | link to the license, and indicate if changes were made. 21 | You may do so in any reasonable manner, but not in any 22 | way that suggests the licensor endorses you or your use. 23 | - NonCommercial — You may not use the material for 24 | commercial purposes. 25 | 26 | No additional restrictions — You may not apply legal terms 27 | or technological measures that legally restrict others from 28 | doing anything the license permits. 29 | 30 | Notices: 31 | You do not have to comply with the license for elements of 32 | the material in the public domain or where your use is 33 | permitted by an applicable exception or limitation. 34 | 35 | No warranties are given. The license may not give you all 36 | of the permissions necessary for your intended use. For 37 | example, other rights such as publicity, privacy, or moral 38 | rights may limit how you use the material. 39 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Dubbing Engine 2 | Copyright (c) 2024 VoiceCheap.ai 3 | 4 | This product includes software developed at VoiceCheap.ai. 5 | 6 | This project is licensed under the Creative Commons 7 | Attribution-NonCommercial 4.0 International License. 8 | For commercial use, please contact: kevin.rousseau@voicecheap.ai 9 | 10 | Third-party dependencies and acknowledgments: 11 | 12 | - TypeScript (Apache License 2.0) 13 | - Bun (MIT License) 14 | - Various API integrations (see README.md for details) 15 | 16 | For more information about licensing, see the LICENSE file. 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![How Dubbing Works](./assets/How-dubbing-works.png) 2 | 3 | # Dubbing Engine with Bun and Typescript 4 | 5 | [![Star this repo](https://img.shields.io/github/stars/kevinrss01/dubbing-engine?style=social)](https://github.com/kevinrss01/dubbing-engine) 6 | [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-CC%20BY--NC%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc/4.0/) 7 | 8 | ## 🌐 Demo 9 | 10 | ### Original video 11 | 12 | https://github.com/user-attachments/assets/73a22695-9457-4c10-8782-c663dae249f3 13 | 14 | ### Translated video 15 | 16 | https://github.com/user-attachments/assets/a7b07820-a99c-4c95-80f6-e2c76f8d191b 17 | 18 | This AI-powered translation and video dubbing engine can translate audio and video files while cloning the original voices, adding subtitles, and synchronizing lip movements. The engine powers [VoiceCheap.ai](https://voicecheap.ai). 19 | 20 | ## ✨ Features 21 | 22 | - Voice cloning & generation 23 | - Automatic language detection 24 | - Speech adaptation for natural timing (SmartSync) 25 | - Background audio separation 26 | - Subtitle generation 27 | - Lip synchronization 28 | - Supports 35 languages 29 | 30 | ## 🧠 How It Works 31 | 32 | The dubbing process follows these steps: 33 | 34 | 1. **Configuration**: Select target language and options 35 | 2. **Transcription & Analysis**: 36 | - Identify source language 37 | - Transcribe audio 38 | - Generate context summary 39 | - Perform speaker diarization (identify different speakers) 40 | 3. **Translation**: 41 | - Format speech segments 42 | - Translate with LLM contextual awareness 43 | 4. **Audio Processing**: 44 | - Separate voices and background audio 45 | - Measure audio levels 46 | - Create timeline for each speaker 47 | 5. **Voice Generation**: 48 | - Clone each speaker's voice 49 | - Apply SmartSync adaptation to match timing 50 | - Adjust speed if necessary 51 | 6. **Final Assembly**: 52 | - Concatenate translated segments 53 | - Adjust audio levels and equalize 54 | - Merge translated voices with background audio 55 | - Add subtitles 56 | - Apply lip synchronization 57 | 58 | ### SmartSync Adaptation 59 | 60 | SmartSync adapts the speaker's speech based on language and speaking speed to match the original timing as closely as possible. When a literal translation would run too long, it intelligently reformulates sentences to maintain natural pacing and synchronization with the original speech. 61 | 62 | ## 🚀 Getting Started 63 | 64 | ### Prerequisites 65 | 66 | Before launching the project, make sure you have the following software installed: 67 | 68 | - **Node.js**: [Download Node.js](https://nodejs.org/) 69 | - **Bun**: JavaScript runtime & toolkit 70 | - **FFmpeg**: Audio/video processing tool 71 | - **API Keys**: For various services (see below) 72 | 73 | #### How to Install Required Software 74 | 75 | **Node.js** 76 | 77 | - **Windows / macOS / Linux**: Download and install from [https://nodejs.org/](https://nodejs.org/) 78 | 79 | **Bun** 80 | 81 | - **macOS / Linux / Windows (WSL)**: 82 | ```bash 83 | curl -fsSL https://bun.sh/install | bash 84 | ``` 85 | For more details, see [Bun's official install guide](https://bun.sh/docs/installation). 86 | 87 | **FFmpeg** 88 | 89 | - **macOS**: Install via Homebrew: 90 | ```bash 91 | brew install ffmpeg 92 | ``` 93 | - **Windows**: Download the latest build from [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html), extract, and add the `bin` folder to your PATH. 94 | - **Linux**: Install via package manager (e.g. Ubuntu/Debian): 95 | ```bash 96 | sudo apt update && sudo apt install ffmpeg 97 | ``` 98 | For other distributions, see [FFmpeg's official download page](https://ffmpeg.org/download.html). 99 | 100 | #### API Keys Required 101 | 102 | You will need API keys from the following services: 103 | 104 | - **OpenAI**: [Get your API key here](https://platform.openai.com/account/api-keys) 105 | - **Gladia**: [Sign up and get your API key here](https://app.gladia.io/) 106 | - **Eleven Labs**: [Sign up and get your API key here](https://elevenlabs.io/) 107 | - **Lalal.ai**: [Sign up and get your license key here](https://www.lalal.ai/) 108 | - **SyncLab**: [Sign up and get your API key here](https://synclab.ai/) 109 | - **Note**: SyncLab requires a subscription. To add lipsync to videos longer than 5 minutes, you must have a "Scale" plan. 110 | - **AWS (for lipsync)**: Create an account at [AWS](https://aws.amazon.com/) and generate S3 credentials if you want to use the lipsync feature. 111 | 112 | Create a `.env` file based on the `.env.example` and fill in your API keys: 113 | 114 | ``` 115 | PORT=4000 116 | OPENAI_API_KEY=your_openai_api_key_here 117 | GLADIA_API_KEY=your_gladia_api_key_here 118 | ELEVEN_LABS_API_KEY=your_eleven_labs_api_key_here 119 | LALAL_LICENSE_KEY=your_lalal_license_key_here 120 | SYNC_LAB_API_KEY=your_sync_lab_api_key_here 121 | 122 | #AWS (For lipsync) 123 | AWS_S3_REGION=your_aws_s3_region_here 124 | AWS_ACCESS_KEY_ID=your_aws_access_key_id_here 125 | AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here 126 | AWS_BUCKET_NAME=your_aws_bucket_name_here 127 | ``` 128 | 129 | > **Note**: AWS credentials are only required for the lipsync feature. Users need a "Scale" subscription for SyncLab to add lipsync to videos longer than 5 minutes. 130 | 131 | > **Important**: It is mandatory to add your own API keys in the `.env` file for all services (excluding the SyncLab API key, which is optional). Without these keys, you will not be able to start the project. 132 | 133 | ### Installation & Usage 134 | 135 | 1. Clone the repository 136 | 2. Create and configure your `.env` file with the necessary API keys 137 | 3. Run the start script: 138 | 139 | ```bash 140 | ./start.sh 141 | ``` 142 | 143 | The script will: 144 | 145 | - Check for required dependencies 146 | - Verify environment variables 147 | - Install necessary packages 148 | - Guide you through the dubbing process 149 | 150 | ## 🛠️ Technology 151 | 152 | - **TypeScript**: Core programming language 153 | - **Bun**: JavaScript runtime and toolkit 154 | - **OpenAI**: Translation and text adaptation 155 | - **Gladia**: Audio transcription 156 | - **Eleven Labs**: Voice cloning and speech generation 157 | - **Lalal.ai**: Audio separation 158 | - **SyncLab**: Lip synchronization 159 | 160 | ## 🔤 Supported Languages 161 | 162 | The engine supports all these languages: 163 | 164 | | Accepted Input Language | Output Language | 165 | | ----------------------- | ------------------------------------------ | 166 | | Afrikaans | | 167 | | Albanian | | 168 | | Amharic | | 169 | | Arabic | Arabic | 170 | | Armenian | | 171 | | Azerbaijani | | 172 | | Bashkir | | 173 | | Belarusian | | 174 | | Bengali | | 175 | | Bosnian | | 176 | | Breton | | 177 | | Bulgarian | Bulgarian | 178 | | Burmese | | 179 | | Catalan | | 180 | | Chinese | Mandarin | 181 | | Croatian | Croatian | 182 | | Czech | Czech | 183 | | Danish | Danish | 184 | | Dutch | Dutch | 185 | | English | English, American English, British English | 186 | | Estonian | | 187 | | Finnish | Finnish | 188 | | French | French, French Canadian | 189 | | Galician | | 190 | | Georgian | | 191 | | German | German | 192 | | Greek | Greek | 193 | | Gujarati | | 194 | | Haitian | | 195 | | Hausa | | 196 | | Hebrew | | 197 | | Hindi | Hindi | 198 | | Hungarian | Hungarian | 199 | | Icelandic | | 200 | | Indonesian | Indonesian | 201 | | Italian | Italian | 202 | | Japanese | Japanese | 203 | | Javanese | | 204 | | Kannada | | 205 | | Kazakh | | 206 | | Korean | Korean | 207 | | Lao | | 208 | | Latvian | | 209 | | Lingala | | 210 | | Lithuanian | | 211 | | Luxembourgish | | 212 | | Macedonian | | 213 | | Malagasy | | 214 | | Malay | Malay | 215 | | Malayalam | | 216 | | Marathi | | 217 | | Moldavian | | 218 | | Moldovan | | 219 | | Mongolian | | 220 | | Nepali | | 221 | | Norwegian | Norwegian | 222 | | Occitan | | 223 | | Panjabi | | 224 | | Pashto | | 225 | | Persian | | 226 | | Polish | Polish | 227 | | Portuguese | Portuguese | 228 | | Pushto | | 229 | | Romanian | Romanian | 230 | | Russian | Russian | 231 | | Serbian | | 232 | | Sindhi | | 233 | | Sinhala | | 234 | | Slovak | Slovak | 235 | | Slovenian | | 236 | | Somali | | 237 | | Spanish | Spanish | 238 | | Sundanese | | 239 | | Swahili | | 240 | | Swedish | Swedish | 241 | | Tagalog | Tagalog | 242 | | Tamil | Tamil | 243 | | Turkish | Turkish | 244 | | Ukrainian | Ukrainian | 245 | | Urdu | | 246 | | Uzbek | | 247 | | Valencian | | 248 | | Vietnamese | Vietnamese | 249 | | Welsh | | 250 | | Yiddish | | 251 | | Yoruba | | 252 | 253 | ## 🤝 Contributing 254 | 255 | Contributions are welcome! Feel free to: 256 | 257 | - Star this repository to show support 258 | - Open issues for bugs or feature requests 259 | - Submit pull requests to improve the codebase 260 | 261 | ## ⚠️ Requirements 262 | 263 | For optimal performance and to use all features: 264 | 265 | - Ensure FFmpeg is properly installed 266 | - Configure all API keys 267 | - For lipsync features, AWS S3 credentials are required 268 | - SyncLab "Scale" subscription for longer videos 269 | 270 | ## 📄 License 271 | 272 | This project is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. 273 | Personal and non-commercial use only. 274 | **Commercial use / SaaS / API integrations require a separate license — contact kevin.rousseau@voicecheap.ai** to access an enhanced API. 275 | 276 | View the full license at https://creativecommons.org/licenses/by-nc/4.0/ 277 | 278 | ## 📊 Translation Quality & Model Options 279 | 280 | **The current stack (OpenAI + Gladia + ElevenLabs + Lalal.ai) comes from months of benchmark­ing; it’s still the most accurate & stable combo I’ve found, even if it costs more.** 281 | 282 | The quality of translations can be increased depending on your needs and budget by changing the AI models used: 283 | 284 | - **Translation Models**: You can use instead, reasoning models like o3-mini (with reasoning capabilities), or upcoming models like o4-mini or o4. 285 | - **Adaptation Quality**: For models supporting reasoning efforts (o1, o3-mini, o3, o1-Pro), you can increase the reasoning_effort parameter from 'medium' to "high". 286 | 287 | These options allow you to balance cost versus quality based on your specific requirements. 288 | 289 | ## 🏆 Smarter Models 290 | 291 | You can leverage models with superior performance on the [MMLU-Pro benchmark](https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro) for enhanced translation quality. Avoid using DeepL as it lacks comprehensive context handling and instruction adherence. 292 | 293 | ## 🔧 Alternative Open-Source Models 294 | 295 | To reduce external API dependencies, consider using open-source alternatives: 296 | 297 | - **Transcription**: Whisper 298 | - **Text-to-Speech**: `hexgrad/Kokoro-82M`, Orpheus Speech from Canopy, SESAME models 299 | - **Translation & Adaptation**: LLAMA 300 | - **Multi-language Voice Cloning**: _TBD_ 301 | - **Lip Synchronization**: Wav2Lip 302 | 303 | --- 304 | 305 | If you find this project helpful, please consider giving it a ⭐ to show support! 306 | -------------------------------------------------------------------------------- /assets/How-dubbing-works.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/assets/How-dubbing-works.png -------------------------------------------------------------------------------- /bun.lockb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/bun.lockb -------------------------------------------------------------------------------- /input/example.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/input/example.mp4 -------------------------------------------------------------------------------- /output/example-result.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/output/example-result.mp4 -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dubbing-engine-bun", 3 | "module": "index.ts", 4 | "type": "module", 5 | "devDependencies": { 6 | "@types/bun": "latest", 7 | "@types/fluent-ffmpeg": "^2.1.27", 8 | "@typescript-eslint/eslint-plugin": "^8.29.1", 9 | "@typescript-eslint/parser": "^8.29.1", 10 | "axios": "^1.8.4", 11 | "elevenlabs": "^1.56.1", 12 | "eslint": "^9.24.0", 13 | "eslint-config-prettier": "^10.1.2", 14 | "eslint-plugin-prettier": "^5.2.6", 15 | "ffprobe-static": "^3.1.0", 16 | "fluent-ffmpeg": "^2.1.3", 17 | "form-data": "^4.0.2", 18 | "openai": "^4.90.0", 19 | "prettier": "^3.5.3", 20 | "tmp-promise": "^3.0.3" 21 | }, 22 | "peerDependencies": { 23 | "typescript": "^5.0.0" 24 | }, 25 | "dependencies": { 26 | "@aws-sdk/client-s3": "^3.787.0", 27 | "@types/ffprobe-static": "^2.0.3", 28 | "@types/qs": "^6.9.18", 29 | "qs": "^6.14.0" 30 | } 31 | } -------------------------------------------------------------------------------- /src/core/index.ts: -------------------------------------------------------------------------------- 1 | import { SubtitlesGenerator } from './../subtitles/subtitles-generator'; 2 | import { AudioUtils } from '../ffmpeg/audio-utils'; 3 | import { Helpers } from '../utils/helpers'; 4 | import { Transcriber } from '../transcription/transcriber'; 5 | import type { AllowedLanguages, AudioOriginalLangAllowed, TranscriptionDataTypes } from '../types'; 6 | import { Formatter } from '../transcription/formatter'; 7 | import { TextTranslator } from '../transcription/textTranslator'; 8 | import { Spleeter } from '../separator/spleeter'; 9 | import { SpeechGenerator } from '../speech/speechGenerator'; 10 | import { Adaptation } from '../smart-sync/adaptation'; 11 | import { VideoUtils } from '../ffmpeg/video-utils'; 12 | import fsPromises from 'fs/promises'; 13 | import fs from 'fs'; 14 | import { Lipsync } from '../lipsync/lipsync'; 15 | import crypto from 'crypto'; 16 | 17 | export type DebugMode = 'yes' | 'no'; 18 | export type NumberOfSpeakers = 'auto-detect' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | '10'; 19 | export type ActivateLipSync = 'yes' | 'no'; 20 | export type ActivateSubtitle = 'yes' | 'no'; 21 | 22 | export const translate = async () => { 23 | const targetLanguage = (process.env.TARGET_LANGUAGE || 'english') as AllowedLanguages; 24 | const debugMode: DebugMode = (process.env.DEBUG_MODE as DebugMode) || 'no'; 25 | const numberOfSpeakers: NumberOfSpeakers = (process.env.NUM_SPEAKERS as NumberOfSpeakers) || 'auto-detect'; 26 | const activateLipSync: ActivateLipSync = (process.env.APPLY_LIPSYNC as ActivateLipSync) || 'no'; 27 | const activateSubtitle: ActivateSubtitle = (process.env.ACTIVATE_SUBTITLE as ActivateSubtitle) || 'yes'; 28 | 29 | let clonedVoicesIdsToDelete: string[] = []; 30 | 31 | const transcriptionData: TranscriptionDataTypes = { 32 | summary: null, 33 | formattedSegments: [], 34 | detectedAudioLanguage: null, 35 | }; 36 | 37 | if (debugMode === 'no') { 38 | console.debug = () => {}; 39 | console.info('Dubbing Started successfully with the following parameters:'); 40 | console.info('Target Language: ', targetLanguage); 41 | console.info('Debug Mode: ', debugMode); 42 | console.info('Number of Speakers: ', numberOfSpeakers); 43 | console.info('Activate Lip Sync: ', activateLipSync); 44 | console.info('Activate Subtitle: ', activateSubtitle); 45 | } 46 | 47 | Helpers.verifyPrerequisitesForDubbing(); 48 | 49 | let inputFilePath = ''; 50 | let videoPathWithoutAudio = null; 51 | let audioPathWithoutVideo = null; 52 | let backgroundAudio = null; 53 | let vocalsIsolated = null; 54 | 55 | try { 56 | inputFilePath = await Helpers.getAllInputFilePaths(); 57 | const fileType = Helpers.getFileType(inputFilePath); 58 | 59 | if (fileType === 'video') { 60 | const { videoPath, audioPath } = await AudioUtils.separateAudioAndVideo(inputFilePath); 61 | videoPathWithoutAudio = videoPath; 62 | audioPathWithoutVideo = audioPath; 63 | } else { 64 | const audioPathCopy = `temporary-files/original-audio-${crypto.randomUUID()}.wav`; 65 | await fsPromises.copyFile(inputFilePath, audioPathCopy); 66 | audioPathWithoutVideo = audioPathCopy; 67 | } 68 | 69 | const transcription = await Transcriber.transcribeAudio({ 70 | audioPath: audioPathWithoutVideo, 71 | numberOfSpeakers, 72 | }); 73 | 74 | transcriptionData.detectedAudioLanguage = transcription.result.transcription 75 | .languages[0] as AudioOriginalLangAllowed; 76 | 77 | const transcriptionSummary = transcription.result.summarization.results; 78 | 79 | const formattedTranscription = Formatter.formatTranscription( 80 | transcription, 81 | transcriptionData.detectedAudioLanguage, 82 | ); 83 | 84 | const translatedTranscription = await TextTranslator.translateTranscriptionInTargetLanguage({ 85 | transcription: formattedTranscription, 86 | targetLanguage, 87 | originLanguage: transcriptionData.detectedAudioLanguage, 88 | transcriptionSummary: transcriptionSummary || '', 89 | }); 90 | 91 | const verifiedTranscription = Helpers.parseAndVerifyTranscriptionDetails( 92 | JSON.stringify(translatedTranscription), 93 | ); 94 | 95 | ({ backgroundAudio, vocalsIsolated } = await Spleeter.getSeparateAudio(audioPathWithoutVideo)); 96 | const isolatedVocalsAverageDecibel = await AudioUtils.getAverageDecibel(vocalsIsolated); 97 | 98 | const { allResultsSorted, clonedVoicesIds } = await SpeechGenerator.getSpeechArrayFromTranscriptions({ 99 | segments: verifiedTranscription, 100 | targetLanguage, 101 | isolatedVocalsPath: vocalsIsolated, 102 | }); 103 | 104 | clonedVoicesIdsToDelete = Object.values(clonedVoicesIds); 105 | 106 | const speechWithDuration = await SpeechGenerator.getEachSpeechDuration({ 107 | speechArray: allResultsSorted, 108 | transcriptions: verifiedTranscription, 109 | }); 110 | 111 | const speechesWithoutSilence = 112 | await SpeechGenerator.removeStartAndEndSilenceFromAllAudio(speechWithDuration); 113 | 114 | const adaptedSpeeches = await Adaptation.compareAndAdjustSpeeches({ 115 | transcriptions: verifiedTranscription, 116 | speeches: speechesWithoutSilence, 117 | clonedVoicesIds, 118 | originalLanguage: transcriptionData.detectedAudioLanguage, 119 | targetLanguage, 120 | transcriptionSummary, 121 | }); 122 | 123 | const finalVoicesAudioTrack = 124 | await SpeechGenerator.createAndAssembleSeparateAudioTracksEachSpeaker(adaptedSpeeches); 125 | 126 | const equalizedAudio = await AudioUtils.startEqualizeAudio(finalVoicesAudioTrack); 127 | 128 | await AudioUtils.adjustAudioToDecibel(equalizedAudio, isolatedVocalsAverageDecibel); 129 | 130 | const mergedAudio = await SpeechGenerator.overlayAudioAndBackgroundMusic(equalizedAudio, backgroundAudio); 131 | 132 | let finalContent = 133 | fileType === 'audio' 134 | ? mergedAudio 135 | : await VideoUtils.getAudioMergeWithVideo(videoPathWithoutAudio!, mergedAudio); 136 | 137 | if (fileType === 'video' && activateSubtitle === 'yes') { 138 | const filePathVideoSubtitles = await SubtitlesGenerator.addSubtitlesInVideo({ 139 | transcriptionData: verifiedTranscription, 140 | initialVideoPath: finalContent, 141 | lang: targetLanguage, 142 | }); 143 | 144 | finalContent = filePathVideoSubtitles; 145 | } 146 | 147 | if (fileType === 'video' && activateLipSync === 'yes') { 148 | const lipSyncedVideoUrl = await Lipsync.processLipSyncWithAwsUpload({ 149 | localAudioPath: mergedAudio, 150 | localVideoPath: finalContent, 151 | }); 152 | 153 | const lipSyncedVideo = await fetch(lipSyncedVideoUrl).then((res) => res.arrayBuffer()); 154 | const lipSyncedVideoBuffer = Buffer.from(lipSyncedVideo); 155 | const newFilePath = `output/result-${crypto.randomUUID()}.mp4`; 156 | await fsPromises.writeFile(newFilePath, lipSyncedVideoBuffer); 157 | 158 | finalContent = newFilePath; 159 | } 160 | 161 | if (fileType === 'video') { 162 | if (fs.existsSync(mergedAudio)) await fsPromises.unlink(mergedAudio); 163 | } 164 | 165 | console.info('Translation completed successfully, you can now find your video in the output folder.'); 166 | } catch (error) { 167 | if (error instanceof Error) { 168 | console.error('Error:', error.message); 169 | } else { 170 | console.error('Error:', error); 171 | } 172 | } finally { 173 | if (videoPathWithoutAudio && fs.existsSync(videoPathWithoutAudio)) 174 | await fsPromises.unlink(videoPathWithoutAudio); 175 | if (audioPathWithoutVideo && fs.existsSync(audioPathWithoutVideo)) 176 | await fsPromises.unlink(audioPathWithoutVideo); 177 | if (backgroundAudio && fs.existsSync(backgroundAudio)) await fsPromises.unlink(backgroundAudio); 178 | if (vocalsIsolated && fs.existsSync(vocalsIsolated)) await fsPromises.unlink(vocalsIsolated); 179 | } 180 | }; 181 | 182 | translate(); 183 | -------------------------------------------------------------------------------- /src/elevenlabs/elevenlabs.ts: -------------------------------------------------------------------------------- 1 | import fsPromise from 'fs/promises'; 2 | import axios from 'axios'; 3 | import * as crypto from 'crypto'; 4 | import { ElevenLabsClient } from 'elevenlabs'; 5 | import FormData from 'form-data'; 6 | import fs from 'fs'; 7 | import type { AllowedLanguages } from '../types/index'; 8 | import { Readable } from 'stream'; 9 | import { AudioUtils } from '../ffmpeg/audio-utils'; 10 | interface LabelPerLanguage { 11 | [key: string]: { 12 | accent: string; 13 | langue: string; 14 | language: string; 15 | }; 16 | } 17 | 18 | interface SettingsElevenLabs { 19 | text: string; 20 | model_id: 'eleven_monolingual_v2' | 'eleven_multilingual_v2'; 21 | output_format: 22 | | 'mp3_22050_32' 23 | | 'mp3_44100_32' 24 | | 'mp3_44100_64' 25 | | 'mp3_44100_96' 26 | | 'mp3_44100_128' 27 | | 'mp3_44100_192' 28 | | 'pcm_16000' 29 | | 'pcm_22050' 30 | | 'pcm_24000' 31 | | 'pcm_44100' 32 | | 'ulaw_8000'; 33 | voice_settings: { 34 | similarity_boost: number; 35 | stability: number; 36 | use_speaker_boost: boolean; 37 | speed?: number; //max 1.2 min 0.8 38 | }; 39 | previous_text?: string; 40 | next_text?: string; 41 | labels?: { 42 | accent: string; 43 | langue: string; 44 | language: string; 45 | }; 46 | previous_request_ids?: PreviousRequestIdsEL; 47 | } 48 | 49 | //Max 3 previous request ids 50 | export type PreviousRequestIdsEL = string[]; 51 | /* 52 | 53 | 54 | 55 | 56 | **Stability 57 | *The stability slider determines how stable the voice is and the randomness between each generation. 58 | *Lowering this slider introduces a broader emotional range for the voice. 59 | *As mentioned before, this is also influenced heavily by the original voice. 60 | *Setting the slider too low may result in odd performances that are overly 61 | *random and cause the character to speak too quickly. 62 | *On the other hand, setting it too high can lead to a monotonous voice with limited emotion. 63 | 64 | 65 | **Similarity 66 | The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. 67 | If the original audio is of poor quality and the similarity slider is set too high, the AI may reproduce artifacts or background noise when trying to mimic the voice if those were present in the original recording. 68 | */ 69 | 70 | /* 71 | 72 | **Speaker Boost 73 | This is another setting that was introduced in the new models. 74 | The setting itself is quite self-explanatory – it boosts the similarity to the original speaker. 75 | However, using this setting requires a slightly higher computational load, which in turn increases latency. 76 | The differences introduced by this setting are generally rather subtle. 77 | 78 | */ 79 | 80 | export type OutputFormat = 81 | | 'mp3_22050_32' 82 | | 'mp3_44100_32' 83 | | 'mp3_44100_64' 84 | | 'mp3_44100_96' 85 | | 'mp3_44100_128' 86 | | 'mp3_44100_192' 87 | | 'pcm_8000' 88 | | 'pcm_16000' 89 | | 'pcm_22050' 90 | | 'pcm_24000' 91 | | 'pcm_44100' 92 | | 'ulaw_8000' 93 | | 'alaw_8000' 94 | | 'opus_48000_32' 95 | | 'opus_48000_64' 96 | | 'opus_48000_96' 97 | | 'opus_48000_128' 98 | | 'opus_48000_192'; 99 | 100 | export class ElevenLabsService { 101 | elevenLabsApiKey: string | undefined; 102 | elevenLabsBaseUrl = 'https://api.elevenlabs.io/v1'; 103 | elevenLabsClient: ElevenLabsClient; 104 | 105 | constructor() { 106 | this.elevenLabsApiKey = process.env.ELEVEN_LABS_API_KEY; 107 | if (!this.elevenLabsApiKey) { 108 | throw new Error('ELEVEN_LABS_API_KEY is not defined'); 109 | } 110 | this.elevenLabsClient = new ElevenLabsClient({ 111 | apiKey: this.elevenLabsApiKey, 112 | }); 113 | } 114 | 115 | getLabels(targetLanguage: AllowedLanguages): 116 | | { 117 | accent: string; 118 | langue: string; 119 | language: string; 120 | } 121 | | undefined { 122 | const labelsPerLanguage: LabelPerLanguage = { 123 | french: { accent: 'french', langue: 'french', language: 'french' }, 124 | 'british english': { 125 | accent: 'british', 126 | langue: 'english', 127 | language: 'english', 128 | }, 129 | english: { 130 | accent: 'american', 131 | langue: 'english', 132 | language: 'english', 133 | }, 134 | 'french canadian': { 135 | accent: 'canadian', 136 | langue: 'french', 137 | language: 'french', 138 | }, 139 | vietnamese: { 140 | accent: 'vietnamese', 141 | langue: 'vietnamese', 142 | language: 'vietnamese', 143 | }, 144 | }; 145 | 146 | if (!labelsPerLanguage[targetLanguage]) { 147 | return undefined; 148 | } else { 149 | return labelsPerLanguage[targetLanguage]; 150 | } 151 | } 152 | 153 | // In the `cloneVoice` method of the `ElevenLabsService` class 154 | 155 | generateShortId(length: number): string { 156 | const characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'; 157 | let result = ''; 158 | const charactersLength = characters.length; 159 | for (let i = 0; i < length; i++) { 160 | result += characters.charAt(Math.floor(Math.random() * charactersLength)); 161 | } 162 | return result; 163 | } 164 | 165 | async cloneVoice( 166 | baseAudio: Buffer[], 167 | voiceName: string, 168 | totalDuration: number, 169 | ): Promise<{ voice_id: string }> { 170 | console.debug('Cloning voice...'); 171 | const maxDuration = 44 * 60; // 44 minutes in seconds 172 | const maxBufferSize = 10 * 1024 * 1024; // 10MB in bytes 173 | 174 | let processedAudio = baseAudio; 175 | 176 | let concatenatedBuffer = this.concatenateAudioBuffers(baseAudio); 177 | 178 | // Trim the audio buffer if it exceeds 44 minutes 179 | if (totalDuration > maxDuration) { 180 | concatenatedBuffer = await AudioUtils.trimAudioBuffer(concatenatedBuffer, maxDuration); 181 | 182 | processedAudio = this.splitBufferIntoChunks(concatenatedBuffer, maxBufferSize); 183 | } 184 | 185 | // Split the buffer into chunks not exceeding 10MB 186 | const uuid = crypto.randomUUID(); 187 | const shortId = this.generateShortId(6); 188 | const url = `${this.elevenLabsBaseUrl}/voices/add`; 189 | 190 | const formData = new FormData(); 191 | formData.append('name', `custom-voice-${shortId}`); 192 | formData.append('description', voiceName); 193 | 194 | processedAudio.forEach((audioBuffer, index) => { 195 | formData.append('files', audioBuffer, { 196 | filename: `${uuid}-${index}.mp3`, 197 | contentType: 'audio/mp3', 198 | }); 199 | }); 200 | 201 | try { 202 | const response = await axios.post(url, formData, { 203 | headers: { 204 | ...formData.getHeaders(), 205 | 'xi-api-key': this.elevenLabsApiKey, 206 | }, 207 | }); 208 | console.debug('One Voice cloned.'); 209 | return response.data; 210 | } catch (error: any) { 211 | console.error('Error in voice cloning:', error.response.data); 212 | if (error.response.data?.detail?.message?.includes('corrupted')) { 213 | throw new Error('Error during voice cloning, audio file is corrupted.'); 214 | } 215 | throw new Error('Error during voice cloning'); 216 | } 217 | } 218 | 219 | private splitBufferIntoChunks(buffer: Buffer, maxChunkSize: number): Buffer[] { 220 | const chunks: Buffer[] = []; 221 | let start = 0; 222 | 223 | while (start < buffer.length) { 224 | const end = Math.min(buffer.length, start + maxChunkSize); 225 | chunks.push(buffer.slice(start, end)); 226 | start = end; 227 | } 228 | 229 | return chunks; 230 | } 231 | 232 | /** 233 | * Concatenates an array of audio buffers into a single buffer 234 | * @param audioBuffers Array of audio buffers to concatenate 235 | * @returns A single concatenated buffer 236 | */ 237 | concatenateAudioBuffers(audioBuffers: Buffer[]): Buffer { 238 | // Validate input 239 | if (!audioBuffers || !Array.isArray(audioBuffers) || audioBuffers.length === 0) { 240 | throw new Error('Invalid input: audioBuffers must be a non-empty array of Buffer objects'); 241 | } 242 | 243 | // Check if all elements are Buffer instances 244 | for (const buffer of audioBuffers) { 245 | if (!(buffer instanceof Buffer)) { 246 | throw new Error('Invalid input: all elements in audioBuffers must be Buffer instances'); 247 | } 248 | } 249 | 250 | // Concatenate all buffers into a single buffer 251 | return Buffer.concat(audioBuffers); 252 | } 253 | 254 | async generateAudioFile({ 255 | text, 256 | modelId, 257 | voiceId, 258 | previousText, 259 | nextText, 260 | targetLanguage, 261 | speedFactor, 262 | }: { 263 | text: string; 264 | modelId: 'eleven_monolingual_v2' | 'eleven_multilingual_v2'; 265 | voiceId: string; 266 | previousText?: string; 267 | nextText?: string; 268 | targetLanguage?: AllowedLanguages; 269 | speedFactor?: number; 270 | }): Promise<{ 271 | response: Buffer; 272 | requestId: string; 273 | }> { 274 | const outputFormat: OutputFormat = 'mp3_44100_128'; 275 | 276 | const settingsElevenLabs: SettingsElevenLabs = { 277 | text: text, 278 | model_id: modelId, 279 | labels: targetLanguage ? this.getLabels(targetLanguage) : undefined, 280 | voice_settings: { 281 | similarity_boost: 0.85, 282 | stability: 0.5, 283 | use_speaker_boost: true, 284 | }, 285 | output_format: outputFormat, 286 | //! MP3 with 192kbps bitrate requires you to be subscribed to Creator tier or above. PCM with 44.1kHz sample rate requires you to be subscribed to Pro tier or above. 287 | //output_format: 'pcm_44100', 288 | }; 289 | 290 | if (previousText) settingsElevenLabs.previous_text = previousText + ' '; 291 | if (nextText) settingsElevenLabs.next_text = ' ' + nextText; 292 | if (speedFactor) settingsElevenLabs.voice_settings.speed = Number(speedFactor.toFixed(2)); 293 | 294 | // Maximum 3 tries 295 | const maxAttempts = 3; 296 | let attempt = 0; 297 | 298 | while (attempt < maxAttempts) { 299 | try { 300 | const res = await this.elevenLabsClient.textToSpeech.convert(voiceId, settingsElevenLabs); 301 | 302 | console.debug(`Speech 11labs generated on attempt ${attempt + 1}.`); 303 | 304 | async function readableToBuffer(readable: Readable): Promise { 305 | const chunks: Buffer[] = []; 306 | 307 | for await (const chunk of readable) { 308 | chunks.push(Buffer.from(chunk)); 309 | } 310 | 311 | return Buffer.concat(chunks); 312 | } 313 | 314 | const buffer = await readableToBuffer(res); 315 | 316 | const audioBuffer = 317 | outputFormat === 'mp3_22050_32' ? buffer : await AudioUtils.convertPCMBufferToWav(buffer); 318 | 319 | return { 320 | response: audioBuffer, 321 | requestId: crypto.randomUUID(), 322 | }; 323 | } catch (error: any) { 324 | console.error(`ERROR IN AUDIO GENERATION (attempt ${attempt + 1}):`, error); 325 | 326 | if (error.toString().includes('Status code: 401')) { 327 | throw new Error( 328 | 'The voice you are trying to translate cannot be cloned, because it is a protected voice.', 329 | ); 330 | } 331 | 332 | attempt++; 333 | 334 | if (attempt < maxAttempts) { 335 | console.debug('Waiting 10 seconds before next attempt...'); 336 | await new Promise((resolve) => setTimeout(resolve, 10000)); 337 | } else { 338 | throw new Error('Error during audio generation after multiple attempts'); 339 | } 340 | } 341 | } 342 | 343 | // In theory, we should never reach here, but just in case: 344 | throw new Error('Error during audio generation after multiple attempts'); 345 | } 346 | 347 | async isolateVoiceFromAudio(audioFilePath: string) { 348 | try { 349 | console.debug('Isolating voice from audio....'); 350 | 351 | const url = `${this.elevenLabsBaseUrl}/audio-isolation/stream`; 352 | const formData = new FormData(); 353 | formData.append('audio', fs.createReadStream(audioFilePath)); 354 | 355 | const response = await axios.post(url, formData, { 356 | headers: { 357 | ...formData.getHeaders(), 358 | 'xi-api-key': this.elevenLabsApiKey, 359 | }, 360 | responseType: 'arraybuffer', 361 | }); 362 | 363 | console.debug('Voice isolated successfully from audio.'); 364 | 365 | const vocalIsolatedBuffer = Buffer.from(response.data); 366 | const outputFilePath = audioFilePath.includes('.wav') 367 | ? audioFilePath.replace('.wav', '-vocal.wav') 368 | : audioFilePath.replace('.mp3', '-vocal.mp3'); 369 | 370 | await fsPromise.writeFile(outputFilePath, vocalIsolatedBuffer); 371 | 372 | return outputFilePath; 373 | } catch (err: any) { 374 | console.error('Error in isolateVoiceFromAudio:', err); 375 | throw new Error('Error during voice isolation'); 376 | } 377 | } 378 | } 379 | -------------------------------------------------------------------------------- /src/ffmpeg/ffmpegPatch.ts: -------------------------------------------------------------------------------- 1 | import ffmpeg from 'fluent-ffmpeg'; 2 | 3 | /** 4 | * Applies a temporary workaround to add the 'lavfi' format. 5 | * This patch adds 'lavfi' to the available formats returned by ffmpeg. 6 | * 7 | * @param command - An instance of ffmpeg.FfmpegCommand to patch. 8 | * @returns The patched ffmpeg command instance. 9 | */ 10 | export function applyLavfiWorkaround( 11 | command: ffmpeg.FfmpegCommand, 12 | ): ffmpeg.FfmpegCommand { 13 | // Save the original availableFormats function. 14 | const originalAvailableFormats = command.availableFormats; 15 | 16 | // Override availableFormats to inject the 'lavfi' format. 17 | command.availableFormats = (callback: (err: any, data: any) => void) => { 18 | originalAvailableFormats.call(command, (err: any, data: any) => { 19 | // If lavfi is not present, add it. 20 | if (!data.lavfi) { 21 | data.lavfi = { 22 | canDemux: true, // lavfi can be used as input 23 | canMux: false, // lavfi cannot be used as output 24 | description: 'Libavfilter virtual input device', 25 | }; 26 | } 27 | callback(err, data); 28 | }); 29 | }; 30 | 31 | return command; 32 | } 33 | -------------------------------------------------------------------------------- /src/ffmpeg/video-utils.ts: -------------------------------------------------------------------------------- 1 | import ffmpeg from 'fluent-ffmpeg'; 2 | import fs from 'fs'; 3 | import type { Readable } from 'stream'; 4 | import path from 'path'; 5 | import crypto from 'crypto'; 6 | import { createReadStream } from 'fs'; 7 | import { promisify } from 'util'; 8 | 9 | export class VideoUtils { 10 | static async getFileDuration(filePath: string): Promise { 11 | return new Promise((resolve, reject) => { 12 | if (!filePath) { 13 | console.error('No file path provided'); 14 | return reject(new Error('No file path provided')); 15 | } 16 | 17 | if (!fs.existsSync(filePath)) { 18 | console.error(`File not found: ${filePath}`); 19 | return reject(new Error('File not found or inaccessible')); 20 | } 21 | 22 | try { 23 | ffmpeg.ffprobe(filePath, (err, metadata) => { 24 | if (err) { 25 | console.error('Error while getting file duration:', err, metadata); 26 | 27 | const errorMessage = err.message?.toLowerCase() || ''; 28 | if (errorMessage.includes('invalid data') || errorMessage.includes('unsupported format')) { 29 | return reject(new Error('Invalid or unsupported media format')); 30 | } 31 | if (errorMessage.includes('permission denied')) { 32 | return reject(new Error('Permission denied to access file')); 33 | } 34 | 35 | return reject(new Error('Failed to process media file')); 36 | } 37 | 38 | if (!metadata?.format?.duration) { 39 | console.error('No duration found in metadata:', { 40 | filePath, 41 | metadata: metadata?.format, 42 | }); 43 | return reject(new Error('Could not determine media duration')); 44 | } 45 | 46 | const duration = metadata.format.duration; 47 | if (typeof duration !== 'number' || isNaN(duration) || duration <= 0) { 48 | console.error('Invalid duration value:', duration); 49 | console.error('metadata of the file:', metadata); 50 | } 51 | 52 | resolve(duration); 53 | }); 54 | } catch (error) { 55 | console.error('Unexpected error in getFileDuration:', error); 56 | reject(new Error('Internal server error while processing media')); 57 | } 58 | }); 59 | } 60 | 61 | static async getAudioMergeWithVideo(videoPath: string, audioPath: string): Promise { 62 | console.debug('Merging audio and video...'); 63 | let filePath = ''; 64 | try { 65 | const outputPath = path.join(`output/result-${crypto.randomUUID()}.mp4`); 66 | const contentLength = await this.getFileDuration(audioPath); 67 | 68 | if (typeof contentLength !== 'number') 69 | throw new Error( 70 | `Error during audio duration when merging audio and video: duration is not a number: ${contentLength}`, 71 | ); 72 | 73 | filePath = await this.mergeAudioAndVideo({ 74 | videoPath, 75 | audioPath, 76 | outputPath, 77 | }); 78 | 79 | console.debug('Audio and video merged.'); 80 | 81 | return filePath; 82 | } catch (e) { 83 | console.error(e); 84 | throw new Error('Error while merging audio and video'); 85 | } 86 | } 87 | 88 | static mergeAudioAndVideo = async ({ 89 | videoPath, 90 | audioPath, 91 | outputPath, 92 | }: { 93 | videoPath: string; 94 | audioPath: string; 95 | outputPath: string; 96 | }): Promise => { 97 | console.debug('Merging audio and video...'); 98 | 99 | const fileExtension = path.extname(videoPath).substring(1).toLowerCase(); 100 | 101 | // Helper to probe the audio track 102 | const ffprobePromise = promisify(ffmpeg.ffprobe); 103 | const audioMetadata = (await ffprobePromise(audioPath)) as { 104 | streams: Array<{ codec_type: string; codec_name: string }>; 105 | }; 106 | const audioStreamIndex = audioMetadata.streams.findIndex((stream) => stream.codec_type === 'audio'); 107 | if (audioStreamIndex === -1) { 108 | throw new Error('No valid audio track found in the provided audio file'); 109 | } 110 | 111 | const videoMetadata = (await ffprobePromise(videoPath)) as { 112 | streams: Array<{ codec_type: string; codec_name: string }>; 113 | }; 114 | 115 | const videoStreamIndex = videoMetadata.streams.findIndex((stream) => stream.codec_type === 'video'); 116 | 117 | if (videoStreamIndex === -1) { 118 | throw new Error('No valid video track found in the provided video file'); 119 | } 120 | 121 | const isAAC = audioMetadata.streams.some( 122 | (stream) => stream.codec_type === 'audio' && stream.codec_name === 'aac', 123 | ); 124 | 125 | return new Promise((resolve, reject) => { 126 | const command = ffmpeg() 127 | .input(videoPath) 128 | .input(audioPath) 129 | // English comment: Map video from the first input, audio from the second 130 | .outputOptions([ 131 | // Map the correct video track from the 1st input 132 | `-map 0:${videoStreamIndex}`, 133 | // Map the correct audio track from the 2nd input 134 | `-map 1:${audioStreamIndex}`, 135 | 136 | // Always copy video to avoid re-encoding (faster + no quality loss) 137 | '-c:v copy', 138 | 139 | // If audio is already AAC, copy it; otherwise encode to AAC 140 | isAAC ? '-c:a copy' : '-c:a aac', 141 | 142 | // Only apply bitrate if we are encoding 143 | // (this will be ignored if we're copying) 144 | '-b:a 320k', 145 | '-ar 48000', 146 | 147 | // Enable faststart for quick playback start in MP4 148 | '-movflags +faststart', 149 | 150 | // Use all available CPU threads for any encoding 151 | '-threads 0', 152 | ]) 153 | .format(fileExtension) 154 | .output(outputPath) 155 | .on('error', (err) => { 156 | console.error('Error merging audio/video:', err); 157 | reject(err); 158 | }) 159 | .on('stderr', (line) => { 160 | if (line.toLowerCase().includes('error')) { 161 | console.error('FFmpeg error:', line); 162 | } 163 | }) 164 | .on('end', () => { 165 | console.debug('Merging succeeded with minimal re-encoding.'); 166 | resolve(outputPath); 167 | }); 168 | 169 | command.run(); 170 | }); 171 | }; 172 | 173 | static addSubtitles = async ({ 174 | videoPath, 175 | srtFilePath, 176 | outputFilePath, 177 | }: { 178 | videoPath: string; 179 | srtFilePath: string; 180 | outputFilePath: string; 181 | }) => { 182 | if (!fs.existsSync(srtFilePath)) { 183 | throw new Error('Srt file does not exist'); 184 | } 185 | 186 | return new Promise((resolve, reject) => { 187 | // Get input file info 188 | ffmpeg.ffprobe(videoPath, (err, metadata) => { 189 | if (err) { 190 | console.error('Error probing video file:', err); 191 | return reject(err); 192 | } 193 | 194 | // Check if we're dealing with an HEVC/H.265 video 195 | const videoStream = metadata.streams.find((stream) => stream.codec_type === 'video'); 196 | const isHEVC = 197 | videoStream && videoStream.codec_name && videoStream.codec_name.toLowerCase().includes('hevc'); 198 | const is10bit = videoStream && videoStream.pix_fmt && videoStream.pix_fmt.includes('10le'); 199 | 200 | console.debug( 201 | `Video info: codec=${videoStream?.codec_name}, pixel format=${videoStream?.pix_fmt}, isHEVC=${isHEVC}, is10bit=${is10bit}`, 202 | ); 203 | 204 | let command = ffmpeg(videoPath); 205 | 206 | // Add subtitles filter with compatible font 207 | const subtitlesFilter = `subtitles=${srtFilePath}:force_style='FontName=DejaVu'`; 208 | 209 | if (isHEVC || is10bit) { 210 | // For HEVC/10-bit videos that need browser compatibility: 211 | console.debug('Converting HEVC/10-bit video to browser-compatible format'); 212 | command = command 213 | .videoCodec('libx264') // Use H.264 which has better browser support 214 | .outputOptions([ 215 | '-vf', 216 | subtitlesFilter, 217 | '-pix_fmt', 218 | 'yuv420p', // Convert to 8-bit color 219 | '-crf', 220 | '18', // High quality 221 | '-preset', 222 | 'medium', // Balance between speed and quality 223 | '-movflags', 224 | '+faststart', // Optimize for web playback 225 | '-c:a', 226 | 'aac', // Convert audio to AAC for compatibility 227 | '-b:a', 228 | '320k', // Good audio quality 229 | ]); 230 | } else { 231 | // For already compatible videos, minimal processing 232 | command = command.videoCodec('libx264').outputOptions([ 233 | '-vf', 234 | subtitlesFilter, 235 | '-pix_fmt', 236 | 'yuv420p', // Ensure 8-bit color 237 | '-c:a', 238 | 'copy', // Copy audio stream 239 | '-movflags', 240 | '+faststart', // Optimize for web playback 241 | ]); 242 | } 243 | 244 | command 245 | .on('start', (commandLine) => { 246 | console.debug('FFmpeg command:', commandLine); 247 | }) 248 | .on('stderr', (stderrLine) => { 249 | if (stderrLine.includes('error')) { 250 | console.error('FFmpeg stderr:', stderrLine); 251 | } 252 | }) 253 | .on('end', () => { 254 | console.debug('Subtitles added successfully'); 255 | resolve(outputFilePath); 256 | }) 257 | .on('error', (err) => { 258 | console.error('Error adding subtitles:', err); 259 | reject(err); 260 | }) 261 | .save(outputFilePath); 262 | }); 263 | }); 264 | }; 265 | } 266 | -------------------------------------------------------------------------------- /src/lipsync/lipsync.ts: -------------------------------------------------------------------------------- 1 | import type { AxiosResponse } from 'axios'; 2 | import type { SyncLabInitialResponse, SynclabV2RequestBody } from '../types/lipsync'; 3 | import axios from 'axios'; 4 | import fs from 'fs'; 5 | import { S3Client, PutObjectCommand, HeadObjectCommand, DeleteObjectCommand } from '@aws-sdk/client-s3'; 6 | 7 | export class Lipsync { 8 | static async startLipSync({ audioPath, videoPath }: { audioPath: string; videoPath: string }) { 9 | try { 10 | console.debug('Verifying usage links for lip sync...'); 11 | 12 | const syncLabResponse = await this.sendLipSyncRequest({ 13 | audioUrl: audioPath, 14 | videoUrl: videoPath, 15 | }); 16 | 17 | return syncLabResponse; 18 | } catch (error) { 19 | console.error(error); 20 | throw new Error('Error during lip sync request'); 21 | } 22 | } 23 | 24 | static async sendLipSyncRequest({ 25 | audioUrl, 26 | videoUrl, 27 | }: { 28 | audioUrl: string; 29 | videoUrl: string; 30 | }): Promise { 31 | const url = 'https://api.sync.so/v2/generate'; 32 | const body: SynclabV2RequestBody = { 33 | input: [ 34 | { 35 | type: 'video', 36 | url: videoUrl, 37 | }, 38 | { 39 | type: 'audio', 40 | url: audioUrl, 41 | }, 42 | ], 43 | options: { 44 | output_format: 'mp4', 45 | active_speaker: true, 46 | }, 47 | model: 'lipsync-2', 48 | }; 49 | 50 | const headers = { 51 | accept: 'application/json', 52 | 'x-api-key': process.env.SYNC_LAB_API_KEY, 53 | 'Content-Type': 'application/json', 54 | }; 55 | 56 | try { 57 | const response: AxiosResponse = await axios.post(url, body, { 58 | headers, 59 | }); 60 | 61 | return response.data as SyncLabInitialResponse; 62 | } catch (error: any) { 63 | console.error('Error:', error.response.data); 64 | throw new Error(`Synclabs error: ${error.message}`); 65 | } 66 | } 67 | 68 | static async pollLipSyncResult( 69 | initialResponse: SyncLabInitialResponse, 70 | maxAttempts = 600, 71 | intervalMs = 10000, 72 | ): Promise { 73 | let attempts = 0; 74 | 75 | while (attempts < maxAttempts) { 76 | attempts++; 77 | 78 | try { 79 | const url = `https://api.sync.so/v2/generate/${initialResponse.id}`; 80 | const headers = { 81 | accept: 'application/json', 82 | 'x-api-key': process.env.SYNC_LAB_API_KEY, 83 | }; 84 | 85 | const response = await axios.get(url, { headers }); 86 | const data = response.data; 87 | 88 | if (data.status === 'COMPLETED') { 89 | if (data.outputUrl) { 90 | return data.outputUrl; 91 | } else { 92 | throw new Error('Output URL is missing from completed response'); 93 | } 94 | } else if (['FAILED', 'REJECTED', 'CANCELED', 'TIMED_OUT'].includes(data.status)) { 95 | throw new Error( 96 | `Lipsync generation failed with status: ${data.status}, error: ${data.error || 'Unknown error'}`, 97 | ); 98 | } 99 | 100 | console.debug(`Lipsync job status: ${data.status}. Polling again in ${intervalMs / 1000} seconds...`); 101 | await new Promise((resolve) => setTimeout(resolve, intervalMs)); 102 | } catch (error: any) { 103 | console.error('Error polling lipsync result:', error); 104 | throw new Error(`Error polling lipsync result: ${error.message}`); 105 | } 106 | } 107 | 108 | throw new Error(`Lipsync generation timed out after ${maxAttempts} attempts`); 109 | } 110 | 111 | static async startLipSyncAndWaitForResult({ 112 | audioPath, 113 | videoPath, 114 | }: { 115 | audioPath: string; 116 | videoPath: string; 117 | }): Promise { 118 | try { 119 | console.debug('Starting lip sync process...'); 120 | 121 | const initialResponse = await this.sendLipSyncRequest({ 122 | audioUrl: audioPath, 123 | videoUrl: videoPath, 124 | }); 125 | 126 | console.debug(`Lip sync job started with ID: ${initialResponse.id}`); 127 | 128 | const outputUrl = await this.pollLipSyncResult(initialResponse); 129 | 130 | console.debug(`Lip sync completed. Output available at: ${outputUrl}`); 131 | return outputUrl; 132 | } catch (error) { 133 | console.error('Error during lip sync process:', error); 134 | throw new Error( 135 | `Failed to complete lip sync process: ${error instanceof Error ? error.message : String(error)}`, 136 | ); 137 | } 138 | } 139 | 140 | static async processLipSyncWithAwsUpload({ 141 | localVideoPath, 142 | localAudioPath, 143 | }: { 144 | localVideoPath: string; 145 | localAudioPath: string; 146 | }): Promise { 147 | // Check if required environment variables are set 148 | const requiredEnvVars = [ 149 | 'SYNC_LAB_API_KEY', 150 | 'AWS_S3_REGION', 151 | 'AWS_ACCESS_KEY_ID', 152 | 'AWS_SECRET_ACCESS_KEY', 153 | 'AWS_BUCKET_NAME', 154 | ]; 155 | 156 | for (const envVar of requiredEnvVars) { 157 | if (!process.env[envVar]) { 158 | throw new Error(`Missing required environment variable: ${envVar}`); 159 | } 160 | } 161 | 162 | // Check if files exist 163 | if (!fs.existsSync(localVideoPath)) { 164 | throw new Error(`Video file not found at path: ${localVideoPath}`); 165 | } 166 | if (!fs.existsSync(localAudioPath)) { 167 | throw new Error(`Audio file not found at path: ${localAudioPath}`); 168 | } 169 | 170 | // S3 configuration 171 | const s3BucketName = process.env.AWS_BUCKET_NAME || ''; 172 | const s3Region = process.env.AWS_S3_REGION || ''; 173 | 174 | // Create S3 client 175 | const s3client = new S3Client({ 176 | region: s3Region, 177 | }); 178 | 179 | // Store S3 file paths for later cleanup 180 | let videoFileName = ''; 181 | let audioFileName = ''; 182 | 183 | try { 184 | console.debug('Uploading files to AWS S3...'); 185 | 186 | // Generate unique file paths for S3 187 | const timestamp = Date.now(); 188 | videoFileName = `lipsync/video_${timestamp}_${localVideoPath.split('/').pop()}`; 189 | audioFileName = `lipsync/audio_${timestamp}_${localAudioPath.split('/').pop()}`; 190 | 191 | // Read files as buffers 192 | const videoBuffer = fs.readFileSync(localVideoPath); 193 | const audioBuffer = fs.readFileSync(localAudioPath); 194 | 195 | // Upload files to S3 196 | const [videoUrl, audioUrl] = await Promise.all([ 197 | uploadFileToS3(s3client, s3BucketName, s3Region, videoBuffer, videoFileName), 198 | uploadFileToS3(s3client, s3BucketName, s3Region, audioBuffer, audioFileName), 199 | ]); 200 | 201 | console.debug(`Files uploaded successfully. Video URL: ${videoUrl}, Audio URL: ${audioUrl}`); 202 | 203 | // Process the lipsync with the public URLs 204 | const lipSyncResultUrl = await this.startLipSyncAndWaitForResult({ 205 | videoPath: videoUrl, 206 | audioPath: audioUrl, 207 | }); 208 | 209 | console.debug(`Lipsync processing complete. Result available at: ${lipSyncResultUrl}`); 210 | 211 | // Clean up local files 212 | try { 213 | fs.unlinkSync(localVideoPath); 214 | fs.unlinkSync(localAudioPath); 215 | console.debug('Local files deleted successfully'); 216 | } catch (deleteError) { 217 | console.warn('Failed to delete local files:', deleteError); 218 | // Continue despite deletion failure 219 | } 220 | 221 | // Clean up S3 files 222 | try { 223 | await Promise.all([ 224 | deleteFileFromS3(s3client, s3BucketName, videoFileName), 225 | deleteFileFromS3(s3client, s3BucketName, audioFileName), 226 | ]); 227 | console.debug('S3 files deleted successfully'); 228 | } catch (deleteError) { 229 | console.warn('Failed to delete S3 files:', deleteError); 230 | // Continue despite deletion failure 231 | } 232 | 233 | return lipSyncResultUrl; 234 | } catch (error) { 235 | console.error('Error in lipsync processing with AWS upload:', error); 236 | 237 | // Attempt to clean up S3 files in case of error 238 | if (videoFileName && audioFileName) { 239 | try { 240 | await Promise.all([ 241 | deleteFileFromS3(s3client, s3BucketName, videoFileName), 242 | deleteFileFromS3(s3client, s3BucketName, audioFileName), 243 | ]); 244 | console.debug('S3 files deleted after error'); 245 | } catch (deleteError) { 246 | console.warn('Failed to delete S3 files after error:', deleteError); 247 | } 248 | } 249 | 250 | throw new Error( 251 | `Failed to process lipsync with AWS: ${error instanceof Error ? error.message : String(error)}`, 252 | ); 253 | } 254 | } 255 | } 256 | 257 | /** 258 | * Helper function to upload a file to S3 and return its public URL 259 | */ 260 | async function uploadFileToS3( 261 | s3client: S3Client, 262 | bucketName: string, 263 | region: string, 264 | fileBuffer: Buffer, 265 | filePath: string, 266 | ): Promise { 267 | // Check if file already exists 268 | try { 269 | await s3client.send( 270 | new HeadObjectCommand({ 271 | Bucket: bucketName, 272 | Key: filePath, 273 | }), 274 | ); 275 | // If no error is thrown, file exists 276 | return `https://${bucketName}.s3.${region}.amazonaws.com/${filePath}`; 277 | } catch (error: unknown) { 278 | // File doesn't exist, continue with upload 279 | } 280 | 281 | // Get expiration date (1 year from now) 282 | const expirationDate = new Date(); 283 | expirationDate.setFullYear(expirationDate.getFullYear() + 1); 284 | 285 | const uploadParams = { 286 | Bucket: bucketName, 287 | Key: filePath.trim(), 288 | Body: fileBuffer, 289 | Metadata: { 290 | 'x-amz-meta-expiration-date': expirationDate.toISOString(), 291 | }, 292 | }; 293 | 294 | try { 295 | const data = await s3client.send(new PutObjectCommand(uploadParams)); 296 | if (!data) { 297 | throw new Error('Error uploading file to AWS S3'); 298 | } 299 | 300 | return `https://${bucketName}.s3.${region}.amazonaws.com/${filePath.trim()}`; 301 | } catch (error: unknown) { 302 | const errorMessage = error instanceof Error ? error.message : 'Unknown error'; 303 | throw new Error(`Failed to upload file: ${errorMessage}`); 304 | } 305 | } 306 | 307 | /** 308 | * Helper function to delete a file from S3 309 | */ 310 | async function deleteFileFromS3(s3client: S3Client, bucketName: string, filePath: string): Promise { 311 | try { 312 | await s3client.send( 313 | new DeleteObjectCommand({ 314 | Bucket: bucketName, 315 | Key: filePath, 316 | }), 317 | ); 318 | console.debug(`Successfully deleted file from S3: ${filePath}`); 319 | } catch (error: unknown) { 320 | const errorMessage = error instanceof Error ? error.message : 'Unknown error'; 321 | console.warn(`Failed to delete file from S3: ${filePath} - ${errorMessage}`); 322 | throw new Error(`Failed to delete file from S3: ${errorMessage}`); 323 | } 324 | } 325 | -------------------------------------------------------------------------------- /src/llm/openai.ts: -------------------------------------------------------------------------------- 1 | import OpenAI from 'openai'; 2 | import type { ChatCompletionCreateParamsNonStreaming, ChatCompletionMessageParam } from 'openai/resources'; 3 | 4 | export type OpenAIModel = string; 5 | 6 | export const models = { 7 | gpt4o: 'gpt-4o', 8 | chatgpt4oLatest: 'chatgpt-4o-latest', 9 | gpt4Turbo: 'gpt-4-turbo', 10 | gpt4: 'gpt-4', 11 | gpt3Turbo: 'gpt-3.5-turbo-0125', 12 | gpt3_16k: 'gpt-3.5-turbo-16k', 13 | gpt4oMini: 'gpt-4o-mini', 14 | o1: 'o1', 15 | o1Mini: 'o1-mini', 16 | o3Mini: 'o3-mini', 17 | o1Pro: 'o1-pro', 18 | gpt45Preview: 'gpt-4.5-preview', 19 | gpt4_1: 'gpt-4.1', 20 | o4Mini: 'o4-mini', 21 | o3: 'o4', 22 | }; 23 | 24 | const oModelsWithoutInstructions: OpenAIModel[] = [ 25 | models.o1Mini, 26 | models.o1, 27 | models.o3Mini, 28 | models.o4Mini, 29 | models.o3, 30 | ]; 31 | 32 | const oModelsWithAdjustableReasoningEffort: OpenAIModel[] = [ 33 | models.o1, 34 | models.o3Mini, 35 | models.o1Pro, 36 | models.o4Mini, 37 | models.o3, 38 | ]; 39 | const defaultInstructions = 'You are a helpful assistant.'; 40 | 41 | export const requestToGPT = async ({ 42 | prompt, 43 | maxTokens, 44 | temperature, 45 | responseFormat, 46 | model, 47 | instructions, 48 | topP, 49 | }: { 50 | prompt: string; 51 | maxTokens: number; 52 | temperature: number; 53 | responseFormat: 'text' | 'json_object'; 54 | model: OpenAIModel; 55 | instructions?: string; 56 | topP?: number; 57 | }): Promise => { 58 | const openAi = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); 59 | 60 | if (!openAi.apiKey) { 61 | throw new Error('No API key found for OpenAI'); 62 | } 63 | 64 | const retryDelay = 1000; 65 | let attemptCount = 0; 66 | 67 | if (oModelsWithoutInstructions.includes(model) && instructions) { 68 | prompt = ` 69 | ${instructions} 70 | 71 | ------- 72 | 73 | ${prompt} 74 | `; 75 | } 76 | 77 | const timeoutId = setTimeout(() => { 78 | throw new Error('OpenAI API request timed out'); 79 | }, 90000); 80 | 81 | try { 82 | const messagesArray: ChatCompletionMessageParam[] = instructions 83 | ? [ 84 | { role: 'system', content: instructions || defaultInstructions }, 85 | { role: 'user', content: prompt }, 86 | ] 87 | : [{ role: 'user', content: prompt }]; 88 | 89 | const params: ChatCompletionCreateParamsNonStreaming = { 90 | model: model, 91 | messages: messagesArray, 92 | response_format: { type: responseFormat }, 93 | }; 94 | 95 | if (!oModelsWithoutInstructions.includes(model)) { 96 | params.max_tokens = maxTokens; 97 | params.temperature = temperature; 98 | params.top_p = topP || 1; 99 | params.presence_penalty = 0; 100 | params.frequency_penalty = 0; 101 | } 102 | 103 | if (oModelsWithAdjustableReasoningEffort.includes(model)) { 104 | params.reasoning_effort = 'medium'; 105 | } 106 | 107 | const response = await openAi.chat.completions.create(params); 108 | 109 | if (!response.choices[0]?.message?.content) { 110 | throw new Error('No content in response'); 111 | } 112 | 113 | const finalResponse = response.choices[0].message.content; 114 | 115 | if (finalResponse.trim().toLowerCase().replace('.', '') === "sorry i can't help you with that") { 116 | console.error('ChatGPT responded with a generic error'); 117 | throw new Error('Error with OpenAI API'); 118 | } 119 | 120 | clearTimeout(timeoutId); 121 | 122 | return finalResponse; 123 | } catch (error: any) { 124 | console.error('Error with OpenAI API:', error); 125 | 126 | if (attemptCount < 1) { 127 | console.error(`Retrying after ${retryDelay} milliseconds...`); 128 | await new Promise((resolve) => setTimeout(resolve, retryDelay)); 129 | attemptCount++; 130 | 131 | return requestToGPT({ 132 | prompt, 133 | maxTokens, 134 | temperature, 135 | responseFormat, 136 | model, 137 | instructions, 138 | topP, 139 | }); 140 | } else { 141 | console.error('Error with OpenAI after retry'); 142 | throw new Error('Error with OpenAI API'); 143 | } 144 | } 145 | }; 146 | -------------------------------------------------------------------------------- /src/llm/prompt-builder.ts: -------------------------------------------------------------------------------- 1 | import type { AllowedLanguages, CreatePromptArguments } from '../types'; 2 | 3 | export const defaultInstructions = ` 4 | You are a world-renowned professional translator with decades of experience, and you know everything about language, writing, and cultural nuances. 5 | 6 | Your goal: 7 | • Provide the best possible translation from the original language to the target language. 8 | • Preserve the exact meaning, style, tone, and context of the source text. 9 | • Maintain original punctuation, verbal tics, and formatting markers (e.g., “--” or “---”). 10 | • Remain consistent with prior segments (e.g., the same politeness form, references, etc.). 11 | • Do not add or omit information; do not generate commentary or explanations. 12 | • If the segment is already in the target language or contains no translatable content, return it as is. 13 | 14 | Additional guidelines: 15 | 1. **Contextual Consistency** 16 | - You receive three segments for context: the *previous* text, the *text to translate*, and the *next* text. 17 | - Only the middle one should be translated and returned. The other two are for context only. 18 | - If you receive a text that precedes or follows the text you have to translate, you must also base yourself on these texts to choose the correct politeness. Like "Vous" and "Tu" or "Monsieur" and "Mademoiselle", and same for other languages. 19 | 20 | 2. **Politeness & Pronouns** 21 | - Preserve the same level of politeness or pronoun usage across segments. For example, if the speaker uses “tu” in French, do not switch it to “vous.” 22 | 23 | 3. **Numbers and Units** 24 | - All numbers must be written out in full words appropriate to the target language (e.g., 1123 → one thousand one hundred twenty-three). 25 | - Units of measurement, and currencies should be expanded into full words and translated if there is an equivalent in the target language (e.g., “km/h” → “kilometers per hour,” “€” → “euros,”). 26 | - Acronyms should be translated if there is an equivalent in the target language (e.g., “SIDA” → “AIDS”), acronyms should not be expanded into full words. 27 | - If an acronym has *no* direct equivalent in the target language, leave it as-is. 28 | 29 | 4. **Verbatim vs. Naturalness** 30 | - Provide a *naturally flowing* translation. Do not introduce major changes in structure or meaning; remain faithful to the original text. 31 | - Keep verbal tics, interjections (e.g., “Oh la la,” “Umm,” “Eh”), or any markers of style or hesitation. 32 | 33 | 5. **Output Format** 34 | - Output **only** the translated text of the middle segment without quotes, titles, or other metadata. 35 | - Do not add additional text, commentary, or formatting beyond the translation itself. 36 | - If you are unsure how to translate a word or phrase, use your best judgment to provide the most statistically probable correct translation. 37 | 38 | 6. **Edge Cases** 39 | - If the source text is partially in the same language as the target, only translate the parts that need translating. 40 | - If it is entirely in the same language, simply return it unchanged. 41 | 42 | Remember: 43 | - Your translation should be culturally appropriate, preserving the intentions and style of the speaker. 44 | - You must not “denature” the text. Maintain verbal tics, punctuation, and overall sentence structure as much as possible, while still ensuring clarity and correctness in the target language. 45 | `; 46 | 47 | export class PromptBuilder { 48 | public static T_V_DistinctionInstruction = 49 | 'When translating, strictly preserve the original text’s level of formality and politeness (including T–V distinctions, formal/informal pronouns, honorifics, and appropriate vocabulary), adapting accurately according to the conventions of each target language. If you receive a text that precedes or follows the text you have to translate, you must also base yourself on these texts to choose the correct politeness.'; 50 | 51 | public static instructionForReformulatedTranscription = ` 52 | Your role here is to reformulate translated dialogues that are too long and don't match the length of the original dialogue. 53 | 54 | You have the expertise to rephrase a text while keeping EXACTLY the same meaning. 55 | 56 | You also know that dubbing adaptation is not just about shortening or lengthening sentences. It requires: 57 | • Understanding natural expressions in the target language. 58 | • Choosing words or structures that match the timing and intensity of the original scene. 59 | • A thorough knowledge of the target language and culture. 60 | • Taking into account context, nuances, and register (formal/informal) as they appear in the scene. 61 | 62 | Think carefully and take your time to respond. 63 | 64 | Here is the workflow context: 65 | 66 | 1. A user sends me a video or audio segment. 67 | 2. I retrieve the transcription of this audio via an API. This transcription is split into small segments. 68 | 3. For each segment, I have silent times between words and the total speaking time of that segment in the video. 69 | 4. I translate the segment. 70 | 5. I generate an audio file from the translated segment with a text-to-audio tool. 71 | 6. I try to speed up the audio so it fits into the original speaking time. 72 | 7. If the audio is still too long (requiring an unnatural speed-up), you step in to intelligently rephrase the sentence, making it shorter while preserving meaning, cultural fit, and overall fluency for dubbing. 73 | 74 | Remember: 75 | • You must adapt the text so that it sounds natural in the target language, preserves context, and stays true to the style (politeness or informality) of the original dialogue. 76 | • You may modify words, expressions, or structures as necessary for clarity and naturalness. 77 | • You must handle punctuation carefully to maintain the intended pauses, exclamations, etc. 78 | • If you encounter an extremely short text that cannot reasonably be shortened further, just return it as is. 79 | • Return only the reformulated text, with no extra commentary, headings, or metadata. 80 | • Never replace or remove essential meaning. If you can’t shorten without losing critical information, shorten only minimally or return the original text if that’s more appropriate. 81 | • Numbers must be spelled out in letters. 82 | • Units of measurement, acronyms, and currencies must be written out fully in the target language if applicable. 83 | • ${PromptBuilder.T_V_DistinctionInstruction} 84 | 85 | Take your time to ensure clarity and precision. 86 | `; 87 | 88 | public static instructionForHandlingToShortSpeech = ` 89 | ### Your Tasks 90 | 91 | 1. **Identify if text rewriting is allowed**: 92 | - If rewriting is allowed, you may add or slightly reformulate phrases in a natural way (while preserving meaning) to lengthen the text so that its spoken duration better matches the target duration. 93 | - If rewriting is not allowed, you can **only** insert specific markers for silence (either "--" or "", depending on the text-to-speech service). 94 | 95 | 2. **Decide when to add silences vs. rewriting**: 96 | - If the **difference** between the original speaking time and the translated speech duration is small to moderate, inserting silences (pauses) is typically sufficient. 97 | - If the difference is large (for example, if you must slow the TTS audio below "0.75x" speed to fit), then rewriting or expanding the text may be more natural than adding very long silences. 98 | 99 | 3. **Placement and distribution of silences**: 100 | - Base your insertion of silences on: 101 | 1. The provided silence times between each original word (highest priority). 102 | 2. Punctuation (commas, periods, semicolons, etc.). 103 | 3. The difference in total duration between the original audio and the TTS-generated audio. 104 | - You must distribute the total required silence ("difference") across the text in a way that sounds natural. 105 | - When using hyphens ("--"), each "--" indicates ~0.6s of silence. 106 | - When using "", you will specify the time in seconds. 107 | 108 | 4. **Output formatting rules**: 109 | - Return **only** the modified text (translated text) with added silences (and optional rewrites if allowed). 110 | - Do not add extra explanations or metadata in your final output. 111 | - Never put a silence marker at the very end (after the last word). 112 | - Preserve the order of the words and punctuation unless rewriting is explicitly allowed. In that case, only do minimal modifications or expansions. 113 | - Use spaces carefully around silence markers (e.g. "word -- word", or "word word"). 114 | 115 | 5. **Important details**: 116 | - This text is part of a larger user-authorized transcription. 117 | - ${PromptBuilder.T_V_DistinctionInstruction} 118 | - Respect the user’s instructions about how many silences to add: “A little less is better than too much.” 119 | - If rewriting is allowed, avoid adding filler words that distort meaning; choose expansions that stay faithful to the original intent. 120 | 121 | You will receive more specific data and parameters in the dynamic prompt below. 122 | `; 123 | 124 | static createPromptToTranslateTranscription(createPromptArguments: CreatePromptArguments) { 125 | return ` 126 | Target language: ${createPromptArguments?.targetLanguage} 127 | Origin language audio: ${createPromptArguments?.originLanguage} 128 | 129 | --- 130 | IMPORTANT INFORMATION: 131 | 132 | - You have three segments: previous, current (to translate), and next. 133 | - Translate ONLY the current text segment. Do not translate or output the previous or next segments. 134 | - If the text to translate is already in the target language or contains no actionable content, return it as is. 135 | - ${this.T_V_DistinctionInstruction} 136 | - Keep “--” or “---” for artificial silences. 137 | - Convert numbers to words. Expand units/acronyms/currencies appropriately in the target language. 138 | - If no direct equivalent exists for an acronym, keep the original acronym. 139 | - Return ONLY the translated text (without quotes, commentary, or additional formatting). 140 | 141 | --- 142 | --- PREVIOUS TEXT IN THE TRANSCRIPTION (SPEAKER ${createPromptArguments?.previousTranscriptionSpeaker}) (context only, do not translate): 143 | ${createPromptArguments?.lastTranscription} 144 | ---END--- 145 | 146 | --- TEXT TO TRANSLATE (SPEAKER ${createPromptArguments?.transcriptionToTranslateSpeaker}): 147 | ${createPromptArguments?.transcriptionToTranslate} 148 | ---END--- 149 | 150 | --- NEXT TEXT IN THE TRANSCRIPTION (SPEAKER ${createPromptArguments?.nextTranscriptionSpeaker}) (context only, do not translate): 151 | ${createPromptArguments?.nextTranscription} 152 | ---END--- 153 | 154 | Some information about the video/audio: 155 | Title: ${createPromptArguments?.videoTitle || ''} 156 | Main category: ${createPromptArguments?.mainCategoryVideo} 157 | Summary of the video transcription to give you a context: ${createPromptArguments?.transcriptionSummary} 158 | `; 159 | } 160 | 161 | static async createPromptForReformulatedTranscription({ 162 | transcriptionToReformulate, 163 | originalTranscription, 164 | targetLanguage, 165 | transcriptionDuration, 166 | translatedSpeechDuration, 167 | difference, 168 | transcriptionSummary, 169 | }: { 170 | transcriptionToReformulate: string; 171 | originalTranscription: string; 172 | targetLanguage: AllowedLanguages | string; 173 | transcriptionDuration: number; 174 | translatedSpeechDuration: number; 175 | difference: string; 176 | transcriptionSummary: string; 177 | }) { 178 | return ` 179 | Reformulate, shorten, and adapt the following text so that it fits perfectly into the original speaking time. 180 | In other words, reduce the word count or syllables without removing essential punctuation. 181 | Your aim is to preserve the original meaning and context while ensuring the dubbed speech duration matches the original timing. 182 | 183 | Length of time to match: ${transcriptionDuration} seconds. 184 | 185 | ---Original text (untranslated)--- 186 | ${originalTranscription} 187 | ---END--- 188 | 189 | ---Text translated (too long to fit)--- 190 | ${transcriptionToReformulate} 191 | ---END--- 192 | 193 | Duration of the original text: ${transcriptionDuration} seconds. 194 | Duration of the translated text: ${translatedSpeechDuration} seconds. 195 | The text is ${difference} seconds too long; you must rewrite it to make it ${difference} seconds shorter. 196 | 197 | Important details: 198 | - If the text is already very short or cannot be shortened without losing meaning, keep it as is. 199 | - Maintain punctuation, style, and verbal tics. 200 | - Return only the reformulated text in ${targetLanguage.toUpperCase()}, with no extra explanations or formatting. 201 | 202 | RETURN ONLY THE REFORMULATED SHORTENED TEXT TRANSLATED IN ${targetLanguage.toUpperCase()} 203 | 204 | Summary of the video transcription to give you a context: "${transcriptionSummary} 205 | 206 | `; 207 | } 208 | 209 | static createPromptForHandlingToShortSpeech({ 210 | targetLanguage, 211 | orignalLanguage, 212 | transcriptionTranslated, 213 | wordsWithSilences, 214 | originalSegmentDuration, 215 | translatedSpeechDuration, 216 | difference, 217 | isSpeechForElevenLabs, 218 | allowRewrite, 219 | transcriptionSummary, 220 | }: { 221 | targetLanguage: string; 222 | orignalLanguage: string; 223 | transcriptionTranslated: string; 224 | wordsWithSilences: string; 225 | originalSegmentDuration: number; 226 | translatedSpeechDuration: string; 227 | difference: string; 228 | isSpeechForElevenLabs: boolean; 229 | allowRewrite: boolean; 230 | transcriptionSummary: string; 231 | }) { 232 | const adjustedTranslatedSpeechDuration = 233 | Number(difference) > 0.5 234 | ? (Number(translatedSpeechDuration) + 0.4).toFixed(4) 235 | : translatedSpeechDuration; 236 | const adjustedDifference = Number(difference) > 0.5 ? (Number(difference) - 0.4).toFixed(4) : difference; 237 | //I do this because AI have the habits to add too much silences 238 | 239 | if (!isSpeechForElevenLabs) { 240 | return ` 241 | You are receiving the following parameters: 242 | - allowRewrite: ${allowRewrite} 243 | - originalSegmentDuration: ${originalSegmentDuration} seconds 244 | - translatedSpeechDuration: ${adjustedTranslatedSpeechDuration} seconds 245 | - difference: ${adjustedDifference} seconds 246 | - wordsWithSilences: ${wordsWithSilences} 247 | - orignalLanguage: ${orignalLanguage} 248 | - targetLanguage: ${targetLanguage} 249 | - transcriptionTranslated: ${transcriptionTranslated} 250 | 251 | Your job: 252 | 1. If allowRewrite = true and the difference is large, you may add or reformulate words for a more natural length. 253 | - Keep original meaning and style. 254 | - Avoid changing proper nouns or technical terms. 255 | 2. Insert "--" (each equals ~0.600 seconds silence) intelligently: 256 | - Prioritize natural pauses based on punctuation and provided silence times. 257 | - Distribute ${adjustedDifference} seconds of total silence (in increments of 0.6s). 258 | 3. Return ONLY the final text with the inserted silences (and optional minimal rewrites if allowRewrite = true). 259 | 4. Never put silences at the very end. 260 | 5. Do not add extra commentary or headings. 261 | 262 | ---Text translated in ${targetLanguage} from ${orignalLanguage} THAT YOU MUST RETURN UPDATED: 263 | ${transcriptionTranslated} 264 | ---END--- 265 | 266 | ---Words of the original text separated with silence in each word, here to help you: 267 | ${wordsWithSilences} 268 | ---END--- 269 | 270 | 271 | Remember: "Less is better than too much" for silences. 272 | 273 | Here is a summary of the video transcription to give you a context: "${transcriptionSummary}" 274 | `; 275 | } 276 | 277 | return ` 278 | You are receiving the following parameters: 279 | - allowRewrite: ${allowRewrite} 280 | - originalSegmentDuration: ${originalSegmentDuration} 281 | - translatedSpeechDuration: ${adjustedTranslatedSpeechDuration} 282 | - difference: ${adjustedDifference} 283 | - wordsWithSilences: ${wordsWithSilences} 284 | - orignalLanguage: ${orignalLanguage} 285 | - targetLanguage: ${targetLanguage} 286 | - transcriptionTranslated: ${transcriptionTranslated} 287 | 288 | Your job: 289 | 1. If allowRewrite = true and the difference is large, you may add or reformulate words for a more natural length. 290 | - Keep original meaning and style. 291 | - Avoid changing proper nouns or technical terms. 292 | - Avoid removing words when removing them will make the sentence not weird 293 | 2. Insert in strategic places: 294 | - Prioritize natural pauses based on punctuation and based on the provided silence times between each word. 295 | - Silences between words have priority over punctuation. 296 | - Distribute ${adjustedDifference} seconds total across these tags. 297 | - Put always a space between the word and the break and the next word. 298 | 3. For silences ≥ 0.800 seconds: 299 | - Use tag 300 | - Example: 301 | 4. For silences < 0.800 seconds: 302 | - Use appropriate punctuation ONLY (comma, period, question mark) 303 | - NEVER use tags for these short silences 304 | - Example: "Hello, how are you?" (comma represents a short pause) 305 | 5. For longer silences (> 1.5 seconds): 306 | - Divide into multiple smaller pauses distributed naturally in the text 307 | - Apply rules 1 & 2 to each divided portion 308 | - Example: A 2.5s silence could become + comma + 309 | 6. Return ONLY the final text with the inserted breaks (and optional minimal rewrites if allowRewrite = true). 310 | 7. NEVER put a break at the very end. 311 | 8. Do not add extra commentary or headings. 312 | 9. Rounding silence to the nearest decimal place, for example, becomes 313 | 314 | 315 | ---Text translated in ${targetLanguage} from ${orignalLanguage} THAT YOU MUST RETURN UPDATED: 316 | ${transcriptionTranslated} 317 | ---END--- 318 | 319 | ---Words of the original text separated with silence in each word, here to help you: 320 | ${wordsWithSilences} 321 | ---END--- 322 | 323 | Remember: "Less is better than too much" for silences. 324 | 325 | Here is a summary of the video transcription to give you a context: "${transcriptionSummary}" 326 | `; 327 | } 328 | } 329 | -------------------------------------------------------------------------------- /src/separator/spleeter.ts: -------------------------------------------------------------------------------- 1 | import { ElevenLabsService } from './../elevenlabs/elevenlabs'; 2 | import axios from 'axios'; 3 | import FormData from 'form-data'; 4 | import fs from 'fs'; 5 | import fsPromises from 'fs/promises'; 6 | import qs from 'qs'; 7 | import { AudioUtils } from '../ffmpeg/audio-utils'; 8 | 9 | export class Spleeter { 10 | static async getSeparateAudio(audioFilePath: string) { 11 | const filePathMp3 = audioFilePath.replace('.wav', '.mp3'); 12 | try { 13 | await AudioUtils.convertToMp3(audioFilePath, filePathMp3); 14 | 15 | const backgroundAudio = (await this.separateAudioInTwoParts(filePathMp3)).accompaniment; 16 | 17 | const elevenLabsService = new ElevenLabsService(); 18 | const vocalsIsolated = await elevenLabsService.isolateVoiceFromAudio(audioFilePath); 19 | 20 | return { backgroundAudio, vocalsIsolated }; 21 | } catch (error) { 22 | console.error('Error in getSeparateAudio:', error); 23 | if (error instanceof Error) { 24 | throw error; 25 | } else { 26 | throw new Error('Error in getSeparateAudio'); 27 | } 28 | } finally { 29 | if (fs.existsSync(filePathMp3)) { 30 | await fsPromises.unlink(filePathMp3); 31 | } 32 | } 33 | } 34 | 35 | static async separateAudioInTwoParts(filePath: string): Promise<{ 36 | vocals: string; 37 | accompaniment: string; 38 | }> { 39 | console.debug('Separating audio into vocals and accompaniment...'); 40 | const licenseKey = process.env.LALAL_LICENSE_KEY; 41 | const apiUrlBase = 'https://www.lalal.ai/api'; 42 | let fileId: string = ''; 43 | 44 | const checkStatus = async (fileId: string): Promise => { 45 | let isCompleted = false; 46 | let statusData: LalalAPIResponse | null = null; 47 | 48 | while (!isCompleted) { 49 | try { 50 | const data = qs.stringify({ id: fileId }); 51 | const response = await axios.post(`${apiUrlBase}/check/`, data, { 52 | headers: { 53 | Authorization: `license ${licenseKey}`, 54 | 'Content-Type': 'application/x-www-form-urlencoded', 55 | }, 56 | }); 57 | 58 | if (response.data.status === 'success') { 59 | const taskState = response.data.result[fileId]?.task?.state; 60 | if (taskState === 'success') { 61 | isCompleted = true; 62 | statusData = response.data; 63 | } else { 64 | await new Promise((resolve) => setTimeout(resolve, 1000)); 65 | } 66 | } else { 67 | console.error('Error checking status:', response.data.error); 68 | throw new Error(response.data.error || 'Status check failed'); 69 | } 70 | } catch (error) { 71 | console.error('An error occurred while checking status:', error); 72 | throw error; 73 | } 74 | } 75 | 76 | if (!statusData) throw new Error('No status data found'); 77 | return statusData; 78 | }; 79 | 80 | const processAudio = async (filePath: string): Promise => { 81 | try { 82 | const form = new FormData(); 83 | form.append('file', fs.createReadStream(filePath), { 84 | filename: filePath.split('/').pop(), 85 | }); 86 | 87 | // Retry up to 2 additional times if the upload fails 88 | const uploadAttempt = async (maxRetries = 2): Promise => { 89 | let attempts = 0; 90 | let lastError: any; 91 | while (attempts <= maxRetries) { 92 | try { 93 | const uploadResponse = await axios.post(`${apiUrlBase}/upload/`, form, { 94 | headers: { 95 | ...form.getHeaders(), 96 | 'Content-Disposition': `attachment; filename=${filePath.split('/').pop()}`, 97 | Authorization: `license ${licenseKey}`, 98 | }, 99 | }); 100 | if (uploadResponse.data.status === 'success') { 101 | return uploadResponse.data; 102 | } else { 103 | lastError = new Error(uploadResponse.data.error || 'Upload failed'); 104 | console.error('Upload failed:', uploadResponse.data.error); 105 | } 106 | } catch (error) { 107 | lastError = error; 108 | console.error('Upload request error:', error); 109 | } 110 | 111 | attempts++; 112 | if (attempts <= maxRetries) { 113 | await new Promise((resolve) => setTimeout(resolve, 1000)); 114 | } 115 | } 116 | 117 | console.error('Upload failed after multiple attempts:', lastError); 118 | throw new Error('Upload failed after multiple attempts.'); 119 | }; 120 | 121 | const uploadResponse = await uploadAttempt(); 122 | 123 | if (!uploadResponse.id) throw new Error('No file ID received from upload'); 124 | fileId = uploadResponse.id; 125 | 126 | interface SplitParams { 127 | id: string; 128 | stem: 129 | | 'vocals' 130 | | 'drum' 131 | | 'bass' 132 | | 'piano' 133 | | 'electric_guitar' 134 | | 'acoustic_guitar' 135 | | 'synthesizer' 136 | | 'voice' 137 | | 'strings' 138 | | 'wind'; 139 | splitter: 'orion' | 'phoenix' | 'perseus'; 140 | filter: 0 | 1 | 2; 141 | } 142 | 143 | const params: SplitParams[] = [ 144 | { 145 | id: fileId, 146 | stem: 'voice', 147 | splitter: 'perseus', 148 | filter: 2, 149 | }, 150 | ]; 151 | 152 | const splitResponse = await axios.post( 153 | `${apiUrlBase}/split/`, 154 | qs.stringify({ params: JSON.stringify(params) }), 155 | { 156 | headers: { 157 | ...form.getHeaders(), 158 | Authorization: `license ${licenseKey}`, 159 | 'Content-Type': 'application/x-www-form-urlencoded', 160 | }, 161 | }, 162 | ); 163 | 164 | if (splitResponse.data.status !== 'success') { 165 | console.error('Split operation failed:', splitResponse.data.error); 166 | throw new Error('Split operation failed.'); 167 | } 168 | 169 | console.debug('Split operation initiated successfully'); 170 | return await checkStatus(fileId); 171 | } catch (error) { 172 | console.error('Process failed:', error); 173 | if (error instanceof Error) { 174 | throw error; 175 | } 176 | throw new Error('Error while processing audio'); 177 | } 178 | }; 179 | 180 | try { 181 | const lalalResponse = await processAudio(filePath); 182 | const vocals = lalalResponse.result[fileId].split.stem_track; 183 | const accompaniment = lalalResponse.result[fileId].split.back_track; 184 | return { vocals, accompaniment }; 185 | } catch (error) { 186 | console.error('separateAudioInTwoParts failed:', error); 187 | throw new Error('Failed to separate audio into two parts.'); 188 | } 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /src/smart-sync/adaptation.ts: -------------------------------------------------------------------------------- 1 | import { models } from '../llm/openai'; 2 | import { requestToGPT } from '../llm/openai'; 3 | import { PromptBuilder } from '../llm/prompt-builder'; 4 | import type { 5 | AllowedLanguages, 6 | AudioOriginalLangAllowed, 7 | SegmentWitDurationAndOriginalSegment, 8 | } from '../types'; 9 | import type { 10 | CreateLongerSpeechArguments, 11 | CreateShorterSpeechArguments, 12 | SpeechAdjusted, 13 | SpeechResponseWithDuration, 14 | } from '../types/speech'; 15 | import { silenceBetweenSegmentConsideredAsPause } from '../utils/config'; 16 | import { AudioUtils } from '../ffmpeg/audio-utils'; 17 | import { SpeechGenerator } from '../speech/speechGenerator'; 18 | import { ElevenLabsService } from '../elevenlabs/elevenlabs'; 19 | import type { Readable } from 'form-data'; 20 | import fs from 'fs'; 21 | import crypto from 'crypto'; 22 | import fsPromises from 'fs/promises'; 23 | 24 | export class Adaptation { 25 | constructor() { 26 | // 27 | } 28 | 29 | static async compareAndAdjustSpeeches({ 30 | transcriptions, 31 | speeches, 32 | clonedVoicesIds, 33 | originalLanguage, 34 | targetLanguage, 35 | transcriptionSummary, 36 | }: { 37 | transcriptions: SegmentWitDurationAndOriginalSegment[]; 38 | speeches: SpeechResponseWithDuration[]; 39 | clonedVoicesIds: { [key: string]: string }; 40 | originalLanguage: AudioOriginalLangAllowed; 41 | targetLanguage: AllowedLanguages; 42 | transcriptionSummary: string; 43 | }): Promise { 44 | console.debug('Comparing speeches, and adjusting length...'); 45 | if (transcriptions.length !== speeches.length) { 46 | console.error('Array length mismatch'); 47 | throw new Error('Array length mismatch'); 48 | } 49 | 50 | const sortedSegments = transcriptions.sort((a, b) => a.index - b.index); 51 | 52 | const maxSpeedFactor = 1.15; 53 | 54 | const minSpeedFactor = 0.9; 55 | 56 | let previousTranscriptionText = ''; 57 | 58 | try { 59 | const adjustments: SpeechAdjusted[] = []; 60 | 61 | for (let index = 0; index < sortedSegments.length; index++) { 62 | let isSpeechModifiedToBeLonger = false; 63 | const transcription = sortedSegments[index]; 64 | const speech = speeches[index]; 65 | let speechBuffer = speech.speech; 66 | 67 | let newSpeechDuration = speech.duration; 68 | 69 | let speedFactor = newSpeechDuration / transcription.duration; 70 | let adjustedSpeedFactor = speedFactor; 71 | let reformulationAttempts = 0; 72 | const clonedVoiceId = clonedVoicesIds[transcription.speaker]; 73 | 74 | let transcriptionText = transcription.transcription; 75 | let nextTranscriptionText = ''; 76 | 77 | //next transcription text 78 | if (index + 1 < sortedSegments.length) { 79 | const silenceBetweenNextTranscription = sortedSegments[index + 1].begin - transcription.end; 80 | 81 | //1 = 1 second 82 | if ( 83 | silenceBetweenNextTranscription > silenceBetweenSegmentConsideredAsPause || 84 | sortedSegments[index + 1].speaker !== transcription.speaker 85 | ) { 86 | nextTranscriptionText = ''; 87 | } else { 88 | nextTranscriptionText = sortedSegments[index + 1].transcription; 89 | } 90 | } 91 | 92 | const activateSmartSync = true; 93 | const smartSyncMustBeTriggered = 94 | activateSmartSync && (speedFactor > maxSpeedFactor || speedFactor < minSpeedFactor); 95 | 96 | while (smartSyncMustBeTriggered && reformulationAttempts < 2) { 97 | if (speedFactor > maxSpeedFactor) { 98 | console.debug(`Too long (speedFactor: ${speedFactor}), reformulation needed`); 99 | 100 | const shorterSpeech = await this.createShorterSpeech({ 101 | translatedTranscription: transcriptionText, 102 | originalTranscription: transcription.originalTranscription, 103 | speechIndex: transcription.index, 104 | speakerIndex: transcription.speaker, 105 | targetLanguage: targetLanguage, 106 | previousText: previousTranscriptionText, 107 | nextText: nextTranscriptionText, 108 | transcriptionDuration: transcription.duration, 109 | translatedSpeechDuration: newSpeechDuration, 110 | difference: (newSpeechDuration - transcription.duration).toFixed(2), 111 | transcriptionSummary, 112 | clonedVoiceId, 113 | }); 114 | 115 | transcriptionText = shorterSpeech.reformulatedText as string; 116 | 117 | speechBuffer = shorterSpeech.speech; 118 | newSpeechDuration = shorterSpeech.duration; 119 | } else if (speedFactor < minSpeedFactor) { 120 | console.debug(`Too short (speedFactor: ${speedFactor}), reformulation needed`); 121 | const longerSpeech = await this.createLongerSpeech({ 122 | translatedTranscription: transcriptionText, 123 | speechIndex: transcription.index, 124 | speakerIndex: transcription.speaker, 125 | targetLanguage: targetLanguage, 126 | originalLanguage: originalLanguage, 127 | transcriptionWords: transcription.wordsWithSilence, 128 | previousText: previousTranscriptionText, 129 | nextText: nextTranscriptionText, 130 | originalSegmentDuration: transcription.duration, 131 | translatedSpeechDuration: newSpeechDuration, 132 | difference: (transcription.duration - newSpeechDuration).toFixed(2), 133 | speedFactor: speedFactor, 134 | transcriptionSummary, 135 | clonedVoiceId, 136 | }); 137 | 138 | transcriptionText = longerSpeech.longerText; 139 | 140 | speechBuffer = longerSpeech.speech; 141 | newSpeechDuration = longerSpeech.duration; 142 | isSpeechModifiedToBeLonger = true; 143 | } 144 | 145 | speedFactor = newSpeechDuration / transcription.duration; 146 | 147 | adjustedSpeedFactor = Math.min(Math.max(speedFactor, minSpeedFactor), maxSpeedFactor); 148 | reformulationAttempts++; 149 | 150 | console.debug( 151 | `Reformulation attempt ${reformulationAttempts}: adjustedSpeedFactor = ${adjustedSpeedFactor}`, 152 | ); 153 | } 154 | 155 | previousTranscriptionText = transcriptionText; 156 | 157 | if ( 158 | (speedFactor >= 0.8 && speedFactor <= 0.9 && !isSpeechModifiedToBeLonger) || 159 | (speedFactor >= 1.1 && speedFactor <= 1.2 && !isSpeechModifiedToBeLonger) 160 | ) { 161 | const { newSpeechBuffer, newSpeechDuration } = await this.adjustSpeechSpeedWithElevenLabs({ 162 | speedFactor, 163 | transcriptionText, 164 | voiceId: clonedVoiceId, 165 | }); 166 | 167 | const newSpeedFactor = newSpeechDuration / transcription.duration; 168 | 169 | if (newSpeedFactor > 0.9 && newSpeedFactor < 1.1) { 170 | speechBuffer = newSpeechBuffer; 171 | speedFactor = newSpeedFactor; 172 | } 173 | } 174 | 175 | const adjustedSpeech = await this.adjustSpeechSpeed(speechBuffer, adjustedSpeedFactor); 176 | 177 | const newSpeechDurationAdjusted = await this.getSpeechDuration(adjustedSpeech); 178 | 179 | if (typeof newSpeechDurationAdjusted !== 'number') 180 | throw new Error( 181 | `Error during audio duration calculation in compareAndAdjustSpeeches: duration is not a number: ${newSpeechDurationAdjusted}`, 182 | ); 183 | 184 | adjustments.push({ 185 | speech: adjustedSpeech, 186 | transcriptionDuration: transcription.duration, 187 | end: transcription.end, 188 | begin: transcription.begin, 189 | speaker: transcription.speaker, 190 | speechDuration: newSpeechDurationAdjusted, 191 | }); 192 | } 193 | 194 | return adjustments; 195 | } catch (err: unknown) { 196 | console.error(err); 197 | throw new Error('Error while adjusting speeches'); 198 | } 199 | } 200 | 201 | static async adjustSpeechSpeed(speech: Buffer, speedFactor: number): Promise { 202 | return new Promise((resolve, reject) => { 203 | if (speedFactor < 0.5 || speedFactor > 2.0) { 204 | console.error('Speed factor must be between 0.5 and 2.0'); 205 | reject(new Error('Speed factor must be between 0.5 and 2.0')); 206 | return; 207 | } 208 | 209 | if (speedFactor === 1) { 210 | console.debug('speedFactor is 1'); 211 | resolve(speech); 212 | return; 213 | } 214 | 215 | return AudioUtils.adjustSpeed(speech, speedFactor).then(resolve).catch(reject); 216 | }); 217 | } 218 | 219 | static async getSpeechDuration(speech: Readable | Buffer): Promise { 220 | try { 221 | const duration = await AudioUtils.getAudioDurationFromBuffer(speech); 222 | return duration; 223 | } catch (err) { 224 | console.error('Speech duration error : ' + err); 225 | throw new Error('Error while getting speech duration'); 226 | } 227 | } 228 | 229 | static async adjustSpeechSpeedWithElevenLabs({ 230 | speedFactor, 231 | transcriptionText, 232 | voiceId, 233 | }: { 234 | speedFactor: number; 235 | transcriptionText: string; 236 | voiceId: string; 237 | }): Promise<{ newSpeechBuffer: Buffer; newSpeechDuration: number }> { 238 | const elevenLabsService = new ElevenLabsService(); 239 | const elevenLabsResponse = await elevenLabsService.generateAudioFile({ 240 | text: transcriptionText, 241 | voiceId: voiceId, 242 | speedFactor, 243 | modelId: 'eleven_multilingual_v2', 244 | }); 245 | 246 | const buffer = elevenLabsResponse.response; 247 | const newSpeechDuration = await AudioUtils.getAudioDurationFromBuffer(buffer); 248 | 249 | if (typeof newSpeechDuration !== 'number') 250 | throw new Error( 251 | `Error during audio duration calculation in adjustSpeechSpeedWithElevenLabs: duration is not a number: ${newSpeechDuration}`, 252 | ); 253 | 254 | return { newSpeechBuffer: buffer, newSpeechDuration }; 255 | } 256 | 257 | static async createShorterSpeech({ 258 | translatedTranscription, 259 | originalTranscription, 260 | speechIndex, 261 | speakerIndex, 262 | targetLanguage, 263 | previousText, 264 | nextText, 265 | transcriptionDuration, 266 | translatedSpeechDuration, 267 | difference, 268 | transcriptionSummary, 269 | clonedVoiceId, 270 | }: CreateShorterSpeechArguments) { 271 | const reformulatedTranscription = await this.getReformulatedTranscription({ 272 | transcription: translatedTranscription, 273 | originalTranscription, 274 | targetLanguage, 275 | transcriptionDuration, 276 | translatedSpeechDuration, 277 | difference, 278 | transcriptionSummary, 279 | }); 280 | 281 | const speechShortened = await SpeechGenerator.getSpeechFromTTSEngine({ 282 | transcription: reformulatedTranscription as string, 283 | index: speechIndex, 284 | speakerIndex: speakerIndex, 285 | clonedVoiceId: clonedVoiceId, 286 | options: { 287 | previousTranscriptionText: previousText, 288 | nextTranscriptionText: nextText, 289 | }, 290 | targetLanguage, 291 | }); 292 | 293 | const speechBuffer = 294 | speechShortened.speech instanceof Response 295 | ? Buffer.from(await speechShortened.speech.arrayBuffer()) 296 | : speechShortened.speech; 297 | 298 | const speechBufferWithoutSilence = await this.removeStartAndEndSilenceFromAudio(speechBuffer); 299 | 300 | const speechDuration = await this.getSpeechDuration(speechBufferWithoutSilence); 301 | 302 | if (typeof speechDuration !== 'number') 303 | throw new Error( 304 | `Error during audio duration calculation in createShorterSpeech: duration is not a number: ${speechDuration}`, 305 | ); 306 | 307 | console.debug('Shorter speech created.'); 308 | 309 | return { 310 | speech: speechBufferWithoutSilence, 311 | duration: speechDuration, 312 | reformulatedText: reformulatedTranscription, 313 | requestId: speechShortened.requestId, 314 | }; 315 | } 316 | 317 | static async removeStartAndEndSilenceFromAudio(speech: Buffer): Promise { 318 | const temporaryInputFile = `temporary-files/input-for-trim-${crypto.randomUUID()}.wav`; 319 | const temporaryOutputFile = `temporary-files/output-for-trim-${crypto.randomUUID()}.wav`; 320 | 321 | try { 322 | await fsPromises.writeFile(temporaryInputFile, speech); 323 | 324 | try { 325 | await AudioUtils.removeStartAndEndSilenceFromAudioWithFFMPEG(temporaryInputFile, temporaryOutputFile); 326 | } catch (ffmpegError: any) { 327 | console.error('FFmpeg error during silence removal:', ffmpegError); 328 | 329 | if (!fs.existsSync(temporaryOutputFile)) { 330 | throw new Error(`FFmpeg failed to process audio: ${ffmpegError.message || 'Unknown error'}`); 331 | } 332 | 333 | console.debug('FFmpeg reported an error but output file exists, attempting to continue'); 334 | } 335 | 336 | if (!fs.existsSync(temporaryOutputFile)) { 337 | throw new Error('Output file was not created during silence removal'); 338 | } 339 | 340 | const stats = await fsPromises.stat(temporaryOutputFile); 341 | if (stats.size === 0) { 342 | throw new Error('Output file is empty after silence removal'); 343 | } 344 | 345 | const bufferNewSpeech = await fsPromises.readFile(temporaryOutputFile); 346 | 347 | return bufferNewSpeech; 348 | } catch (err: any) { 349 | console.error('Error in removeStartAndEndSilenceFromAudio:', err); 350 | throw new Error( 351 | `ERROR while removing start and end silence from audio: ${err.message || 'Unknown error'}`, 352 | ); 353 | } finally { 354 | try { 355 | if (fs.existsSync(temporaryInputFile)) await fsPromises.unlink(temporaryInputFile); 356 | } catch (unlinkError) { 357 | console.error('Error deleting temporary input file:', unlinkError); 358 | } 359 | 360 | try { 361 | if (fs.existsSync(temporaryOutputFile)) await fsPromises.unlink(temporaryOutputFile); 362 | } catch (unlinkError) { 363 | console.error('Error deleting temporary output file:', unlinkError); 364 | } 365 | } 366 | } 367 | 368 | static async requestUpdatedTextToAi({ prompt, instruction }: { prompt: string; instruction: string }) { 369 | try { 370 | const response = await requestToGPT({ 371 | prompt, 372 | maxTokens: 8000, 373 | temperature: 0.5, 374 | instructions: instruction, 375 | responseFormat: 'text', 376 | model: models.o4Mini, 377 | }); 378 | 379 | return response; 380 | } catch (error) { 381 | console.error('Error requesting updated text to AI with fallback (1) :', error); 382 | 383 | throw new Error('Error requesting updated text to AI with fallback (1)'); 384 | } 385 | } 386 | 387 | static async getReformulatedTranscription({ 388 | transcription, 389 | originalTranscription, 390 | targetLanguage, 391 | transcriptionDuration, 392 | translatedSpeechDuration, 393 | difference, 394 | transcriptionSummary, 395 | }: { 396 | transcription: string; 397 | originalTranscription: string; 398 | targetLanguage: string; 399 | transcriptionDuration: number; 400 | translatedSpeechDuration: number; 401 | difference: string; 402 | transcriptionSummary: string; 403 | }) { 404 | const params = { 405 | transcriptionToReformulate: transcription, 406 | originalTranscription: originalTranscription, 407 | targetLanguage: targetLanguage, 408 | transcriptionDuration: transcriptionDuration, 409 | translatedSpeechDuration: translatedSpeechDuration, 410 | difference: difference, 411 | transcriptionSummary: transcriptionSummary, 412 | }; 413 | 414 | const promptForLLM = await PromptBuilder.createPromptForReformulatedTranscription(params); 415 | 416 | const instruction = PromptBuilder.instructionForReformulatedTranscription; 417 | 418 | const LLMResponse = await this.requestUpdatedTextToAi({ 419 | prompt: promptForLLM, 420 | instruction, 421 | }); 422 | 423 | return LLMResponse; 424 | } 425 | 426 | static async getLongerText({ 427 | speedFactor, 428 | difference, 429 | targetLanguage, 430 | originalLanguage, 431 | translatedTranscription, 432 | transcriptionWords, 433 | originalSegmentDuration, 434 | translatedSpeechDuration, 435 | transcriptionSummary, 436 | }: { 437 | speedFactor: number; 438 | difference: string; 439 | targetLanguage: string; 440 | originalLanguage: string; 441 | translatedTranscription: string; 442 | transcriptionWords: string; 443 | originalSegmentDuration: number; 444 | translatedSpeechDuration: number; 445 | transcriptionSummary: string; 446 | }) { 447 | const isSpeechForElevenLabs = true; 448 | const isAiAllowedToRewrite = speedFactor < 0.75 || Number(difference) > 2; 449 | 450 | const prompt = PromptBuilder.createPromptForHandlingToShortSpeech({ 451 | targetLanguage: targetLanguage, 452 | orignalLanguage: originalLanguage, 453 | transcriptionTranslated: translatedTranscription, 454 | wordsWithSilences: transcriptionWords, 455 | originalSegmentDuration, 456 | translatedSpeechDuration: translatedSpeechDuration.toFixed(2), 457 | difference, 458 | isSpeechForElevenLabs, 459 | allowRewrite: isAiAllowedToRewrite, 460 | transcriptionSummary, 461 | }); 462 | 463 | const instruction = PromptBuilder.instructionForHandlingToShortSpeech; 464 | 465 | const translatedTextWithSilence = await this.requestUpdatedTextToAi({ 466 | prompt, 467 | instruction, 468 | }); 469 | 470 | return translatedTextWithSilence; 471 | } 472 | 473 | static async createLongerSpeech({ 474 | translatedTranscription, 475 | speechIndex, 476 | speakerIndex, 477 | targetLanguage, 478 | originalLanguage, 479 | transcriptionWords, 480 | nextText, 481 | previousText, 482 | originalSegmentDuration, 483 | translatedSpeechDuration, 484 | difference, 485 | speedFactor, 486 | transcriptionSummary, 487 | clonedVoiceId, 488 | }: CreateLongerSpeechArguments): Promise<{ 489 | speech: Buffer; 490 | duration: number; 491 | requestId: string; 492 | longerText: string; 493 | }> { 494 | const translatedTextWithSilence = await this.getLongerText({ 495 | speedFactor, 496 | difference, 497 | targetLanguage, 498 | originalLanguage, 499 | translatedTranscription, 500 | transcriptionWords, 501 | originalSegmentDuration, 502 | translatedSpeechDuration, 503 | transcriptionSummary, 504 | }); 505 | 506 | const longerSpeech = await SpeechGenerator.getSpeechFromTTSEngine({ 507 | transcription: translatedTextWithSilence as string, 508 | index: speechIndex, 509 | speakerIndex: speakerIndex, 510 | clonedVoiceId, 511 | options: { 512 | previousTranscriptionText: previousText, 513 | nextTranscriptionText: nextText, 514 | }, 515 | targetLanguage, 516 | }); 517 | 518 | const speechBuffer = 519 | longerSpeech.speech instanceof Response 520 | ? Buffer.from(await longerSpeech.speech.arrayBuffer()) 521 | : longerSpeech.speech; 522 | 523 | const speechBufferWithoutSilence = await this.removeStartAndEndSilenceFromAudio(speechBuffer); 524 | 525 | const speechDuration = await this.getSpeechDuration(speechBufferWithoutSilence); 526 | 527 | if (typeof speechDuration !== 'number') 528 | throw new Error( 529 | `Error during audio duration calculation in translation service: duration is not a number: ${speechDuration}`, 530 | ); 531 | 532 | return { 533 | speech: speechBufferWithoutSilence, 534 | duration: speechDuration, 535 | requestId: longerSpeech.requestId, 536 | longerText: translatedTextWithSilence, 537 | }; 538 | } 539 | } 540 | -------------------------------------------------------------------------------- /src/speech/speechGenerator.ts: -------------------------------------------------------------------------------- 1 | import type { PreviousRequestIdsEL } from '../elevenlabs/elevenlabs'; 2 | import type { AllowedLanguages, SegmentWitDurationAndOriginalSegment } from '../types'; 3 | import type { SpeechAdjusted, SpeechResponseWithDuration, SpeechResponseWithIndex } from '../types/speech'; 4 | import { maxSimultaneousFetchElevenLabs, silenceBetweenSegmentConsideredAsPause } from '../utils/config'; 5 | import { ElevenLabsService } from '../elevenlabs/elevenlabs'; 6 | import { AudioUtils } from '../ffmpeg/audio-utils'; 7 | import crypto from 'crypto'; 8 | import fs from 'fs'; 9 | import fsPromises from 'fs/promises'; 10 | import { Helpers } from '../utils/helpers'; 11 | import { VideoUtils } from '../ffmpeg/video-utils'; 12 | import type { Readable } from 'stream'; 13 | import * as path from 'path'; 14 | 15 | export class SpeechGenerator { 16 | constructor() { 17 | // 18 | } 19 | 20 | static async getSpeechArrayFromTranscriptions({ 21 | segments, 22 | targetLanguage, 23 | isolatedVocalsPath, 24 | }: { 25 | segments: SegmentWitDurationAndOriginalSegment[]; 26 | isolatedVocalsPath: string; 27 | targetLanguage: AllowedLanguages; 28 | }): Promise<{ 29 | allResultsSorted: SpeechResponseWithIndex[]; 30 | clonedVoicesIds: { [key: string]: string }; 31 | }> { 32 | console.debug('Getting speeches...'); 33 | const maxSimultaneousFetch = maxSimultaneousFetchElevenLabs; 34 | 35 | let allResults: SpeechResponseWithIndex[] = []; 36 | const clonedVoicesIds: { 37 | //speakerIndex/number: clonedVoiceId 38 | [key: string]: string; 39 | } = {}; 40 | 41 | const speakers = this.getNumberSpeakers(segments); 42 | for (const speaker of speakers) { 43 | clonedVoicesIds[speaker] = await this.cloneVideoVoice(isolatedVocalsPath, segments, speaker); 44 | } 45 | 46 | try { 47 | //Voice cloning or custom Voice return only an Array of one Item 48 | const processTranscriptionBatch = async ({ 49 | batch, 50 | previousTranscriptionText, 51 | nextTranscriptionText, 52 | targetLanguage, 53 | }: { 54 | batch: SegmentWitDurationAndOriginalSegment[]; 55 | previousTranscriptionText: string | ''; 56 | nextTranscriptionText: string | ''; 57 | previousRequestIds: PreviousRequestIdsEL; 58 | targetLanguage: AllowedLanguages; 59 | }) => { 60 | const promises = batch.map((transcription) => 61 | this.getSpeechFromTTSEngine({ 62 | transcription: transcription.transcription, 63 | index: transcription.index, 64 | speakerIndex: transcription.speaker, 65 | clonedVoiceId: clonedVoicesIds[transcription.speaker], 66 | options: { 67 | previousTranscriptionText, 68 | nextTranscriptionText, 69 | }, 70 | targetLanguage, 71 | }), 72 | ); 73 | 74 | return await Promise.all(promises); 75 | }; 76 | 77 | const pastSpeechIds: PreviousRequestIdsEL = []; 78 | for (let i = 0; i < segments.length; i += maxSimultaneousFetch) { 79 | const batchEndIndex = i + maxSimultaneousFetch; 80 | const nextTranscriptionData = segments[i + 1]; 81 | const transcriptionBatch = segments.slice(i, batchEndIndex); 82 | const previousTranscriptionText = i === 0 ? '' : segments[i - 1]?.transcription; 83 | let nextTranscriptionText = ''; 84 | 85 | if (batchEndIndex < segments.length) { 86 | const silenceBetweenNextTranscription = nextTranscriptionData?.begin - segments[i].end; 87 | 88 | if ( 89 | nextTranscriptionData?.speaker !== segments[i].speaker || 90 | silenceBetweenNextTranscription > silenceBetweenSegmentConsideredAsPause 91 | ) { 92 | nextTranscriptionText = ''; 93 | } else { 94 | nextTranscriptionText = nextTranscriptionData.transcription; 95 | } 96 | } 97 | 98 | const batchResults = await processTranscriptionBatch({ 99 | batch: transcriptionBatch, 100 | previousTranscriptionText: previousTranscriptionText, 101 | nextTranscriptionText: nextTranscriptionText, 102 | previousRequestIds: pastSpeechIds || '', 103 | targetLanguage, 104 | }); 105 | 106 | if (pastSpeechIds.length === 3) pastSpeechIds.shift(); 107 | pastSpeechIds.push(batchResults[0].requestId); 108 | 109 | allResults = allResults.concat(batchResults); 110 | } 111 | console.debug('Speeches got.'); 112 | const allResultsSorted = allResults.sort((a, b) => a.index - b.index); 113 | 114 | return { 115 | allResultsSorted, 116 | clonedVoicesIds, 117 | }; 118 | } catch (err: unknown) { 119 | console.error(err); 120 | if (err instanceof Error) { 121 | throw err; 122 | } 123 | throw new Error('Error while getting speeches'); 124 | } 125 | } 126 | 127 | static async cloneVideoVoice( 128 | vocalsAudioPath: string, 129 | segments: SegmentWitDurationAndOriginalSegment[], 130 | speakerIndex: number, 131 | ) { 132 | console.debug('Cloning video voice...'); 133 | function combineBuffers(buffers: Buffer[]): Buffer { 134 | const totalLength = buffers.reduce((sum, buffer) => sum + buffer.length, 0); 135 | 136 | const combinedBuffer = Buffer.alloc(totalLength); 137 | 138 | let offset = 0; 139 | for (const buffer of buffers) { 140 | buffer.copy(combinedBuffer, offset); 141 | offset += buffer.length; 142 | } 143 | 144 | return combinedBuffer; 145 | } 146 | 147 | const filePath = `temporary-files/audioFromOneSpeaker-${crypto.randomUUID()}.mp3`; 148 | 149 | try { 150 | let audioFromOneSpeakerBuffer = await this.getAudiosSpeakerAndMerge( 151 | segments, 152 | speakerIndex, 153 | vocalsAudioPath, 154 | ); 155 | 156 | fs.writeFileSync(filePath, combineBuffers(audioFromOneSpeakerBuffer)); 157 | console.debug('getting file duration for function cloneVideoVoice'); 158 | const audioDuration = await VideoUtils.getFileDuration(filePath); 159 | 160 | if (typeof audioDuration !== 'number') 161 | throw new Error( 162 | `Error during audio duration when cloning video voice: duration is not a number: ${audioDuration}`, 163 | ); 164 | 165 | if (audioDuration < 90) { 166 | const resultPath = await AudioUtils.duplicateAndConcatenateAudio(filePath, 3, 'mp3'); 167 | 168 | audioFromOneSpeakerBuffer = await Helpers.splitAudioIntoBuffers(resultPath); 169 | 170 | if (fs.existsSync(resultPath)) await fsPromises.unlink(resultPath); 171 | } 172 | 173 | const elevenLabsService = new ElevenLabsService(); 174 | const response = await elevenLabsService.cloneVoice( 175 | audioFromOneSpeakerBuffer, 176 | 'speaker-' + speakerIndex, 177 | audioDuration, 178 | ); 179 | 180 | return response.voice_id; 181 | } catch (err) { 182 | console.error(err); 183 | if (err instanceof Error) { 184 | throw err; 185 | } 186 | throw new Error('Error while cloning video voice'); 187 | } finally { 188 | if (fs.existsSync(filePath)) await fsPromises.unlink(filePath); 189 | } 190 | } 191 | 192 | static async getSpeechFromTTSEngine({ 193 | transcription, 194 | index, 195 | speakerIndex, 196 | options, 197 | targetLanguage, 198 | clonedVoiceId, 199 | }: { 200 | transcription: string; 201 | index: number; 202 | speakerIndex: number; 203 | clonedVoiceId: string; 204 | options?: { 205 | previousTranscriptionText: string | ''; 206 | nextTranscriptionText: string | ''; 207 | }; 208 | targetLanguage: AllowedLanguages; 209 | }): Promise { 210 | const elevenLabsService = new ElevenLabsService(); 211 | 212 | const createSpeechWithVoiceCloning = async () => { 213 | try { 214 | return await elevenLabsService.generateAudioFile({ 215 | text: transcription, 216 | modelId: 'eleven_multilingual_v2', 217 | voiceId: clonedVoiceId, 218 | previousText: options?.previousTranscriptionText, 219 | targetLanguage: targetLanguage, 220 | nextText: options?.nextTranscriptionText, 221 | }); 222 | } catch (err) { 223 | console.error(err); 224 | if (err instanceof Error) { 225 | throw err; 226 | } 227 | 228 | throw new Error('Error while getting speech with ElevenLabs'); 229 | } 230 | }; 231 | 232 | const response = await createSpeechWithVoiceCloning(); 233 | 234 | return { 235 | index: index, 236 | speech: response.response, 237 | speaker: speakerIndex, 238 | requestId: response?.requestId, 239 | }; 240 | } 241 | 242 | static async getAudiosSpeakerAndMerge( 243 | segments: SegmentWitDurationAndOriginalSegment[], 244 | speakerIndex: number, 245 | vocalsAudioPath: string, 246 | ): Promise { 247 | console.debug('Getting audios from one speaker...'); 248 | const uuid = crypto.randomUUID(); 249 | const finalAudioPath = `temporary-files/finalAudioPathFromSpeaker-${uuid}.mp3`; 250 | const audioPartsPathFromSpeaker: string[] = []; 251 | 252 | try { 253 | const segmentsFromThisSpeaker = segments.filter((segment) => segment.speaker === speakerIndex); 254 | 255 | for (const segmentWithDuration of segmentsFromThisSpeaker) { 256 | try { 257 | const singleVocalSpeakerPath = await AudioUtils.cutAudioToBufferAtSpecificTime( 258 | vocalsAudioPath, 259 | segmentWithDuration.begin - 0.2, 260 | segmentWithDuration.end + 0.2, 261 | false, 262 | ); 263 | 264 | if (typeof singleVocalSpeakerPath === 'string') { 265 | audioPartsPathFromSpeaker.push(singleVocalSpeakerPath); 266 | } else { 267 | throw new Error('singleVocalSpeakerPath is not type string'); 268 | } 269 | } catch (error) { 270 | for (const path of audioPartsPathFromSpeaker) { 271 | if (fs.existsSync(path)) await fsPromises.unlink(path); 272 | } 273 | throw error; 274 | } 275 | } 276 | 277 | await AudioUtils.concatenateAudio({ 278 | files: audioPartsPathFromSpeaker, 279 | outputPath: finalAudioPath, 280 | outputFormat: 'mp3', 281 | }); 282 | 283 | if (await this.isFileSizeMoreThan10MB(finalAudioPath)) { 284 | return await Helpers.splitAudioIntoBuffers(finalAudioPath); 285 | } else { 286 | const bufferFile = await fsPromises.readFile(finalAudioPath); 287 | return [bufferFile]; 288 | } 289 | } catch (error) { 290 | console.error(error); 291 | if (error instanceof Error) { 292 | throw error; 293 | } 294 | throw new Error('Error while getting audio from one speaker.'); 295 | } finally { 296 | if (fs.existsSync(finalAudioPath)) { 297 | try { 298 | await fsPromises.unlink(finalAudioPath); 299 | } catch (e) { 300 | console.error('Error cleaning up finalAudioPath:', e); 301 | } 302 | } 303 | 304 | audioPartsPathFromSpeaker.forEach(async (path) => { 305 | if (fs.existsSync(path)) { 306 | try { 307 | await fsPromises.unlink(path); 308 | } catch (e) { 309 | console.error(`Error cleaning up temp file ${path}:`, e); 310 | } 311 | } 312 | }); 313 | } 314 | } 315 | 316 | static getNumberSpeakers(segments: SegmentWitDurationAndOriginalSegment[]) { 317 | const speakerArray = segments.map((segment) => segment.speaker); 318 | return Array.from(new Set(speakerArray)); 319 | } 320 | 321 | static async isFileSizeMoreThan10MB(filePath: string): Promise { 322 | try { 323 | const stats = await fsPromises.stat(filePath); 324 | const fileSizeInBytes = stats.size; 325 | const fileSizeInMegabytes = fileSizeInBytes / (1024 * 1024); 326 | return fileSizeInMegabytes > 10; 327 | } catch (error) { 328 | console.error('Erreur lors de la vérification de la taille du fichier:', error); 329 | throw error; 330 | } 331 | } 332 | 333 | static async getEachSpeechDuration({ 334 | speechArray, 335 | transcriptions, 336 | }: { 337 | speechArray: SpeechResponseWithIndex[]; 338 | transcriptions: SegmentWitDurationAndOriginalSegment[]; 339 | }): Promise { 340 | console.debug('Getting speeches duration...'); 341 | try { 342 | const speechArraySorted = speechArray.sort((a, b) => a.index - b.index); 343 | 344 | const arraySpeechWithDuration: SpeechResponseWithDuration[] = []; 345 | 346 | for (let i = 0; i < speechArraySorted.length; i++) { 347 | const speech = speechArraySorted[i]; 348 | const audioBuffer = 349 | speech.speech instanceof Response ? Buffer.from(await speech.speech.arrayBuffer()) : speech.speech; 350 | 351 | console.debug(`Getting initial speech duration for index ${i}`); 352 | 353 | const duration = await this.getSpeechDuration(audioBuffer); 354 | 355 | if (typeof duration !== 'number') { 356 | transcriptions.filter((transcription) => transcription.index !== speech.index); 357 | continue; 358 | } 359 | 360 | arraySpeechWithDuration.push({ 361 | speech: audioBuffer, 362 | duration, 363 | speechIndex: i, 364 | speaker: speech.speaker, 365 | requestId: speech.requestId, 366 | }); 367 | } 368 | 369 | console.debug('All Speeches duration got.'); 370 | return arraySpeechWithDuration.sort((a, b) => a.speechIndex - b.speechIndex); 371 | } catch (err: unknown) { 372 | console.error(err); 373 | throw new Error('Error while getting speeches duration'); 374 | } 375 | } 376 | 377 | static async getSpeechDuration(speech: Readable | Buffer): Promise { 378 | try { 379 | return await AudioUtils.getAudioDurationFromBuffer(speech); 380 | } catch (err) { 381 | console.error('Speech duration error : ' + err); 382 | throw new Error('Error while getting speech duration'); 383 | } 384 | } 385 | 386 | static async removeStartAndEndSilenceFromAllAudio(arraySpeeches: SpeechResponseWithDuration[]) { 387 | const results = []; 388 | 389 | for (const speech of arraySpeeches) { 390 | try { 391 | let retries = 0; 392 | const maxRetries = 3; 393 | let newSpeechBuffer: Buffer = speech.speech; 394 | let success = false; 395 | 396 | while (!success && retries < maxRetries) { 397 | try { 398 | const processedBuffer = await this.removeStartAndEndSilenceFromAudio(speech.speech); 399 | newSpeechBuffer = processedBuffer; 400 | success = true; 401 | } catch (error: any) { 402 | retries++; 403 | throw error; 404 | } 405 | } 406 | 407 | const newSpeechDuration = await this.getSpeechDuration(newSpeechBuffer); 408 | 409 | if (typeof newSpeechDuration !== 'number') { 410 | console.warn( 411 | `Speech duration calculation failed for speech index ${speech.speechIndex}, using original duration`, 412 | ); 413 | results.push({ 414 | speech: speech.speech, // Use original speech buffer 415 | duration: speech.duration, // Use original duration 416 | speechIndex: speech.speechIndex, 417 | speaker: speech.speaker, 418 | requestId: speech.requestId, 419 | }); 420 | continue; 421 | } 422 | 423 | results.push({ 424 | speech: newSpeechBuffer, 425 | duration: newSpeechDuration, 426 | speechIndex: speech.speechIndex, 427 | speaker: speech.speaker, 428 | requestId: speech.requestId, 429 | }); 430 | } catch (error) { 431 | console.error(`Error processing speech at index ${speech.speechIndex}:`, error); 432 | 433 | // Instead of failing the entire batch, keep the original speech 434 | results.push({ 435 | speech: speech.speech, // Use original speech buffer 436 | duration: speech.duration, // Use original duration 437 | speechIndex: speech.speechIndex, 438 | speaker: speech.speaker, 439 | requestId: speech.requestId, 440 | }); 441 | } 442 | } 443 | 444 | return results; 445 | } 446 | 447 | static async removeStartAndEndSilenceFromAudio(speech: Buffer): Promise { 448 | console.debug('Removing start and end silence from audio...'); 449 | const temporaryInputFile = `temporary-files/input-for-trim-${crypto.randomUUID()}.wav`; 450 | const temporaryOutputFile = `temporary-files/output-for-trim-${crypto.randomUUID()}.wav`; 451 | 452 | try { 453 | await fsPromises.writeFile(temporaryInputFile, speech); 454 | 455 | try { 456 | await AudioUtils.removeStartAndEndSilenceFromAudioWithFFMPEG(temporaryInputFile, temporaryOutputFile); 457 | } catch (ffmpegError: any) { 458 | console.error('FFmpeg error during silence removal:', ffmpegError); 459 | 460 | if (!fs.existsSync(temporaryOutputFile)) { 461 | throw new Error(`FFmpeg failed to process audio: ${ffmpegError.message || 'Unknown error'}`); 462 | } 463 | 464 | console.debug('FFmpeg reported an error but output file exists, attempting to continue'); 465 | } 466 | 467 | if (!fs.existsSync(temporaryOutputFile)) { 468 | throw new Error('Output file was not created during silence removal'); 469 | } 470 | 471 | const stats = await fsPromises.stat(temporaryOutputFile); 472 | if (stats.size === 0) { 473 | throw new Error('Output file is empty after silence removal'); 474 | } 475 | 476 | const bufferNewSpeech = await fsPromises.readFile(temporaryOutputFile); 477 | 478 | console.debug('Start and end silence removed from audio.'); 479 | return bufferNewSpeech; 480 | } catch (err: any) { 481 | console.error('Error in removeStartAndEndSilenceFromAudio:', err); 482 | throw new Error( 483 | `ERROR while removing start and end silence from audio: ${err.message || 'Unknown error'}`, 484 | ); 485 | } finally { 486 | try { 487 | if (fs.existsSync(temporaryInputFile)) await fsPromises.unlink(temporaryInputFile); 488 | } catch (unlinkError) { 489 | console.error('Error deleting temporary input file:', unlinkError); 490 | } 491 | 492 | try { 493 | if (fs.existsSync(temporaryOutputFile)) await fsPromises.unlink(temporaryOutputFile); 494 | } catch (unlinkError) { 495 | console.error('Error deleting temporary output file:', unlinkError); 496 | } 497 | } 498 | } 499 | 500 | static async createAndAssembleSeparateAudioTracksEachSpeaker(clips: SpeechAdjusted[]): Promise { 501 | const numberOfSpeakers = [...new Set(clips.map((clip) => clip.speaker))]; 502 | 503 | if (numberOfSpeakers.length === 1) { 504 | console.debug('starting assemble audio for one speaker'); 505 | const audioFrequency = 44100; 506 | const outputPath = await this.assembleAudio(clips, audioFrequency); 507 | console.debug('assemble audio for one speaker done'); 508 | return outputPath; 509 | } 510 | 511 | console.debug(`starting overlaying audio for ${numberOfSpeakers.length} speakers`); 512 | const timelineForEachSpeaker: string[] = []; 513 | 514 | for (const speaker of numberOfSpeakers) { 515 | console.debug(`starting assemble audio for speaker ${speaker}`); 516 | const speakerClips = clips.filter((clip) => clip.speaker === speaker); 517 | timelineForEachSpeaker.push(await this.assembleAudio(speakerClips, 44100)); 518 | } 519 | 520 | console.debug('assembling audio for all speakers done'); 521 | 522 | const outputPath = `temporary-files/${crypto.randomUUID()}-result-of-overlaying.wav`; 523 | 524 | await AudioUtils.overlayingAudio(outputPath, timelineForEachSpeaker); 525 | 526 | return outputPath; 527 | } 528 | 529 | static async assembleAudio(clips: SpeechAdjusted[], audioFrequency: number) { 530 | console.debug('Assembling audio...'); 531 | let previousEnd = 0; 532 | const tempFiles: string[] = []; 533 | 534 | try { 535 | for (const clip of clips) { 536 | if (clip.begin > previousEnd && parseFloat((clip.begin - previousEnd).toFixed(4)) > 0.001) { 537 | const silenceDuration = (clip.begin - previousEnd).toFixed(4); 538 | const silenceDurationFormatted = parseFloat(silenceDuration); 539 | const silenceFile = await AudioUtils.generateSilence(silenceDurationFormatted, audioFrequency); 540 | tempFiles.push(silenceFile); 541 | } 542 | 543 | if (clip.speech) { 544 | const audioFilePath = `temporary-files/${crypto.randomUUID()}-audio.wav`; 545 | await fsPromises.writeFile(audioFilePath, clip.speech); 546 | tempFiles.push(audioFilePath); 547 | } 548 | 549 | previousEnd = clip.begin + clip.speechDuration; 550 | } 551 | 552 | const outputPath = `temporary-files/${crypto.randomUUID()}-for-assemble-audio.wav`; 553 | 554 | const concatenatedAudioPath = await AudioUtils.concatenateAudio({ 555 | files: tempFiles, 556 | outputPath, 557 | outputFormat: 'wav', 558 | }); 559 | 560 | return concatenatedAudioPath; 561 | } catch (err: unknown) { 562 | console.error(err); 563 | throw new Error('Error while assembling audio'); 564 | } 565 | } 566 | 567 | static async overlayAudioAndBackgroundMusic( 568 | voicesAudioPath: string, 569 | backgroundMusicPath: string, 570 | ): Promise { 571 | console.debug('Merging audio and background music...'); 572 | try { 573 | const outputPath = path.join(`output/result-${crypto.randomUUID()}.wav`); 574 | 575 | //!Do not delete this line for the moment 576 | //await this.ffmpegService.amplifyAudio(backgroundMusicPath, 1.5); 577 | 578 | return await AudioUtils.mergeAudioFiles(voicesAudioPath, backgroundMusicPath, outputPath); 579 | } catch (err) { 580 | console.error(err); 581 | throw new Error('Error while merging audio and background music'); 582 | } finally { 583 | if (fs.existsSync(voicesAudioPath)) await fsPromises.unlink(voicesAudioPath); 584 | } 585 | } 586 | } 587 | -------------------------------------------------------------------------------- /src/subtitles/subtitles-generator.ts: -------------------------------------------------------------------------------- 1 | import { VideoUtils } from '../ffmpeg/video-utils'; 2 | import type { AllowedLanguages, SegmentWitDurationAndOriginalSegment } from '../types'; 3 | import { specialLanguagesWithSpecialCharacters } from '../utils/config'; 4 | import fs from 'fs'; 5 | import fsPromises from 'fs/promises'; 6 | import crypto from 'crypto'; 7 | 8 | export class SubtitlesGenerator { 9 | constructor() { 10 | // 11 | } 12 | 13 | static async addSubtitlesInVideo({ 14 | transcriptionData, 15 | initialVideoPath, 16 | lang, 17 | }: { 18 | transcriptionData: SegmentWitDurationAndOriginalSegment[]; 19 | initialVideoPath: string; 20 | lang: AllowedLanguages; 21 | }): Promise { 22 | console.debug('Adding subtitles in video...'); 23 | const maxLengthText = 50; 24 | const srtContent = this.createSrt(transcriptionData, maxLengthText, lang); 25 | const srtFilePath = `temporary-files/subtitles-${crypto.randomUUID()}.srt`; 26 | fs.writeFileSync(srtFilePath, srtContent, 'utf8'); 27 | const outputVideoFilePath = `output/result-${crypto.randomUUID()}.mp4`; 28 | 29 | try { 30 | await VideoUtils.addSubtitles({ 31 | videoPath: initialVideoPath, 32 | srtFilePath: srtFilePath, 33 | outputFilePath: outputVideoFilePath, 34 | }); 35 | 36 | return outputVideoFilePath; 37 | } catch (err) { 38 | console.error(err); 39 | throw new Error('Error while adding subtitles'); 40 | } finally { 41 | if (fs.existsSync(srtFilePath)) await fsPromises.unlink(srtFilePath); 42 | if (fs.existsSync(initialVideoPath)) await fsPromises.unlink(initialVideoPath); 43 | } 44 | } 45 | 46 | static createSrt( 47 | subtitles: SegmentWitDurationAndOriginalSegment[], 48 | maxLength: number, 49 | lang: AllowedLanguages, 50 | ): string { 51 | console.debug('Creating subtitles srt file...'); 52 | let srtIndex = 1; 53 | let srtContent = ''; 54 | 55 | for (const subtitle of subtitles) { 56 | const chunks = this.splitTextProportionally(subtitle.transcription, maxLength, lang); 57 | 58 | const totalWords = chunks.reduce((acc, chunk) => acc + chunk.split(' ').length, 0); 59 | 60 | let previousEnd = subtitle.begin; 61 | for (const chunk of chunks) { 62 | const words = chunk.split(' ').length; 63 | const chunkDuration = (subtitle.end - subtitle.begin) * (words / totalWords); 64 | const begin = this.secondsToSrtTime(previousEnd); 65 | const end = this.secondsToSrtTime(previousEnd + chunkDuration); 66 | 67 | srtContent += `${srtIndex}\n${begin} --> ${end}\n${chunk}\n\n`; 68 | srtIndex++; 69 | previousEnd += chunkDuration; 70 | } 71 | } 72 | 73 | console.debug('Subtitles srt file created'); 74 | return srtContent; 75 | } 76 | 77 | static secondsToSrtTime(seconds: number): string { 78 | const date = new Date(0); 79 | date.setSeconds(seconds); 80 | const iso = date.toISOString(); 81 | return iso.substring(11, 23).replace('.', ','); 82 | } 83 | 84 | static ddLineBreaks(text: string): string { 85 | const maxLength = 20; 86 | let result = ''; 87 | let lineLength = 0; 88 | 89 | for (const char of text) { 90 | result += char; 91 | lineLength++; 92 | if (lineLength >= maxLength) { 93 | result += '\n'; 94 | lineLength = 0; 95 | } 96 | } 97 | 98 | return result; 99 | } 100 | 101 | static splitTextProportionally(text: string, maxLength: number, lang: AllowedLanguages): string[] { 102 | const chunks: string[] = []; 103 | let currentChunk = ''; 104 | 105 | if (specialLanguagesWithSpecialCharacters.includes(lang)) { 106 | maxLength = 20; 107 | for (const char of text) { 108 | if ((currentChunk + char).length > maxLength) { 109 | chunks.push(currentChunk); 110 | currentChunk = ''; 111 | } 112 | currentChunk += char; 113 | } 114 | } else { 115 | const words = text.split(' '); 116 | for (const word of words) { 117 | if ((currentChunk + ' ' + word).trim().length > maxLength) { 118 | chunks.push(currentChunk.trim()); 119 | currentChunk = ''; 120 | } 121 | currentChunk += (currentChunk ? ' ' : '') + word; 122 | } 123 | } 124 | 125 | if (currentChunk) { 126 | chunks.push(currentChunk.trim()); 127 | } 128 | 129 | return chunks; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/transcription/formatter.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | AllowedLanguages, 3 | AudioOriginalLangAllowed, 4 | GladiaResponse, 5 | Result, 6 | SegmentDetail, 7 | SegmentDetailOut, 8 | SegmentDetailOutWithDuration, 9 | Sentence, 10 | Utterance, 11 | Word, 12 | } from '../types/index'; 13 | import { maxCharactersPerSegmentForNonLatinScriptLanguages, threshold } from '../utils/config'; 14 | import { maxCharactersPerSegment } from '../utils/config'; 15 | import { languageCodes, nonLatinScriptLanguages } from '../utils/constants'; 16 | 17 | export class Formatter { 18 | static formatTranscription(transcription: GladiaResponse, detectedLanguage: AudioOriginalLangAllowed) { 19 | const initialFormattedTranscription = this.getDetailsAndFormatTranscription( 20 | transcription.result, 21 | detectedLanguage, 22 | ); 23 | 24 | const mergedSegments = this.mergeSegments(initialFormattedTranscription, threshold); 25 | 26 | const finalTranscription = this.addDurationForEachTranscription(mergedSegments); 27 | 28 | return finalTranscription; 29 | } 30 | 31 | static getDetailsAndFormatTranscription( 32 | transcriptionsData: Result, 33 | detectedLanguage: AudioOriginalLangAllowed, 34 | ) { 35 | const gladiaUtterances = (dataTranscriptionGladia: Result) => { 36 | return dataTranscriptionGladia?.transcription?.utterances; 37 | }; 38 | 39 | let formattedUtterances: { 40 | transcription: string; 41 | begin: number; 42 | end: number; 43 | wordsWithSilence: string; 44 | speaker: number; 45 | channel: number; 46 | confidence: number; 47 | language: string; 48 | }[] = []; 49 | 50 | const splittedUtterances = this.splitTooLongUtterances( 51 | gladiaUtterances(transcriptionsData) as Utterance[], 52 | ); 53 | 54 | formattedUtterances = splittedUtterances.map((part) => ({ 55 | transcription: part.text, 56 | begin: Number(part.start.toFixed(3)), 57 | end: Number(part.end.toFixed(3)), 58 | wordsWithSilence: this.addTimesInText(part.words), 59 | speaker: part?.speaker || 0, 60 | channel: part?.channel || 0, 61 | confidence: part.confidence, 62 | language: detectedLanguage, 63 | })); 64 | 65 | return formattedUtterances; 66 | } 67 | 68 | static splitTooLongUtterances(transcriptions: Utterance[]) { 69 | const maxCharactersPerSegment = 500; 70 | const adjustedTranscription: Utterance[] = []; 71 | 72 | transcriptions.forEach((transcription) => { 73 | if (transcription.text.length > maxCharactersPerSegment) { 74 | const splittedTranscription = this.splitSegment(transcription) as Utterance[]; 75 | adjustedTranscription.push(...splittedTranscription); 76 | } else { 77 | adjustedTranscription.push(transcription); 78 | } 79 | }); 80 | 81 | return adjustedTranscription; 82 | } 83 | 84 | static splitSegment(obj: Sentence | Utterance, maxSentenceLength: number = 500): Sentence[] | Utterance[] { 85 | const words = obj.words; 86 | const chunks: (Sentence | Utterance)[] = []; 87 | 88 | let currentChunkWords: Word[] = []; 89 | let currentSentenceLength = 0; 90 | 91 | const isSentence = 'sentence' in obj; 92 | const textKey = isSentence ? 'sentence' : 'text'; 93 | 94 | for (let i = 0; i < words.length; i++) { 95 | const word = words[i]; 96 | const wordLength = word.word.length; 97 | 98 | if (currentSentenceLength + wordLength > maxSentenceLength && currentChunkWords.length > 0) { 99 | const sentence = currentChunkWords.map((w) => w.word).join(''); 100 | const start = currentChunkWords[0].start; 101 | const end = currentChunkWords[currentChunkWords.length - 1].end; 102 | const confidence = 103 | currentChunkWords.reduce((sum, w) => sum + w.confidence, 0) / currentChunkWords.length; 104 | 105 | const newSegment = { 106 | words: currentChunkWords, 107 | language: obj.language, 108 | start: start, 109 | end: end, 110 | speaker: obj?.speaker || 0, 111 | confidence: confidence, 112 | channel: obj?.channel || 0, 113 | [textKey]: sentence, 114 | }; 115 | 116 | //@ts-ignore 117 | chunks.push(newSegment as Sentence | Utterance); 118 | 119 | currentChunkWords = []; 120 | currentSentenceLength = 0; 121 | } 122 | 123 | currentChunkWords.push(word); 124 | currentSentenceLength += wordLength; 125 | } 126 | 127 | if (currentChunkWords.length > 0) { 128 | const sentence = currentChunkWords.map((w) => w.word).join(''); 129 | const start = currentChunkWords[0].start; 130 | const end = currentChunkWords[currentChunkWords.length - 1].end; 131 | const confidence = 132 | currentChunkWords.reduce((sum, w) => sum + w.confidence, 0) / currentChunkWords.length; 133 | 134 | const newSegment = { 135 | words: currentChunkWords, 136 | language: obj.language, 137 | start: start, 138 | end: end, 139 | speaker: obj?.speaker || 0, 140 | confidence: confidence, 141 | channel: obj?.channel || 0, 142 | [textKey]: sentence, 143 | }; 144 | 145 | //@ts-ignore 146 | chunks.push(newSegment as Sentence | Utterance); 147 | } 148 | 149 | return chunks as Sentence[] | Utterance[]; 150 | } 151 | 152 | private static addTimesInText(words: Word[]) { 153 | let enhancedText = ''; 154 | 155 | words.forEach((word, index) => { 156 | const timeBetweenNextWord = 157 | index !== words.length - 1 ? (words[index + 1].start - word.end).toString() : ''; 158 | 159 | enhancedText += word.word.trim() + (timeBetweenNextWord ? `<${timeBetweenNextWord.slice(0, 5)}s>` : ''); 160 | }); 161 | 162 | return enhancedText; 163 | } 164 | 165 | static mergeSegments(segments: SegmentDetail[], timeThreshold: number): SegmentDetailOut[] { 166 | console.debug('Merging segments...'); 167 | const mergedSegments = this.mergeUnderCondition(segments, timeThreshold); 168 | 169 | return mergedSegments; 170 | } 171 | 172 | static getMaxCharactersPerSegment(language: string): number { 173 | const languageCode = languageCodes[language as keyof typeof languageCodes]?.toLowerCase(); 174 | return nonLatinScriptLanguages.includes(languageCode as AllowedLanguages) 175 | ? maxCharactersPerSegmentForNonLatinScriptLanguages 176 | : maxCharactersPerSegment; 177 | } 178 | 179 | static mergeUnderCondition(segments: SegmentDetail[], timeThreshold: number) { 180 | //If one the transcription part is longer that 4000 characters, we try again we a smaller timeThreshold 181 | 182 | const getMergedTranscription = () => { 183 | const mergedSegments: SegmentDetailOut[] = []; 184 | let currentSegment = segments[0]; 185 | let mergedPartIndex = 0; 186 | 187 | if (segments.length === 0) throw new Error('No transcription found in the response.'); 188 | 189 | for (let i = 1; i < segments.length; i++) { 190 | const nextSegment = segments[i]; 191 | const maxCharactersPerSegment = this.getMaxCharactersPerSegment(nextSegment.language); 192 | 193 | // Check if the start of the next segment is close to the end of the current segment 194 | const difference = nextSegment.begin - currentSegment.end; 195 | 196 | if ( 197 | difference <= timeThreshold && 198 | currentSegment.speaker === nextSegment.speaker && 199 | currentSegment.transcription.length + nextSegment.transcription.length < maxCharactersPerSegment 200 | ) { 201 | // Merge segments if close enough 202 | currentSegment = { 203 | ...currentSegment, 204 | transcription: currentSegment.transcription + ' ' + nextSegment.transcription, 205 | end: nextSegment.end, 206 | //*To get words with low confidence, simply add low confidence words in an array 207 | wordsWithSilence: currentSegment.wordsWithSilence.concat(nextSegment.wordsWithSilence), 208 | }; 209 | } else { 210 | // Adds the current transcript to the board and moves to the next 211 | mergedSegments.push({ 212 | ...currentSegment, 213 | index: mergedPartIndex, 214 | }); 215 | currentSegment = nextSegment; 216 | mergedPartIndex++; 217 | } 218 | } 219 | 220 | mergedSegments.push({ 221 | ...currentSegment, 222 | index: mergedPartIndex, 223 | }); 224 | 225 | return mergedSegments; 226 | }; 227 | 228 | const finalMergedTranscriptions = getMergedTranscription(); 229 | 230 | const isEverySegmentsLessThan4000 = finalMergedTranscriptions.every( 231 | (transcription) => transcription.transcription.length < 4000, 232 | ); 233 | if (!isEverySegmentsLessThan4000) { 234 | console.error('Error while merging transcriptions: One of the transcription is too long (>4000)'); 235 | //Throw an error if the transcription is too long 236 | throw new Error('One of the transcription is too long (>4000)'); 237 | } else { 238 | return finalMergedTranscriptions; 239 | } 240 | } 241 | 242 | static addDurationForEachTranscription(transcription: SegmentDetail[]): SegmentDetailOutWithDuration[] { 243 | return transcription.map((part, index) => { 244 | const duration = part.end - part.begin; 245 | return { 246 | ...part, 247 | duration: Number(duration.toFixed(3)), 248 | index, 249 | }; 250 | }); 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /src/transcription/textTranslator.ts: -------------------------------------------------------------------------------- 1 | import { models, requestToGPT } from '../llm/openai'; 2 | import type { OpenAIModel } from '../llm/openai'; 3 | import { PromptBuilder } from '../llm/prompt-builder'; 4 | import { defaultInstructions } from '../llm/prompt-builder'; 5 | import type { 6 | AllowedLanguages, 7 | AudioOriginalLangAllowed, 8 | CreatePromptArguments, 9 | SegmentDetailOutWithDuration, 10 | SegmentWitDurationAndOriginalSegment, 11 | } from '../types'; 12 | 13 | export class TextTranslator { 14 | static async translateTranscriptionInTargetLanguage({ 15 | transcription, 16 | targetLanguage, 17 | originLanguage, 18 | transcriptionSummary, 19 | }: { 20 | transcription: SegmentDetailOutWithDuration[]; 21 | targetLanguage: AllowedLanguages; 22 | originLanguage: AudioOriginalLangAllowed; 23 | transcriptionSummary: string; 24 | }) { 25 | const translatedTranscription = await this.translateTranscription({ 26 | transcription, 27 | targetLanguage, 28 | originLanguage, 29 | transcriptionSummary, 30 | }); 31 | 32 | return translatedTranscription; 33 | } 34 | 35 | static async translateTranscription({ 36 | transcription, 37 | targetLanguage, 38 | originLanguage, 39 | transcriptionSummary, 40 | }: { 41 | transcription: SegmentDetailOutWithDuration[]; 42 | targetLanguage: AllowedLanguages; 43 | originLanguage: string; 44 | transcriptionSummary: string; 45 | }) { 46 | console.debug('Translating transcription...'); 47 | const maxSimultaneousTranslation = 10; 48 | let translationPromises: Promise[] = []; 49 | const transcriptionTranslated: SegmentWitDurationAndOriginalSegment[] = []; 50 | const deepCopyTranscriptions = ( 51 | JSON.parse(JSON.stringify(transcription)) as SegmentWitDurationAndOriginalSegment[] 52 | ).sort((a, b) => a.index - b.index) as SegmentWitDurationAndOriginalSegment[]; 53 | 54 | try { 55 | for (let i = 0; i < deepCopyTranscriptions.length; i++) { 56 | // Skip for first transcription to avoid undefined reference 57 | const lastTranscription = i !== 0 ? deepCopyTranscriptions[i - 1].transcription : ''; 58 | 59 | const actualTranscription = deepCopyTranscriptions[i].transcription; 60 | 61 | deepCopyTranscriptions[i].transcription = actualTranscription; 62 | 63 | const actualTranscriptionSpeaker = deepCopyTranscriptions[i].speaker?.toString() || '0'; 64 | 65 | const nextTranscriptionSpeaker = 66 | i !== deepCopyTranscriptions.length - 1 67 | ? deepCopyTranscriptions[i + 1].speaker?.toString() || '0' 68 | : ''; 69 | 70 | const nextTranscription = 71 | i !== deepCopyTranscriptions.length - 1 ? deepCopyTranscriptions[i + 1].transcription || '' : ''; 72 | 73 | const lastTranscriptionSpeaker = lastTranscription 74 | ? deepCopyTranscriptions[i - 1].speaker?.toString() || '0' 75 | : ''; 76 | 77 | const translationPromise = this.getTranslationPromise({ 78 | actualTranscription, 79 | lastTranscription, 80 | targetLanguage: targetLanguage, 81 | transcriptionLanguage: originLanguage, 82 | actualTranscriptionSpeaker, 83 | nextTranscriptionSpeaker, 84 | nextTranscription, 85 | lastTranscriptionSpeaker, 86 | transcriptionSummary, 87 | }); 88 | 89 | translationPromises.push(translationPromise); 90 | 91 | // Resolve translations in batches or at the last item 92 | if ( 93 | translationPromises.length === maxSimultaneousTranslation || 94 | i === deepCopyTranscriptions.length - 1 95 | ) { 96 | const translations: string[] = await Promise.all(translationPromises); 97 | for (let j = 0; j < translations.length; j++) { 98 | const transcriptionToUpdate = deepCopyTranscriptions[transcriptionTranslated.length]; 99 | transcriptionToUpdate.originalTranscription = deepCopyTranscriptions[j].transcription; 100 | transcriptionToUpdate.transcription = translations[j]; 101 | transcriptionToUpdate.language = targetLanguage; 102 | 103 | transcriptionTranslated.push(transcriptionToUpdate); 104 | } 105 | translationPromises = []; 106 | } 107 | } 108 | 109 | console.debug('Transcription translated.'); 110 | return transcriptionTranslated; 111 | } catch (error: unknown) { 112 | console.error(error); 113 | throw new Error('Error while translating transcription'); 114 | } 115 | } 116 | 117 | static async getTranslationPromise({ 118 | actualTranscription, 119 | lastTranscription, 120 | targetLanguage, 121 | transcriptionLanguage, 122 | nextTranscriptionSpeaker, 123 | nextTranscription, 124 | lastTranscriptionSpeaker, 125 | actualTranscriptionSpeaker, 126 | transcriptionSummary, 127 | }: { 128 | actualTranscription: string; 129 | lastTranscription: string; 130 | targetLanguage: AllowedLanguages; 131 | transcriptionLanguage: string; 132 | actualTranscriptionSpeaker: string; 133 | nextTranscriptionSpeaker?: string; 134 | nextTranscription?: string; 135 | lastTranscriptionSpeaker?: string; 136 | transcriptionSummary: string; 137 | }) { 138 | const maxAttempts = 3; 139 | let textTranslated = ''; 140 | let attempts = 0; 141 | 142 | do { 143 | textTranslated = await this.getTranslationPromiseFromAI({ 144 | actualTranscription, 145 | lastTranscription, 146 | targetLanguage, 147 | transcriptionLanguage, 148 | nextTranscription: nextTranscription || '', 149 | nextTranscriptionSpeaker: nextTranscriptionSpeaker || '', 150 | lastTranscriptionSpeaker: lastTranscriptionSpeaker || '', 151 | actualTranscriptionSpeaker, 152 | transcriptionSummary, 153 | }); 154 | attempts++; 155 | } while (textTranslated === actualTranscription && attempts < maxAttempts); 156 | 157 | return textTranslated; 158 | } 159 | 160 | static async getTranslationPromiseFromAI({ 161 | actualTranscription, 162 | lastTranscription, 163 | targetLanguage, 164 | transcriptionLanguage, 165 | nextTranscriptionSpeaker, 166 | nextTranscription, 167 | lastTranscriptionSpeaker, 168 | actualTranscriptionSpeaker, 169 | transcriptionSummary, 170 | }: { 171 | actualTranscription: string; 172 | lastTranscription: string; 173 | targetLanguage: AllowedLanguages; 174 | transcriptionLanguage: string; 175 | nextTranscription?: string; 176 | nextTranscriptionSpeaker?: string; 177 | lastTranscriptionSpeaker?: string; 178 | actualTranscriptionSpeaker: string; 179 | transcriptionSummary: string; 180 | }) { 181 | const promptSettings: CreatePromptArguments = { 182 | transcriptionToTranslate: actualTranscription, 183 | lastTranscription: lastTranscription, 184 | targetLanguage: targetLanguage, 185 | originLanguage: transcriptionLanguage, 186 | mainCategoryVideo: '', 187 | nextTranscription: nextTranscription || '', 188 | nextTranscriptionSpeaker: nextTranscriptionSpeaker || '', 189 | previousTranscriptionSpeaker: lastTranscriptionSpeaker || '', 190 | transcriptionToTranslateSpeaker: actualTranscriptionSpeaker || '', 191 | transcriptionSummary: transcriptionSummary, 192 | }; 193 | 194 | const prompt = PromptBuilder.createPromptToTranslateTranscription(promptSettings); 195 | 196 | return this.translateWithLLM({ 197 | prompt, 198 | instruction: defaultInstructions, 199 | temperature: 0.5, 200 | }); 201 | } 202 | 203 | static async translateWithLLM({ 204 | prompt, 205 | temperature, 206 | instruction, 207 | responseFormat = 'text', 208 | }: { 209 | prompt: string; 210 | temperature: number; 211 | instruction: string; 212 | responseFormat?: 'text' | 'json'; 213 | }) { 214 | let model: OpenAIModel = models.gpt4_1; 215 | 216 | try { 217 | return await requestToGPT({ 218 | prompt, 219 | temperature, 220 | instructions: instruction, 221 | model, 222 | maxTokens: 8192, 223 | responseFormat: responseFormat === 'json' ? 'json_object' : 'text', 224 | }); 225 | } catch (error) { 226 | console.error(error); 227 | throw new Error('Error while translating transcription'); 228 | } 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /src/transcription/transcriber.ts: -------------------------------------------------------------------------------- 1 | import type { GladiaRequestBody, GladiaResponse } from '../types'; 2 | import axios from 'axios'; 3 | import fs from 'fs'; 4 | import FormData from 'form-data'; 5 | import fsPromise from 'fs/promises'; 6 | 7 | const baseUrlGladia = 'https://api.gladia.io/v2/pre-recorded/'; 8 | 9 | interface AudioUploadResponse { 10 | audio_url: string; 11 | audio_metadata: { 12 | id: string; 13 | filename: string; 14 | source: string; 15 | extension: string; 16 | size: number; 17 | audio_duration: number; 18 | number_of_channels: number; 19 | }; 20 | } 21 | 22 | export class Transcriber { 23 | static async transcribeAudio({ 24 | audioPath, 25 | numberOfSpeakers, 26 | }: { 27 | audioPath: string; 28 | numberOfSpeakers: string; 29 | }) { 30 | try { 31 | const speakerNumber = 32 | numberOfSpeakers !== 'auto-detect' && numberOfSpeakers !== undefined 33 | ? parseInt(numberOfSpeakers) 34 | : numberOfSpeakers; 35 | 36 | const audioUrl = await this.uploadAudioFile(audioPath); 37 | 38 | const transcription = await this.getGladiaTranscription({ 39 | fileUrl: audioUrl, 40 | numberOfSpeakers: speakerNumber, 41 | }); 42 | 43 | return transcription; 44 | } catch (error) { 45 | if (error instanceof Error) { 46 | throw new Error(error.message); 47 | } else { 48 | throw new Error('Error in transcribeAudio: ' + error); 49 | } 50 | } 51 | } 52 | 53 | static async getGladiaTranscription({ 54 | fileUrl, 55 | numberOfSpeakers, 56 | }: { 57 | fileUrl: string; 58 | numberOfSpeakers: number | 'auto-detect'; 59 | }): Promise { 60 | try { 61 | const requestData: GladiaRequestBody = { 62 | audio_url: fileUrl, 63 | detect_language: true, 64 | diarization: true, 65 | sentences: true, 66 | name_consistency: true, 67 | punctuation_enhanced: true, 68 | summarization: true, 69 | }; 70 | 71 | if (numberOfSpeakers !== 'auto-detect' && numberOfSpeakers !== undefined && numberOfSpeakers !== 0) { 72 | requestData.diarization_config = { 73 | number_of_speakers: numberOfSpeakers || 1, 74 | max_speakers: numberOfSpeakers || 1, 75 | }; 76 | } 77 | 78 | const headers = { 79 | 'x-gladia-key': process.env.GLADIA_API_KEY, 80 | 'Content-Type': 'application/json', 81 | }; 82 | 83 | console.debug('- Sending initial request to Gladia API...'); 84 | const initialResponse: any = await this.makeFetchRequest(baseUrlGladia, { 85 | method: 'POST', 86 | headers, 87 | body: JSON.stringify(requestData), 88 | }); 89 | 90 | if (!initialResponse.id) { 91 | throw new Error('Error with gladia initialization'); 92 | } 93 | 94 | const response = await this.pollForResult(initialResponse.id, headers); 95 | 96 | return response; 97 | } catch (error) { 98 | console.error('Error in Gladia transcription:', error); 99 | throw new Error('Error in Gladia transcription'); 100 | } 101 | } 102 | 103 | static async pollForResult(transcriptionId: string, headers: any): Promise { 104 | const pollUrl = `${baseUrlGladia}${transcriptionId}`; 105 | 106 | while (true) { 107 | const pollResponse: any = await this.makeFetchRequest(pollUrl, { 108 | method: 'GET', 109 | headers, 110 | }); 111 | 112 | if (pollResponse.status === 'done') { 113 | return pollResponse; 114 | } else if (pollResponse.status === 'error') { 115 | throw new Error(`Gladia transcription failed: ${pollResponse.error}`); 116 | } 117 | 118 | await new Promise((resolve) => setTimeout(resolve, 1000)); 119 | } 120 | } 121 | 122 | static async makeFetchRequest(url: string, options: any) { 123 | const response = await fetch(url, options); 124 | if (!response.ok) { 125 | throw new Error(`Gladia API error: ${response.statusText}`); 126 | } 127 | return response.json(); 128 | } 129 | 130 | static async uploadAudioFile(filePath: string): Promise { 131 | const apiKey = process.env.GLADIA_API_KEY; 132 | if (!apiKey) { 133 | throw new Error('Missing GLADIA_API_KEY environment variable.'); 134 | } 135 | 136 | try { 137 | console.debug('Uploading audio file to Gladia API...'); 138 | 139 | const form = new FormData(); 140 | const fileStream = fs.createReadStream(filePath); 141 | const filename = filePath.split('/').pop() || 'audio.mp3'; 142 | 143 | form.append('audio', fileStream, filename); 144 | 145 | const response = await axios.post('https://api.gladia.io/v2/upload', form, { 146 | headers: { 147 | 'x-gladia-key': apiKey, 148 | ...form.getHeaders(), 149 | }, 150 | }); 151 | 152 | const data = response.data as AudioUploadResponse; 153 | 154 | if (!data.audio_url) { 155 | console.error('Error uploading audio file to Gladia API: ', data); 156 | throw new Error('Error uploading audio file to Gladia API'); 157 | } 158 | 159 | console.debug('File uploaded to Gladia API'); 160 | 161 | return data.audio_url; 162 | } catch (error: any) { 163 | console.error('Error uploading audio file:', error.response?.data || error.message); 164 | throw new Error(`Upload failed: ${error.message}`); 165 | } 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/types/index.d.ts: -------------------------------------------------------------------------------- 1 | export interface TranscriptionDataTypes { 2 | summary: SegmentDetailOutWithDuration | null; 3 | formattedSegments: string[]; 4 | detectedAudioLanguage: AudioOriginalLangAllowed | null; 5 | } 6 | 7 | export interface GladiaResponse { 8 | id: string; 9 | request_id: string; 10 | kind: string; 11 | status: string; 12 | created_at: string; 13 | completed_at: string; 14 | file: GladiaFile; 15 | request_params: RequestParams; 16 | result: Result; 17 | //Custom, not natively from Gladia 18 | original_audio_path: string; 19 | error_code?: string; 20 | } 21 | 22 | export interface Metadata { 23 | audio_duration: number; 24 | number_of_distinct_channels: number; 25 | billing_time: number; 26 | transcription_time: number; 27 | } 28 | 29 | export interface Word { 30 | word: string; 31 | start: number; 32 | end: number; 33 | confidence: number; 34 | } 35 | 36 | export interface Utterance { 37 | text: string; 38 | language: string; 39 | start: number; 40 | end: number; 41 | confidence: number; 42 | channel: number; 43 | speaker: number; 44 | words: Word[]; 45 | } 46 | 47 | export interface Sentence { 48 | sentence: string; 49 | language: string; 50 | start: number; 51 | end: number; 52 | confidence: number; 53 | channel: number; 54 | speaker: number; 55 | words: Word[]; 56 | } 57 | 58 | export interface SegmentDetail { 59 | transcription: string; 60 | language: string; 61 | begin: number; 62 | end: number; 63 | speaker: number; 64 | channel: number; 65 | confidence: number; 66 | wordsWithSilence: string; 67 | } 68 | 69 | export interface SegmentWitDurationAndOriginalSegment extends SegmentDetail { 70 | duration: number; 71 | index: number; 72 | originalTranscription: string; 73 | } 74 | 75 | export interface SegmentDetailOut extends SegmentDetail { 76 | index: number; 77 | } 78 | 79 | export interface SegmentDetailOutWithDuration extends SegmentDetailOut { 80 | duration: number; 81 | } 82 | 83 | export interface Result { 84 | metadata: Metadata; 85 | summarization: { 86 | success: boolean; 87 | is_empty: boolean; 88 | results: string; 89 | exec_time: number; 90 | error: string | null; 91 | }; 92 | transcription: Transcription; 93 | } 94 | 95 | export interface Transcription { 96 | languages: string[]; 97 | full_transcript: string; 98 | utterances: Utterance[]; 99 | sentences: Sentence[]; 100 | } 101 | 102 | export interface CreatePromptArguments { 103 | transcriptionToTranslate: string; 104 | lastTranscription: string; 105 | targetLanguage: string; 106 | originLanguage: string; 107 | mainCategoryVideo: string; 108 | nextTranscription?: string; 109 | transcriptionToTranslateSpeaker: string; 110 | previousTranscriptionSpeaker?: string; 111 | nextTranscriptionSpeaker?: string; 112 | videoTitle?: string; 113 | transcriptionSummary?: string; 114 | } 115 | 116 | export interface GladiaRequestBody { 117 | /** Context to feed the transcription model with for possible better performance */ 118 | context_prompt?: string; 119 | 120 | /** Enable diarization enhanced for this audio */ 121 | diarization_enhanced?: boolean; 122 | 123 | /** Specific vocabulary list to feed the transcription model with */ 124 | custom_vocabulary?: string[]; 125 | 126 | /** Detect the language from the given audio */ 127 | detect_language?: boolean; 128 | 129 | /** Detect multiple languages in the given audio */ 130 | enable_code_switching?: boolean; 131 | 132 | /** Specify the configuration for code switching */ 133 | code_switching_config?: { 134 | // Les détails spécifiques ne sont pas fournis 135 | }; 136 | 137 | /** Set the spoken language for the given audio (ISO 639 standard) */ 138 | language?: keyof typeof languageCodes; 139 | 140 | /** Enable punctuation enhanced for this audio */ 141 | punctuation_enhanced?: boolean; 142 | 143 | /** Callback URL we will do a POST re uest to with the result of the transcription */ 144 | callback_url?: string; 145 | 146 | /** Enable subtitles generation for this transcription */ 147 | subtitles?: boolean; 148 | 149 | /** Configuration for subtitles generation if subtitles is enabled */ 150 | subtitles_config?: { 151 | // Les détails spécifiques ne sont pas fournis 152 | }; 153 | 154 | /** Enable speaker recognition (diarization) for this audio */ 155 | diarization?: boolean; 156 | 157 | /** Speaker recognition configuration, if diarization is enabled */ 158 | diarization_config?: { 159 | // Les détails spécifiques ne sont pas fournis 160 | }; 161 | 162 | /** Enable translation for this audio */ 163 | translation?: boolean; 164 | 165 | /** Translation configuration, if translation is enabled */ 166 | translation_config?: { 167 | // Les détails spécifiques ne sont pas fournis 168 | }; 169 | 170 | /** Enable summarization for this audio */ 171 | summarization?: boolean; 172 | 173 | /** Summarization configuration, if summarization is enabled */ 174 | summarization_config?: { 175 | // Les détails spécifiques ne sont pas fournis 176 | }; 177 | 178 | /** Enable moderation for this audio */ 179 | moderation?: boolean; 180 | 181 | /** Enable named entity recognition for this audio */ 182 | named_entity_recognition?: boolean; 183 | 184 | /** Enable chapterization for this audio */ 185 | chapterization?: boolean; 186 | 187 | /** Enable names consistency for this audio */ 188 | name_consistency?: boolean; 189 | 190 | /** Enable custom spelling for this audio */ 191 | custom_spelling?: boolean; 192 | 193 | /** Custom spelling configuration, if custom_spelling is enabled */ 194 | custom_spelling_config?: { 195 | // Les détails spécifiques ne sont pas fournis 196 | }; 197 | 198 | /** Enable structured data extraction for this audio */ 199 | structured_data_extraction?: boolean; 200 | 201 | /** Structured data extraction configuration, if structured_data_extraction is enabled */ 202 | structured_data_extraction_config?: { 203 | // Les détails spécifiques ne sont pas fournis 204 | }; 205 | 206 | /** Enable sentiment analysis for this audio */ 207 | sentiment_analysis?: boolean; 208 | 209 | /** Enable audio to llm processing for this audio */ 210 | audio_to_llm?: boolean; 211 | 212 | /** Audio to llm configuration, if audio_to_llm is enabled */ 213 | audio_to_llm_config?: { 214 | // Les détails spécifiques ne sont pas fournis 215 | }; 216 | 217 | /** Custom metadata you can attach to this transcription */ 218 | custom_metadata?: Record; 219 | 220 | /** Enable sentences for this audio */ 221 | sentences?: boolean; 222 | 223 | /** Allows to change the output display_mode for this audio. The output will be reordered, creating new utterances when speakers overlapped */ 224 | display_mode?: boolean; 225 | 226 | /** URL to a Gladia file or to an external audio or video file */ 227 | audio_url: string; 228 | } 229 | 230 | export type AllowedLanguages = 231 | | 'swedish' 232 | | 'korean' 233 | | 'ukrainian' 234 | | 'greek' 235 | | 'japanese' 236 | | 'english' 237 | | 'american english' 238 | | 'russian' 239 | | 'hindi' 240 | | 'german' 241 | | 'danish' 242 | | 'bulgarian' 243 | | 'czech' 244 | | 'polish' 245 | | 'slovak' 246 | | 'finnish' 247 | | 'spanish' 248 | | 'croatian' 249 | | 'dutch' 250 | | 'portuguese' 251 | | 'french' 252 | | 'malay' 253 | | 'italian' 254 | | 'romanian' 255 | | 'mandarin' 256 | | 'tamil' 257 | | 'turkish' 258 | | 'indonesian' 259 | | 'tagalog' 260 | | 'arabic' 261 | | 'estonian' 262 | | 'norwegian' 263 | | 'vietnamese' 264 | | 'hungarian' 265 | | 'british english' 266 | | 'french canadian'; 267 | 268 | export type AudioOriginalLangAllowed = 269 | | 'af' 270 | | 'sq' 271 | | 'am' 272 | | 'ar' 273 | | 'hy' 274 | | 'as' 275 | | 'ast' 276 | | 'az' 277 | | 'ba' 278 | | 'eu' 279 | | 'be' 280 | | 'bn' 281 | | 'bs' 282 | | 'br' 283 | | 'bg' 284 | | 'my' 285 | | 'ca' 286 | | 'ceb' 287 | | 'zh' 288 | | 'hr' 289 | | 'cs' 290 | | 'da' 291 | | 'nl' 292 | | 'en' 293 | | 'et' 294 | | 'at' 295 | | 'fo' 296 | | 'fi' 297 | | 'fr' 298 | | 'fy' 299 | | 'ff' 300 | | 'gd' 301 | | 'gl' 302 | | 'lg' 303 | | 'ka' 304 | | 'de' 305 | | 'el' 306 | | 'gu' 307 | | 'ht' 308 | | 'ha' 309 | | 'haw' 310 | | 'he' 311 | | 'hi' 312 | | 'hu' 313 | | 'is' 314 | | 'ig' 315 | | 'ilo' 316 | | 'id' 317 | | 'ga' 318 | | 'it' 319 | | 'ja' 320 | | 'jv' 321 | | 'kn' 322 | | 'kk' 323 | | 'km' 324 | | 'ko' 325 | | 'lo' 326 | | 'la' 327 | | 'lv' 328 | | 'lb' 329 | | 'ln' 330 | | 'lt' 331 | | 'mk' 332 | | 'mg' 333 | | 'ms' 334 | | 'ml' 335 | | 'mt' 336 | | 'mi' 337 | | 'mr' 338 | | 'mo' 339 | | 'mn' 340 | | 'ne' 341 | | 'no' 342 | | 'nn' 343 | | 'oc' 344 | | 'or' 345 | | 'pa' 346 | | 'ps' 347 | | 'fa' 348 | | 'pl' 349 | | 'pt' 350 | | 'ro' 351 | | 'ru' 352 | | 'sa' 353 | | 'sr' 354 | | 'sn' 355 | | 'sd' 356 | | 'si' 357 | | 'sk' 358 | | 'sl' 359 | | 'so' 360 | | 'es' 361 | | 'su' 362 | | 'sw' 363 | | 'ss' 364 | | 'sv' 365 | | 'tl' 366 | | 'tg' 367 | | 'ta' 368 | | 'tt' 369 | | 'te' 370 | | 'th' 371 | | 'bo' 372 | | 'tn' 373 | | 'tr' 374 | | 'tk' 375 | | 'uk' 376 | | 'ur' 377 | | 'uz' 378 | | 'vi' 379 | | 'cy' 380 | | 'wo' 381 | | 'xh' 382 | | 'yi' 383 | | 'yo' 384 | | 'zu'; 385 | -------------------------------------------------------------------------------- /src/types/lipsync.d.ts: -------------------------------------------------------------------------------- 1 | interface LipSyncResult { 2 | id: string; 3 | createdAt: string; 4 | status: StatusSyncLab; 5 | model: string; 6 | input: string; 7 | webhookUrl: string; 8 | options: { 9 | output_format: string; 10 | }; 11 | outputUrl: string; 12 | error: null | string; 13 | } 14 | 15 | export interface LipSyncResponse { 16 | id: string; 17 | createdAt: string; 18 | status: StatusSyncLab; 19 | model: string; 20 | input: string; 21 | webhookUrl: string; 22 | options: { 23 | output_format: string; 24 | }; 25 | outputUrl: string; 26 | error: null | string; 27 | } 28 | 29 | export interface SyncLabInitialResponse { 30 | id: string; 31 | createdAt: string; 32 | status: 'PENDING'; 33 | videoUrl: string | null; 34 | originalVideoUrl: string; 35 | originalAudioUrl: string; 36 | synergize: boolean; 37 | creditsDeducted: number | null; 38 | webhookUrl: string; 39 | errorMessage: string | null; 40 | message: string; 41 | } 42 | 43 | export interface SynclabInput { 44 | type: 'video' | 'audio'; 45 | url: string; 46 | segments_secs?: number[][]; 47 | segments_frames?: number[][]; 48 | } 49 | 50 | export interface SynclabOptions { 51 | output_format: 'mp4'; 52 | active_speaker?: boolean; 53 | } 54 | 55 | export interface SynclabV2RequestBody { 56 | model: string; 57 | input: SynclabInput[]; 58 | options: SynclabOptions; 59 | webhookUrl?: string; 60 | } 61 | 62 | export interface SynclabRequestBody { 63 | audioUrl: string; 64 | videoUrl: string; 65 | model: string; 66 | webhookUrl?: string; 67 | synergize?: boolean; 68 | pads?: number; 69 | maxCredits?: number; 70 | } 71 | 72 | export type StatusSyncLab = 'PENDING' | 'PROCESSING' | 'COMPLETED' | 'FAILED' | 'REJECTED' | 'CANCELED'; 73 | -------------------------------------------------------------------------------- /src/types/speech.d.ts: -------------------------------------------------------------------------------- 1 | export interface SpeechResponseWithIndex { 2 | speech: Response | Buffer; 3 | index: number; 4 | speaker: number; 5 | requestId: string; 6 | } 7 | 8 | export interface SpeechResponseWithDuration { 9 | speech: Buffer; 10 | duration: number; 11 | speechIndex: number; 12 | speaker: number; 13 | requestId: string; 14 | } 15 | 16 | export interface SpeechAdjusted { 17 | speech: Buffer | undefined; 18 | transcriptionDuration: number; 19 | end: number; 20 | begin: number; 21 | speaker: number; 22 | speechDuration: number; 23 | } 24 | 25 | export interface CreateLongerSpeechArguments { 26 | translatedTranscription: string; 27 | speechIndex: number; 28 | speakerIndex: number; 29 | targetLanguage: AllowedLanguages; 30 | originalLanguage: string; 31 | transcriptionWords: string; 32 | previousText: string; 33 | nextText: string; 34 | originalSegmentDuration: number; 35 | translatedSpeechDuration: number; 36 | difference: string; 37 | speedFactor: number; 38 | transcriptionSummary: string; 39 | clonedVoiceId: string; 40 | } 41 | 42 | export interface CreateShorterSpeechArguments { 43 | translatedTranscription: string; 44 | originalTranscription: string; 45 | speechIndex: number; 46 | speakerIndex: number; 47 | targetLanguage: AllowedLanguages; 48 | previousText: string; 49 | nextText: string; 50 | transcriptionDuration: number; 51 | translatedSpeechDuration: number; 52 | difference: string; 53 | transcriptionSummary: string; 54 | clonedVoiceId: string; 55 | } 56 | -------------------------------------------------------------------------------- /src/types/spleeter.d.ts: -------------------------------------------------------------------------------- 1 | interface LalalAPIResponse { 2 | status: 'success' | 'error'; 3 | result: Result; 4 | } 5 | 6 | interface Result { 7 | [key: string]: SplitDetail; 8 | archive: SplitDetail; 9 | batch: SplitDetail; 10 | } 11 | 12 | interface SplitDetail { 13 | status: 'success' | 'error'; 14 | name?: string; 15 | size?: number; 16 | duration?: number; 17 | stem?: string; 18 | splitter?: 'orion' | 'phoenix'; 19 | preview?: Preview | null; 20 | split?: any; 21 | player?: Player | null; 22 | task?: TaskDetail; 23 | error?: string; 24 | } 25 | 26 | interface Preview { 27 | duration: number; 28 | stem_track: string; 29 | stem_track_size: number; 30 | back_track: string; 31 | back_track_size: number; 32 | } 33 | 34 | interface Player { 35 | stem_track: string; 36 | stem_track_size: number; 37 | back_track: string; 38 | back_track_size: number; 39 | } 40 | 41 | interface TaskDetail { 42 | id: string[]; 43 | state: 'success' | 'error' | 'progress' | 'cancelled'; 44 | progress?: number; 45 | split_id?: string; 46 | error?: string; 47 | } 48 | 49 | interface ApiUploadResponse { 50 | status: 'success' | 'error'; 51 | id?: string; 52 | size?: number; 53 | duration?: number; 54 | expires?: number; 55 | error?: string; 56 | } 57 | -------------------------------------------------------------------------------- /src/utils/config.ts: -------------------------------------------------------------------------------- 1 | import type { AllowedLanguages } from '../types'; 2 | 3 | export const threshold = 0.7; // 0.8 seconds 4 | export const maxCharactersPerSegment = 350; 5 | export const maxCharactersPerSegmentForNonLatinScriptLanguages = 175; 6 | export const maxSimultaneousFetchElevenLabs = 1; 7 | export const maxSimultaneousFetchOpenAI = process.env.NODE_ENV === 'production' ? 4 : 10; 8 | export const silenceBetweenSegmentConsideredAsPause = 0.5; 9 | 10 | export const specialLanguagesWithSpecialCharacters: AllowedLanguages[] = ['mandarin', 'japanese', 'korean']; 11 | -------------------------------------------------------------------------------- /src/utils/constants.ts: -------------------------------------------------------------------------------- 1 | import type { AudioOriginalLangAllowed } from '../types/index'; 2 | 3 | export const audioExtensions = ['.mp3', '.wav', '.ogg', '.aac', '.flac', '.m4a', '.wma']; 4 | 5 | export const videoExtensions = ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.wmv', '.flv', '.m4v']; 6 | 7 | export const allowedExtensions = [...audioExtensions, ...videoExtensions]; 8 | 9 | type LanguageObject = { 10 | [K in AudioOriginalLangAllowed]?: string; 11 | }; 12 | 13 | export const languageCodes: LanguageObject = { 14 | af: 'Afrikaans', 15 | sq: 'Albanian', 16 | am: 'Amharic', 17 | ar: 'Arabic', 18 | hy: 'Armenian', 19 | as: 'Assamese', 20 | ast: 'Asturian', 21 | az: 'Azerbaijani', 22 | ba: 'Bashkir', 23 | eu: 'Basque', 24 | be: 'Belarusian', 25 | bn: 'Bengali', 26 | bs: 'Bosnian', 27 | br: 'Breton', 28 | bg: 'Bulgarian', 29 | my: 'Burmese', 30 | ca: 'Catalan', 31 | ceb: 'Cebuano', 32 | zh: 'Mandarin', 33 | hr: 'Croatian', 34 | cs: 'Czech', 35 | da: 'Danish', 36 | nl: 'Dutch', 37 | en: 'English', 38 | at: 'estonian', 39 | et: 'estonian', 40 | fo: 'Faroese', 41 | fi: 'Finnish', 42 | fr: 'French', 43 | fy: 'Western Frisian', 44 | ff: 'Fulah', 45 | gd: 'Gaelic', 46 | gl: 'Galician', 47 | lg: 'Ganda', 48 | ka: 'Georgian', 49 | de: 'German', 50 | el: 'Greek', 51 | gu: 'Gujarati', 52 | ht: 'Haitian Creole', 53 | ha: 'Hausa', 54 | haw: 'Hawaiian', 55 | he: 'Hebrew', 56 | hi: 'Hindi', 57 | hu: 'Hungarian', 58 | is: 'Icelandic', 59 | ig: 'Igbo', 60 | ilo: 'Iloko', 61 | id: 'Indonesian', 62 | ga: 'Irish', 63 | it: 'Italian', 64 | ja: 'Japanese', 65 | jv: 'Javanese', 66 | kn: 'Kannada', 67 | kk: 'Kazakh', 68 | km: 'Khmer', 69 | ko: 'Korean', 70 | lo: 'Lao', 71 | la: 'Latin', 72 | lv: 'Latvian', 73 | lb: 'Luxembourgish', 74 | ln: 'Lingala', 75 | lt: 'Lithuanian', 76 | mk: 'Macedonian', 77 | mg: 'Malagasy', 78 | ms: 'Malay', 79 | ml: 'Malayalam', 80 | mt: 'Maltese', 81 | mi: 'Maori', 82 | mr: 'Marathi', 83 | mo: 'Moldovan', 84 | mn: 'Mongolian', 85 | ne: 'Nepali', 86 | no: 'Norwegian', 87 | nn: 'Nynorsk', 88 | oc: 'Occitan', 89 | or: 'Oriya', 90 | pa: 'Punjabi', 91 | ps: 'Pashto', 92 | fa: 'Persian', 93 | pl: 'Polish', 94 | pt: 'Portuguese', 95 | ro: 'Romanian', 96 | ru: 'Russian', 97 | sa: 'Sanskrit', 98 | sr: 'Serbian', 99 | sn: 'Shona', 100 | sd: 'Sindhi', 101 | si: 'Sinhala', 102 | sk: 'Slovak', 103 | sl: 'Slovenian', 104 | so: 'Somali', 105 | es: 'Spanish', 106 | su: 'Sundanese', 107 | sw: 'Swahili', 108 | ss: 'Swati', 109 | sv: 'Swedish', 110 | tl: 'Tagalog', 111 | tg: 'Tajik', 112 | ta: 'Tamil', 113 | tt: 'Tatar', 114 | te: 'Telugu', 115 | th: 'Thai', 116 | bo: 'Tibetan', 117 | tn: 'Tswana', 118 | tr: 'Turkish', 119 | tk: 'Turkmen', 120 | uk: 'Ukrainian', 121 | ur: 'Urdu', 122 | uz: 'Uzbek', 123 | vi: 'Vietnamese', 124 | cy: 'Welsh', 125 | wo: 'Wolof', 126 | xh: 'Xhosa', 127 | yi: 'Yiddish', 128 | yo: 'Yoruba', 129 | zu: 'Zulu', 130 | }; 131 | 132 | export const nonLatinScriptLanguages: string[] = [ 133 | 'ar', // Arabic 134 | 'am', // Amharic 135 | 'as', // Assamese 136 | 'bn', // Bengali 137 | 'my', // Burmese 138 | 'zh', // Mandarin 139 | 'gu', // Gujarati 140 | 'he', // Hebrew 141 | 'hi', // Hindi 142 | 'ja', // Japanese 143 | 'kn', // Kannada 144 | 'kk', // Kazakh 145 | 'km', // Khmer 146 | 'ko', // Korean 147 | 'lo', // Lao 148 | 'ml', // Malayalam 149 | 'mr', // Marathi 150 | 'mn', // Mongolian 151 | 'ne', // Nepali 152 | 'or', // Oriya 153 | 'pa', // Punjabi 154 | 'ps', // Pashto 155 | 'fa', // Persian 156 | 'sa', // Sanskrit 157 | 'sd', // Sindhi 158 | 'si', // Sinhala 159 | 'ta', // Tamil 160 | 'te', // Telugu 161 | 'th', // Thai 162 | 'bo', // Tibetan 163 | 'ur', // Urdu 164 | 'yi', // Yiddish 165 | ]; 166 | -------------------------------------------------------------------------------- /src/utils/helpers.ts: -------------------------------------------------------------------------------- 1 | import * as fs from 'fs'; 2 | import * as path from 'path'; 3 | import { allowedExtensions, audioExtensions, videoExtensions } from './constants'; 4 | import type { SegmentWitDurationAndOriginalSegment } from '../types'; 5 | import { VideoUtils } from '../ffmpeg/video-utils'; 6 | import fsPromises from 'fs/promises'; 7 | 8 | export class Helpers { 9 | static async verifyPrerequisitesForDubbing() { 10 | console.debug('Verifying prerequisites for dubbing...'); 11 | const inputDir = path.join(process.cwd(), 'input'); 12 | let foundInputFile = false; 13 | 14 | try { 15 | const files = await fs.promises.readdir(inputDir); 16 | for (const file of files) { 17 | const ext = path.extname(file).toLowerCase(); 18 | if (allowedExtensions.includes(ext)) { 19 | foundInputFile = true; 20 | break; 21 | } 22 | } 23 | } catch (error: any) { 24 | if (error.code === 'ENOENT') { 25 | throw new Error("Input directory 'input' not found at the project root."); 26 | } 27 | throw new Error(`Error reading input directory: ${error.message}`); 28 | } 29 | 30 | if (!foundInputFile) { 31 | throw new Error( 32 | `No valid video or audio file found in the 'input' directory. Allowed extensions: ${allowedExtensions.join( 33 | ', ', 34 | )}`, 35 | ); 36 | } 37 | 38 | const numberOfSpeakers = process.env.NUM_SPEAKERS; 39 | const applyLipsync = process.env.APPLY_LIPSYNC; 40 | const targetLanguage = process.env.TARGET_LANGUAGE; 41 | const syncLabApiKey = process.env.SYNC_LAB_API_KEY; 42 | 43 | if (!numberOfSpeakers) { 44 | throw new Error('Environment variable NUMBER_OF_SPEAKERS is missing or not a valid number.'); 45 | } 46 | 47 | if (applyLipsync !== 'yes' && applyLipsync !== 'no') { 48 | throw new Error("Environment variable APPLY_LIPSYNC must be either 'yes' or 'no'."); 49 | } 50 | 51 | if (!targetLanguage) { 52 | throw new Error('Environment variable TARGET_LANGUAGE is missing.'); 53 | } 54 | 55 | if (applyLipsync === 'yes' && !syncLabApiKey) { 56 | throw new Error('Environment variable SYNC_LAB_API_KEY is required when APPLY_LIPSYNC is true.'); 57 | } 58 | 59 | console.debug('Prerequisites verified successfully.'); 60 | } 61 | 62 | static async getInputFilePath(): Promise { 63 | const inputDir = path.join(process.cwd(), 'input'); 64 | 65 | try { 66 | const files = await fs.promises.readdir(inputDir); 67 | 68 | for (const file of files) { 69 | const ext = path.extname(file).toLowerCase(); 70 | if (allowedExtensions.includes(ext)) { 71 | return path.join(inputDir, file); 72 | } 73 | } 74 | 75 | throw new Error( 76 | `No valid media file found in the input directory. Allowed extensions: ${allowedExtensions.join(', ')}`, 77 | ); 78 | } catch (error: any) { 79 | if (error.code === 'ENOENT') { 80 | throw new Error("Input directory 'input' not found at the project root."); 81 | } 82 | throw error; 83 | } 84 | } 85 | 86 | static async getAllInputFilePaths(): Promise { 87 | console.debug('Getting all input file paths...'); 88 | const inputDir = path.join(process.cwd(), 'input'); 89 | 90 | try { 91 | const files = await fs.promises.readdir(inputDir); 92 | 93 | for (const file of files) { 94 | const ext = path.extname(file).toLowerCase(); 95 | if (allowedExtensions.includes(ext)) { 96 | return path.join(inputDir, file); 97 | } 98 | } 99 | 100 | throw new Error( 101 | `No valid media file found in the input directory. Allowed extensions: ${allowedExtensions.join(', ')}`, 102 | ); 103 | } catch (error: any) { 104 | if (error.code === 'ENOENT') { 105 | throw new Error("Input directory 'input' not found at the project root."); 106 | } 107 | throw error; 108 | } 109 | } 110 | 111 | static getFileType(filePath: string): 'audio' | 'video' | null { 112 | const ext = path.extname(filePath).toLowerCase(); 113 | 114 | if (audioExtensions.includes(ext)) { 115 | return 'audio'; 116 | } else if (videoExtensions.includes(ext)) { 117 | return 'video'; 118 | } else { 119 | throw new Error(`Unsupported file type: ${ext}`); 120 | } 121 | } 122 | 123 | static parseAndVerifyTranscriptionDetails( 124 | transcriptionDetails: string, 125 | ): SegmentWitDurationAndOriginalSegment[] { 126 | try { 127 | let parsedTranscriptions = 128 | typeof transcriptionDetails === 'string' 129 | ? (JSON.parse(transcriptionDetails) as SegmentWitDurationAndOriginalSegment[]) 130 | : (transcriptionDetails as SegmentWitDurationAndOriginalSegment[]); 131 | 132 | parsedTranscriptions = parsedTranscriptions.map((partTranscription) => { 133 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 134 | const { wordsWithSilence, ...rest } = partTranscription; 135 | const segment = rest; 136 | if (!partTranscription.channel) { 137 | partTranscription.channel = 0; 138 | } 139 | 140 | const isEveryValueCorrect = Object.values(segment).every( 141 | (value) => value !== '' && value !== null && value !== undefined, 142 | ); 143 | 144 | if (!isEveryValueCorrect) { 145 | throw new Error('Invalid transcription details, one or more values are incorrect or empty'); 146 | } 147 | 148 | return partTranscription; 149 | }); 150 | 151 | console.debug('Transcription details parsed.'); 152 | return parsedTranscriptions; 153 | } catch (err: any) { 154 | console.error(err); 155 | throw new Error('Error while parsing transcription: ' + err); 156 | } 157 | } 158 | 159 | static async getVideoLength(filePath: string) { 160 | if (!filePath) throw new Error('File path is required'); 161 | 162 | const duration = await VideoUtils.getFileDuration(filePath); 163 | if (typeof duration !== 'number') 164 | throw new Error( 165 | `Error during audio duration calculation in translation service: duration is not a number: ${duration}`, 166 | ); 167 | 168 | return Math.round(duration / 60); 169 | } 170 | 171 | static async splitAudioIntoBuffers(filePath: string): Promise { 172 | try { 173 | console.debug('Splitting audio into buffers...'); 174 | const fileSizeLimit = 10 * 1024 * 1024; // 10 MB en bytes 175 | const fileBuffer = await fsPromises.readFile(filePath); 176 | const buffers = []; 177 | 178 | for (let start = 0; start < fileBuffer.length; start += fileSizeLimit) { 179 | const end = Math.min(start + fileSizeLimit, fileBuffer.length); 180 | buffers.push(fileBuffer.slice(start, end)); 181 | } 182 | 183 | console.debug('Audio split into buffers.'); 184 | return buffers; 185 | } catch (error) { 186 | console.error('Erreur lors de la lecture ou de la découpe du fichier:', error); 187 | throw error; 188 | } 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BLUE='\033[0;34m' 4 | GREEN='\033[0;32m' 5 | YELLOW='\033[1;33m' 6 | NC='\033[0m' 7 | BOLD='\033[1m' 8 | 9 | # --- Pre-run Checks --- 10 | if [ ! -f ".env" ]; then 11 | echo -e "${BOLD}${YELLOW}Warning: .env file not found in the project root.${NC}" 12 | echo -e "${BOLD}Please create a .env file with the required environment variables before running this script.${NC}" 13 | exit 1 14 | fi 15 | echo -e "${GREEN}.env file check passed.${NC}" 16 | 17 | if ! command -v node &> /dev/null; then 18 | echo -e "${BOLD}Error: Node.js is not installed. Please install it to continue.${NC}" 19 | exit 1 20 | fi 21 | echo -e "${GREEN}Node.js check passed.${NC}" 22 | 23 | if ! command -v bun &> /dev/null; then 24 | echo -e "${BOLD}Error: Bun is not installed. Please install it to continue.${NC}" 25 | exit 1 26 | fi 27 | echo -e "${GREEN}Bun check passed.${NC}" 28 | 29 | if ! command -v ffmpeg &> /dev/null; then 30 | echo -e "${BOLD}Error: FFmpeg is not installed. Please install it to continue.${NC}" 31 | echo -e "${BOLD}See installation instructions at: https://ffmpeg.org/download.html${NC}" 32 | exit 1 33 | fi 34 | echo -e "${GREEN}FFmpeg check passed.${NC}" 35 | 36 | if [ ! -d "node_modules" ]; then 37 | echo -e "${YELLOW}Dependencies not found. Installing...${NC}" 38 | bun install 39 | if [ $? -ne 0 ]; then 40 | echo -e "${BOLD}Error: Failed to install dependencies with pnpm.${NC}" 41 | exit 1 42 | fi 43 | echo -e "${GREEN}Dependencies installed successfully.${NC}" 44 | else 45 | echo -e "${GREEN}Dependencies check passed (node_modules found).${NC}" 46 | fi 47 | 48 | echo -e "\n${GREEN}All checks passed. Proceeding with script...${NC}\n" 49 | 50 | # --- Script Start --- 51 | 52 | clear 53 | echo -e "${BOLD}╔════════════════════════════════════════╗${NC}" 54 | echo -e "${BOLD}║ ${BLUE}Choose the target language ${NC} ${BOLD}║${NC}" 55 | echo -e "${BOLD}╚════════════════════════════════════════╝${NC}" 56 | echo "" 57 | 58 | languages=( 59 | "swedish" 60 | "korean" 61 | "ukrainian" 62 | "greek" 63 | "japanese" 64 | "english" 65 | "american english" 66 | "russian" 67 | "hindi" 68 | "german" 69 | "danish" 70 | "bulgarian" 71 | "czech" 72 | "polish" 73 | "slovak" 74 | "finnish" 75 | "spanish" 76 | "croatian" 77 | "dutch" 78 | "portuguese" 79 | "french" 80 | "malay" 81 | "italian" 82 | "romanian" 83 | "mandarin" 84 | "tamil" 85 | "turkish" 86 | "indonesian" 87 | "tagalog" 88 | "arabic" 89 | "norwegian" 90 | "vietnamese" 91 | "hungarian" 92 | "british english" 93 | "french canadian" 94 | ) 95 | 96 | echo -e "${BOLD}Available languages:${NC}\n" 97 | 98 | COLUMNS=3 99 | count=${#languages[@]} 100 | rows=$(( (count + COLUMNS - 1) / COLUMNS )) 101 | 102 | for (( i=0; i