├── .env.example
├── .eslintrc.js
├── .gitignore
├── .prettierrc
├── .vscode
    └── settings.json
├── LICENSE
├── NOTICE
├── README.md
├── assets
    └── How-dubbing-works.png
├── bun.lockb
├── input
    └── example.mp4
├── output
    └── example-result.mp4
├── package.json
├── src
    ├── core
    │   └── index.ts
    ├── elevenlabs
    │   └── elevenlabs.ts
    ├── ffmpeg
    │   ├── audio-utils.ts
    │   ├── ffmpegPatch.ts
    │   └── video-utils.ts
    ├── lipsync
    │   └── lipsync.ts
    ├── llm
    │   ├── openai.ts
    │   └── prompt-builder.ts
    ├── separator
    │   └── spleeter.ts
    ├── smart-sync
    │   └── adaptation.ts
    ├── speech
    │   └── speechGenerator.ts
    ├── subtitles
    │   └── subtitles-generator.ts
    ├── transcription
    │   ├── formatter.ts
    │   ├── textTranslator.ts
    │   └── transcriber.ts
    ├── types
    │   ├── index.d.ts
    │   ├── lipsync.d.ts
    │   ├── speech.d.ts
    │   └── spleeter.d.ts
    └── utils
    │   ├── config.ts
    │   ├── constants.ts
    │   └── helpers.ts
├── start.sh
├── temporary-files
    └── example.txt
└── tsconfig.json


/.env.example:
--------------------------------------------------------------------------------
 1 | PORT=4000
 2 | OPENAI_API_KEY=your_openai_api_key_here
 3 | GLADIA_API_KEY=your_gladia_api_key_here
 4 | ELEVEN_LABS_API_KEY=your_eleven_labs_api_key_here
 5 | LALAL_LICENSE_KEY=your_lalal_license_key_here
 6 | SYNC_LAB_API_KEY=your_sync_lab_api_key_here
 7 | 
 8 | #AWS (For lipsync)
 9 | AWS_S3_REGION=your_aws_s3_region_here
10 | AWS_ACCESS_KEY_ID=your_aws_access_key_id_here
11 | AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here
12 | AWS_BUCKET_NAME=your_aws_bucket_name_here


--------------------------------------------------------------------------------
/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   parser: '@typescript-eslint/parser',
 3 |   parserOptions: {
 4 |     project: 'tsconfig.json',
 5 |     tsconfigRootDir: __dirname,
 6 |     sourceType: 'module',
 7 |   },
 8 |   plugins: ['@typescript-eslint/eslint-plugin'],
 9 |   extends: ['plugin:@typescript-eslint/recommended', 'plugin:prettier/recommended'],
10 |   root: true,
11 |   env: {
12 |     node: true,
13 |     jest: true,
14 |   },
15 |   ignorePatterns: ['.eslintrc.js'],
16 |   rules: {
17 |     '@typescript-eslint/interface-name-prefix': 'off',
18 |     '@typescript-eslint/explicit-function-return-type': 'off',
19 |     '@typescript-eslint/explicit-module-boundary-types': 'off',
20 |     '@typescript-eslint/no-explicit-any': 'off',
21 |     '@typescript-eslint/no-floating-promises': 'error',
22 |   },
23 |   prettier: {
24 |     printWidth: 110,
25 |   },
26 | };
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
  2 | 
  3 | # Logs
  4 | 
  5 | logs
  6 | _.log
  7 | npm-debug.log_
  8 | yarn-debug.log*
  9 | yarn-error.log*
 10 | lerna-debug.log*
 11 | .pnpm-debug.log*
 12 | 
 13 | .vscode
 14 | 
 15 | # Caches 
 16 | 
 17 | .cache
 18 | 
 19 | # Diagnostic reports (https://nodejs.org/api/report.html)
 20 | 
 21 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
 22 | 
 23 | # Runtime data
 24 | 
 25 | pids
 26 | _.pid
 27 | _.seed
 28 | *.pid.lock
 29 | 
 30 | # Directory for instrumented libs generated by jscoverage/JSCover
 31 | 
 32 | lib-cov
 33 | 
 34 | # Coverage directory used by tools like istanbul
 35 | 
 36 | coverage
 37 | *.lcov
 38 | 
 39 | # nyc test coverage
 40 | 
 41 | .nyc_output
 42 | 
 43 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 44 | 
 45 | .grunt
 46 | 
 47 | # Bower dependency directory (https://bower.io/)
 48 | 
 49 | bower_components
 50 | 
 51 | # node-waf configuration
 52 | 
 53 | .lock-wscript
 54 | 
 55 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 56 | 
 57 | build/Release
 58 | 
 59 | # Dependency directories
 60 | 
 61 | node_modules/
 62 | jspm_packages/
 63 | 
 64 | # Snowpack dependency directory (https://snowpack.dev/)
 65 | 
 66 | web_modules/
 67 | 
 68 | # TypeScript cache
 69 | 
 70 | *.tsbuildinfo
 71 | 
 72 | # Optional npm cache directory
 73 | 
 74 | .npm
 75 | 
 76 | # Optional eslint cache
 77 | 
 78 | .eslintcache
 79 | 
 80 | # Optional stylelint cache
 81 | 
 82 | .stylelintcache
 83 | 
 84 | # Microbundle cache
 85 | 
 86 | .rpt2_cache/
 87 | .rts2_cache_cjs/
 88 | .rts2_cache_es/
 89 | .rts2_cache_umd/
 90 | 
 91 | # Optional REPL history
 92 | 
 93 | .node_repl_history
 94 | 
 95 | # Output of 'npm pack'
 96 | 
 97 | *.tgz
 98 | 
 99 | # Yarn Integrity file
100 | 
101 | .yarn-integrity
102 | 
103 | # dotenv environment variable files
104 | 
105 | .env
106 | .env.development.local
107 | .env.test.local
108 | .env.production.local
109 | .env.local
110 | 
111 | # parcel-bundler cache (https://parceljs.org/)
112 | 
113 | .parcel-cache
114 | 
115 | # Next.js build output
116 | 
117 | .next
118 | out
119 | 
120 | # Nuxt.js build / generate output
121 | 
122 | .nuxt
123 | dist
124 | 
125 | # Gatsby files
126 | 
127 | # Comment in the public line in if your project uses Gatsby and not Next.js
128 | 
129 | # https://nextjs.org/blog/next-9-1#public-directory-support
130 | 
131 | # public
132 | 
133 | # vuepress build output
134 | 
135 | .vuepress/dist
136 | 
137 | # vuepress v2.x temp and cache directory
138 | 
139 | .temp
140 | 
141 | # Docusaurus cache and generated files
142 | 
143 | .docusaurus
144 | 
145 | # Serverless directories
146 | 
147 | .serverless/
148 | 
149 | # FuseBox cache
150 | 
151 | .fusebox/
152 | 
153 | # DynamoDB Local files
154 | 
155 | .dynamodb/
156 | 
157 | # TernJS port file
158 | 
159 | .tern-port
160 | 
161 | # Stores VSCode versions used for testing VSCode extensions
162 | 
163 | .vscode-test
164 | 
165 | # yarn v2
166 | 
167 | .yarn/cache
168 | .yarn/unplugged
169 | .yarn/build-state.yml
170 | .yarn/install-state.gz
171 | .pnp.*
172 | 
173 | # IntelliJ based IDEs
174 | .idea
175 | 
176 | # Finder (MacOS) folder config
177 | .DS_Store
178 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "singleQuote": true,
3 |   "trailingComma": "all",
4 |   "printWidth": 110
5 | }
6 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "prettier.prettierPath": "./node_modules/prettier"
3 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Creative Commons Attribution-NonCommercial 4.0 International License
 2 | 
 3 | This work is licensed under the Creative Commons
 4 | Attribution-NonCommercial 4.0 International License.
 5 | 
 6 | To view a copy of this license, visit
 7 | https://creativecommons.org/licenses/by-nc/4.0/ or send a
 8 | letter to Creative Commons, PO Box 1866, Mountain View,
 9 | CA 94042, USA.
10 | 
11 | You are free to:
12 | 
13 | - Share — copy and redistribute the material in any medium
14 |   or format
15 | - Adapt — remix, transform, and build upon the material
16 | 
17 | Under the following terms:
18 | 
19 | - Attribution — You must give appropriate credit, provide a
20 |   link to the license, and indicate if changes were made.
21 |   You may do so in any reasonable manner, but not in any
22 |   way that suggests the licensor endorses you or your use.
23 | - NonCommercial — You may not use the material for
24 |   commercial purposes.
25 | 
26 | No additional restrictions — You may not apply legal terms
27 | or technological measures that legally restrict others from
28 | doing anything the license permits.
29 | 
30 | Notices:
31 | You do not have to comply with the license for elements of
32 | the material in the public domain or where your use is
33 | permitted by an applicable exception or limitation.
34 | 
35 | No warranties are given. The license may not give you all
36 | of the permissions necessary for your intended use. For
37 | example, other rights such as publicity, privacy, or moral
38 | rights may limit how you use the material.
39 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Dubbing Engine
 2 | Copyright (c) 2024 VoiceCheap.ai
 3 | 
 4 | This product includes software developed at VoiceCheap.ai.
 5 | 
 6 | This project is licensed under the Creative Commons
 7 | Attribution-NonCommercial 4.0 International License.
 8 | For commercial use, please contact: kevin.rousseau@voicecheap.ai
 9 | 
10 | Third-party dependencies and acknowledgments:
11 | 
12 | - TypeScript (Apache License 2.0)
13 | - Bun (MIT License)
14 | - Various API integrations (see README.md for details)
15 | 
16 | For more information about licensing, see the LICENSE file.
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![How Dubbing Works](./assets/How-dubbing-works.png)
  2 | 
  3 | # Dubbing Engine with Bun and Typescript
  4 | 
  5 | [![Star this repo](https://img.shields.io/github/stars/kevinrss01/dubbing-engine?style=social)](https://github.com/kevinrss01/dubbing-engine)
  6 | [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-CC%20BY--NC%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc/4.0/)
  7 | 
  8 | ## 🌐 Demo
  9 | 
 10 | ### Original video
 11 | 
 12 | https://github.com/user-attachments/assets/73a22695-9457-4c10-8782-c663dae249f3
 13 | 
 14 | ### Translated video
 15 | 
 16 | https://github.com/user-attachments/assets/a7b07820-a99c-4c95-80f6-e2c76f8d191b
 17 | 
 18 | This AI-powered translation and video dubbing engine can translate audio and video files while cloning the original voices, adding subtitles, and synchronizing lip movements. The engine powers [VoiceCheap.ai](https://voicecheap.ai).
 19 | 
 20 | ## ✨ Features
 21 | 
 22 | - Voice cloning & generation
 23 | - Automatic language detection
 24 | - Speech adaptation for natural timing (SmartSync)
 25 | - Background audio separation
 26 | - Subtitle generation
 27 | - Lip synchronization
 28 | - Supports 35 languages
 29 | 
 30 | ## 🧠 How It Works
 31 | 
 32 | The dubbing process follows these steps:
 33 | 
 34 | 1. **Configuration**: Select target language and options
 35 | 2. **Transcription & Analysis**:
 36 |    - Identify source language
 37 |    - Transcribe audio
 38 |    - Generate context summary
 39 |    - Perform speaker diarization (identify different speakers)
 40 | 3. **Translation**:
 41 |    - Format speech segments
 42 |    - Translate with LLM contextual awareness
 43 | 4. **Audio Processing**:
 44 |    - Separate voices and background audio
 45 |    - Measure audio levels
 46 |    - Create timeline for each speaker
 47 | 5. **Voice Generation**:
 48 |    - Clone each speaker's voice
 49 |    - Apply SmartSync adaptation to match timing
 50 |    - Adjust speed if necessary
 51 | 6. **Final Assembly**:
 52 |    - Concatenate translated segments
 53 |    - Adjust audio levels and equalize
 54 |    - Merge translated voices with background audio
 55 |    - Add subtitles
 56 |    - Apply lip synchronization
 57 | 
 58 | ### SmartSync Adaptation
 59 | 
 60 | SmartSync adapts the speaker's speech based on language and speaking speed to match the original timing as closely as possible. When a literal translation would run too long, it intelligently reformulates sentences to maintain natural pacing and synchronization with the original speech.
 61 | 
 62 | ## 🚀 Getting Started
 63 | 
 64 | ### Prerequisites
 65 | 
 66 | Before launching the project, make sure you have the following software installed:
 67 | 
 68 | - **Node.js**: [Download Node.js](https://nodejs.org/)
 69 | - **Bun**: JavaScript runtime & toolkit
 70 | - **FFmpeg**: Audio/video processing tool
 71 | - **API Keys**: For various services (see below)
 72 | 
 73 | #### How to Install Required Software
 74 | 
 75 | **Node.js**
 76 | 
 77 | - **Windows / macOS / Linux**: Download and install from [https://nodejs.org/](https://nodejs.org/)
 78 | 
 79 | **Bun**
 80 | 
 81 | - **macOS / Linux / Windows (WSL)**:
 82 |   ```bash
 83 |   curl -fsSL https://bun.sh/install | bash
 84 |   ```
 85 |   For more details, see [Bun's official install guide](https://bun.sh/docs/installation).
 86 | 
 87 | **FFmpeg**
 88 | 
 89 | - **macOS**: Install via Homebrew:
 90 |   ```bash
 91 |   brew install ffmpeg
 92 |   ```
 93 | - **Windows**: Download the latest build from [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html), extract, and add the `bin` folder to your PATH.
 94 | - **Linux**: Install via package manager (e.g. Ubuntu/Debian):
 95 |   ```bash
 96 |   sudo apt update && sudo apt install ffmpeg
 97 |   ```
 98 |   For other distributions, see [FFmpeg's official download page](https://ffmpeg.org/download.html).
 99 | 
100 | #### API Keys Required
101 | 
102 | You will need API keys from the following services:
103 | 
104 | - **OpenAI**: [Get your API key here](https://platform.openai.com/account/api-keys)
105 | - **Gladia**: [Sign up and get your API key here](https://app.gladia.io/)
106 | - **Eleven Labs**: [Sign up and get your API key here](https://elevenlabs.io/)
107 | - **Lalal.ai**: [Sign up and get your license key here](https://www.lalal.ai/)
108 | - **SyncLab**: [Sign up and get your API key here](https://synclab.ai/)
109 |   - **Note**: SyncLab requires a subscription. To add lipsync to videos longer than 5 minutes, you must have a "Scale" plan.
110 | - **AWS (for lipsync)**: Create an account at [AWS](https://aws.amazon.com/) and generate S3 credentials if you want to use the lipsync feature.
111 | 
112 | Create a `.env` file based on the `.env.example` and fill in your API keys:
113 | 
114 | ```
115 | PORT=4000
116 | OPENAI_API_KEY=your_openai_api_key_here
117 | GLADIA_API_KEY=your_gladia_api_key_here
118 | ELEVEN_LABS_API_KEY=your_eleven_labs_api_key_here
119 | LALAL_LICENSE_KEY=your_lalal_license_key_here
120 | SYNC_LAB_API_KEY=your_sync_lab_api_key_here
121 | 
122 | #AWS (For lipsync)
123 | AWS_S3_REGION=your_aws_s3_region_here
124 | AWS_ACCESS_KEY_ID=your_aws_access_key_id_here
125 | AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here
126 | AWS_BUCKET_NAME=your_aws_bucket_name_here
127 | ```
128 | 
129 | > **Note**: AWS credentials are only required for the lipsync feature. Users need a "Scale" subscription for SyncLab to add lipsync to videos longer than 5 minutes.
130 | 
131 | > **Important**: It is mandatory to add your own API keys in the `.env` file for all services (excluding the SyncLab API key, which is optional). Without these keys, you will not be able to start the project.
132 | 
133 | ### Installation & Usage
134 | 
135 | 1. Clone the repository
136 | 2. Create and configure your `.env` file with the necessary API keys
137 | 3. Run the start script:
138 | 
139 | ```bash
140 | ./start.sh
141 | ```
142 | 
143 | The script will:
144 | 
145 | - Check for required dependencies
146 | - Verify environment variables
147 | - Install necessary packages
148 | - Guide you through the dubbing process
149 | 
150 | ## 🛠️ Technology
151 | 
152 | - **TypeScript**: Core programming language
153 | - **Bun**: JavaScript runtime and toolkit
154 | - **OpenAI**: Translation and text adaptation
155 | - **Gladia**: Audio transcription
156 | - **Eleven Labs**: Voice cloning and speech generation
157 | - **Lalal.ai**: Audio separation
158 | - **SyncLab**: Lip synchronization
159 | 
160 | ## 🔤 Supported Languages
161 | 
162 | The engine supports all these languages:
163 | 
164 | | Accepted Input Language | Output Language                            |
165 | | ----------------------- | ------------------------------------------ |
166 | | Afrikaans               |                                            |
167 | | Albanian                |                                            |
168 | | Amharic                 |                                            |
169 | | Arabic                  | Arabic                                     |
170 | | Armenian                |                                            |
171 | | Azerbaijani             |                                            |
172 | | Bashkir                 |                                            |
173 | | Belarusian              |                                            |
174 | | Bengali                 |                                            |
175 | | Bosnian                 |                                            |
176 | | Breton                  |                                            |
177 | | Bulgarian               | Bulgarian                                  |
178 | | Burmese                 |                                            |
179 | | Catalan                 |                                            |
180 | | Chinese                 | Mandarin                                   |
181 | | Croatian                | Croatian                                   |
182 | | Czech                   | Czech                                      |
183 | | Danish                  | Danish                                     |
184 | | Dutch                   | Dutch                                      |
185 | | English                 | English, American English, British English |
186 | | Estonian                |                                            |
187 | | Finnish                 | Finnish                                    |
188 | | French                  | French, French Canadian                    |
189 | | Galician                |                                            |
190 | | Georgian                |                                            |
191 | | German                  | German                                     |
192 | | Greek                   | Greek                                      |
193 | | Gujarati                |                                            |
194 | | Haitian                 |                                            |
195 | | Hausa                   |                                            |
196 | | Hebrew                  |                                            |
197 | | Hindi                   | Hindi                                      |
198 | | Hungarian               | Hungarian                                  |
199 | | Icelandic               |                                            |
200 | | Indonesian              | Indonesian                                 |
201 | | Italian                 | Italian                                    |
202 | | Japanese                | Japanese                                   |
203 | | Javanese                |                                            |
204 | | Kannada                 |                                            |
205 | | Kazakh                  |                                            |
206 | | Korean                  | Korean                                     |
207 | | Lao                     |                                            |
208 | | Latvian                 |                                            |
209 | | Lingala                 |                                            |
210 | | Lithuanian              |                                            |
211 | | Luxembourgish           |                                            |
212 | | Macedonian              |                                            |
213 | | Malagasy                |                                            |
214 | | Malay                   | Malay                                      |
215 | | Malayalam               |                                            |
216 | | Marathi                 |                                            |
217 | | Moldavian               |                                            |
218 | | Moldovan                |                                            |
219 | | Mongolian               |                                            |
220 | | Nepali                  |                                            |
221 | | Norwegian               | Norwegian                                  |
222 | | Occitan                 |                                            |
223 | | Panjabi                 |                                            |
224 | | Pashto                  |                                            |
225 | | Persian                 |                                            |
226 | | Polish                  | Polish                                     |
227 | | Portuguese              | Portuguese                                 |
228 | | Pushto                  |                                            |
229 | | Romanian                | Romanian                                   |
230 | | Russian                 | Russian                                    |
231 | | Serbian                 |                                            |
232 | | Sindhi                  |                                            |
233 | | Sinhala                 |                                            |
234 | | Slovak                  | Slovak                                     |
235 | | Slovenian               |                                            |
236 | | Somali                  |                                            |
237 | | Spanish                 | Spanish                                    |
238 | | Sundanese               |                                            |
239 | | Swahili                 |                                            |
240 | | Swedish                 | Swedish                                    |
241 | | Tagalog                 | Tagalog                                    |
242 | | Tamil                   | Tamil                                      |
243 | | Turkish                 | Turkish                                    |
244 | | Ukrainian               | Ukrainian                                  |
245 | | Urdu                    |                                            |
246 | | Uzbek                   |                                            |
247 | | Valencian               |                                            |
248 | | Vietnamese              | Vietnamese                                 |
249 | | Welsh                   |                                            |
250 | | Yiddish                 |                                            |
251 | | Yoruba                  |                                            |
252 | 
253 | ## 🤝 Contributing
254 | 
255 | Contributions are welcome! Feel free to:
256 | 
257 | - Star this repository to show support
258 | - Open issues for bugs or feature requests
259 | - Submit pull requests to improve the codebase
260 | 
261 | ## ⚠️ Requirements
262 | 
263 | For optimal performance and to use all features:
264 | 
265 | - Ensure FFmpeg is properly installed
266 | - Configure all API keys
267 | - For lipsync features, AWS S3 credentials are required
268 | - SyncLab "Scale" subscription for longer videos
269 | 
270 | ## 📄 License
271 | 
272 | This project is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License.
273 | Personal and non-commercial use only.
274 | **Commercial use / SaaS / API integrations require a separate license — contact kevin.rousseau@voicecheap.ai** to access an enhanced API.
275 | 
276 | View the full license at https://creativecommons.org/licenses/by-nc/4.0/
277 | 
278 | ## 📊 Translation Quality & Model Options
279 | 
280 | **The current stack (OpenAI + Gladia + ElevenLabs + Lalal.ai) comes from months of benchmark­ing; it’s still the most accurate & stable combo I’ve found, even if it costs more.**
281 | 
282 | The quality of translations can be increased depending on your needs and budget by changing the AI models used:
283 | 
284 | - **Translation Models**: You can use instead, reasoning models like o3-mini (with reasoning capabilities), or upcoming models like o4-mini or o4.
285 | - **Adaptation Quality**: For models supporting reasoning efforts (o1, o3-mini, o3, o1-Pro), you can increase the reasoning_effort parameter from 'medium' to "high".
286 | 
287 | These options allow you to balance cost versus quality based on your specific requirements.
288 | 
289 | ## 🏆 Smarter Models
290 | 
291 | You can leverage models with superior performance on the [MMLU-Pro benchmark](https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro) for enhanced translation quality. Avoid using DeepL as it lacks comprehensive context handling and instruction adherence.
292 | 
293 | ## 🔧 Alternative Open-Source Models
294 | 
295 | To reduce external API dependencies, consider using open-source alternatives:
296 | 
297 | - **Transcription**: Whisper
298 | - **Text-to-Speech**: `hexgrad/Kokoro-82M`, Orpheus Speech from Canopy, SESAME models
299 | - **Translation & Adaptation**: LLAMA
300 | - **Multi-language Voice Cloning**: _TBD_
301 | - **Lip Synchronization**: Wav2Lip
302 | 
303 | ---
304 | 
305 | If you find this project helpful, please consider giving it a ⭐ to show support!
306 | 


--------------------------------------------------------------------------------
/assets/How-dubbing-works.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/assets/How-dubbing-works.png


--------------------------------------------------------------------------------
/bun.lockb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/bun.lockb


--------------------------------------------------------------------------------
/input/example.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/input/example.mp4


--------------------------------------------------------------------------------
/output/example-result.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/output/example-result.mp4


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "dubbing-engine-bun",
 3 |   "module": "index.ts",
 4 |   "type": "module",
 5 |   "devDependencies": {
 6 |     "@types/bun": "latest",
 7 |     "@types/fluent-ffmpeg": "^2.1.27",
 8 |     "@typescript-eslint/eslint-plugin": "^8.29.1",
 9 |     "@typescript-eslint/parser": "^8.29.1",
10 |     "axios": "^1.8.4",
11 |     "elevenlabs": "^1.56.1",
12 |     "eslint": "^9.24.0",
13 |     "eslint-config-prettier": "^10.1.2",
14 |     "eslint-plugin-prettier": "^5.2.6",
15 |     "ffprobe-static": "^3.1.0",
16 |     "fluent-ffmpeg": "^2.1.3",
17 |     "form-data": "^4.0.2",
18 |     "openai": "^4.90.0",
19 |     "prettier": "^3.5.3",
20 |     "tmp-promise": "^3.0.3"
21 |   },
22 |   "peerDependencies": {
23 |     "typescript": "^5.0.0"
24 |   },
25 |   "dependencies": {
26 |     "@aws-sdk/client-s3": "^3.787.0",
27 |     "@types/ffprobe-static": "^2.0.3",
28 |     "@types/qs": "^6.9.18",
29 |     "qs": "^6.14.0"
30 |   }
31 | }


--------------------------------------------------------------------------------
/src/core/index.ts:
--------------------------------------------------------------------------------
  1 | import { SubtitlesGenerator } from './../subtitles/subtitles-generator';
  2 | import { AudioUtils } from '../ffmpeg/audio-utils';
  3 | import { Helpers } from '../utils/helpers';
  4 | import { Transcriber } from '../transcription/transcriber';
  5 | import type { AllowedLanguages, AudioOriginalLangAllowed, TranscriptionDataTypes } from '../types';
  6 | import { Formatter } from '../transcription/formatter';
  7 | import { TextTranslator } from '../transcription/textTranslator';
  8 | import { Spleeter } from '../separator/spleeter';
  9 | import { SpeechGenerator } from '../speech/speechGenerator';
 10 | import { Adaptation } from '../smart-sync/adaptation';
 11 | import { VideoUtils } from '../ffmpeg/video-utils';
 12 | import fsPromises from 'fs/promises';
 13 | import fs from 'fs';
 14 | import { Lipsync } from '../lipsync/lipsync';
 15 | import crypto from 'crypto';
 16 | 
 17 | export type DebugMode = 'yes' | 'no';
 18 | export type NumberOfSpeakers = 'auto-detect' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | '10';
 19 | export type ActivateLipSync = 'yes' | 'no';
 20 | export type ActivateSubtitle = 'yes' | 'no';
 21 | 
 22 | export const translate = async () => {
 23 |   const targetLanguage = (process.env.TARGET_LANGUAGE || 'english') as AllowedLanguages;
 24 |   const debugMode: DebugMode = (process.env.DEBUG_MODE as DebugMode) || 'no';
 25 |   const numberOfSpeakers: NumberOfSpeakers = (process.env.NUM_SPEAKERS as NumberOfSpeakers) || 'auto-detect';
 26 |   const activateLipSync: ActivateLipSync = (process.env.APPLY_LIPSYNC as ActivateLipSync) || 'no';
 27 |   const activateSubtitle: ActivateSubtitle = (process.env.ACTIVATE_SUBTITLE as ActivateSubtitle) || 'yes';
 28 | 
 29 |   let clonedVoicesIdsToDelete: string[] = [];
 30 | 
 31 |   const transcriptionData: TranscriptionDataTypes = {
 32 |     summary: null,
 33 |     formattedSegments: [],
 34 |     detectedAudioLanguage: null,
 35 |   };
 36 | 
 37 |   if (debugMode === 'no') {
 38 |     console.debug = () => {};
 39 |     console.info('Dubbing Started successfully with the following parameters:');
 40 |     console.info('Target Language: ', targetLanguage);
 41 |     console.info('Debug Mode: ', debugMode);
 42 |     console.info('Number of Speakers: ', numberOfSpeakers);
 43 |     console.info('Activate Lip Sync: ', activateLipSync);
 44 |     console.info('Activate Subtitle: ', activateSubtitle);
 45 |   }
 46 | 
 47 |   Helpers.verifyPrerequisitesForDubbing();
 48 | 
 49 |   let inputFilePath = '';
 50 |   let videoPathWithoutAudio = null;
 51 |   let audioPathWithoutVideo = null;
 52 |   let backgroundAudio = null;
 53 |   let vocalsIsolated = null;
 54 | 
 55 |   try {
 56 |     inputFilePath = await Helpers.getAllInputFilePaths();
 57 |     const fileType = Helpers.getFileType(inputFilePath);
 58 | 
 59 |     if (fileType === 'video') {
 60 |       const { videoPath, audioPath } = await AudioUtils.separateAudioAndVideo(inputFilePath);
 61 |       videoPathWithoutAudio = videoPath;
 62 |       audioPathWithoutVideo = audioPath;
 63 |     } else {
 64 |       const audioPathCopy = `temporary-files/original-audio-${crypto.randomUUID()}.wav`;
 65 |       await fsPromises.copyFile(inputFilePath, audioPathCopy);
 66 |       audioPathWithoutVideo = audioPathCopy;
 67 |     }
 68 | 
 69 |     const transcription = await Transcriber.transcribeAudio({
 70 |       audioPath: audioPathWithoutVideo,
 71 |       numberOfSpeakers,
 72 |     });
 73 | 
 74 |     transcriptionData.detectedAudioLanguage = transcription.result.transcription
 75 |       .languages[0] as AudioOriginalLangAllowed;
 76 | 
 77 |     const transcriptionSummary = transcription.result.summarization.results;
 78 | 
 79 |     const formattedTranscription = Formatter.formatTranscription(
 80 |       transcription,
 81 |       transcriptionData.detectedAudioLanguage,
 82 |     );
 83 | 
 84 |     const translatedTranscription = await TextTranslator.translateTranscriptionInTargetLanguage({
 85 |       transcription: formattedTranscription,
 86 |       targetLanguage,
 87 |       originLanguage: transcriptionData.detectedAudioLanguage,
 88 |       transcriptionSummary: transcriptionSummary || '',
 89 |     });
 90 | 
 91 |     const verifiedTranscription = Helpers.parseAndVerifyTranscriptionDetails(
 92 |       JSON.stringify(translatedTranscription),
 93 |     );
 94 | 
 95 |     ({ backgroundAudio, vocalsIsolated } = await Spleeter.getSeparateAudio(audioPathWithoutVideo));
 96 |     const isolatedVocalsAverageDecibel = await AudioUtils.getAverageDecibel(vocalsIsolated);
 97 | 
 98 |     const { allResultsSorted, clonedVoicesIds } = await SpeechGenerator.getSpeechArrayFromTranscriptions({
 99 |       segments: verifiedTranscription,
100 |       targetLanguage,
101 |       isolatedVocalsPath: vocalsIsolated,
102 |     });
103 | 
104 |     clonedVoicesIdsToDelete = Object.values(clonedVoicesIds);
105 | 
106 |     const speechWithDuration = await SpeechGenerator.getEachSpeechDuration({
107 |       speechArray: allResultsSorted,
108 |       transcriptions: verifiedTranscription,
109 |     });
110 | 
111 |     const speechesWithoutSilence =
112 |       await SpeechGenerator.removeStartAndEndSilenceFromAllAudio(speechWithDuration);
113 | 
114 |     const adaptedSpeeches = await Adaptation.compareAndAdjustSpeeches({
115 |       transcriptions: verifiedTranscription,
116 |       speeches: speechesWithoutSilence,
117 |       clonedVoicesIds,
118 |       originalLanguage: transcriptionData.detectedAudioLanguage,
119 |       targetLanguage,
120 |       transcriptionSummary,
121 |     });
122 | 
123 |     const finalVoicesAudioTrack =
124 |       await SpeechGenerator.createAndAssembleSeparateAudioTracksEachSpeaker(adaptedSpeeches);
125 | 
126 |     const equalizedAudio = await AudioUtils.startEqualizeAudio(finalVoicesAudioTrack);
127 | 
128 |     await AudioUtils.adjustAudioToDecibel(equalizedAudio, isolatedVocalsAverageDecibel);
129 | 
130 |     const mergedAudio = await SpeechGenerator.overlayAudioAndBackgroundMusic(equalizedAudio, backgroundAudio);
131 | 
132 |     let finalContent =
133 |       fileType === 'audio'
134 |         ? mergedAudio
135 |         : await VideoUtils.getAudioMergeWithVideo(videoPathWithoutAudio!, mergedAudio);
136 | 
137 |     if (fileType === 'video' && activateSubtitle === 'yes') {
138 |       const filePathVideoSubtitles = await SubtitlesGenerator.addSubtitlesInVideo({
139 |         transcriptionData: verifiedTranscription,
140 |         initialVideoPath: finalContent,
141 |         lang: targetLanguage,
142 |       });
143 | 
144 |       finalContent = filePathVideoSubtitles;
145 |     }
146 | 
147 |     if (fileType === 'video' && activateLipSync === 'yes') {
148 |       const lipSyncedVideoUrl = await Lipsync.processLipSyncWithAwsUpload({
149 |         localAudioPath: mergedAudio,
150 |         localVideoPath: finalContent,
151 |       });
152 | 
153 |       const lipSyncedVideo = await fetch(lipSyncedVideoUrl).then((res) => res.arrayBuffer());
154 |       const lipSyncedVideoBuffer = Buffer.from(lipSyncedVideo);
155 |       const newFilePath = `output/result-${crypto.randomUUID()}.mp4`;
156 |       await fsPromises.writeFile(newFilePath, lipSyncedVideoBuffer);
157 | 
158 |       finalContent = newFilePath;
159 |     }
160 | 
161 |     if (fileType === 'video') {
162 |       if (fs.existsSync(mergedAudio)) await fsPromises.unlink(mergedAudio);
163 |     }
164 | 
165 |     console.info('Translation completed successfully, you can now find your video in the output folder.');
166 |   } catch (error) {
167 |     if (error instanceof Error) {
168 |       console.error('Error:', error.message);
169 |     } else {
170 |       console.error('Error:', error);
171 |     }
172 |   } finally {
173 |     if (videoPathWithoutAudio && fs.existsSync(videoPathWithoutAudio))
174 |       await fsPromises.unlink(videoPathWithoutAudio);
175 |     if (audioPathWithoutVideo && fs.existsSync(audioPathWithoutVideo))
176 |       await fsPromises.unlink(audioPathWithoutVideo);
177 |     if (backgroundAudio && fs.existsSync(backgroundAudio)) await fsPromises.unlink(backgroundAudio);
178 |     if (vocalsIsolated && fs.existsSync(vocalsIsolated)) await fsPromises.unlink(vocalsIsolated);
179 |   }
180 | };
181 | 
182 | translate();
183 | 


--------------------------------------------------------------------------------
/src/elevenlabs/elevenlabs.ts:
--------------------------------------------------------------------------------
  1 | import fsPromise from 'fs/promises';
  2 | import axios from 'axios';
  3 | import * as crypto from 'crypto';
  4 | import { ElevenLabsClient } from 'elevenlabs';
  5 | import FormData from 'form-data';
  6 | import fs from 'fs';
  7 | import type { AllowedLanguages } from '../types/index';
  8 | import { Readable } from 'stream';
  9 | import { AudioUtils } from '../ffmpeg/audio-utils';
 10 | interface LabelPerLanguage {
 11 |   [key: string]: {
 12 |     accent: string;
 13 |     langue: string;
 14 |     language: string;
 15 |   };
 16 | }
 17 | 
 18 | interface SettingsElevenLabs {
 19 |   text: string;
 20 |   model_id: 'eleven_monolingual_v2' | 'eleven_multilingual_v2';
 21 |   output_format:
 22 |     | 'mp3_22050_32'
 23 |     | 'mp3_44100_32'
 24 |     | 'mp3_44100_64'
 25 |     | 'mp3_44100_96'
 26 |     | 'mp3_44100_128'
 27 |     | 'mp3_44100_192'
 28 |     | 'pcm_16000'
 29 |     | 'pcm_22050'
 30 |     | 'pcm_24000'
 31 |     | 'pcm_44100'
 32 |     | 'ulaw_8000';
 33 |   voice_settings: {
 34 |     similarity_boost: number;
 35 |     stability: number;
 36 |     use_speaker_boost: boolean;
 37 |     speed?: number; //max 1.2 min 0.8
 38 |   };
 39 |   previous_text?: string;
 40 |   next_text?: string;
 41 |   labels?: {
 42 |     accent: string;
 43 |     langue: string;
 44 |     language: string;
 45 |   };
 46 |   previous_request_ids?: PreviousRequestIdsEL;
 47 | }
 48 | 
 49 | //Max 3 previous request ids
 50 | export type PreviousRequestIdsEL = string[];
 51 | /*
 52 | 
 53 | 
 54 | 
 55 | 
 56 | **Stability
 57 | *The stability slider determines how stable the voice is and the randomness between each generation.
 58 | *Lowering this slider introduces a broader emotional range for the voice.
 59 | *As mentioned before, this is also influenced heavily by the original voice.
 60 | *Setting the slider too low may result in odd performances that are overly
 61 | *random and cause the character to speak too quickly.
 62 | *On the other hand, setting it too high can lead to a monotonous voice with limited emotion.
 63 | 
 64 | 
 65 | **Similarity
 66 | The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it.
 67 | If the original audio is of poor quality and the similarity slider is set too high, the AI may reproduce artifacts or background noise when trying to mimic the voice if those were present in the original recording.
 68 | */
 69 | 
 70 | /*
 71 | 
 72 | **Speaker Boost
 73 | This is another setting that was introduced in the new models.
 74 | The setting itself is quite self-explanatory – it boosts the similarity to the original speaker.
 75 | However, using this setting requires a slightly higher computational load, which in turn increases latency.
 76 | The differences introduced by this setting are generally rather subtle.
 77 | 
 78 | */
 79 | 
 80 | export type OutputFormat =
 81 |   | 'mp3_22050_32'
 82 |   | 'mp3_44100_32'
 83 |   | 'mp3_44100_64'
 84 |   | 'mp3_44100_96'
 85 |   | 'mp3_44100_128'
 86 |   | 'mp3_44100_192'
 87 |   | 'pcm_8000'
 88 |   | 'pcm_16000'
 89 |   | 'pcm_22050'
 90 |   | 'pcm_24000'
 91 |   | 'pcm_44100'
 92 |   | 'ulaw_8000'
 93 |   | 'alaw_8000'
 94 |   | 'opus_48000_32'
 95 |   | 'opus_48000_64'
 96 |   | 'opus_48000_96'
 97 |   | 'opus_48000_128'
 98 |   | 'opus_48000_192';
 99 | 
100 | export class ElevenLabsService {
101 |   elevenLabsApiKey: string | undefined;
102 |   elevenLabsBaseUrl = 'https://api.elevenlabs.io/v1';
103 |   elevenLabsClient: ElevenLabsClient;
104 | 
105 |   constructor() {
106 |     this.elevenLabsApiKey = process.env.ELEVEN_LABS_API_KEY;
107 |     if (!this.elevenLabsApiKey) {
108 |       throw new Error('ELEVEN_LABS_API_KEY is not defined');
109 |     }
110 |     this.elevenLabsClient = new ElevenLabsClient({
111 |       apiKey: this.elevenLabsApiKey,
112 |     });
113 |   }
114 | 
115 |   getLabels(targetLanguage: AllowedLanguages):
116 |     | {
117 |         accent: string;
118 |         langue: string;
119 |         language: string;
120 |       }
121 |     | undefined {
122 |     const labelsPerLanguage: LabelPerLanguage = {
123 |       french: { accent: 'french', langue: 'french', language: 'french' },
124 |       'british english': {
125 |         accent: 'british',
126 |         langue: 'english',
127 |         language: 'english',
128 |       },
129 |       english: {
130 |         accent: 'american',
131 |         langue: 'english',
132 |         language: 'english',
133 |       },
134 |       'french canadian': {
135 |         accent: 'canadian',
136 |         langue: 'french',
137 |         language: 'french',
138 |       },
139 |       vietnamese: {
140 |         accent: 'vietnamese',
141 |         langue: 'vietnamese',
142 |         language: 'vietnamese',
143 |       },
144 |     };
145 | 
146 |     if (!labelsPerLanguage[targetLanguage]) {
147 |       return undefined;
148 |     } else {
149 |       return labelsPerLanguage[targetLanguage];
150 |     }
151 |   }
152 | 
153 |   // In the `cloneVoice` method of the `ElevenLabsService` class
154 | 
155 |   generateShortId(length: number): string {
156 |     const characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789';
157 |     let result = '';
158 |     const charactersLength = characters.length;
159 |     for (let i = 0; i < length; i++) {
160 |       result += characters.charAt(Math.floor(Math.random() * charactersLength));
161 |     }
162 |     return result;
163 |   }
164 | 
165 |   async cloneVoice(
166 |     baseAudio: Buffer[],
167 |     voiceName: string,
168 |     totalDuration: number,
169 |   ): Promise<{ voice_id: string }> {
170 |     console.debug('Cloning voice...');
171 |     const maxDuration = 44 * 60; // 44 minutes in seconds
172 |     const maxBufferSize = 10 * 1024 * 1024; // 10MB in bytes
173 | 
174 |     let processedAudio = baseAudio;
175 | 
176 |     let concatenatedBuffer = this.concatenateAudioBuffers(baseAudio);
177 | 
178 |     // Trim the audio buffer if it exceeds 44 minutes
179 |     if (totalDuration > maxDuration) {
180 |       concatenatedBuffer = await AudioUtils.trimAudioBuffer(concatenatedBuffer, maxDuration);
181 | 
182 |       processedAudio = this.splitBufferIntoChunks(concatenatedBuffer, maxBufferSize);
183 |     }
184 | 
185 |     // Split the buffer into chunks not exceeding 10MB
186 |     const uuid = crypto.randomUUID();
187 |     const shortId = this.generateShortId(6);
188 |     const url = `${this.elevenLabsBaseUrl}/voices/add`;
189 | 
190 |     const formData = new FormData();
191 |     formData.append('name', `custom-voice-${shortId}`);
192 |     formData.append('description', voiceName);
193 | 
194 |     processedAudio.forEach((audioBuffer, index) => {
195 |       formData.append('files', audioBuffer, {
196 |         filename: `${uuid}-${index}.mp3`,
197 |         contentType: 'audio/mp3',
198 |       });
199 |     });
200 | 
201 |     try {
202 |       const response = await axios.post(url, formData, {
203 |         headers: {
204 |           ...formData.getHeaders(),
205 |           'xi-api-key': this.elevenLabsApiKey,
206 |         },
207 |       });
208 |       console.debug('One Voice cloned.');
209 |       return response.data;
210 |     } catch (error: any) {
211 |       console.error('Error in voice cloning:', error.response.data);
212 |       if (error.response.data?.detail?.message?.includes('corrupted')) {
213 |         throw new Error('Error during voice cloning, audio file is corrupted.');
214 |       }
215 |       throw new Error('Error during voice cloning');
216 |     }
217 |   }
218 | 
219 |   private splitBufferIntoChunks(buffer: Buffer, maxChunkSize: number): Buffer[] {
220 |     const chunks: Buffer[] = [];
221 |     let start = 0;
222 | 
223 |     while (start < buffer.length) {
224 |       const end = Math.min(buffer.length, start + maxChunkSize);
225 |       chunks.push(buffer.slice(start, end));
226 |       start = end;
227 |     }
228 | 
229 |     return chunks;
230 |   }
231 | 
232 |   /**
233 |    * Concatenates an array of audio buffers into a single buffer
234 |    * @param audioBuffers Array of audio buffers to concatenate
235 |    * @returns A single concatenated buffer
236 |    */
237 |   concatenateAudioBuffers(audioBuffers: Buffer[]): Buffer {
238 |     // Validate input
239 |     if (!audioBuffers || !Array.isArray(audioBuffers) || audioBuffers.length === 0) {
240 |       throw new Error('Invalid input: audioBuffers must be a non-empty array of Buffer objects');
241 |     }
242 | 
243 |     // Check if all elements are Buffer instances
244 |     for (const buffer of audioBuffers) {
245 |       if (!(buffer instanceof Buffer)) {
246 |         throw new Error('Invalid input: all elements in audioBuffers must be Buffer instances');
247 |       }
248 |     }
249 | 
250 |     // Concatenate all buffers into a single buffer
251 |     return Buffer.concat(audioBuffers);
252 |   }
253 | 
254 |   async generateAudioFile({
255 |     text,
256 |     modelId,
257 |     voiceId,
258 |     previousText,
259 |     nextText,
260 |     targetLanguage,
261 |     speedFactor,
262 |   }: {
263 |     text: string;
264 |     modelId: 'eleven_monolingual_v2' | 'eleven_multilingual_v2';
265 |     voiceId: string;
266 |     previousText?: string;
267 |     nextText?: string;
268 |     targetLanguage?: AllowedLanguages;
269 |     speedFactor?: number;
270 |   }): Promise<{
271 |     response: Buffer;
272 |     requestId: string;
273 |   }> {
274 |     const outputFormat: OutputFormat = 'mp3_44100_128';
275 | 
276 |     const settingsElevenLabs: SettingsElevenLabs = {
277 |       text: text,
278 |       model_id: modelId,
279 |       labels: targetLanguage ? this.getLabels(targetLanguage) : undefined,
280 |       voice_settings: {
281 |         similarity_boost: 0.85,
282 |         stability: 0.5,
283 |         use_speaker_boost: true,
284 |       },
285 |       output_format: outputFormat,
286 |       //! MP3 with 192kbps bitrate requires you to be subscribed to Creator tier or above. PCM with 44.1kHz sample rate requires you to be subscribed to Pro tier or above.
287 |       //output_format: 'pcm_44100',
288 |     };
289 | 
290 |     if (previousText) settingsElevenLabs.previous_text = previousText + ' ';
291 |     if (nextText) settingsElevenLabs.next_text = ' ' + nextText;
292 |     if (speedFactor) settingsElevenLabs.voice_settings.speed = Number(speedFactor.toFixed(2));
293 | 
294 |     // Maximum 3 tries
295 |     const maxAttempts = 3;
296 |     let attempt = 0;
297 | 
298 |     while (attempt < maxAttempts) {
299 |       try {
300 |         const res = await this.elevenLabsClient.textToSpeech.convert(voiceId, settingsElevenLabs);
301 | 
302 |         console.debug(`Speech 11labs generated on attempt ${attempt + 1}.`);
303 | 
304 |         async function readableToBuffer(readable: Readable): Promise<Buffer> {
305 |           const chunks: Buffer[] = [];
306 | 
307 |           for await (const chunk of readable) {
308 |             chunks.push(Buffer.from(chunk));
309 |           }
310 | 
311 |           return Buffer.concat(chunks);
312 |         }
313 | 
314 |         const buffer = await readableToBuffer(res);
315 | 
316 |         const audioBuffer =
317 |           outputFormat === 'mp3_22050_32' ? buffer : await AudioUtils.convertPCMBufferToWav(buffer);
318 | 
319 |         return {
320 |           response: audioBuffer,
321 |           requestId: crypto.randomUUID(),
322 |         };
323 |       } catch (error: any) {
324 |         console.error(`ERROR IN AUDIO GENERATION (attempt ${attempt + 1}):`, error);
325 | 
326 |         if (error.toString().includes('Status code: 401')) {
327 |           throw new Error(
328 |             'The voice you are trying to translate cannot be cloned, because it is a protected voice.',
329 |           );
330 |         }
331 | 
332 |         attempt++;
333 | 
334 |         if (attempt < maxAttempts) {
335 |           console.debug('Waiting 10 seconds before next attempt...');
336 |           await new Promise((resolve) => setTimeout(resolve, 10000));
337 |         } else {
338 |           throw new Error('Error during audio generation after multiple attempts');
339 |         }
340 |       }
341 |     }
342 | 
343 |     // In theory, we should never reach here, but just in case:
344 |     throw new Error('Error during audio generation after multiple attempts');
345 |   }
346 | 
347 |   async isolateVoiceFromAudio(audioFilePath: string) {
348 |     try {
349 |       console.debug('Isolating voice from audio....');
350 | 
351 |       const url = `${this.elevenLabsBaseUrl}/audio-isolation/stream`;
352 |       const formData = new FormData();
353 |       formData.append('audio', fs.createReadStream(audioFilePath));
354 | 
355 |       const response = await axios.post(url, formData, {
356 |         headers: {
357 |           ...formData.getHeaders(),
358 |           'xi-api-key': this.elevenLabsApiKey,
359 |         },
360 |         responseType: 'arraybuffer',
361 |       });
362 | 
363 |       console.debug('Voice isolated successfully from audio.');
364 | 
365 |       const vocalIsolatedBuffer = Buffer.from(response.data);
366 |       const outputFilePath = audioFilePath.includes('.wav')
367 |         ? audioFilePath.replace('.wav', '-vocal.wav')
368 |         : audioFilePath.replace('.mp3', '-vocal.mp3');
369 | 
370 |       await fsPromise.writeFile(outputFilePath, vocalIsolatedBuffer);
371 | 
372 |       return outputFilePath;
373 |     } catch (err: any) {
374 |       console.error('Error in isolateVoiceFromAudio:', err);
375 |       throw new Error('Error during voice isolation');
376 |     }
377 |   }
378 | }
379 | 


--------------------------------------------------------------------------------
/src/ffmpeg/ffmpegPatch.ts:
--------------------------------------------------------------------------------
 1 | import ffmpeg from 'fluent-ffmpeg';
 2 | 
 3 | /**
 4 |  * Applies a temporary workaround to add the 'lavfi' format.
 5 |  * This patch adds 'lavfi' to the available formats returned by ffmpeg.
 6 |  *
 7 |  * @param command - An instance of ffmpeg.FfmpegCommand to patch.
 8 |  * @returns The patched ffmpeg command instance.
 9 |  */
10 | export function applyLavfiWorkaround(
11 |   command: ffmpeg.FfmpegCommand,
12 | ): ffmpeg.FfmpegCommand {
13 |   // Save the original availableFormats function.
14 |   const originalAvailableFormats = command.availableFormats;
15 | 
16 |   // Override availableFormats to inject the 'lavfi' format.
17 |   command.availableFormats = (callback: (err: any, data: any) => void) => {
18 |     originalAvailableFormats.call(command, (err: any, data: any) => {
19 |       // If lavfi is not present, add it.
20 |       if (!data.lavfi) {
21 |         data.lavfi = {
22 |           canDemux: true, // lavfi can be used as input
23 |           canMux: false, // lavfi cannot be used as output
24 |           description: 'Libavfilter virtual input device',
25 |         };
26 |       }
27 |       callback(err, data);
28 |     });
29 |   };
30 | 
31 |   return command;
32 | }
33 | 


--------------------------------------------------------------------------------
/src/ffmpeg/video-utils.ts:
--------------------------------------------------------------------------------
  1 | import ffmpeg from 'fluent-ffmpeg';
  2 | import fs from 'fs';
  3 | import type { Readable } from 'stream';
  4 | import path from 'path';
  5 | import crypto from 'crypto';
  6 | import { createReadStream } from 'fs';
  7 | import { promisify } from 'util';
  8 | 
  9 | export class VideoUtils {
 10 |   static async getFileDuration(filePath: string): Promise<number | 'N/A'> {
 11 |     return new Promise((resolve, reject) => {
 12 |       if (!filePath) {
 13 |         console.error('No file path provided');
 14 |         return reject(new Error('No file path provided'));
 15 |       }
 16 | 
 17 |       if (!fs.existsSync(filePath)) {
 18 |         console.error(`File not found: ${filePath}`);
 19 |         return reject(new Error('File not found or inaccessible'));
 20 |       }
 21 | 
 22 |       try {
 23 |         ffmpeg.ffprobe(filePath, (err, metadata) => {
 24 |           if (err) {
 25 |             console.error('Error while getting file duration:', err, metadata);
 26 | 
 27 |             const errorMessage = err.message?.toLowerCase() || '';
 28 |             if (errorMessage.includes('invalid data') || errorMessage.includes('unsupported format')) {
 29 |               return reject(new Error('Invalid or unsupported media format'));
 30 |             }
 31 |             if (errorMessage.includes('permission denied')) {
 32 |               return reject(new Error('Permission denied to access file'));
 33 |             }
 34 | 
 35 |             return reject(new Error('Failed to process media file'));
 36 |           }
 37 | 
 38 |           if (!metadata?.format?.duration) {
 39 |             console.error('No duration found in metadata:', {
 40 |               filePath,
 41 |               metadata: metadata?.format,
 42 |             });
 43 |             return reject(new Error('Could not determine media duration'));
 44 |           }
 45 | 
 46 |           const duration = metadata.format.duration;
 47 |           if (typeof duration !== 'number' || isNaN(duration) || duration <= 0) {
 48 |             console.error('Invalid duration value:', duration);
 49 |             console.error('metadata of the file:', metadata);
 50 |           }
 51 | 
 52 |           resolve(duration);
 53 |         });
 54 |       } catch (error) {
 55 |         console.error('Unexpected error in getFileDuration:', error);
 56 |         reject(new Error('Internal server error while processing media'));
 57 |       }
 58 |     });
 59 |   }
 60 | 
 61 |   static async getAudioMergeWithVideo(videoPath: string, audioPath: string): Promise<string> {
 62 |     console.debug('Merging audio and video...');
 63 |     let filePath = '';
 64 |     try {
 65 |       const outputPath = path.join(`output/result-${crypto.randomUUID()}.mp4`);
 66 |       const contentLength = await this.getFileDuration(audioPath);
 67 | 
 68 |       if (typeof contentLength !== 'number')
 69 |         throw new Error(
 70 |           `Error during audio duration when merging audio and video: duration is not a number: ${contentLength}`,
 71 |         );
 72 | 
 73 |       filePath = await this.mergeAudioAndVideo({
 74 |         videoPath,
 75 |         audioPath,
 76 |         outputPath,
 77 |       });
 78 | 
 79 |       console.debug('Audio and video merged.');
 80 | 
 81 |       return filePath;
 82 |     } catch (e) {
 83 |       console.error(e);
 84 |       throw new Error('Error while merging audio and video');
 85 |     }
 86 |   }
 87 | 
 88 |   static mergeAudioAndVideo = async ({
 89 |     videoPath,
 90 |     audioPath,
 91 |     outputPath,
 92 |   }: {
 93 |     videoPath: string;
 94 |     audioPath: string;
 95 |     outputPath: string;
 96 |   }): Promise<string> => {
 97 |     console.debug('Merging audio and video...');
 98 | 
 99 |     const fileExtension = path.extname(videoPath).substring(1).toLowerCase();
100 | 
101 |     // Helper to probe the audio track
102 |     const ffprobePromise = promisify(ffmpeg.ffprobe);
103 |     const audioMetadata = (await ffprobePromise(audioPath)) as {
104 |       streams: Array<{ codec_type: string; codec_name: string }>;
105 |     };
106 |     const audioStreamIndex = audioMetadata.streams.findIndex((stream) => stream.codec_type === 'audio');
107 |     if (audioStreamIndex === -1) {
108 |       throw new Error('No valid audio track found in the provided audio file');
109 |     }
110 | 
111 |     const videoMetadata = (await ffprobePromise(videoPath)) as {
112 |       streams: Array<{ codec_type: string; codec_name: string }>;
113 |     };
114 | 
115 |     const videoStreamIndex = videoMetadata.streams.findIndex((stream) => stream.codec_type === 'video');
116 | 
117 |     if (videoStreamIndex === -1) {
118 |       throw new Error('No valid video track found in the provided video file');
119 |     }
120 | 
121 |     const isAAC = audioMetadata.streams.some(
122 |       (stream) => stream.codec_type === 'audio' && stream.codec_name === 'aac',
123 |     );
124 | 
125 |     return new Promise((resolve, reject) => {
126 |       const command = ffmpeg()
127 |         .input(videoPath)
128 |         .input(audioPath)
129 |         // English comment: Map video from the first input, audio from the second
130 |         .outputOptions([
131 |           // Map the correct video track from the 1st input
132 |           `-map 0:${videoStreamIndex}`,
133 |           // Map the correct audio track from the 2nd input
134 |           `-map 1:${audioStreamIndex}`,
135 | 
136 |           // Always copy video to avoid re-encoding (faster + no quality loss)
137 |           '-c:v copy',
138 | 
139 |           // If audio is already AAC, copy it; otherwise encode to AAC
140 |           isAAC ? '-c:a copy' : '-c:a aac',
141 | 
142 |           // Only apply bitrate if we are encoding
143 |           // (this will be ignored if we're copying)
144 |           '-b:a 320k',
145 |           '-ar 48000',
146 | 
147 |           // Enable faststart for quick playback start in MP4
148 |           '-movflags +faststart',
149 | 
150 |           // Use all available CPU threads for any encoding
151 |           '-threads 0',
152 |         ])
153 |         .format(fileExtension)
154 |         .output(outputPath)
155 |         .on('error', (err) => {
156 |           console.error('Error merging audio/video:', err);
157 |           reject(err);
158 |         })
159 |         .on('stderr', (line) => {
160 |           if (line.toLowerCase().includes('error')) {
161 |             console.error('FFmpeg error:', line);
162 |           }
163 |         })
164 |         .on('end', () => {
165 |           console.debug('Merging succeeded with minimal re-encoding.');
166 |           resolve(outputPath);
167 |         });
168 | 
169 |       command.run();
170 |     });
171 |   };
172 | 
173 |   static addSubtitles = async ({
174 |     videoPath,
175 |     srtFilePath,
176 |     outputFilePath,
177 |   }: {
178 |     videoPath: string;
179 |     srtFilePath: string;
180 |     outputFilePath: string;
181 |   }) => {
182 |     if (!fs.existsSync(srtFilePath)) {
183 |       throw new Error('Srt file does not exist');
184 |     }
185 | 
186 |     return new Promise((resolve, reject) => {
187 |       // Get input file info
188 |       ffmpeg.ffprobe(videoPath, (err, metadata) => {
189 |         if (err) {
190 |           console.error('Error probing video file:', err);
191 |           return reject(err);
192 |         }
193 | 
194 |         // Check if we're dealing with an HEVC/H.265 video
195 |         const videoStream = metadata.streams.find((stream) => stream.codec_type === 'video');
196 |         const isHEVC =
197 |           videoStream && videoStream.codec_name && videoStream.codec_name.toLowerCase().includes('hevc');
198 |         const is10bit = videoStream && videoStream.pix_fmt && videoStream.pix_fmt.includes('10le');
199 | 
200 |         console.debug(
201 |           `Video info: codec=${videoStream?.codec_name}, pixel format=${videoStream?.pix_fmt}, isHEVC=${isHEVC}, is10bit=${is10bit}`,
202 |         );
203 | 
204 |         let command = ffmpeg(videoPath);
205 | 
206 |         // Add subtitles filter with compatible font
207 |         const subtitlesFilter = `subtitles=${srtFilePath}:force_style='FontName=DejaVu'`;
208 | 
209 |         if (isHEVC || is10bit) {
210 |           // For HEVC/10-bit videos that need browser compatibility:
211 |           console.debug('Converting HEVC/10-bit video to browser-compatible format');
212 |           command = command
213 |             .videoCodec('libx264') // Use H.264 which has better browser support
214 |             .outputOptions([
215 |               '-vf',
216 |               subtitlesFilter,
217 |               '-pix_fmt',
218 |               'yuv420p', // Convert to 8-bit color
219 |               '-crf',
220 |               '18', // High quality
221 |               '-preset',
222 |               'medium', // Balance between speed and quality
223 |               '-movflags',
224 |               '+faststart', // Optimize for web playback
225 |               '-c:a',
226 |               'aac', // Convert audio to AAC for compatibility
227 |               '-b:a',
228 |               '320k', // Good audio quality
229 |             ]);
230 |         } else {
231 |           // For already compatible videos, minimal processing
232 |           command = command.videoCodec('libx264').outputOptions([
233 |             '-vf',
234 |             subtitlesFilter,
235 |             '-pix_fmt',
236 |             'yuv420p', // Ensure 8-bit color
237 |             '-c:a',
238 |             'copy', // Copy audio stream
239 |             '-movflags',
240 |             '+faststart', // Optimize for web playback
241 |           ]);
242 |         }
243 | 
244 |         command
245 |           .on('start', (commandLine) => {
246 |             console.debug('FFmpeg command:', commandLine);
247 |           })
248 |           .on('stderr', (stderrLine) => {
249 |             if (stderrLine.includes('error')) {
250 |               console.error('FFmpeg stderr:', stderrLine);
251 |             }
252 |           })
253 |           .on('end', () => {
254 |             console.debug('Subtitles added successfully');
255 |             resolve(outputFilePath);
256 |           })
257 |           .on('error', (err) => {
258 |             console.error('Error adding subtitles:', err);
259 |             reject(err);
260 |           })
261 |           .save(outputFilePath);
262 |       });
263 |     });
264 |   };
265 | }
266 | 


--------------------------------------------------------------------------------
/src/lipsync/lipsync.ts:
--------------------------------------------------------------------------------
  1 | import type { AxiosResponse } from 'axios';
  2 | import type { SyncLabInitialResponse, SynclabV2RequestBody } from '../types/lipsync';
  3 | import axios from 'axios';
  4 | import fs from 'fs';
  5 | import { S3Client, PutObjectCommand, HeadObjectCommand, DeleteObjectCommand } from '@aws-sdk/client-s3';
  6 | 
  7 | export class Lipsync {
  8 |   static async startLipSync({ audioPath, videoPath }: { audioPath: string; videoPath: string }) {
  9 |     try {
 10 |       console.debug('Verifying usage links for lip sync...');
 11 | 
 12 |       const syncLabResponse = await this.sendLipSyncRequest({
 13 |         audioUrl: audioPath,
 14 |         videoUrl: videoPath,
 15 |       });
 16 | 
 17 |       return syncLabResponse;
 18 |     } catch (error) {
 19 |       console.error(error);
 20 |       throw new Error('Error during lip sync request');
 21 |     }
 22 |   }
 23 | 
 24 |   static async sendLipSyncRequest({
 25 |     audioUrl,
 26 |     videoUrl,
 27 |   }: {
 28 |     audioUrl: string;
 29 |     videoUrl: string;
 30 |   }): Promise<SyncLabInitialResponse> {
 31 |     const url = 'https://api.sync.so/v2/generate';
 32 |     const body: SynclabV2RequestBody = {
 33 |       input: [
 34 |         {
 35 |           type: 'video',
 36 |           url: videoUrl,
 37 |         },
 38 |         {
 39 |           type: 'audio',
 40 |           url: audioUrl,
 41 |         },
 42 |       ],
 43 |       options: {
 44 |         output_format: 'mp4',
 45 |         active_speaker: true,
 46 |       },
 47 |       model: 'lipsync-2',
 48 |     };
 49 | 
 50 |     const headers = {
 51 |       accept: 'application/json',
 52 |       'x-api-key': process.env.SYNC_LAB_API_KEY,
 53 |       'Content-Type': 'application/json',
 54 |     };
 55 | 
 56 |     try {
 57 |       const response: AxiosResponse<SyncLabInitialResponse> = await axios.post(url, body, {
 58 |         headers,
 59 |       });
 60 | 
 61 |       return response.data as SyncLabInitialResponse;
 62 |     } catch (error: any) {
 63 |       console.error('Error:', error.response.data);
 64 |       throw new Error(`Synclabs error: ${error.message}`);
 65 |     }
 66 |   }
 67 | 
 68 |   static async pollLipSyncResult(
 69 |     initialResponse: SyncLabInitialResponse,
 70 |     maxAttempts = 600,
 71 |     intervalMs = 10000,
 72 |   ): Promise<string> {
 73 |     let attempts = 0;
 74 | 
 75 |     while (attempts < maxAttempts) {
 76 |       attempts++;
 77 | 
 78 |       try {
 79 |         const url = `https://api.sync.so/v2/generate/${initialResponse.id}`;
 80 |         const headers = {
 81 |           accept: 'application/json',
 82 |           'x-api-key': process.env.SYNC_LAB_API_KEY,
 83 |         };
 84 | 
 85 |         const response = await axios.get(url, { headers });
 86 |         const data = response.data;
 87 | 
 88 |         if (data.status === 'COMPLETED') {
 89 |           if (data.outputUrl) {
 90 |             return data.outputUrl;
 91 |           } else {
 92 |             throw new Error('Output URL is missing from completed response');
 93 |           }
 94 |         } else if (['FAILED', 'REJECTED', 'CANCELED', 'TIMED_OUT'].includes(data.status)) {
 95 |           throw new Error(
 96 |             `Lipsync generation failed with status: ${data.status}, error: ${data.error || 'Unknown error'}`,
 97 |           );
 98 |         }
 99 | 
100 |         console.debug(`Lipsync job status: ${data.status}. Polling again in ${intervalMs / 1000} seconds...`);
101 |         await new Promise((resolve) => setTimeout(resolve, intervalMs));
102 |       } catch (error: any) {
103 |         console.error('Error polling lipsync result:', error);
104 |         throw new Error(`Error polling lipsync result: ${error.message}`);
105 |       }
106 |     }
107 | 
108 |     throw new Error(`Lipsync generation timed out after ${maxAttempts} attempts`);
109 |   }
110 | 
111 |   static async startLipSyncAndWaitForResult({
112 |     audioPath,
113 |     videoPath,
114 |   }: {
115 |     audioPath: string;
116 |     videoPath: string;
117 |   }): Promise<string> {
118 |     try {
119 |       console.debug('Starting lip sync process...');
120 | 
121 |       const initialResponse = await this.sendLipSyncRequest({
122 |         audioUrl: audioPath,
123 |         videoUrl: videoPath,
124 |       });
125 | 
126 |       console.debug(`Lip sync job started with ID: ${initialResponse.id}`);
127 | 
128 |       const outputUrl = await this.pollLipSyncResult(initialResponse);
129 | 
130 |       console.debug(`Lip sync completed. Output available at: ${outputUrl}`);
131 |       return outputUrl;
132 |     } catch (error) {
133 |       console.error('Error during lip sync process:', error);
134 |       throw new Error(
135 |         `Failed to complete lip sync process: ${error instanceof Error ? error.message : String(error)}`,
136 |       );
137 |     }
138 |   }
139 | 
140 |   static async processLipSyncWithAwsUpload({
141 |     localVideoPath,
142 |     localAudioPath,
143 |   }: {
144 |     localVideoPath: string;
145 |     localAudioPath: string;
146 |   }): Promise<string> {
147 |     // Check if required environment variables are set
148 |     const requiredEnvVars = [
149 |       'SYNC_LAB_API_KEY',
150 |       'AWS_S3_REGION',
151 |       'AWS_ACCESS_KEY_ID',
152 |       'AWS_SECRET_ACCESS_KEY',
153 |       'AWS_BUCKET_NAME',
154 |     ];
155 | 
156 |     for (const envVar of requiredEnvVars) {
157 |       if (!process.env[envVar]) {
158 |         throw new Error(`Missing required environment variable: ${envVar}`);
159 |       }
160 |     }
161 | 
162 |     // Check if files exist
163 |     if (!fs.existsSync(localVideoPath)) {
164 |       throw new Error(`Video file not found at path: ${localVideoPath}`);
165 |     }
166 |     if (!fs.existsSync(localAudioPath)) {
167 |       throw new Error(`Audio file not found at path: ${localAudioPath}`);
168 |     }
169 | 
170 |     // S3 configuration
171 |     const s3BucketName = process.env.AWS_BUCKET_NAME || '';
172 |     const s3Region = process.env.AWS_S3_REGION || '';
173 | 
174 |     // Create S3 client
175 |     const s3client = new S3Client({
176 |       region: s3Region,
177 |     });
178 | 
179 |     // Store S3 file paths for later cleanup
180 |     let videoFileName = '';
181 |     let audioFileName = '';
182 | 
183 |     try {
184 |       console.debug('Uploading files to AWS S3...');
185 | 
186 |       // Generate unique file paths for S3
187 |       const timestamp = Date.now();
188 |       videoFileName = `lipsync/video_${timestamp}_${localVideoPath.split('/').pop()}`;
189 |       audioFileName = `lipsync/audio_${timestamp}_${localAudioPath.split('/').pop()}`;
190 | 
191 |       // Read files as buffers
192 |       const videoBuffer = fs.readFileSync(localVideoPath);
193 |       const audioBuffer = fs.readFileSync(localAudioPath);
194 | 
195 |       // Upload files to S3
196 |       const [videoUrl, audioUrl] = await Promise.all([
197 |         uploadFileToS3(s3client, s3BucketName, s3Region, videoBuffer, videoFileName),
198 |         uploadFileToS3(s3client, s3BucketName, s3Region, audioBuffer, audioFileName),
199 |       ]);
200 | 
201 |       console.debug(`Files uploaded successfully. Video URL: ${videoUrl}, Audio URL: ${audioUrl}`);
202 | 
203 |       // Process the lipsync with the public URLs
204 |       const lipSyncResultUrl = await this.startLipSyncAndWaitForResult({
205 |         videoPath: videoUrl,
206 |         audioPath: audioUrl,
207 |       });
208 | 
209 |       console.debug(`Lipsync processing complete. Result available at: ${lipSyncResultUrl}`);
210 | 
211 |       // Clean up local files
212 |       try {
213 |         fs.unlinkSync(localVideoPath);
214 |         fs.unlinkSync(localAudioPath);
215 |         console.debug('Local files deleted successfully');
216 |       } catch (deleteError) {
217 |         console.warn('Failed to delete local files:', deleteError);
218 |         // Continue despite deletion failure
219 |       }
220 | 
221 |       // Clean up S3 files
222 |       try {
223 |         await Promise.all([
224 |           deleteFileFromS3(s3client, s3BucketName, videoFileName),
225 |           deleteFileFromS3(s3client, s3BucketName, audioFileName),
226 |         ]);
227 |         console.debug('S3 files deleted successfully');
228 |       } catch (deleteError) {
229 |         console.warn('Failed to delete S3 files:', deleteError);
230 |         // Continue despite deletion failure
231 |       }
232 | 
233 |       return lipSyncResultUrl;
234 |     } catch (error) {
235 |       console.error('Error in lipsync processing with AWS upload:', error);
236 | 
237 |       // Attempt to clean up S3 files in case of error
238 |       if (videoFileName && audioFileName) {
239 |         try {
240 |           await Promise.all([
241 |             deleteFileFromS3(s3client, s3BucketName, videoFileName),
242 |             deleteFileFromS3(s3client, s3BucketName, audioFileName),
243 |           ]);
244 |           console.debug('S3 files deleted after error');
245 |         } catch (deleteError) {
246 |           console.warn('Failed to delete S3 files after error:', deleteError);
247 |         }
248 |       }
249 | 
250 |       throw new Error(
251 |         `Failed to process lipsync with AWS: ${error instanceof Error ? error.message : String(error)}`,
252 |       );
253 |     }
254 |   }
255 | }
256 | 
257 | /**
258 |  * Helper function to upload a file to S3 and return its public URL
259 |  */
260 | async function uploadFileToS3(
261 |   s3client: S3Client,
262 |   bucketName: string,
263 |   region: string,
264 |   fileBuffer: Buffer,
265 |   filePath: string,
266 | ): Promise<string> {
267 |   // Check if file already exists
268 |   try {
269 |     await s3client.send(
270 |       new HeadObjectCommand({
271 |         Bucket: bucketName,
272 |         Key: filePath,
273 |       }),
274 |     );
275 |     // If no error is thrown, file exists
276 |     return `https://${bucketName}.s3.${region}.amazonaws.com/${filePath}`;
277 |   } catch (error: unknown) {
278 |     // File doesn't exist, continue with upload
279 |   }
280 | 
281 |   // Get expiration date (1 year from now)
282 |   const expirationDate = new Date();
283 |   expirationDate.setFullYear(expirationDate.getFullYear() + 1);
284 | 
285 |   const uploadParams = {
286 |     Bucket: bucketName,
287 |     Key: filePath.trim(),
288 |     Body: fileBuffer,
289 |     Metadata: {
290 |       'x-amz-meta-expiration-date': expirationDate.toISOString(),
291 |     },
292 |   };
293 | 
294 |   try {
295 |     const data = await s3client.send(new PutObjectCommand(uploadParams));
296 |     if (!data) {
297 |       throw new Error('Error uploading file to AWS S3');
298 |     }
299 | 
300 |     return `https://${bucketName}.s3.${region}.amazonaws.com/${filePath.trim()}`;
301 |   } catch (error: unknown) {
302 |     const errorMessage = error instanceof Error ? error.message : 'Unknown error';
303 |     throw new Error(`Failed to upload file: ${errorMessage}`);
304 |   }
305 | }
306 | 
307 | /**
308 |  * Helper function to delete a file from S3
309 |  */
310 | async function deleteFileFromS3(s3client: S3Client, bucketName: string, filePath: string): Promise<void> {
311 |   try {
312 |     await s3client.send(
313 |       new DeleteObjectCommand({
314 |         Bucket: bucketName,
315 |         Key: filePath,
316 |       }),
317 |     );
318 |     console.debug(`Successfully deleted file from S3: ${filePath}`);
319 |   } catch (error: unknown) {
320 |     const errorMessage = error instanceof Error ? error.message : 'Unknown error';
321 |     console.warn(`Failed to delete file from S3: ${filePath} - ${errorMessage}`);
322 |     throw new Error(`Failed to delete file from S3: ${errorMessage}`);
323 |   }
324 | }
325 | 


--------------------------------------------------------------------------------
/src/llm/openai.ts:
--------------------------------------------------------------------------------
  1 | import OpenAI from 'openai';
  2 | import type { ChatCompletionCreateParamsNonStreaming, ChatCompletionMessageParam } from 'openai/resources';
  3 | 
  4 | export type OpenAIModel = string;
  5 | 
  6 | export const models = {
  7 |   gpt4o: 'gpt-4o',
  8 |   chatgpt4oLatest: 'chatgpt-4o-latest',
  9 |   gpt4Turbo: 'gpt-4-turbo',
 10 |   gpt4: 'gpt-4',
 11 |   gpt3Turbo: 'gpt-3.5-turbo-0125',
 12 |   gpt3_16k: 'gpt-3.5-turbo-16k',
 13 |   gpt4oMini: 'gpt-4o-mini',
 14 |   o1: 'o1',
 15 |   o1Mini: 'o1-mini',
 16 |   o3Mini: 'o3-mini',
 17 |   o1Pro: 'o1-pro',
 18 |   gpt45Preview: 'gpt-4.5-preview',
 19 |   gpt4_1: 'gpt-4.1',
 20 |   o4Mini: 'o4-mini',
 21 |   o3: 'o4',
 22 | };
 23 | 
 24 | const oModelsWithoutInstructions: OpenAIModel[] = [
 25 |   models.o1Mini,
 26 |   models.o1,
 27 |   models.o3Mini,
 28 |   models.o4Mini,
 29 |   models.o3,
 30 | ];
 31 | 
 32 | const oModelsWithAdjustableReasoningEffort: OpenAIModel[] = [
 33 |   models.o1,
 34 |   models.o3Mini,
 35 |   models.o1Pro,
 36 |   models.o4Mini,
 37 |   models.o3,
 38 | ];
 39 | const defaultInstructions = 'You are a helpful assistant.';
 40 | 
 41 | export const requestToGPT = async ({
 42 |   prompt,
 43 |   maxTokens,
 44 |   temperature,
 45 |   responseFormat,
 46 |   model,
 47 |   instructions,
 48 |   topP,
 49 | }: {
 50 |   prompt: string;
 51 |   maxTokens: number;
 52 |   temperature: number;
 53 |   responseFormat: 'text' | 'json_object';
 54 |   model: OpenAIModel;
 55 |   instructions?: string;
 56 |   topP?: number;
 57 | }): Promise<string> => {
 58 |   const openAi = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
 59 | 
 60 |   if (!openAi.apiKey) {
 61 |     throw new Error('No API key found for OpenAI');
 62 |   }
 63 | 
 64 |   const retryDelay = 1000;
 65 |   let attemptCount = 0;
 66 | 
 67 |   if (oModelsWithoutInstructions.includes(model) && instructions) {
 68 |     prompt = `
 69 |       ${instructions}
 70 | 
 71 |       -------
 72 | 
 73 |       ${prompt}
 74 |     `;
 75 |   }
 76 | 
 77 |   const timeoutId = setTimeout(() => {
 78 |     throw new Error('OpenAI API request timed out');
 79 |   }, 90000);
 80 | 
 81 |   try {
 82 |     const messagesArray: ChatCompletionMessageParam[] = instructions
 83 |       ? [
 84 |           { role: 'system', content: instructions || defaultInstructions },
 85 |           { role: 'user', content: prompt },
 86 |         ]
 87 |       : [{ role: 'user', content: prompt }];
 88 | 
 89 |     const params: ChatCompletionCreateParamsNonStreaming = {
 90 |       model: model,
 91 |       messages: messagesArray,
 92 |       response_format: { type: responseFormat },
 93 |     };
 94 | 
 95 |     if (!oModelsWithoutInstructions.includes(model)) {
 96 |       params.max_tokens = maxTokens;
 97 |       params.temperature = temperature;
 98 |       params.top_p = topP || 1;
 99 |       params.presence_penalty = 0;
100 |       params.frequency_penalty = 0;
101 |     }
102 | 
103 |     if (oModelsWithAdjustableReasoningEffort.includes(model)) {
104 |       params.reasoning_effort = 'medium';
105 |     }
106 | 
107 |     const response = await openAi.chat.completions.create(params);
108 | 
109 |     if (!response.choices[0]?.message?.content) {
110 |       throw new Error('No content in response');
111 |     }
112 | 
113 |     const finalResponse = response.choices[0].message.content;
114 | 
115 |     if (finalResponse.trim().toLowerCase().replace('.', '') === "sorry i can't help you with that") {
116 |       console.error('ChatGPT responded with a generic error');
117 |       throw new Error('Error with OpenAI API');
118 |     }
119 | 
120 |     clearTimeout(timeoutId);
121 | 
122 |     return finalResponse;
123 |   } catch (error: any) {
124 |     console.error('Error with OpenAI API:', error);
125 | 
126 |     if (attemptCount < 1) {
127 |       console.error(`Retrying after ${retryDelay} milliseconds...`);
128 |       await new Promise((resolve) => setTimeout(resolve, retryDelay));
129 |       attemptCount++;
130 | 
131 |       return requestToGPT({
132 |         prompt,
133 |         maxTokens,
134 |         temperature,
135 |         responseFormat,
136 |         model,
137 |         instructions,
138 |         topP,
139 |       });
140 |     } else {
141 |       console.error('Error with OpenAI after retry');
142 |       throw new Error('Error with OpenAI API');
143 |     }
144 |   }
145 | };
146 | 


--------------------------------------------------------------------------------
/src/llm/prompt-builder.ts:
--------------------------------------------------------------------------------
  1 | import type { AllowedLanguages, CreatePromptArguments } from '../types';
  2 | 
  3 | export const defaultInstructions = `
  4 | You are a world-renowned professional translator with decades of experience, and you know everything about language, writing, and cultural nuances.
  5 | 
  6 | Your goal:
  7 | • Provide the best possible translation from the original language to the target language.
  8 | • Preserve the exact meaning, style, tone, and context of the source text.
  9 | • Maintain original punctuation, verbal tics, and formatting markers (e.g., “--” or “---”).
 10 | • Remain consistent with prior segments (e.g., the same politeness form, references, etc.).
 11 | • Do not add or omit information; do not generate commentary or explanations.
 12 | • If the segment is already in the target language or contains no translatable content, return it as is.
 13 | 
 14 | Additional guidelines:
 15 | 1. **Contextual Consistency**  
 16 |    - You receive three segments for context: the *previous* text, the *text to translate*, and the *next* text.  
 17 |    - Only the middle one should be translated and returned. The other two are for context only.
 18 |    - If you receive a text that precedes or follows the text you have to translate, you must also base yourself on these texts to choose the correct politeness. Like "Vous" and "Tu" or "Monsieur" and "Mademoiselle", and same for other languages.
 19 | 
 20 | 2. **Politeness & Pronouns**  
 21 |    - Preserve the same level of politeness or pronoun usage across segments. For example, if the speaker uses “tu” in French, do not switch it to “vous.”
 22 | 
 23 | 3. **Numbers and Units**  
 24 |    - All numbers must be written out in full words appropriate to the target language (e.g., 1123 → one thousand one hundred twenty-three).  
 25 |    - Units of measurement, and currencies should be expanded into full words and translated if there is an equivalent in the target language (e.g., “km/h” → “kilometers per hour,” “€” → “euros,”).
 26 |    - Acronyms should be translated if there is an equivalent in the target language (e.g., “SIDA” → “AIDS”), acronyms should not be expanded into full words.
 27 |    - If an acronym has *no* direct equivalent in the target language, leave it as-is.
 28 | 
 29 | 4. **Verbatim vs. Naturalness**  
 30 |    - Provide a *naturally flowing* translation. Do not introduce major changes in structure or meaning; remain faithful to the original text.  
 31 |    - Keep verbal tics, interjections (e.g., “Oh la la,” “Umm,” “Eh”), or any markers of style or hesitation.
 32 | 
 33 | 5. **Output Format**  
 34 |    - Output **only** the translated text of the middle segment without quotes, titles, or other metadata.  
 35 |    - Do not add additional text, commentary, or formatting beyond the translation itself.  
 36 |    - If you are unsure how to translate a word or phrase, use your best judgment to provide the most statistically probable correct translation.
 37 | 
 38 | 6. **Edge Cases**  
 39 |    - If the source text is partially in the same language as the target, only translate the parts that need translating.  
 40 |    - If it is entirely in the same language, simply return it unchanged.
 41 | 
 42 | Remember: 
 43 | - Your translation should be culturally appropriate, preserving the intentions and style of the speaker.
 44 | - You must not “denature” the text. Maintain verbal tics, punctuation, and overall sentence structure as much as possible, while still ensuring clarity and correctness in the target language.
 45 | `;
 46 | 
 47 | export class PromptBuilder {
 48 |   public static T_V_DistinctionInstruction =
 49 |     'When translating, strictly preserve the original text’s level of formality and politeness (including T–V distinctions, formal/informal pronouns, honorifics, and appropriate vocabulary), adapting accurately according to the conventions of each target language. If you receive a text that precedes or follows the text you have to translate, you must also base yourself on these texts to choose the correct politeness.';
 50 | 
 51 |   public static instructionForReformulatedTranscription = `
 52 |     Your role here is to reformulate translated dialogues that are too long and don't match the length of the original dialogue.
 53 | 
 54 |     You have the expertise to rephrase a text while keeping EXACTLY the same meaning.
 55 | 
 56 |     You also know that dubbing adaptation is not just about shortening or lengthening sentences. It requires:
 57 |     • Understanding natural expressions in the target language.
 58 |     • Choosing words or structures that match the timing and intensity of the original scene.
 59 |     • A thorough knowledge of the target language and culture.
 60 |     • Taking into account context, nuances, and register (formal/informal) as they appear in the scene.
 61 | 
 62 |     Think carefully and take your time to respond. 
 63 | 
 64 |     Here is the workflow context:
 65 | 
 66 |     1. A user sends me a video or audio segment.
 67 |     2. I retrieve the transcription of this audio via an API. This transcription is split into small segments.
 68 |     3. For each segment, I have silent times between words and the total speaking time of that segment in the video.
 69 |     4. I translate the segment.
 70 |     5. I generate an audio file from the translated segment with a text-to-audio tool.
 71 |     6. I try to speed up the audio so it fits into the original speaking time. 
 72 |     7. If the audio is still too long (requiring an unnatural speed-up), you step in to intelligently rephrase the sentence, making it shorter while preserving meaning, cultural fit, and overall fluency for dubbing.
 73 | 
 74 |     Remember: 
 75 |     • You must adapt the text so that it sounds natural in the target language, preserves context, and stays true to the style (politeness or informality) of the original dialogue. 
 76 |     • You may modify words, expressions, or structures as necessary for clarity and naturalness. 
 77 |     • You must handle punctuation carefully to maintain the intended pauses, exclamations, etc.
 78 |     • If you encounter an extremely short text that cannot reasonably be shortened further, just return it as is.
 79 |     • Return only the reformulated text, with no extra commentary, headings, or metadata.
 80 |     • Never replace or remove essential meaning. If you can’t shorten without losing critical information, shorten only minimally or return the original text if that’s more appropriate.
 81 |     • Numbers must be spelled out in letters. 
 82 |     • Units of measurement, acronyms, and currencies must be written out fully in the target language if applicable.
 83 |     • ${PromptBuilder.T_V_DistinctionInstruction}
 84 | 
 85 |     Take your time to ensure clarity and precision. 
 86 |   `;
 87 | 
 88 |   public static instructionForHandlingToShortSpeech = `
 89 |     ### Your Tasks
 90 | 
 91 |     1. **Identify if text rewriting is allowed**:  
 92 |       - If rewriting is allowed, you may add or slightly reformulate phrases in a natural way (while preserving meaning) to lengthen the text so that its spoken duration better matches the target duration.
 93 |       - If rewriting is not allowed, you can **only** insert specific markers for silence (either "--" or "<break time="x" />", depending on the text-to-speech service).
 94 | 
 95 |     2. **Decide when to add silences vs. rewriting**:
 96 |       - If the **difference** between the original speaking time and the translated speech duration is small to moderate, inserting silences (pauses) is typically sufficient.
 97 |       - If the difference is large (for example, if you must slow the TTS audio below "0.75x" speed to fit), then rewriting or expanding the text may be more natural than adding very long silences.
 98 | 
 99 |     3. **Placement and distribution of silences**:
100 |       - Base your insertion of silences on:
101 |         1. The provided silence times between each original word (highest priority).  
102 |         2. Punctuation (commas, periods, semicolons, etc.).  
103 |         3. The difference in total duration between the original audio and the TTS-generated audio.
104 |       - You must distribute the total required silence ("difference") across the text in a way that sounds natural.  
105 |       - When using hyphens ("--"), each "--" indicates ~0.6s of silence.  
106 |       - When using "<break time="x" />", you will specify the time in seconds.
107 | 
108 |     4. **Output formatting rules**:
109 |       - Return **only** the modified text (translated text) with added silences (and optional rewrites if allowed).
110 |       - Do not add extra explanations or metadata in your final output.
111 |       - Never put a silence marker at the very end (after the last word).
112 |       - Preserve the order of the words and punctuation unless rewriting is explicitly allowed. In that case, only do minimal modifications or expansions.
113 |       - Use spaces carefully around silence markers (e.g. "word -- word", or "word <break time="0.8s" /> word").
114 | 
115 |     5. **Important details**:
116 |       - This text is part of a larger user-authorized transcription.
117 |       - ${PromptBuilder.T_V_DistinctionInstruction}
118 |       - Respect the user’s instructions about how many silences to add: “A little less is better than too much.”
119 |       - If rewriting is allowed, avoid adding filler words that distort meaning; choose expansions that stay faithful to the original intent.
120 | 
121 |     You will receive more specific data and parameters in the dynamic prompt below.
122 |   `;
123 | 
124 |   static createPromptToTranslateTranscription(createPromptArguments: CreatePromptArguments) {
125 |     return `
126 |         Target language: ${createPromptArguments?.targetLanguage}
127 |         Origin language audio: ${createPromptArguments?.originLanguage}
128 |     
129 |         ---
130 |         IMPORTANT INFORMATION:
131 |     
132 |         - You have three segments: previous, current (to translate), and next.
133 |         - Translate ONLY the current text segment. Do not translate or output the previous or next segments.
134 |         - If the text to translate is already in the target language or contains no actionable content, return it as is.
135 |         - ${this.T_V_DistinctionInstruction}
136 |         - Keep “--” or “---” for artificial silences.
137 |         - Convert numbers to words. Expand units/acronyms/currencies appropriately in the target language.
138 |         - If no direct equivalent exists for an acronym, keep the original acronym.
139 |         - Return ONLY the translated text (without quotes, commentary, or additional formatting).
140 |     
141 |         ---
142 |         --- PREVIOUS TEXT IN THE TRANSCRIPTION (SPEAKER ${createPromptArguments?.previousTranscriptionSpeaker}) (context only, do not translate):
143 |         ${createPromptArguments?.lastTranscription}
144 |         ---END---
145 |     
146 |         --- TEXT TO TRANSLATE (SPEAKER ${createPromptArguments?.transcriptionToTranslateSpeaker}):
147 |         ${createPromptArguments?.transcriptionToTranslate}
148 |         ---END---
149 |     
150 |         --- NEXT TEXT IN THE TRANSCRIPTION (SPEAKER ${createPromptArguments?.nextTranscriptionSpeaker}) (context only, do not translate):
151 |         ${createPromptArguments?.nextTranscription}
152 |         ---END---
153 |     
154 |          Some information about the video/audio:
155 |           Title: ${createPromptArguments?.videoTitle || ''}
156 |           Main category: ${createPromptArguments?.mainCategoryVideo}
157 |           Summary of the video transcription to give you a context: ${createPromptArguments?.transcriptionSummary}
158 |         `;
159 |   }
160 | 
161 |   static async createPromptForReformulatedTranscription({
162 |     transcriptionToReformulate,
163 |     originalTranscription,
164 |     targetLanguage,
165 |     transcriptionDuration,
166 |     translatedSpeechDuration,
167 |     difference,
168 |     transcriptionSummary,
169 |   }: {
170 |     transcriptionToReformulate: string;
171 |     originalTranscription: string;
172 |     targetLanguage: AllowedLanguages | string;
173 |     transcriptionDuration: number;
174 |     translatedSpeechDuration: number;
175 |     difference: string;
176 |     transcriptionSummary: string;
177 |   }) {
178 |     return `
179 |    Reformulate, shorten, and adapt the following text so that it fits perfectly into the original speaking time. 
180 |    In other words, reduce the word count or syllables without removing essential punctuation. 
181 |    Your aim is to preserve the original meaning and context while ensuring the dubbed speech duration matches the original timing.
182 | 
183 |    Length of time to match: ${transcriptionDuration} seconds.
184 | 
185 |    ---Original text (untranslated)---
186 |    ${originalTranscription}
187 |    ---END---
188 | 
189 |    ---Text translated (too long to fit)---
190 |    ${transcriptionToReformulate}
191 |    ---END---
192 | 
193 |    Duration of the original text: ${transcriptionDuration} seconds.
194 |    Duration of the translated text: ${translatedSpeechDuration} seconds.
195 |    The text is ${difference} seconds too long; you must rewrite it to make it ${difference} seconds shorter.
196 | 
197 |    Important details:
198 |    - If the text is already very short or cannot be shortened without losing meaning, keep it as is.
199 |    - Maintain punctuation, style, and verbal tics.
200 |    - Return only the reformulated text in ${targetLanguage.toUpperCase()}, with no extra explanations or formatting.
201 | 
202 |    RETURN ONLY THE REFORMULATED SHORTENED TEXT TRANSLATED IN ${targetLanguage.toUpperCase()}
203 | 
204 |    Summary of the video transcription to give you a context: "${transcriptionSummary}
205 | 
206 |    `;
207 |   }
208 | 
209 |   static createPromptForHandlingToShortSpeech({
210 |     targetLanguage,
211 |     orignalLanguage,
212 |     transcriptionTranslated,
213 |     wordsWithSilences,
214 |     originalSegmentDuration,
215 |     translatedSpeechDuration,
216 |     difference,
217 |     isSpeechForElevenLabs,
218 |     allowRewrite,
219 |     transcriptionSummary,
220 |   }: {
221 |     targetLanguage: string;
222 |     orignalLanguage: string;
223 |     transcriptionTranslated: string;
224 |     wordsWithSilences: string;
225 |     originalSegmentDuration: number;
226 |     translatedSpeechDuration: string;
227 |     difference: string;
228 |     isSpeechForElevenLabs: boolean;
229 |     allowRewrite: boolean;
230 |     transcriptionSummary: string;
231 |   }) {
232 |     const adjustedTranslatedSpeechDuration =
233 |       Number(difference) > 0.5
234 |         ? (Number(translatedSpeechDuration) + 0.4).toFixed(4)
235 |         : translatedSpeechDuration;
236 |     const adjustedDifference = Number(difference) > 0.5 ? (Number(difference) - 0.4).toFixed(4) : difference;
237 |     //I do this because AI have the habits to add too much silences
238 | 
239 |     if (!isSpeechForElevenLabs) {
240 |       return `
241 |      You are receiving the following parameters:
242 |      - allowRewrite: ${allowRewrite}
243 |      - originalSegmentDuration: ${originalSegmentDuration} seconds
244 |      - translatedSpeechDuration: ${adjustedTranslatedSpeechDuration} seconds
245 |      - difference: ${adjustedDifference} seconds
246 |      - wordsWithSilences: ${wordsWithSilences}
247 |      - orignalLanguage: ${orignalLanguage}
248 |      - targetLanguage: ${targetLanguage}
249 |      - transcriptionTranslated: ${transcriptionTranslated}
250 |      
251 |      Your job:
252 |      1. If allowRewrite = true and the difference is large, you may add or reformulate words for a more natural length. 
253 |         - Keep original meaning and style.
254 |         - Avoid changing proper nouns or technical terms.
255 |      2. Insert "--" (each equals ~0.600 seconds silence) intelligently:
256 |         - Prioritize natural pauses based on punctuation and provided silence times.
257 |         - Distribute ${adjustedDifference} seconds of total silence (in increments of 0.6s).
258 |      3. Return ONLY the final text with the inserted silences (and optional minimal rewrites if allowRewrite = true).
259 |      4. Never put silences at the very end. 
260 |      5. Do not add extra commentary or headings.
261 |    
262 |      ---Text translated in ${targetLanguage} from ${orignalLanguage} THAT YOU MUST RETURN UPDATED:
263 |      ${transcriptionTranslated}
264 |      ---END---
265 |    
266 |      ---Words of the original text separated with silence in each word, here to help you: 
267 |      ${wordsWithSilences}
268 |      ---END---
269 |    
270 |    
271 |      Remember: "Less is better than too much" for silences.
272 | 
273 |      Here is a summary of the video transcription to give you a context: "${transcriptionSummary}"
274 |      `;
275 |     }
276 | 
277 |     return `
278 |    You are receiving the following parameters:
279 |    - allowRewrite: ${allowRewrite}
280 |    - originalSegmentDuration: ${originalSegmentDuration}
281 |    - translatedSpeechDuration: ${adjustedTranslatedSpeechDuration}
282 |    - difference: ${adjustedDifference}
283 |    - wordsWithSilences: ${wordsWithSilences}
284 |    - orignalLanguage: ${orignalLanguage}
285 |    - targetLanguage: ${targetLanguage}
286 |    - transcriptionTranslated: ${transcriptionTranslated}
287 | 
288 |    Your job:
289 |    1. If allowRewrite = true and the difference is large, you may add or reformulate words for a more natural length. 
290 |       - Keep original meaning and style.
291 |       - Avoid changing proper nouns or technical terms.
292 |       - Avoid removing words when removing them will make the sentence not weird
293 |    2. Insert <break time="X.Xs" /> in strategic places:
294 |       - Prioritize natural pauses based on punctuation and based on the provided silence times between each word.
295 |       - Silences between words have priority over punctuation.
296 |       - Distribute ${adjustedDifference} seconds total across these <break> tags.
297 |       - Put always a space between the word and the break and the next word.
298 |    3. For silences ≥ 0.800 seconds:
299 |      - Use <break time="Xs"> tag
300 |      - Example: <break time="1.1s">
301 |    4. For silences < 0.800 seconds:
302 |      - Use appropriate punctuation ONLY (comma, period, question mark)
303 |      - NEVER use <break> tags for these short silences
304 |      - Example: "Hello, how are you?" (comma represents a short pause)
305 |    5. For longer silences (> 1.5 seconds):
306 |      - Divide into multiple smaller pauses distributed naturally in the text
307 |      - Apply rules 1 & 2 to each divided portion
308 |      - Example: A 2.5s silence could become <break time="1.0"> + comma + <break time="1.0">
309 |    6. Return ONLY the final text with the inserted breaks (and optional minimal rewrites if allowRewrite = true).
310 |    7. NEVER put a break at the very end. 
311 |    8. Do not add extra commentary or headings.
312 |    9. Rounding silence to the nearest decimal place, for example, <break silence="1.37"> becomes <break silence="1.4">
313 |    
314 |    
315 |    ---Text translated in ${targetLanguage} from ${orignalLanguage} THAT YOU MUST RETURN UPDATED:
316 |    ${transcriptionTranslated}
317 |    ---END---
318 |    
319 |    ---Words of the original text separated with silence in each word, here to help you: 
320 |    ${wordsWithSilences}
321 |    ---END---
322 |  
323 |    Remember: "Less is better than too much" for silences.
324 | 
325 |    Here is a summary of the video transcription to give you a context: "${transcriptionSummary}"
326 |    `;
327 |   }
328 | }
329 | 


--------------------------------------------------------------------------------
/src/separator/spleeter.ts:
--------------------------------------------------------------------------------
  1 | import { ElevenLabsService } from './../elevenlabs/elevenlabs';
  2 | import axios from 'axios';
  3 | import FormData from 'form-data';
  4 | import fs from 'fs';
  5 | import fsPromises from 'fs/promises';
  6 | import qs from 'qs';
  7 | import { AudioUtils } from '../ffmpeg/audio-utils';
  8 | 
  9 | export class Spleeter {
 10 |   static async getSeparateAudio(audioFilePath: string) {
 11 |     const filePathMp3 = audioFilePath.replace('.wav', '.mp3');
 12 |     try {
 13 |       await AudioUtils.convertToMp3(audioFilePath, filePathMp3);
 14 | 
 15 |       const backgroundAudio = (await this.separateAudioInTwoParts(filePathMp3)).accompaniment;
 16 | 
 17 |       const elevenLabsService = new ElevenLabsService();
 18 |       const vocalsIsolated = await elevenLabsService.isolateVoiceFromAudio(audioFilePath);
 19 | 
 20 |       return { backgroundAudio, vocalsIsolated };
 21 |     } catch (error) {
 22 |       console.error('Error in getSeparateAudio:', error);
 23 |       if (error instanceof Error) {
 24 |         throw error;
 25 |       } else {
 26 |         throw new Error('Error in getSeparateAudio');
 27 |       }
 28 |     } finally {
 29 |       if (fs.existsSync(filePathMp3)) {
 30 |         await fsPromises.unlink(filePathMp3);
 31 |       }
 32 |     }
 33 |   }
 34 | 
 35 |   static async separateAudioInTwoParts(filePath: string): Promise<{
 36 |     vocals: string;
 37 |     accompaniment: string;
 38 |   }> {
 39 |     console.debug('Separating audio into vocals and accompaniment...');
 40 |     const licenseKey = process.env.LALAL_LICENSE_KEY;
 41 |     const apiUrlBase = 'https://www.lalal.ai/api';
 42 |     let fileId: string = '';
 43 | 
 44 |     const checkStatus = async (fileId: string): Promise<LalalAPIResponse> => {
 45 |       let isCompleted = false;
 46 |       let statusData: LalalAPIResponse | null = null;
 47 | 
 48 |       while (!isCompleted) {
 49 |         try {
 50 |           const data = qs.stringify({ id: fileId });
 51 |           const response = await axios.post(`${apiUrlBase}/check/`, data, {
 52 |             headers: {
 53 |               Authorization: `license ${licenseKey}`,
 54 |               'Content-Type': 'application/x-www-form-urlencoded',
 55 |             },
 56 |           });
 57 | 
 58 |           if (response.data.status === 'success') {
 59 |             const taskState = response.data.result[fileId]?.task?.state;
 60 |             if (taskState === 'success') {
 61 |               isCompleted = true;
 62 |               statusData = response.data;
 63 |             } else {
 64 |               await new Promise((resolve) => setTimeout(resolve, 1000));
 65 |             }
 66 |           } else {
 67 |             console.error('Error checking status:', response.data.error);
 68 |             throw new Error(response.data.error || 'Status check failed');
 69 |           }
 70 |         } catch (error) {
 71 |           console.error('An error occurred while checking status:', error);
 72 |           throw error;
 73 |         }
 74 |       }
 75 | 
 76 |       if (!statusData) throw new Error('No status data found');
 77 |       return statusData;
 78 |     };
 79 | 
 80 |     const processAudio = async (filePath: string): Promise<LalalAPIResponse> => {
 81 |       try {
 82 |         const form = new FormData();
 83 |         form.append('file', fs.createReadStream(filePath), {
 84 |           filename: filePath.split('/').pop(),
 85 |         });
 86 | 
 87 |         // Retry up to 2 additional times if the upload fails
 88 |         const uploadAttempt = async (maxRetries = 2): Promise<ApiUploadResponse> => {
 89 |           let attempts = 0;
 90 |           let lastError: any;
 91 |           while (attempts <= maxRetries) {
 92 |             try {
 93 |               const uploadResponse = await axios.post<ApiUploadResponse>(`${apiUrlBase}/upload/`, form, {
 94 |                 headers: {
 95 |                   ...form.getHeaders(),
 96 |                   'Content-Disposition': `attachment; filename=${filePath.split('/').pop()}`,
 97 |                   Authorization: `license ${licenseKey}`,
 98 |                 },
 99 |               });
100 |               if (uploadResponse.data.status === 'success') {
101 |                 return uploadResponse.data;
102 |               } else {
103 |                 lastError = new Error(uploadResponse.data.error || 'Upload failed');
104 |                 console.error('Upload failed:', uploadResponse.data.error);
105 |               }
106 |             } catch (error) {
107 |               lastError = error;
108 |               console.error('Upload request error:', error);
109 |             }
110 | 
111 |             attempts++;
112 |             if (attempts <= maxRetries) {
113 |               await new Promise((resolve) => setTimeout(resolve, 1000));
114 |             }
115 |           }
116 | 
117 |           console.error('Upload failed after multiple attempts:', lastError);
118 |           throw new Error('Upload failed after multiple attempts.');
119 |         };
120 | 
121 |         const uploadResponse = await uploadAttempt();
122 | 
123 |         if (!uploadResponse.id) throw new Error('No file ID received from upload');
124 |         fileId = uploadResponse.id;
125 | 
126 |         interface SplitParams {
127 |           id: string;
128 |           stem:
129 |             | 'vocals'
130 |             | 'drum'
131 |             | 'bass'
132 |             | 'piano'
133 |             | 'electric_guitar'
134 |             | 'acoustic_guitar'
135 |             | 'synthesizer'
136 |             | 'voice'
137 |             | 'strings'
138 |             | 'wind';
139 |           splitter: 'orion' | 'phoenix' | 'perseus';
140 |           filter: 0 | 1 | 2;
141 |         }
142 | 
143 |         const params: SplitParams[] = [
144 |           {
145 |             id: fileId,
146 |             stem: 'voice',
147 |             splitter: 'perseus',
148 |             filter: 2,
149 |           },
150 |         ];
151 | 
152 |         const splitResponse = await axios.post(
153 |           `${apiUrlBase}/split/`,
154 |           qs.stringify({ params: JSON.stringify(params) }),
155 |           {
156 |             headers: {
157 |               ...form.getHeaders(),
158 |               Authorization: `license ${licenseKey}`,
159 |               'Content-Type': 'application/x-www-form-urlencoded',
160 |             },
161 |           },
162 |         );
163 | 
164 |         if (splitResponse.data.status !== 'success') {
165 |           console.error('Split operation failed:', splitResponse.data.error);
166 |           throw new Error('Split operation failed.');
167 |         }
168 | 
169 |         console.debug('Split operation initiated successfully');
170 |         return await checkStatus(fileId);
171 |       } catch (error) {
172 |         console.error('Process failed:', error);
173 |         if (error instanceof Error) {
174 |           throw error;
175 |         }
176 |         throw new Error('Error while processing audio');
177 |       }
178 |     };
179 | 
180 |     try {
181 |       const lalalResponse = await processAudio(filePath);
182 |       const vocals = lalalResponse.result[fileId].split.stem_track;
183 |       const accompaniment = lalalResponse.result[fileId].split.back_track;
184 |       return { vocals, accompaniment };
185 |     } catch (error) {
186 |       console.error('separateAudioInTwoParts failed:', error);
187 |       throw new Error('Failed to separate audio into two parts.');
188 |     }
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/src/smart-sync/adaptation.ts:
--------------------------------------------------------------------------------
  1 | import { models } from '../llm/openai';
  2 | import { requestToGPT } from '../llm/openai';
  3 | import { PromptBuilder } from '../llm/prompt-builder';
  4 | import type {
  5 |   AllowedLanguages,
  6 |   AudioOriginalLangAllowed,
  7 |   SegmentWitDurationAndOriginalSegment,
  8 | } from '../types';
  9 | import type {
 10 |   CreateLongerSpeechArguments,
 11 |   CreateShorterSpeechArguments,
 12 |   SpeechAdjusted,
 13 |   SpeechResponseWithDuration,
 14 | } from '../types/speech';
 15 | import { silenceBetweenSegmentConsideredAsPause } from '../utils/config';
 16 | import { AudioUtils } from '../ffmpeg/audio-utils';
 17 | import { SpeechGenerator } from '../speech/speechGenerator';
 18 | import { ElevenLabsService } from '../elevenlabs/elevenlabs';
 19 | import type { Readable } from 'form-data';
 20 | import fs from 'fs';
 21 | import crypto from 'crypto';
 22 | import fsPromises from 'fs/promises';
 23 | 
 24 | export class Adaptation {
 25 |   constructor() {
 26 |     //
 27 |   }
 28 | 
 29 |   static async compareAndAdjustSpeeches({
 30 |     transcriptions,
 31 |     speeches,
 32 |     clonedVoicesIds,
 33 |     originalLanguage,
 34 |     targetLanguage,
 35 |     transcriptionSummary,
 36 |   }: {
 37 |     transcriptions: SegmentWitDurationAndOriginalSegment[];
 38 |     speeches: SpeechResponseWithDuration[];
 39 |     clonedVoicesIds: { [key: string]: string };
 40 |     originalLanguage: AudioOriginalLangAllowed;
 41 |     targetLanguage: AllowedLanguages;
 42 |     transcriptionSummary: string;
 43 |   }): Promise<SpeechAdjusted[]> {
 44 |     console.debug('Comparing speeches, and adjusting length...');
 45 |     if (transcriptions.length !== speeches.length) {
 46 |       console.error('Array length mismatch');
 47 |       throw new Error('Array length mismatch');
 48 |     }
 49 | 
 50 |     const sortedSegments = transcriptions.sort((a, b) => a.index - b.index);
 51 | 
 52 |     const maxSpeedFactor = 1.15;
 53 | 
 54 |     const minSpeedFactor = 0.9;
 55 | 
 56 |     let previousTranscriptionText = '';
 57 | 
 58 |     try {
 59 |       const adjustments: SpeechAdjusted[] = [];
 60 | 
 61 |       for (let index = 0; index < sortedSegments.length; index++) {
 62 |         let isSpeechModifiedToBeLonger = false;
 63 |         const transcription = sortedSegments[index];
 64 |         const speech = speeches[index];
 65 |         let speechBuffer = speech.speech;
 66 | 
 67 |         let newSpeechDuration = speech.duration;
 68 | 
 69 |         let speedFactor = newSpeechDuration / transcription.duration;
 70 |         let adjustedSpeedFactor = speedFactor;
 71 |         let reformulationAttempts = 0;
 72 |         const clonedVoiceId = clonedVoicesIds[transcription.speaker];
 73 | 
 74 |         let transcriptionText = transcription.transcription;
 75 |         let nextTranscriptionText = '';
 76 | 
 77 |         //next transcription text
 78 |         if (index + 1 < sortedSegments.length) {
 79 |           const silenceBetweenNextTranscription = sortedSegments[index + 1].begin - transcription.end;
 80 | 
 81 |           //1 = 1 second
 82 |           if (
 83 |             silenceBetweenNextTranscription > silenceBetweenSegmentConsideredAsPause ||
 84 |             sortedSegments[index + 1].speaker !== transcription.speaker
 85 |           ) {
 86 |             nextTranscriptionText = '';
 87 |           } else {
 88 |             nextTranscriptionText = sortedSegments[index + 1].transcription;
 89 |           }
 90 |         }
 91 | 
 92 |         const activateSmartSync = true;
 93 |         const smartSyncMustBeTriggered =
 94 |           activateSmartSync && (speedFactor > maxSpeedFactor || speedFactor < minSpeedFactor);
 95 | 
 96 |         while (smartSyncMustBeTriggered && reformulationAttempts < 2) {
 97 |           if (speedFactor > maxSpeedFactor) {
 98 |             console.debug(`Too long (speedFactor: ${speedFactor}), reformulation needed`);
 99 | 
100 |             const shorterSpeech = await this.createShorterSpeech({
101 |               translatedTranscription: transcriptionText,
102 |               originalTranscription: transcription.originalTranscription,
103 |               speechIndex: transcription.index,
104 |               speakerIndex: transcription.speaker,
105 |               targetLanguage: targetLanguage,
106 |               previousText: previousTranscriptionText,
107 |               nextText: nextTranscriptionText,
108 |               transcriptionDuration: transcription.duration,
109 |               translatedSpeechDuration: newSpeechDuration,
110 |               difference: (newSpeechDuration - transcription.duration).toFixed(2),
111 |               transcriptionSummary,
112 |               clonedVoiceId,
113 |             });
114 | 
115 |             transcriptionText = shorterSpeech.reformulatedText as string;
116 | 
117 |             speechBuffer = shorterSpeech.speech;
118 |             newSpeechDuration = shorterSpeech.duration;
119 |           } else if (speedFactor < minSpeedFactor) {
120 |             console.debug(`Too short (speedFactor: ${speedFactor}), reformulation needed`);
121 |             const longerSpeech = await this.createLongerSpeech({
122 |               translatedTranscription: transcriptionText,
123 |               speechIndex: transcription.index,
124 |               speakerIndex: transcription.speaker,
125 |               targetLanguage: targetLanguage,
126 |               originalLanguage: originalLanguage,
127 |               transcriptionWords: transcription.wordsWithSilence,
128 |               previousText: previousTranscriptionText,
129 |               nextText: nextTranscriptionText,
130 |               originalSegmentDuration: transcription.duration,
131 |               translatedSpeechDuration: newSpeechDuration,
132 |               difference: (transcription.duration - newSpeechDuration).toFixed(2),
133 |               speedFactor: speedFactor,
134 |               transcriptionSummary,
135 |               clonedVoiceId,
136 |             });
137 | 
138 |             transcriptionText = longerSpeech.longerText;
139 | 
140 |             speechBuffer = longerSpeech.speech;
141 |             newSpeechDuration = longerSpeech.duration;
142 |             isSpeechModifiedToBeLonger = true;
143 |           }
144 | 
145 |           speedFactor = newSpeechDuration / transcription.duration;
146 | 
147 |           adjustedSpeedFactor = Math.min(Math.max(speedFactor, minSpeedFactor), maxSpeedFactor);
148 |           reformulationAttempts++;
149 | 
150 |           console.debug(
151 |             `Reformulation attempt ${reformulationAttempts}: adjustedSpeedFactor = ${adjustedSpeedFactor}`,
152 |           );
153 |         }
154 | 
155 |         previousTranscriptionText = transcriptionText;
156 | 
157 |         if (
158 |           (speedFactor >= 0.8 && speedFactor <= 0.9 && !isSpeechModifiedToBeLonger) ||
159 |           (speedFactor >= 1.1 && speedFactor <= 1.2 && !isSpeechModifiedToBeLonger)
160 |         ) {
161 |           const { newSpeechBuffer, newSpeechDuration } = await this.adjustSpeechSpeedWithElevenLabs({
162 |             speedFactor,
163 |             transcriptionText,
164 |             voiceId: clonedVoiceId,
165 |           });
166 | 
167 |           const newSpeedFactor = newSpeechDuration / transcription.duration;
168 | 
169 |           if (newSpeedFactor > 0.9 && newSpeedFactor < 1.1) {
170 |             speechBuffer = newSpeechBuffer;
171 |             speedFactor = newSpeedFactor;
172 |           }
173 |         }
174 | 
175 |         const adjustedSpeech = await this.adjustSpeechSpeed(speechBuffer, adjustedSpeedFactor);
176 | 
177 |         const newSpeechDurationAdjusted = await this.getSpeechDuration(adjustedSpeech);
178 | 
179 |         if (typeof newSpeechDurationAdjusted !== 'number')
180 |           throw new Error(
181 |             `Error during audio duration calculation in compareAndAdjustSpeeches: duration is not a number: ${newSpeechDurationAdjusted}`,
182 |           );
183 | 
184 |         adjustments.push({
185 |           speech: adjustedSpeech,
186 |           transcriptionDuration: transcription.duration,
187 |           end: transcription.end,
188 |           begin: transcription.begin,
189 |           speaker: transcription.speaker,
190 |           speechDuration: newSpeechDurationAdjusted,
191 |         });
192 |       }
193 | 
194 |       return adjustments;
195 |     } catch (err: unknown) {
196 |       console.error(err);
197 |       throw new Error('Error while adjusting speeches');
198 |     }
199 |   }
200 | 
201 |   static async adjustSpeechSpeed(speech: Buffer, speedFactor: number): Promise<Buffer> {
202 |     return new Promise((resolve, reject) => {
203 |       if (speedFactor < 0.5 || speedFactor > 2.0) {
204 |         console.error('Speed factor must be between 0.5 and 2.0');
205 |         reject(new Error('Speed factor must be between 0.5 and 2.0'));
206 |         return;
207 |       }
208 | 
209 |       if (speedFactor === 1) {
210 |         console.debug('speedFactor is 1');
211 |         resolve(speech);
212 |         return;
213 |       }
214 | 
215 |       return AudioUtils.adjustSpeed(speech, speedFactor).then(resolve).catch(reject);
216 |     });
217 |   }
218 | 
219 |   static async getSpeechDuration(speech: Readable | Buffer): Promise<number | 'N/A'> {
220 |     try {
221 |       const duration = await AudioUtils.getAudioDurationFromBuffer(speech);
222 |       return duration;
223 |     } catch (err) {
224 |       console.error('Speech duration error : ' + err);
225 |       throw new Error('Error while getting speech duration');
226 |     }
227 |   }
228 | 
229 |   static async adjustSpeechSpeedWithElevenLabs({
230 |     speedFactor,
231 |     transcriptionText,
232 |     voiceId,
233 |   }: {
234 |     speedFactor: number;
235 |     transcriptionText: string;
236 |     voiceId: string;
237 |   }): Promise<{ newSpeechBuffer: Buffer; newSpeechDuration: number }> {
238 |     const elevenLabsService = new ElevenLabsService();
239 |     const elevenLabsResponse = await elevenLabsService.generateAudioFile({
240 |       text: transcriptionText,
241 |       voiceId: voiceId,
242 |       speedFactor,
243 |       modelId: 'eleven_multilingual_v2',
244 |     });
245 | 
246 |     const buffer = elevenLabsResponse.response;
247 |     const newSpeechDuration = await AudioUtils.getAudioDurationFromBuffer(buffer);
248 | 
249 |     if (typeof newSpeechDuration !== 'number')
250 |       throw new Error(
251 |         `Error during audio duration calculation in adjustSpeechSpeedWithElevenLabs: duration is not a number: ${newSpeechDuration}`,
252 |       );
253 | 
254 |     return { newSpeechBuffer: buffer, newSpeechDuration };
255 |   }
256 | 
257 |   static async createShorterSpeech({
258 |     translatedTranscription,
259 |     originalTranscription,
260 |     speechIndex,
261 |     speakerIndex,
262 |     targetLanguage,
263 |     previousText,
264 |     nextText,
265 |     transcriptionDuration,
266 |     translatedSpeechDuration,
267 |     difference,
268 |     transcriptionSummary,
269 |     clonedVoiceId,
270 |   }: CreateShorterSpeechArguments) {
271 |     const reformulatedTranscription = await this.getReformulatedTranscription({
272 |       transcription: translatedTranscription,
273 |       originalTranscription,
274 |       targetLanguage,
275 |       transcriptionDuration,
276 |       translatedSpeechDuration,
277 |       difference,
278 |       transcriptionSummary,
279 |     });
280 | 
281 |     const speechShortened = await SpeechGenerator.getSpeechFromTTSEngine({
282 |       transcription: reformulatedTranscription as string,
283 |       index: speechIndex,
284 |       speakerIndex: speakerIndex,
285 |       clonedVoiceId: clonedVoiceId,
286 |       options: {
287 |         previousTranscriptionText: previousText,
288 |         nextTranscriptionText: nextText,
289 |       },
290 |       targetLanguage,
291 |     });
292 | 
293 |     const speechBuffer =
294 |       speechShortened.speech instanceof Response
295 |         ? Buffer.from(await speechShortened.speech.arrayBuffer())
296 |         : speechShortened.speech;
297 | 
298 |     const speechBufferWithoutSilence = await this.removeStartAndEndSilenceFromAudio(speechBuffer);
299 | 
300 |     const speechDuration = await this.getSpeechDuration(speechBufferWithoutSilence);
301 | 
302 |     if (typeof speechDuration !== 'number')
303 |       throw new Error(
304 |         `Error during audio duration calculation in createShorterSpeech: duration is not a number: ${speechDuration}`,
305 |       );
306 | 
307 |     console.debug('Shorter speech created.');
308 | 
309 |     return {
310 |       speech: speechBufferWithoutSilence,
311 |       duration: speechDuration,
312 |       reformulatedText: reformulatedTranscription,
313 |       requestId: speechShortened.requestId,
314 |     };
315 |   }
316 | 
317 |   static async removeStartAndEndSilenceFromAudio(speech: Buffer): Promise<Buffer> {
318 |     const temporaryInputFile = `temporary-files/input-for-trim-${crypto.randomUUID()}.wav`;
319 |     const temporaryOutputFile = `temporary-files/output-for-trim-${crypto.randomUUID()}.wav`;
320 | 
321 |     try {
322 |       await fsPromises.writeFile(temporaryInputFile, speech);
323 | 
324 |       try {
325 |         await AudioUtils.removeStartAndEndSilenceFromAudioWithFFMPEG(temporaryInputFile, temporaryOutputFile);
326 |       } catch (ffmpegError: any) {
327 |         console.error('FFmpeg error during silence removal:', ffmpegError);
328 | 
329 |         if (!fs.existsSync(temporaryOutputFile)) {
330 |           throw new Error(`FFmpeg failed to process audio: ${ffmpegError.message || 'Unknown error'}`);
331 |         }
332 | 
333 |         console.debug('FFmpeg reported an error but output file exists, attempting to continue');
334 |       }
335 | 
336 |       if (!fs.existsSync(temporaryOutputFile)) {
337 |         throw new Error('Output file was not created during silence removal');
338 |       }
339 | 
340 |       const stats = await fsPromises.stat(temporaryOutputFile);
341 |       if (stats.size === 0) {
342 |         throw new Error('Output file is empty after silence removal');
343 |       }
344 | 
345 |       const bufferNewSpeech = await fsPromises.readFile(temporaryOutputFile);
346 | 
347 |       return bufferNewSpeech;
348 |     } catch (err: any) {
349 |       console.error('Error in removeStartAndEndSilenceFromAudio:', err);
350 |       throw new Error(
351 |         `ERROR while removing start and end silence from audio: ${err.message || 'Unknown error'}`,
352 |       );
353 |     } finally {
354 |       try {
355 |         if (fs.existsSync(temporaryInputFile)) await fsPromises.unlink(temporaryInputFile);
356 |       } catch (unlinkError) {
357 |         console.error('Error deleting temporary input file:', unlinkError);
358 |       }
359 | 
360 |       try {
361 |         if (fs.existsSync(temporaryOutputFile)) await fsPromises.unlink(temporaryOutputFile);
362 |       } catch (unlinkError) {
363 |         console.error('Error deleting temporary output file:', unlinkError);
364 |       }
365 |     }
366 |   }
367 | 
368 |   static async requestUpdatedTextToAi({ prompt, instruction }: { prompt: string; instruction: string }) {
369 |     try {
370 |       const response = await requestToGPT({
371 |         prompt,
372 |         maxTokens: 8000,
373 |         temperature: 0.5,
374 |         instructions: instruction,
375 |         responseFormat: 'text',
376 |         model: models.o4Mini,
377 |       });
378 | 
379 |       return response;
380 |     } catch (error) {
381 |       console.error('Error requesting updated text to AI with fallback (1) :', error);
382 | 
383 |       throw new Error('Error requesting updated text to AI with fallback (1)');
384 |     }
385 |   }
386 | 
387 |   static async getReformulatedTranscription({
388 |     transcription,
389 |     originalTranscription,
390 |     targetLanguage,
391 |     transcriptionDuration,
392 |     translatedSpeechDuration,
393 |     difference,
394 |     transcriptionSummary,
395 |   }: {
396 |     transcription: string;
397 |     originalTranscription: string;
398 |     targetLanguage: string;
399 |     transcriptionDuration: number;
400 |     translatedSpeechDuration: number;
401 |     difference: string;
402 |     transcriptionSummary: string;
403 |   }) {
404 |     const params = {
405 |       transcriptionToReformulate: transcription,
406 |       originalTranscription: originalTranscription,
407 |       targetLanguage: targetLanguage,
408 |       transcriptionDuration: transcriptionDuration,
409 |       translatedSpeechDuration: translatedSpeechDuration,
410 |       difference: difference,
411 |       transcriptionSummary: transcriptionSummary,
412 |     };
413 | 
414 |     const promptForLLM = await PromptBuilder.createPromptForReformulatedTranscription(params);
415 | 
416 |     const instruction = PromptBuilder.instructionForReformulatedTranscription;
417 | 
418 |     const LLMResponse = await this.requestUpdatedTextToAi({
419 |       prompt: promptForLLM,
420 |       instruction,
421 |     });
422 | 
423 |     return LLMResponse;
424 |   }
425 | 
426 |   static async getLongerText({
427 |     speedFactor,
428 |     difference,
429 |     targetLanguage,
430 |     originalLanguage,
431 |     translatedTranscription,
432 |     transcriptionWords,
433 |     originalSegmentDuration,
434 |     translatedSpeechDuration,
435 |     transcriptionSummary,
436 |   }: {
437 |     speedFactor: number;
438 |     difference: string;
439 |     targetLanguage: string;
440 |     originalLanguage: string;
441 |     translatedTranscription: string;
442 |     transcriptionWords: string;
443 |     originalSegmentDuration: number;
444 |     translatedSpeechDuration: number;
445 |     transcriptionSummary: string;
446 |   }) {
447 |     const isSpeechForElevenLabs = true;
448 |     const isAiAllowedToRewrite = speedFactor < 0.75 || Number(difference) > 2;
449 | 
450 |     const prompt = PromptBuilder.createPromptForHandlingToShortSpeech({
451 |       targetLanguage: targetLanguage,
452 |       orignalLanguage: originalLanguage,
453 |       transcriptionTranslated: translatedTranscription,
454 |       wordsWithSilences: transcriptionWords,
455 |       originalSegmentDuration,
456 |       translatedSpeechDuration: translatedSpeechDuration.toFixed(2),
457 |       difference,
458 |       isSpeechForElevenLabs,
459 |       allowRewrite: isAiAllowedToRewrite,
460 |       transcriptionSummary,
461 |     });
462 | 
463 |     const instruction = PromptBuilder.instructionForHandlingToShortSpeech;
464 | 
465 |     const translatedTextWithSilence = await this.requestUpdatedTextToAi({
466 |       prompt,
467 |       instruction,
468 |     });
469 | 
470 |     return translatedTextWithSilence;
471 |   }
472 | 
473 |   static async createLongerSpeech({
474 |     translatedTranscription,
475 |     speechIndex,
476 |     speakerIndex,
477 |     targetLanguage,
478 |     originalLanguage,
479 |     transcriptionWords,
480 |     nextText,
481 |     previousText,
482 |     originalSegmentDuration,
483 |     translatedSpeechDuration,
484 |     difference,
485 |     speedFactor,
486 |     transcriptionSummary,
487 |     clonedVoiceId,
488 |   }: CreateLongerSpeechArguments): Promise<{
489 |     speech: Buffer;
490 |     duration: number;
491 |     requestId: string;
492 |     longerText: string;
493 |   }> {
494 |     const translatedTextWithSilence = await this.getLongerText({
495 |       speedFactor,
496 |       difference,
497 |       targetLanguage,
498 |       originalLanguage,
499 |       translatedTranscription,
500 |       transcriptionWords,
501 |       originalSegmentDuration,
502 |       translatedSpeechDuration,
503 |       transcriptionSummary,
504 |     });
505 | 
506 |     const longerSpeech = await SpeechGenerator.getSpeechFromTTSEngine({
507 |       transcription: translatedTextWithSilence as string,
508 |       index: speechIndex,
509 |       speakerIndex: speakerIndex,
510 |       clonedVoiceId,
511 |       options: {
512 |         previousTranscriptionText: previousText,
513 |         nextTranscriptionText: nextText,
514 |       },
515 |       targetLanguage,
516 |     });
517 | 
518 |     const speechBuffer =
519 |       longerSpeech.speech instanceof Response
520 |         ? Buffer.from(await longerSpeech.speech.arrayBuffer())
521 |         : longerSpeech.speech;
522 | 
523 |     const speechBufferWithoutSilence = await this.removeStartAndEndSilenceFromAudio(speechBuffer);
524 | 
525 |     const speechDuration = await this.getSpeechDuration(speechBufferWithoutSilence);
526 | 
527 |     if (typeof speechDuration !== 'number')
528 |       throw new Error(
529 |         `Error during audio duration calculation in translation service: duration is not a number: ${speechDuration}`,
530 |       );
531 | 
532 |     return {
533 |       speech: speechBufferWithoutSilence,
534 |       duration: speechDuration,
535 |       requestId: longerSpeech.requestId,
536 |       longerText: translatedTextWithSilence,
537 |     };
538 |   }
539 | }
540 | 


--------------------------------------------------------------------------------
/src/speech/speechGenerator.ts:
--------------------------------------------------------------------------------
  1 | import type { PreviousRequestIdsEL } from '../elevenlabs/elevenlabs';
  2 | import type { AllowedLanguages, SegmentWitDurationAndOriginalSegment } from '../types';
  3 | import type { SpeechAdjusted, SpeechResponseWithDuration, SpeechResponseWithIndex } from '../types/speech';
  4 | import { maxSimultaneousFetchElevenLabs, silenceBetweenSegmentConsideredAsPause } from '../utils/config';
  5 | import { ElevenLabsService } from '../elevenlabs/elevenlabs';
  6 | import { AudioUtils } from '../ffmpeg/audio-utils';
  7 | import crypto from 'crypto';
  8 | import fs from 'fs';
  9 | import fsPromises from 'fs/promises';
 10 | import { Helpers } from '../utils/helpers';
 11 | import { VideoUtils } from '../ffmpeg/video-utils';
 12 | import type { Readable } from 'stream';
 13 | import * as path from 'path';
 14 | 
 15 | export class SpeechGenerator {
 16 |   constructor() {
 17 |     //
 18 |   }
 19 | 
 20 |   static async getSpeechArrayFromTranscriptions({
 21 |     segments,
 22 |     targetLanguage,
 23 |     isolatedVocalsPath,
 24 |   }: {
 25 |     segments: SegmentWitDurationAndOriginalSegment[];
 26 |     isolatedVocalsPath: string;
 27 |     targetLanguage: AllowedLanguages;
 28 |   }): Promise<{
 29 |     allResultsSorted: SpeechResponseWithIndex[];
 30 |     clonedVoicesIds: { [key: string]: string };
 31 |   }> {
 32 |     console.debug('Getting speeches...');
 33 |     const maxSimultaneousFetch = maxSimultaneousFetchElevenLabs;
 34 | 
 35 |     let allResults: SpeechResponseWithIndex[] = [];
 36 |     const clonedVoicesIds: {
 37 |       //speakerIndex/number: clonedVoiceId
 38 |       [key: string]: string;
 39 |     } = {};
 40 | 
 41 |     const speakers = this.getNumberSpeakers(segments);
 42 |     for (const speaker of speakers) {
 43 |       clonedVoicesIds[speaker] = await this.cloneVideoVoice(isolatedVocalsPath, segments, speaker);
 44 |     }
 45 | 
 46 |     try {
 47 |       //Voice cloning or custom Voice return only an Array of one Item
 48 |       const processTranscriptionBatch = async ({
 49 |         batch,
 50 |         previousTranscriptionText,
 51 |         nextTranscriptionText,
 52 |         targetLanguage,
 53 |       }: {
 54 |         batch: SegmentWitDurationAndOriginalSegment[];
 55 |         previousTranscriptionText: string | '';
 56 |         nextTranscriptionText: string | '';
 57 |         previousRequestIds: PreviousRequestIdsEL;
 58 |         targetLanguage: AllowedLanguages;
 59 |       }) => {
 60 |         const promises = batch.map((transcription) =>
 61 |           this.getSpeechFromTTSEngine({
 62 |             transcription: transcription.transcription,
 63 |             index: transcription.index,
 64 |             speakerIndex: transcription.speaker,
 65 |             clonedVoiceId: clonedVoicesIds[transcription.speaker],
 66 |             options: {
 67 |               previousTranscriptionText,
 68 |               nextTranscriptionText,
 69 |             },
 70 |             targetLanguage,
 71 |           }),
 72 |         );
 73 | 
 74 |         return await Promise.all(promises);
 75 |       };
 76 | 
 77 |       const pastSpeechIds: PreviousRequestIdsEL = [];
 78 |       for (let i = 0; i < segments.length; i += maxSimultaneousFetch) {
 79 |         const batchEndIndex = i + maxSimultaneousFetch;
 80 |         const nextTranscriptionData = segments[i + 1];
 81 |         const transcriptionBatch = segments.slice(i, batchEndIndex);
 82 |         const previousTranscriptionText = i === 0 ? '' : segments[i - 1]?.transcription;
 83 |         let nextTranscriptionText = '';
 84 | 
 85 |         if (batchEndIndex < segments.length) {
 86 |           const silenceBetweenNextTranscription = nextTranscriptionData?.begin - segments[i].end;
 87 | 
 88 |           if (
 89 |             nextTranscriptionData?.speaker !== segments[i].speaker ||
 90 |             silenceBetweenNextTranscription > silenceBetweenSegmentConsideredAsPause
 91 |           ) {
 92 |             nextTranscriptionText = '';
 93 |           } else {
 94 |             nextTranscriptionText = nextTranscriptionData.transcription;
 95 |           }
 96 |         }
 97 | 
 98 |         const batchResults = await processTranscriptionBatch({
 99 |           batch: transcriptionBatch,
100 |           previousTranscriptionText: previousTranscriptionText,
101 |           nextTranscriptionText: nextTranscriptionText,
102 |           previousRequestIds: pastSpeechIds || '',
103 |           targetLanguage,
104 |         });
105 | 
106 |         if (pastSpeechIds.length === 3) pastSpeechIds.shift();
107 |         pastSpeechIds.push(batchResults[0].requestId);
108 | 
109 |         allResults = allResults.concat(batchResults);
110 |       }
111 |       console.debug('Speeches got.');
112 |       const allResultsSorted = allResults.sort((a, b) => a.index - b.index);
113 | 
114 |       return {
115 |         allResultsSorted,
116 |         clonedVoicesIds,
117 |       };
118 |     } catch (err: unknown) {
119 |       console.error(err);
120 |       if (err instanceof Error) {
121 |         throw err;
122 |       }
123 |       throw new Error('Error while getting speeches');
124 |     }
125 |   }
126 | 
127 |   static async cloneVideoVoice(
128 |     vocalsAudioPath: string,
129 |     segments: SegmentWitDurationAndOriginalSegment[],
130 |     speakerIndex: number,
131 |   ) {
132 |     console.debug('Cloning video voice...');
133 |     function combineBuffers(buffers: Buffer[]): Buffer {
134 |       const totalLength = buffers.reduce((sum, buffer) => sum + buffer.length, 0);
135 | 
136 |       const combinedBuffer = Buffer.alloc(totalLength);
137 | 
138 |       let offset = 0;
139 |       for (const buffer of buffers) {
140 |         buffer.copy(combinedBuffer, offset);
141 |         offset += buffer.length;
142 |       }
143 | 
144 |       return combinedBuffer;
145 |     }
146 | 
147 |     const filePath = `temporary-files/audioFromOneSpeaker-${crypto.randomUUID()}.mp3`;
148 | 
149 |     try {
150 |       let audioFromOneSpeakerBuffer = await this.getAudiosSpeakerAndMerge(
151 |         segments,
152 |         speakerIndex,
153 |         vocalsAudioPath,
154 |       );
155 | 
156 |       fs.writeFileSync(filePath, combineBuffers(audioFromOneSpeakerBuffer));
157 |       console.debug('getting file duration for function cloneVideoVoice');
158 |       const audioDuration = await VideoUtils.getFileDuration(filePath);
159 | 
160 |       if (typeof audioDuration !== 'number')
161 |         throw new Error(
162 |           `Error during audio duration when cloning video voice: duration is not a number: ${audioDuration}`,
163 |         );
164 | 
165 |       if (audioDuration < 90) {
166 |         const resultPath = await AudioUtils.duplicateAndConcatenateAudio(filePath, 3, 'mp3');
167 | 
168 |         audioFromOneSpeakerBuffer = await Helpers.splitAudioIntoBuffers(resultPath);
169 | 
170 |         if (fs.existsSync(resultPath)) await fsPromises.unlink(resultPath);
171 |       }
172 | 
173 |       const elevenLabsService = new ElevenLabsService();
174 |       const response = await elevenLabsService.cloneVoice(
175 |         audioFromOneSpeakerBuffer,
176 |         'speaker-' + speakerIndex,
177 |         audioDuration,
178 |       );
179 | 
180 |       return response.voice_id;
181 |     } catch (err) {
182 |       console.error(err);
183 |       if (err instanceof Error) {
184 |         throw err;
185 |       }
186 |       throw new Error('Error while cloning video voice');
187 |     } finally {
188 |       if (fs.existsSync(filePath)) await fsPromises.unlink(filePath);
189 |     }
190 |   }
191 | 
192 |   static async getSpeechFromTTSEngine({
193 |     transcription,
194 |     index,
195 |     speakerIndex,
196 |     options,
197 |     targetLanguage,
198 |     clonedVoiceId,
199 |   }: {
200 |     transcription: string;
201 |     index: number;
202 |     speakerIndex: number;
203 |     clonedVoiceId: string;
204 |     options?: {
205 |       previousTranscriptionText: string | '';
206 |       nextTranscriptionText: string | '';
207 |     };
208 |     targetLanguage: AllowedLanguages;
209 |   }): Promise<SpeechResponseWithIndex> {
210 |     const elevenLabsService = new ElevenLabsService();
211 | 
212 |     const createSpeechWithVoiceCloning = async () => {
213 |       try {
214 |         return await elevenLabsService.generateAudioFile({
215 |           text: transcription,
216 |           modelId: 'eleven_multilingual_v2',
217 |           voiceId: clonedVoiceId,
218 |           previousText: options?.previousTranscriptionText,
219 |           targetLanguage: targetLanguage,
220 |           nextText: options?.nextTranscriptionText,
221 |         });
222 |       } catch (err) {
223 |         console.error(err);
224 |         if (err instanceof Error) {
225 |           throw err;
226 |         }
227 | 
228 |         throw new Error('Error while getting speech with ElevenLabs');
229 |       }
230 |     };
231 | 
232 |     const response = await createSpeechWithVoiceCloning();
233 | 
234 |     return {
235 |       index: index,
236 |       speech: response.response,
237 |       speaker: speakerIndex,
238 |       requestId: response?.requestId,
239 |     };
240 |   }
241 | 
242 |   static async getAudiosSpeakerAndMerge(
243 |     segments: SegmentWitDurationAndOriginalSegment[],
244 |     speakerIndex: number,
245 |     vocalsAudioPath: string,
246 |   ): Promise<Buffer[]> {
247 |     console.debug('Getting audios from one speaker...');
248 |     const uuid = crypto.randomUUID();
249 |     const finalAudioPath = `temporary-files/finalAudioPathFromSpeaker-${uuid}.mp3`;
250 |     const audioPartsPathFromSpeaker: string[] = [];
251 | 
252 |     try {
253 |       const segmentsFromThisSpeaker = segments.filter((segment) => segment.speaker === speakerIndex);
254 | 
255 |       for (const segmentWithDuration of segmentsFromThisSpeaker) {
256 |         try {
257 |           const singleVocalSpeakerPath = await AudioUtils.cutAudioToBufferAtSpecificTime(
258 |             vocalsAudioPath,
259 |             segmentWithDuration.begin - 0.2,
260 |             segmentWithDuration.end + 0.2,
261 |             false,
262 |           );
263 | 
264 |           if (typeof singleVocalSpeakerPath === 'string') {
265 |             audioPartsPathFromSpeaker.push(singleVocalSpeakerPath);
266 |           } else {
267 |             throw new Error('singleVocalSpeakerPath is not type string');
268 |           }
269 |         } catch (error) {
270 |           for (const path of audioPartsPathFromSpeaker) {
271 |             if (fs.existsSync(path)) await fsPromises.unlink(path);
272 |           }
273 |           throw error;
274 |         }
275 |       }
276 | 
277 |       await AudioUtils.concatenateAudio({
278 |         files: audioPartsPathFromSpeaker,
279 |         outputPath: finalAudioPath,
280 |         outputFormat: 'mp3',
281 |       });
282 | 
283 |       if (await this.isFileSizeMoreThan10MB(finalAudioPath)) {
284 |         return await Helpers.splitAudioIntoBuffers(finalAudioPath);
285 |       } else {
286 |         const bufferFile = await fsPromises.readFile(finalAudioPath);
287 |         return [bufferFile];
288 |       }
289 |     } catch (error) {
290 |       console.error(error);
291 |       if (error instanceof Error) {
292 |         throw error;
293 |       }
294 |       throw new Error('Error while getting audio from one speaker.');
295 |     } finally {
296 |       if (fs.existsSync(finalAudioPath)) {
297 |         try {
298 |           await fsPromises.unlink(finalAudioPath);
299 |         } catch (e) {
300 |           console.error('Error cleaning up finalAudioPath:', e);
301 |         }
302 |       }
303 | 
304 |       audioPartsPathFromSpeaker.forEach(async (path) => {
305 |         if (fs.existsSync(path)) {
306 |           try {
307 |             await fsPromises.unlink(path);
308 |           } catch (e) {
309 |             console.error(`Error cleaning up temp file ${path}:`, e);
310 |           }
311 |         }
312 |       });
313 |     }
314 |   }
315 | 
316 |   static getNumberSpeakers(segments: SegmentWitDurationAndOriginalSegment[]) {
317 |     const speakerArray = segments.map((segment) => segment.speaker);
318 |     return Array.from(new Set(speakerArray));
319 |   }
320 | 
321 |   static async isFileSizeMoreThan10MB(filePath: string): Promise<boolean> {
322 |     try {
323 |       const stats = await fsPromises.stat(filePath);
324 |       const fileSizeInBytes = stats.size;
325 |       const fileSizeInMegabytes = fileSizeInBytes / (1024 * 1024);
326 |       return fileSizeInMegabytes > 10;
327 |     } catch (error) {
328 |       console.error('Erreur lors de la vérification de la taille du fichier:', error);
329 |       throw error;
330 |     }
331 |   }
332 | 
333 |   static async getEachSpeechDuration({
334 |     speechArray,
335 |     transcriptions,
336 |   }: {
337 |     speechArray: SpeechResponseWithIndex[];
338 |     transcriptions: SegmentWitDurationAndOriginalSegment[];
339 |   }): Promise<SpeechResponseWithDuration[]> {
340 |     console.debug('Getting speeches duration...');
341 |     try {
342 |       const speechArraySorted = speechArray.sort((a, b) => a.index - b.index);
343 | 
344 |       const arraySpeechWithDuration: SpeechResponseWithDuration[] = [];
345 | 
346 |       for (let i = 0; i < speechArraySorted.length; i++) {
347 |         const speech = speechArraySorted[i];
348 |         const audioBuffer =
349 |           speech.speech instanceof Response ? Buffer.from(await speech.speech.arrayBuffer()) : speech.speech;
350 | 
351 |         console.debug(`Getting initial speech duration for index ${i}`);
352 | 
353 |         const duration = await this.getSpeechDuration(audioBuffer);
354 | 
355 |         if (typeof duration !== 'number') {
356 |           transcriptions.filter((transcription) => transcription.index !== speech.index);
357 |           continue;
358 |         }
359 | 
360 |         arraySpeechWithDuration.push({
361 |           speech: audioBuffer,
362 |           duration,
363 |           speechIndex: i,
364 |           speaker: speech.speaker,
365 |           requestId: speech.requestId,
366 |         });
367 |       }
368 | 
369 |       console.debug('All Speeches duration got.');
370 |       return arraySpeechWithDuration.sort((a, b) => a.speechIndex - b.speechIndex);
371 |     } catch (err: unknown) {
372 |       console.error(err);
373 |       throw new Error('Error while getting speeches duration');
374 |     }
375 |   }
376 | 
377 |   static async getSpeechDuration(speech: Readable | Buffer): Promise<number | 'N/A'> {
378 |     try {
379 |       return await AudioUtils.getAudioDurationFromBuffer(speech);
380 |     } catch (err) {
381 |       console.error('Speech duration error : ' + err);
382 |       throw new Error('Error while getting speech duration');
383 |     }
384 |   }
385 | 
386 |   static async removeStartAndEndSilenceFromAllAudio(arraySpeeches: SpeechResponseWithDuration[]) {
387 |     const results = [];
388 | 
389 |     for (const speech of arraySpeeches) {
390 |       try {
391 |         let retries = 0;
392 |         const maxRetries = 3;
393 |         let newSpeechBuffer: Buffer = speech.speech;
394 |         let success = false;
395 | 
396 |         while (!success && retries < maxRetries) {
397 |           try {
398 |             const processedBuffer = await this.removeStartAndEndSilenceFromAudio(speech.speech);
399 |             newSpeechBuffer = processedBuffer;
400 |             success = true;
401 |           } catch (error: any) {
402 |             retries++;
403 |             throw error;
404 |           }
405 |         }
406 | 
407 |         const newSpeechDuration = await this.getSpeechDuration(newSpeechBuffer);
408 | 
409 |         if (typeof newSpeechDuration !== 'number') {
410 |           console.warn(
411 |             `Speech duration calculation failed for speech index ${speech.speechIndex}, using original duration`,
412 |           );
413 |           results.push({
414 |             speech: speech.speech, // Use original speech buffer
415 |             duration: speech.duration, // Use original duration
416 |             speechIndex: speech.speechIndex,
417 |             speaker: speech.speaker,
418 |             requestId: speech.requestId,
419 |           });
420 |           continue;
421 |         }
422 | 
423 |         results.push({
424 |           speech: newSpeechBuffer,
425 |           duration: newSpeechDuration,
426 |           speechIndex: speech.speechIndex,
427 |           speaker: speech.speaker,
428 |           requestId: speech.requestId,
429 |         });
430 |       } catch (error) {
431 |         console.error(`Error processing speech at index ${speech.speechIndex}:`, error);
432 | 
433 |         // Instead of failing the entire batch, keep the original speech
434 |         results.push({
435 |           speech: speech.speech, // Use original speech buffer
436 |           duration: speech.duration, // Use original duration
437 |           speechIndex: speech.speechIndex,
438 |           speaker: speech.speaker,
439 |           requestId: speech.requestId,
440 |         });
441 |       }
442 |     }
443 | 
444 |     return results;
445 |   }
446 | 
447 |   static async removeStartAndEndSilenceFromAudio(speech: Buffer): Promise<Buffer> {
448 |     console.debug('Removing start and end silence from audio...');
449 |     const temporaryInputFile = `temporary-files/input-for-trim-${crypto.randomUUID()}.wav`;
450 |     const temporaryOutputFile = `temporary-files/output-for-trim-${crypto.randomUUID()}.wav`;
451 | 
452 |     try {
453 |       await fsPromises.writeFile(temporaryInputFile, speech);
454 | 
455 |       try {
456 |         await AudioUtils.removeStartAndEndSilenceFromAudioWithFFMPEG(temporaryInputFile, temporaryOutputFile);
457 |       } catch (ffmpegError: any) {
458 |         console.error('FFmpeg error during silence removal:', ffmpegError);
459 | 
460 |         if (!fs.existsSync(temporaryOutputFile)) {
461 |           throw new Error(`FFmpeg failed to process audio: ${ffmpegError.message || 'Unknown error'}`);
462 |         }
463 | 
464 |         console.debug('FFmpeg reported an error but output file exists, attempting to continue');
465 |       }
466 | 
467 |       if (!fs.existsSync(temporaryOutputFile)) {
468 |         throw new Error('Output file was not created during silence removal');
469 |       }
470 | 
471 |       const stats = await fsPromises.stat(temporaryOutputFile);
472 |       if (stats.size === 0) {
473 |         throw new Error('Output file is empty after silence removal');
474 |       }
475 | 
476 |       const bufferNewSpeech = await fsPromises.readFile(temporaryOutputFile);
477 | 
478 |       console.debug('Start and end silence removed from audio.');
479 |       return bufferNewSpeech;
480 |     } catch (err: any) {
481 |       console.error('Error in removeStartAndEndSilenceFromAudio:', err);
482 |       throw new Error(
483 |         `ERROR while removing start and end silence from audio: ${err.message || 'Unknown error'}`,
484 |       );
485 |     } finally {
486 |       try {
487 |         if (fs.existsSync(temporaryInputFile)) await fsPromises.unlink(temporaryInputFile);
488 |       } catch (unlinkError) {
489 |         console.error('Error deleting temporary input file:', unlinkError);
490 |       }
491 | 
492 |       try {
493 |         if (fs.existsSync(temporaryOutputFile)) await fsPromises.unlink(temporaryOutputFile);
494 |       } catch (unlinkError) {
495 |         console.error('Error deleting temporary output file:', unlinkError);
496 |       }
497 |     }
498 |   }
499 | 
500 |   static async createAndAssembleSeparateAudioTracksEachSpeaker(clips: SpeechAdjusted[]): Promise<string> {
501 |     const numberOfSpeakers = [...new Set(clips.map((clip) => clip.speaker))];
502 | 
503 |     if (numberOfSpeakers.length === 1) {
504 |       console.debug('starting assemble audio for one speaker');
505 |       const audioFrequency = 44100;
506 |       const outputPath = await this.assembleAudio(clips, audioFrequency);
507 |       console.debug('assemble audio for one speaker done');
508 |       return outputPath;
509 |     }
510 | 
511 |     console.debug(`starting overlaying audio for ${numberOfSpeakers.length} speakers`);
512 |     const timelineForEachSpeaker: string[] = [];
513 | 
514 |     for (const speaker of numberOfSpeakers) {
515 |       console.debug(`starting assemble audio for speaker ${speaker}`);
516 |       const speakerClips = clips.filter((clip) => clip.speaker === speaker);
517 |       timelineForEachSpeaker.push(await this.assembleAudio(speakerClips, 44100));
518 |     }
519 | 
520 |     console.debug('assembling audio for all speakers done');
521 | 
522 |     const outputPath = `temporary-files/${crypto.randomUUID()}-result-of-overlaying.wav`;
523 | 
524 |     await AudioUtils.overlayingAudio(outputPath, timelineForEachSpeaker);
525 | 
526 |     return outputPath;
527 |   }
528 | 
529 |   static async assembleAudio(clips: SpeechAdjusted[], audioFrequency: number) {
530 |     console.debug('Assembling audio...');
531 |     let previousEnd = 0;
532 |     const tempFiles: string[] = [];
533 | 
534 |     try {
535 |       for (const clip of clips) {
536 |         if (clip.begin > previousEnd && parseFloat((clip.begin - previousEnd).toFixed(4)) > 0.001) {
537 |           const silenceDuration = (clip.begin - previousEnd).toFixed(4);
538 |           const silenceDurationFormatted = parseFloat(silenceDuration);
539 |           const silenceFile = await AudioUtils.generateSilence(silenceDurationFormatted, audioFrequency);
540 |           tempFiles.push(silenceFile);
541 |         }
542 | 
543 |         if (clip.speech) {
544 |           const audioFilePath = `temporary-files/${crypto.randomUUID()}-audio.wav`;
545 |           await fsPromises.writeFile(audioFilePath, clip.speech);
546 |           tempFiles.push(audioFilePath);
547 |         }
548 | 
549 |         previousEnd = clip.begin + clip.speechDuration;
550 |       }
551 | 
552 |       const outputPath = `temporary-files/${crypto.randomUUID()}-for-assemble-audio.wav`;
553 | 
554 |       const concatenatedAudioPath = await AudioUtils.concatenateAudio({
555 |         files: tempFiles,
556 |         outputPath,
557 |         outputFormat: 'wav',
558 |       });
559 | 
560 |       return concatenatedAudioPath;
561 |     } catch (err: unknown) {
562 |       console.error(err);
563 |       throw new Error('Error while assembling audio');
564 |     }
565 |   }
566 | 
567 |   static async overlayAudioAndBackgroundMusic(
568 |     voicesAudioPath: string,
569 |     backgroundMusicPath: string,
570 |   ): Promise<string> {
571 |     console.debug('Merging audio and background music...');
572 |     try {
573 |       const outputPath = path.join(`output/result-${crypto.randomUUID()}.wav`);
574 | 
575 |       //!Do not delete this line for the moment
576 |       //await this.ffmpegService.amplifyAudio(backgroundMusicPath, 1.5);
577 | 
578 |       return await AudioUtils.mergeAudioFiles(voicesAudioPath, backgroundMusicPath, outputPath);
579 |     } catch (err) {
580 |       console.error(err);
581 |       throw new Error('Error while merging audio and background music');
582 |     } finally {
583 |       if (fs.existsSync(voicesAudioPath)) await fsPromises.unlink(voicesAudioPath);
584 |     }
585 |   }
586 | }
587 | 


--------------------------------------------------------------------------------
/src/subtitles/subtitles-generator.ts:
--------------------------------------------------------------------------------
  1 | import { VideoUtils } from '../ffmpeg/video-utils';
  2 | import type { AllowedLanguages, SegmentWitDurationAndOriginalSegment } from '../types';
  3 | import { specialLanguagesWithSpecialCharacters } from '../utils/config';
  4 | import fs from 'fs';
  5 | import fsPromises from 'fs/promises';
  6 | import crypto from 'crypto';
  7 | 
  8 | export class SubtitlesGenerator {
  9 |   constructor() {
 10 |     //
 11 |   }
 12 | 
 13 |   static async addSubtitlesInVideo({
 14 |     transcriptionData,
 15 |     initialVideoPath,
 16 |     lang,
 17 |   }: {
 18 |     transcriptionData: SegmentWitDurationAndOriginalSegment[];
 19 |     initialVideoPath: string;
 20 |     lang: AllowedLanguages;
 21 |   }): Promise<string> {
 22 |     console.debug('Adding subtitles in video...');
 23 |     const maxLengthText = 50;
 24 |     const srtContent = this.createSrt(transcriptionData, maxLengthText, lang);
 25 |     const srtFilePath = `temporary-files/subtitles-${crypto.randomUUID()}.srt`;
 26 |     fs.writeFileSync(srtFilePath, srtContent, 'utf8');
 27 |     const outputVideoFilePath = `output/result-${crypto.randomUUID()}.mp4`;
 28 | 
 29 |     try {
 30 |       await VideoUtils.addSubtitles({
 31 |         videoPath: initialVideoPath,
 32 |         srtFilePath: srtFilePath,
 33 |         outputFilePath: outputVideoFilePath,
 34 |       });
 35 | 
 36 |       return outputVideoFilePath;
 37 |     } catch (err) {
 38 |       console.error(err);
 39 |       throw new Error('Error while adding subtitles');
 40 |     } finally {
 41 |       if (fs.existsSync(srtFilePath)) await fsPromises.unlink(srtFilePath);
 42 |       if (fs.existsSync(initialVideoPath)) await fsPromises.unlink(initialVideoPath);
 43 |     }
 44 |   }
 45 | 
 46 |   static createSrt(
 47 |     subtitles: SegmentWitDurationAndOriginalSegment[],
 48 |     maxLength: number,
 49 |     lang: AllowedLanguages,
 50 |   ): string {
 51 |     console.debug('Creating subtitles srt file...');
 52 |     let srtIndex = 1;
 53 |     let srtContent = '';
 54 | 
 55 |     for (const subtitle of subtitles) {
 56 |       const chunks = this.splitTextProportionally(subtitle.transcription, maxLength, lang);
 57 | 
 58 |       const totalWords = chunks.reduce((acc, chunk) => acc + chunk.split(' ').length, 0);
 59 | 
 60 |       let previousEnd = subtitle.begin;
 61 |       for (const chunk of chunks) {
 62 |         const words = chunk.split(' ').length;
 63 |         const chunkDuration = (subtitle.end - subtitle.begin) * (words / totalWords);
 64 |         const begin = this.secondsToSrtTime(previousEnd);
 65 |         const end = this.secondsToSrtTime(previousEnd + chunkDuration);
 66 | 
 67 |         srtContent += `${srtIndex}\n${begin} --> ${end}\n${chunk}\n\n`;
 68 |         srtIndex++;
 69 |         previousEnd += chunkDuration;
 70 |       }
 71 |     }
 72 | 
 73 |     console.debug('Subtitles srt file created');
 74 |     return srtContent;
 75 |   }
 76 | 
 77 |   static secondsToSrtTime(seconds: number): string {
 78 |     const date = new Date(0);
 79 |     date.setSeconds(seconds);
 80 |     const iso = date.toISOString();
 81 |     return iso.substring(11, 23).replace('.', ',');
 82 |   }
 83 | 
 84 |   static ddLineBreaks(text: string): string {
 85 |     const maxLength = 20;
 86 |     let result = '';
 87 |     let lineLength = 0;
 88 | 
 89 |     for (const char of text) {
 90 |       result += char;
 91 |       lineLength++;
 92 |       if (lineLength >= maxLength) {
 93 |         result += '\n';
 94 |         lineLength = 0;
 95 |       }
 96 |     }
 97 | 
 98 |     return result;
 99 |   }
100 | 
101 |   static splitTextProportionally(text: string, maxLength: number, lang: AllowedLanguages): string[] {
102 |     const chunks: string[] = [];
103 |     let currentChunk = '';
104 | 
105 |     if (specialLanguagesWithSpecialCharacters.includes(lang)) {
106 |       maxLength = 20;
107 |       for (const char of text) {
108 |         if ((currentChunk + char).length > maxLength) {
109 |           chunks.push(currentChunk);
110 |           currentChunk = '';
111 |         }
112 |         currentChunk += char;
113 |       }
114 |     } else {
115 |       const words = text.split(' ');
116 |       for (const word of words) {
117 |         if ((currentChunk + ' ' + word).trim().length > maxLength) {
118 |           chunks.push(currentChunk.trim());
119 |           currentChunk = '';
120 |         }
121 |         currentChunk += (currentChunk ? ' ' : '') + word;
122 |       }
123 |     }
124 | 
125 |     if (currentChunk) {
126 |       chunks.push(currentChunk.trim());
127 |     }
128 | 
129 |     return chunks;
130 |   }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/transcription/formatter.ts:
--------------------------------------------------------------------------------
  1 | import type {
  2 |   AllowedLanguages,
  3 |   AudioOriginalLangAllowed,
  4 |   GladiaResponse,
  5 |   Result,
  6 |   SegmentDetail,
  7 |   SegmentDetailOut,
  8 |   SegmentDetailOutWithDuration,
  9 |   Sentence,
 10 |   Utterance,
 11 |   Word,
 12 | } from '../types/index';
 13 | import { maxCharactersPerSegmentForNonLatinScriptLanguages, threshold } from '../utils/config';
 14 | import { maxCharactersPerSegment } from '../utils/config';
 15 | import { languageCodes, nonLatinScriptLanguages } from '../utils/constants';
 16 | 
 17 | export class Formatter {
 18 |   static formatTranscription(transcription: GladiaResponse, detectedLanguage: AudioOriginalLangAllowed) {
 19 |     const initialFormattedTranscription = this.getDetailsAndFormatTranscription(
 20 |       transcription.result,
 21 |       detectedLanguage,
 22 |     );
 23 | 
 24 |     const mergedSegments = this.mergeSegments(initialFormattedTranscription, threshold);
 25 | 
 26 |     const finalTranscription = this.addDurationForEachTranscription(mergedSegments);
 27 | 
 28 |     return finalTranscription;
 29 |   }
 30 | 
 31 |   static getDetailsAndFormatTranscription(
 32 |     transcriptionsData: Result,
 33 |     detectedLanguage: AudioOriginalLangAllowed,
 34 |   ) {
 35 |     const gladiaUtterances = (dataTranscriptionGladia: Result) => {
 36 |       return dataTranscriptionGladia?.transcription?.utterances;
 37 |     };
 38 | 
 39 |     let formattedUtterances: {
 40 |       transcription: string;
 41 |       begin: number;
 42 |       end: number;
 43 |       wordsWithSilence: string;
 44 |       speaker: number;
 45 |       channel: number;
 46 |       confidence: number;
 47 |       language: string;
 48 |     }[] = [];
 49 | 
 50 |     const splittedUtterances = this.splitTooLongUtterances(
 51 |       gladiaUtterances(transcriptionsData) as Utterance[],
 52 |     );
 53 | 
 54 |     formattedUtterances = splittedUtterances.map((part) => ({
 55 |       transcription: part.text,
 56 |       begin: Number(part.start.toFixed(3)),
 57 |       end: Number(part.end.toFixed(3)),
 58 |       wordsWithSilence: this.addTimesInText(part.words),
 59 |       speaker: part?.speaker || 0,
 60 |       channel: part?.channel || 0,
 61 |       confidence: part.confidence,
 62 |       language: detectedLanguage,
 63 |     }));
 64 | 
 65 |     return formattedUtterances;
 66 |   }
 67 | 
 68 |   static splitTooLongUtterances(transcriptions: Utterance[]) {
 69 |     const maxCharactersPerSegment = 500;
 70 |     const adjustedTranscription: Utterance[] = [];
 71 | 
 72 |     transcriptions.forEach((transcription) => {
 73 |       if (transcription.text.length > maxCharactersPerSegment) {
 74 |         const splittedTranscription = this.splitSegment(transcription) as Utterance[];
 75 |         adjustedTranscription.push(...splittedTranscription);
 76 |       } else {
 77 |         adjustedTranscription.push(transcription);
 78 |       }
 79 |     });
 80 | 
 81 |     return adjustedTranscription;
 82 |   }
 83 | 
 84 |   static splitSegment(obj: Sentence | Utterance, maxSentenceLength: number = 500): Sentence[] | Utterance[] {
 85 |     const words = obj.words;
 86 |     const chunks: (Sentence | Utterance)[] = [];
 87 | 
 88 |     let currentChunkWords: Word[] = [];
 89 |     let currentSentenceLength = 0;
 90 | 
 91 |     const isSentence = 'sentence' in obj;
 92 |     const textKey = isSentence ? 'sentence' : 'text';
 93 | 
 94 |     for (let i = 0; i < words.length; i++) {
 95 |       const word = words[i];
 96 |       const wordLength = word.word.length;
 97 | 
 98 |       if (currentSentenceLength + wordLength > maxSentenceLength && currentChunkWords.length > 0) {
 99 |         const sentence = currentChunkWords.map((w) => w.word).join('');
100 |         const start = currentChunkWords[0].start;
101 |         const end = currentChunkWords[currentChunkWords.length - 1].end;
102 |         const confidence =
103 |           currentChunkWords.reduce((sum, w) => sum + w.confidence, 0) / currentChunkWords.length;
104 | 
105 |         const newSegment = {
106 |           words: currentChunkWords,
107 |           language: obj.language,
108 |           start: start,
109 |           end: end,
110 |           speaker: obj?.speaker || 0,
111 |           confidence: confidence,
112 |           channel: obj?.channel || 0,
113 |           [textKey]: sentence,
114 |         };
115 | 
116 |         //@ts-ignore
117 |         chunks.push(newSegment as Sentence | Utterance);
118 | 
119 |         currentChunkWords = [];
120 |         currentSentenceLength = 0;
121 |       }
122 | 
123 |       currentChunkWords.push(word);
124 |       currentSentenceLength += wordLength;
125 |     }
126 | 
127 |     if (currentChunkWords.length > 0) {
128 |       const sentence = currentChunkWords.map((w) => w.word).join('');
129 |       const start = currentChunkWords[0].start;
130 |       const end = currentChunkWords[currentChunkWords.length - 1].end;
131 |       const confidence =
132 |         currentChunkWords.reduce((sum, w) => sum + w.confidence, 0) / currentChunkWords.length;
133 | 
134 |       const newSegment = {
135 |         words: currentChunkWords,
136 |         language: obj.language,
137 |         start: start,
138 |         end: end,
139 |         speaker: obj?.speaker || 0,
140 |         confidence: confidence,
141 |         channel: obj?.channel || 0,
142 |         [textKey]: sentence,
143 |       };
144 | 
145 |       //@ts-ignore
146 |       chunks.push(newSegment as Sentence | Utterance);
147 |     }
148 | 
149 |     return chunks as Sentence[] | Utterance[];
150 |   }
151 | 
152 |   private static addTimesInText(words: Word[]) {
153 |     let enhancedText = '';
154 | 
155 |     words.forEach((word, index) => {
156 |       const timeBetweenNextWord =
157 |         index !== words.length - 1 ? (words[index + 1].start - word.end).toString() : '';
158 | 
159 |       enhancedText += word.word.trim() + (timeBetweenNextWord ? `<${timeBetweenNextWord.slice(0, 5)}s>` : '');
160 |     });
161 | 
162 |     return enhancedText;
163 |   }
164 | 
165 |   static mergeSegments(segments: SegmentDetail[], timeThreshold: number): SegmentDetailOut[] {
166 |     console.debug('Merging segments...');
167 |     const mergedSegments = this.mergeUnderCondition(segments, timeThreshold);
168 | 
169 |     return mergedSegments;
170 |   }
171 | 
172 |   static getMaxCharactersPerSegment(language: string): number {
173 |     const languageCode = languageCodes[language as keyof typeof languageCodes]?.toLowerCase();
174 |     return nonLatinScriptLanguages.includes(languageCode as AllowedLanguages)
175 |       ? maxCharactersPerSegmentForNonLatinScriptLanguages
176 |       : maxCharactersPerSegment;
177 |   }
178 | 
179 |   static mergeUnderCondition(segments: SegmentDetail[], timeThreshold: number) {
180 |     //If one the transcription part is longer that 4000 characters, we try again we a smaller timeThreshold
181 | 
182 |     const getMergedTranscription = () => {
183 |       const mergedSegments: SegmentDetailOut[] = [];
184 |       let currentSegment = segments[0];
185 |       let mergedPartIndex = 0;
186 | 
187 |       if (segments.length === 0) throw new Error('No transcription found in the response.');
188 | 
189 |       for (let i = 1; i < segments.length; i++) {
190 |         const nextSegment = segments[i];
191 |         const maxCharactersPerSegment = this.getMaxCharactersPerSegment(nextSegment.language);
192 | 
193 |         // Check if the start of the next segment is close to the end of the current segment
194 |         const difference = nextSegment.begin - currentSegment.end;
195 | 
196 |         if (
197 |           difference <= timeThreshold &&
198 |           currentSegment.speaker === nextSegment.speaker &&
199 |           currentSegment.transcription.length + nextSegment.transcription.length < maxCharactersPerSegment
200 |         ) {
201 |           // Merge segments if close enough
202 |           currentSegment = {
203 |             ...currentSegment,
204 |             transcription: currentSegment.transcription + ' ' + nextSegment.transcription,
205 |             end: nextSegment.end,
206 |             //*To get words with low confidence, simply add low confidence words in an array
207 |             wordsWithSilence: currentSegment.wordsWithSilence.concat(nextSegment.wordsWithSilence),
208 |           };
209 |         } else {
210 |           // Adds the current transcript to the board and moves to the next
211 |           mergedSegments.push({
212 |             ...currentSegment,
213 |             index: mergedPartIndex,
214 |           });
215 |           currentSegment = nextSegment;
216 |           mergedPartIndex++;
217 |         }
218 |       }
219 | 
220 |       mergedSegments.push({
221 |         ...currentSegment,
222 |         index: mergedPartIndex,
223 |       });
224 | 
225 |       return mergedSegments;
226 |     };
227 | 
228 |     const finalMergedTranscriptions = getMergedTranscription();
229 | 
230 |     const isEverySegmentsLessThan4000 = finalMergedTranscriptions.every(
231 |       (transcription) => transcription.transcription.length < 4000,
232 |     );
233 |     if (!isEverySegmentsLessThan4000) {
234 |       console.error('Error while merging transcriptions: One of the transcription is too long (>4000)');
235 |       //Throw an error if the transcription is too long
236 |       throw new Error('One of the transcription is too long (>4000)');
237 |     } else {
238 |       return finalMergedTranscriptions;
239 |     }
240 |   }
241 | 
242 |   static addDurationForEachTranscription(transcription: SegmentDetail[]): SegmentDetailOutWithDuration[] {
243 |     return transcription.map((part, index) => {
244 |       const duration = part.end - part.begin;
245 |       return {
246 |         ...part,
247 |         duration: Number(duration.toFixed(3)),
248 |         index,
249 |       };
250 |     });
251 |   }
252 | }
253 | 


--------------------------------------------------------------------------------
/src/transcription/textTranslator.ts:
--------------------------------------------------------------------------------
  1 | import { models, requestToGPT } from '../llm/openai';
  2 | import type { OpenAIModel } from '../llm/openai';
  3 | import { PromptBuilder } from '../llm/prompt-builder';
  4 | import { defaultInstructions } from '../llm/prompt-builder';
  5 | import type {
  6 |   AllowedLanguages,
  7 |   AudioOriginalLangAllowed,
  8 |   CreatePromptArguments,
  9 |   SegmentDetailOutWithDuration,
 10 |   SegmentWitDurationAndOriginalSegment,
 11 | } from '../types';
 12 | 
 13 | export class TextTranslator {
 14 |   static async translateTranscriptionInTargetLanguage({
 15 |     transcription,
 16 |     targetLanguage,
 17 |     originLanguage,
 18 |     transcriptionSummary,
 19 |   }: {
 20 |     transcription: SegmentDetailOutWithDuration[];
 21 |     targetLanguage: AllowedLanguages;
 22 |     originLanguage: AudioOriginalLangAllowed;
 23 |     transcriptionSummary: string;
 24 |   }) {
 25 |     const translatedTranscription = await this.translateTranscription({
 26 |       transcription,
 27 |       targetLanguage,
 28 |       originLanguage,
 29 |       transcriptionSummary,
 30 |     });
 31 | 
 32 |     return translatedTranscription;
 33 |   }
 34 | 
 35 |   static async translateTranscription({
 36 |     transcription,
 37 |     targetLanguage,
 38 |     originLanguage,
 39 |     transcriptionSummary,
 40 |   }: {
 41 |     transcription: SegmentDetailOutWithDuration[];
 42 |     targetLanguage: AllowedLanguages;
 43 |     originLanguage: string;
 44 |     transcriptionSummary: string;
 45 |   }) {
 46 |     console.debug('Translating transcription...');
 47 |     const maxSimultaneousTranslation = 10;
 48 |     let translationPromises: Promise<string>[] = [];
 49 |     const transcriptionTranslated: SegmentWitDurationAndOriginalSegment[] = [];
 50 |     const deepCopyTranscriptions = (
 51 |       JSON.parse(JSON.stringify(transcription)) as SegmentWitDurationAndOriginalSegment[]
 52 |     ).sort((a, b) => a.index - b.index) as SegmentWitDurationAndOriginalSegment[];
 53 | 
 54 |     try {
 55 |       for (let i = 0; i < deepCopyTranscriptions.length; i++) {
 56 |         // Skip for first transcription to avoid undefined reference
 57 |         const lastTranscription = i !== 0 ? deepCopyTranscriptions[i - 1].transcription : '';
 58 | 
 59 |         const actualTranscription = deepCopyTranscriptions[i].transcription;
 60 | 
 61 |         deepCopyTranscriptions[i].transcription = actualTranscription;
 62 | 
 63 |         const actualTranscriptionSpeaker = deepCopyTranscriptions[i].speaker?.toString() || '0';
 64 | 
 65 |         const nextTranscriptionSpeaker =
 66 |           i !== deepCopyTranscriptions.length - 1
 67 |             ? deepCopyTranscriptions[i + 1].speaker?.toString() || '0'
 68 |             : '';
 69 | 
 70 |         const nextTranscription =
 71 |           i !== deepCopyTranscriptions.length - 1 ? deepCopyTranscriptions[i + 1].transcription || '' : '';
 72 | 
 73 |         const lastTranscriptionSpeaker = lastTranscription
 74 |           ? deepCopyTranscriptions[i - 1].speaker?.toString() || '0'
 75 |           : '';
 76 | 
 77 |         const translationPromise = this.getTranslationPromise({
 78 |           actualTranscription,
 79 |           lastTranscription,
 80 |           targetLanguage: targetLanguage,
 81 |           transcriptionLanguage: originLanguage,
 82 |           actualTranscriptionSpeaker,
 83 |           nextTranscriptionSpeaker,
 84 |           nextTranscription,
 85 |           lastTranscriptionSpeaker,
 86 |           transcriptionSummary,
 87 |         });
 88 | 
 89 |         translationPromises.push(translationPromise);
 90 | 
 91 |         // Resolve translations in batches or at the last item
 92 |         if (
 93 |           translationPromises.length === maxSimultaneousTranslation ||
 94 |           i === deepCopyTranscriptions.length - 1
 95 |         ) {
 96 |           const translations: string[] = await Promise.all(translationPromises);
 97 |           for (let j = 0; j < translations.length; j++) {
 98 |             const transcriptionToUpdate = deepCopyTranscriptions[transcriptionTranslated.length];
 99 |             transcriptionToUpdate.originalTranscription = deepCopyTranscriptions[j].transcription;
100 |             transcriptionToUpdate.transcription = translations[j];
101 |             transcriptionToUpdate.language = targetLanguage;
102 | 
103 |             transcriptionTranslated.push(transcriptionToUpdate);
104 |           }
105 |           translationPromises = [];
106 |         }
107 |       }
108 | 
109 |       console.debug('Transcription translated.');
110 |       return transcriptionTranslated;
111 |     } catch (error: unknown) {
112 |       console.error(error);
113 |       throw new Error('Error while translating transcription');
114 |     }
115 |   }
116 | 
117 |   static async getTranslationPromise({
118 |     actualTranscription,
119 |     lastTranscription,
120 |     targetLanguage,
121 |     transcriptionLanguage,
122 |     nextTranscriptionSpeaker,
123 |     nextTranscription,
124 |     lastTranscriptionSpeaker,
125 |     actualTranscriptionSpeaker,
126 |     transcriptionSummary,
127 |   }: {
128 |     actualTranscription: string;
129 |     lastTranscription: string;
130 |     targetLanguage: AllowedLanguages;
131 |     transcriptionLanguage: string;
132 |     actualTranscriptionSpeaker: string;
133 |     nextTranscriptionSpeaker?: string;
134 |     nextTranscription?: string;
135 |     lastTranscriptionSpeaker?: string;
136 |     transcriptionSummary: string;
137 |   }) {
138 |     const maxAttempts = 3;
139 |     let textTranslated = '';
140 |     let attempts = 0;
141 | 
142 |     do {
143 |       textTranslated = await this.getTranslationPromiseFromAI({
144 |         actualTranscription,
145 |         lastTranscription,
146 |         targetLanguage,
147 |         transcriptionLanguage,
148 |         nextTranscription: nextTranscription || '',
149 |         nextTranscriptionSpeaker: nextTranscriptionSpeaker || '',
150 |         lastTranscriptionSpeaker: lastTranscriptionSpeaker || '',
151 |         actualTranscriptionSpeaker,
152 |         transcriptionSummary,
153 |       });
154 |       attempts++;
155 |     } while (textTranslated === actualTranscription && attempts < maxAttempts);
156 | 
157 |     return textTranslated;
158 |   }
159 | 
160 |   static async getTranslationPromiseFromAI({
161 |     actualTranscription,
162 |     lastTranscription,
163 |     targetLanguage,
164 |     transcriptionLanguage,
165 |     nextTranscriptionSpeaker,
166 |     nextTranscription,
167 |     lastTranscriptionSpeaker,
168 |     actualTranscriptionSpeaker,
169 |     transcriptionSummary,
170 |   }: {
171 |     actualTranscription: string;
172 |     lastTranscription: string;
173 |     targetLanguage: AllowedLanguages;
174 |     transcriptionLanguage: string;
175 |     nextTranscription?: string;
176 |     nextTranscriptionSpeaker?: string;
177 |     lastTranscriptionSpeaker?: string;
178 |     actualTranscriptionSpeaker: string;
179 |     transcriptionSummary: string;
180 |   }) {
181 |     const promptSettings: CreatePromptArguments = {
182 |       transcriptionToTranslate: actualTranscription,
183 |       lastTranscription: lastTranscription,
184 |       targetLanguage: targetLanguage,
185 |       originLanguage: transcriptionLanguage,
186 |       mainCategoryVideo: '',
187 |       nextTranscription: nextTranscription || '',
188 |       nextTranscriptionSpeaker: nextTranscriptionSpeaker || '',
189 |       previousTranscriptionSpeaker: lastTranscriptionSpeaker || '',
190 |       transcriptionToTranslateSpeaker: actualTranscriptionSpeaker || '',
191 |       transcriptionSummary: transcriptionSummary,
192 |     };
193 | 
194 |     const prompt = PromptBuilder.createPromptToTranslateTranscription(promptSettings);
195 | 
196 |     return this.translateWithLLM({
197 |       prompt,
198 |       instruction: defaultInstructions,
199 |       temperature: 0.5,
200 |     });
201 |   }
202 | 
203 |   static async translateWithLLM({
204 |     prompt,
205 |     temperature,
206 |     instruction,
207 |     responseFormat = 'text',
208 |   }: {
209 |     prompt: string;
210 |     temperature: number;
211 |     instruction: string;
212 |     responseFormat?: 'text' | 'json';
213 |   }) {
214 |     let model: OpenAIModel = models.gpt4_1;
215 | 
216 |     try {
217 |       return await requestToGPT({
218 |         prompt,
219 |         temperature,
220 |         instructions: instruction,
221 |         model,
222 |         maxTokens: 8192,
223 |         responseFormat: responseFormat === 'json' ? 'json_object' : 'text',
224 |       });
225 |     } catch (error) {
226 |       console.error(error);
227 |       throw new Error('Error while translating transcription');
228 |     }
229 |   }
230 | }
231 | 


--------------------------------------------------------------------------------
/src/transcription/transcriber.ts:
--------------------------------------------------------------------------------
  1 | import type { GladiaRequestBody, GladiaResponse } from '../types';
  2 | import axios from 'axios';
  3 | import fs from 'fs';
  4 | import FormData from 'form-data';
  5 | import fsPromise from 'fs/promises';
  6 | 
  7 | const baseUrlGladia = 'https://api.gladia.io/v2/pre-recorded/';
  8 | 
  9 | interface AudioUploadResponse {
 10 |   audio_url: string;
 11 |   audio_metadata: {
 12 |     id: string;
 13 |     filename: string;
 14 |     source: string;
 15 |     extension: string;
 16 |     size: number;
 17 |     audio_duration: number;
 18 |     number_of_channels: number;
 19 |   };
 20 | }
 21 | 
 22 | export class Transcriber {
 23 |   static async transcribeAudio({
 24 |     audioPath,
 25 |     numberOfSpeakers,
 26 |   }: {
 27 |     audioPath: string;
 28 |     numberOfSpeakers: string;
 29 |   }) {
 30 |     try {
 31 |       const speakerNumber =
 32 |         numberOfSpeakers !== 'auto-detect' && numberOfSpeakers !== undefined
 33 |           ? parseInt(numberOfSpeakers)
 34 |           : numberOfSpeakers;
 35 | 
 36 |       const audioUrl = await this.uploadAudioFile(audioPath);
 37 | 
 38 |       const transcription = await this.getGladiaTranscription({
 39 |         fileUrl: audioUrl,
 40 |         numberOfSpeakers: speakerNumber,
 41 |       });
 42 | 
 43 |       return transcription;
 44 |     } catch (error) {
 45 |       if (error instanceof Error) {
 46 |         throw new Error(error.message);
 47 |       } else {
 48 |         throw new Error('Error in transcribeAudio: ' + error);
 49 |       }
 50 |     }
 51 |   }
 52 | 
 53 |   static async getGladiaTranscription({
 54 |     fileUrl,
 55 |     numberOfSpeakers,
 56 |   }: {
 57 |     fileUrl: string;
 58 |     numberOfSpeakers: number | 'auto-detect';
 59 |   }): Promise<GladiaResponse> {
 60 |     try {
 61 |       const requestData: GladiaRequestBody = {
 62 |         audio_url: fileUrl,
 63 |         detect_language: true,
 64 |         diarization: true,
 65 |         sentences: true,
 66 |         name_consistency: true,
 67 |         punctuation_enhanced: true,
 68 |         summarization: true,
 69 |       };
 70 | 
 71 |       if (numberOfSpeakers !== 'auto-detect' && numberOfSpeakers !== undefined && numberOfSpeakers !== 0) {
 72 |         requestData.diarization_config = {
 73 |           number_of_speakers: numberOfSpeakers || 1,
 74 |           max_speakers: numberOfSpeakers || 1,
 75 |         };
 76 |       }
 77 | 
 78 |       const headers = {
 79 |         'x-gladia-key': process.env.GLADIA_API_KEY,
 80 |         'Content-Type': 'application/json',
 81 |       };
 82 | 
 83 |       console.debug('- Sending initial request to Gladia API...');
 84 |       const initialResponse: any = await this.makeFetchRequest(baseUrlGladia, {
 85 |         method: 'POST',
 86 |         headers,
 87 |         body: JSON.stringify(requestData),
 88 |       });
 89 | 
 90 |       if (!initialResponse.id) {
 91 |         throw new Error('Error with gladia initialization');
 92 |       }
 93 | 
 94 |       const response = await this.pollForResult(initialResponse.id, headers);
 95 | 
 96 |       return response;
 97 |     } catch (error) {
 98 |       console.error('Error in Gladia transcription:', error);
 99 |       throw new Error('Error in Gladia transcription');
100 |     }
101 |   }
102 | 
103 |   static async pollForResult(transcriptionId: string, headers: any): Promise<GladiaResponse> {
104 |     const pollUrl = `${baseUrlGladia}${transcriptionId}`;
105 | 
106 |     while (true) {
107 |       const pollResponse: any = await this.makeFetchRequest(pollUrl, {
108 |         method: 'GET',
109 |         headers,
110 |       });
111 | 
112 |       if (pollResponse.status === 'done') {
113 |         return pollResponse;
114 |       } else if (pollResponse.status === 'error') {
115 |         throw new Error(`Gladia transcription failed: ${pollResponse.error}`);
116 |       }
117 | 
118 |       await new Promise((resolve) => setTimeout(resolve, 1000));
119 |     }
120 |   }
121 | 
122 |   static async makeFetchRequest(url: string, options: any) {
123 |     const response = await fetch(url, options);
124 |     if (!response.ok) {
125 |       throw new Error(`Gladia API error: ${response.statusText}`);
126 |     }
127 |     return response.json();
128 |   }
129 | 
130 |   static async uploadAudioFile(filePath: string): Promise<string> {
131 |     const apiKey = process.env.GLADIA_API_KEY;
132 |     if (!apiKey) {
133 |       throw new Error('Missing GLADIA_API_KEY environment variable.');
134 |     }
135 | 
136 |     try {
137 |       console.debug('Uploading audio file to Gladia API...');
138 | 
139 |       const form = new FormData();
140 |       const fileStream = fs.createReadStream(filePath);
141 |       const filename = filePath.split('/').pop() || 'audio.mp3';
142 | 
143 |       form.append('audio', fileStream, filename);
144 | 
145 |       const response = await axios.post('https://api.gladia.io/v2/upload', form, {
146 |         headers: {
147 |           'x-gladia-key': apiKey,
148 |           ...form.getHeaders(),
149 |         },
150 |       });
151 | 
152 |       const data = response.data as AudioUploadResponse;
153 | 
154 |       if (!data.audio_url) {
155 |         console.error('Error uploading audio file to Gladia API: ', data);
156 |         throw new Error('Error uploading audio file to Gladia API');
157 |       }
158 | 
159 |       console.debug('File uploaded to Gladia API');
160 | 
161 |       return data.audio_url;
162 |     } catch (error: any) {
163 |       console.error('Error uploading audio file:', error.response?.data || error.message);
164 |       throw new Error(`Upload failed: ${error.message}`);
165 |     }
166 |   }
167 | }
168 | 


--------------------------------------------------------------------------------
/src/types/index.d.ts:
--------------------------------------------------------------------------------
  1 | export interface TranscriptionDataTypes {
  2 |   summary: SegmentDetailOutWithDuration | null;
  3 |   formattedSegments: string[];
  4 |   detectedAudioLanguage: AudioOriginalLangAllowed | null;
  5 | }
  6 | 
  7 | export interface GladiaResponse {
  8 |   id: string;
  9 |   request_id: string;
 10 |   kind: string;
 11 |   status: string;
 12 |   created_at: string;
 13 |   completed_at: string;
 14 |   file: GladiaFile;
 15 |   request_params: RequestParams;
 16 |   result: Result;
 17 |   //Custom, not natively from Gladia
 18 |   original_audio_path: string;
 19 |   error_code?: string;
 20 | }
 21 | 
 22 | export interface Metadata {
 23 |   audio_duration: number;
 24 |   number_of_distinct_channels: number;
 25 |   billing_time: number;
 26 |   transcription_time: number;
 27 | }
 28 | 
 29 | export interface Word {
 30 |   word: string;
 31 |   start: number;
 32 |   end: number;
 33 |   confidence: number;
 34 | }
 35 | 
 36 | export interface Utterance {
 37 |   text: string;
 38 |   language: string;
 39 |   start: number;
 40 |   end: number;
 41 |   confidence: number;
 42 |   channel: number;
 43 |   speaker: number;
 44 |   words: Word[];
 45 | }
 46 | 
 47 | export interface Sentence {
 48 |   sentence: string;
 49 |   language: string;
 50 |   start: number;
 51 |   end: number;
 52 |   confidence: number;
 53 |   channel: number;
 54 |   speaker: number;
 55 |   words: Word[];
 56 | }
 57 | 
 58 | export interface SegmentDetail {
 59 |   transcription: string;
 60 |   language: string;
 61 |   begin: number;
 62 |   end: number;
 63 |   speaker: number;
 64 |   channel: number;
 65 |   confidence: number;
 66 |   wordsWithSilence: string;
 67 | }
 68 | 
 69 | export interface SegmentWitDurationAndOriginalSegment extends SegmentDetail {
 70 |   duration: number;
 71 |   index: number;
 72 |   originalTranscription: string;
 73 | }
 74 | 
 75 | export interface SegmentDetailOut extends SegmentDetail {
 76 |   index: number;
 77 | }
 78 | 
 79 | export interface SegmentDetailOutWithDuration extends SegmentDetailOut {
 80 |   duration: number;
 81 | }
 82 | 
 83 | export interface Result {
 84 |   metadata: Metadata;
 85 |   summarization: {
 86 |     success: boolean;
 87 |     is_empty: boolean;
 88 |     results: string;
 89 |     exec_time: number;
 90 |     error: string | null;
 91 |   };
 92 |   transcription: Transcription;
 93 | }
 94 | 
 95 | export interface Transcription {
 96 |   languages: string[];
 97 |   full_transcript: string;
 98 |   utterances: Utterance[];
 99 |   sentences: Sentence[];
100 | }
101 | 
102 | export interface CreatePromptArguments {
103 |   transcriptionToTranslate: string;
104 |   lastTranscription: string;
105 |   targetLanguage: string;
106 |   originLanguage: string;
107 |   mainCategoryVideo: string;
108 |   nextTranscription?: string;
109 |   transcriptionToTranslateSpeaker: string;
110 |   previousTranscriptionSpeaker?: string;
111 |   nextTranscriptionSpeaker?: string;
112 |   videoTitle?: string;
113 |   transcriptionSummary?: string;
114 | }
115 | 
116 | export interface GladiaRequestBody {
117 |   /** Context to feed the transcription model with for possible better performance */
118 |   context_prompt?: string;
119 | 
120 |   /** Enable diarization enhanced for this audio */
121 |   diarization_enhanced?: boolean;
122 | 
123 |   /** Specific vocabulary list to feed the transcription model with */
124 |   custom_vocabulary?: string[];
125 | 
126 |   /** Detect the language from the given audio */
127 |   detect_language?: boolean;
128 | 
129 |   /** Detect multiple languages in the given audio */
130 |   enable_code_switching?: boolean;
131 | 
132 |   /** Specify the configuration for code switching */
133 |   code_switching_config?: {
134 |     // Les détails spécifiques ne sont pas fournis
135 |   };
136 | 
137 |   /** Set the spoken language for the given audio (ISO 639 standard) */
138 |   language?: keyof typeof languageCodes;
139 | 
140 |   /** Enable punctuation enhanced for this audio */
141 |   punctuation_enhanced?: boolean;
142 | 
143 |   /** Callback URL we will do a POST re uest to with the result of the transcription */
144 |   callback_url?: string;
145 | 
146 |   /** Enable subtitles generation for this transcription */
147 |   subtitles?: boolean;
148 | 
149 |   /** Configuration for subtitles generation if subtitles is enabled */
150 |   subtitles_config?: {
151 |     // Les détails spécifiques ne sont pas fournis
152 |   };
153 | 
154 |   /** Enable speaker recognition (diarization) for this audio */
155 |   diarization?: boolean;
156 | 
157 |   /** Speaker recognition configuration, if diarization is enabled */
158 |   diarization_config?: {
159 |     // Les détails spécifiques ne sont pas fournis
160 |   };
161 | 
162 |   /** Enable translation for this audio */
163 |   translation?: boolean;
164 | 
165 |   /** Translation configuration, if translation is enabled */
166 |   translation_config?: {
167 |     // Les détails spécifiques ne sont pas fournis
168 |   };
169 | 
170 |   /** Enable summarization for this audio */
171 |   summarization?: boolean;
172 | 
173 |   /** Summarization configuration, if summarization is enabled */
174 |   summarization_config?: {
175 |     // Les détails spécifiques ne sont pas fournis
176 |   };
177 | 
178 |   /** Enable moderation for this audio */
179 |   moderation?: boolean;
180 | 
181 |   /** Enable named entity recognition for this audio */
182 |   named_entity_recognition?: boolean;
183 | 
184 |   /** Enable chapterization for this audio */
185 |   chapterization?: boolean;
186 | 
187 |   /** Enable names consistency for this audio */
188 |   name_consistency?: boolean;
189 | 
190 |   /** Enable custom spelling for this audio */
191 |   custom_spelling?: boolean;
192 | 
193 |   /** Custom spelling configuration, if custom_spelling is enabled */
194 |   custom_spelling_config?: {
195 |     // Les détails spécifiques ne sont pas fournis
196 |   };
197 | 
198 |   /** Enable structured data extraction for this audio */
199 |   structured_data_extraction?: boolean;
200 | 
201 |   /** Structured data extraction configuration, if structured_data_extraction is enabled */
202 |   structured_data_extraction_config?: {
203 |     // Les détails spécifiques ne sont pas fournis
204 |   };
205 | 
206 |   /** Enable sentiment analysis for this audio */
207 |   sentiment_analysis?: boolean;
208 | 
209 |   /** Enable audio to llm processing for this audio */
210 |   audio_to_llm?: boolean;
211 | 
212 |   /** Audio to llm configuration, if audio_to_llm is enabled */
213 |   audio_to_llm_config?: {
214 |     // Les détails spécifiques ne sont pas fournis
215 |   };
216 | 
217 |   /** Custom metadata you can attach to this transcription */
218 |   custom_metadata?: Record<string, any>;
219 | 
220 |   /** Enable sentences for this audio */
221 |   sentences?: boolean;
222 | 
223 |   /** Allows to change the output display_mode for this audio. The output will be reordered, creating new utterances when speakers overlapped */
224 |   display_mode?: boolean;
225 | 
226 |   /** URL to a Gladia file or to an external audio or video file */
227 |   audio_url: string;
228 | }
229 | 
230 | export type AllowedLanguages =
231 |   | 'swedish'
232 |   | 'korean'
233 |   | 'ukrainian'
234 |   | 'greek'
235 |   | 'japanese'
236 |   | 'english'
237 |   | 'american english'
238 |   | 'russian'
239 |   | 'hindi'
240 |   | 'german'
241 |   | 'danish'
242 |   | 'bulgarian'
243 |   | 'czech'
244 |   | 'polish'
245 |   | 'slovak'
246 |   | 'finnish'
247 |   | 'spanish'
248 |   | 'croatian'
249 |   | 'dutch'
250 |   | 'portuguese'
251 |   | 'french'
252 |   | 'malay'
253 |   | 'italian'
254 |   | 'romanian'
255 |   | 'mandarin'
256 |   | 'tamil'
257 |   | 'turkish'
258 |   | 'indonesian'
259 |   | 'tagalog'
260 |   | 'arabic'
261 |   | 'estonian'
262 |   | 'norwegian'
263 |   | 'vietnamese'
264 |   | 'hungarian'
265 |   | 'british english'
266 |   | 'french canadian';
267 | 
268 | export type AudioOriginalLangAllowed =
269 |   | 'af'
270 |   | 'sq'
271 |   | 'am'
272 |   | 'ar'
273 |   | 'hy'
274 |   | 'as'
275 |   | 'ast'
276 |   | 'az'
277 |   | 'ba'
278 |   | 'eu'
279 |   | 'be'
280 |   | 'bn'
281 |   | 'bs'
282 |   | 'br'
283 |   | 'bg'
284 |   | 'my'
285 |   | 'ca'
286 |   | 'ceb'
287 |   | 'zh'
288 |   | 'hr'
289 |   | 'cs'
290 |   | 'da'
291 |   | 'nl'
292 |   | 'en'
293 |   | 'et'
294 |   | 'at'
295 |   | 'fo'
296 |   | 'fi'
297 |   | 'fr'
298 |   | 'fy'
299 |   | 'ff'
300 |   | 'gd'
301 |   | 'gl'
302 |   | 'lg'
303 |   | 'ka'
304 |   | 'de'
305 |   | 'el'
306 |   | 'gu'
307 |   | 'ht'
308 |   | 'ha'
309 |   | 'haw'
310 |   | 'he'
311 |   | 'hi'
312 |   | 'hu'
313 |   | 'is'
314 |   | 'ig'
315 |   | 'ilo'
316 |   | 'id'
317 |   | 'ga'
318 |   | 'it'
319 |   | 'ja'
320 |   | 'jv'
321 |   | 'kn'
322 |   | 'kk'
323 |   | 'km'
324 |   | 'ko'
325 |   | 'lo'
326 |   | 'la'
327 |   | 'lv'
328 |   | 'lb'
329 |   | 'ln'
330 |   | 'lt'
331 |   | 'mk'
332 |   | 'mg'
333 |   | 'ms'
334 |   | 'ml'
335 |   | 'mt'
336 |   | 'mi'
337 |   | 'mr'
338 |   | 'mo'
339 |   | 'mn'
340 |   | 'ne'
341 |   | 'no'
342 |   | 'nn'
343 |   | 'oc'
344 |   | 'or'
345 |   | 'pa'
346 |   | 'ps'
347 |   | 'fa'
348 |   | 'pl'
349 |   | 'pt'
350 |   | 'ro'
351 |   | 'ru'
352 |   | 'sa'
353 |   | 'sr'
354 |   | 'sn'
355 |   | 'sd'
356 |   | 'si'
357 |   | 'sk'
358 |   | 'sl'
359 |   | 'so'
360 |   | 'es'
361 |   | 'su'
362 |   | 'sw'
363 |   | 'ss'
364 |   | 'sv'
365 |   | 'tl'
366 |   | 'tg'
367 |   | 'ta'
368 |   | 'tt'
369 |   | 'te'
370 |   | 'th'
371 |   | 'bo'
372 |   | 'tn'
373 |   | 'tr'
374 |   | 'tk'
375 |   | 'uk'
376 |   | 'ur'
377 |   | 'uz'
378 |   | 'vi'
379 |   | 'cy'
380 |   | 'wo'
381 |   | 'xh'
382 |   | 'yi'
383 |   | 'yo'
384 |   | 'zu';
385 | 


--------------------------------------------------------------------------------
/src/types/lipsync.d.ts:
--------------------------------------------------------------------------------
 1 | interface LipSyncResult {
 2 |   id: string;
 3 |   createdAt: string;
 4 |   status: StatusSyncLab;
 5 |   model: string;
 6 |   input: string;
 7 |   webhookUrl: string;
 8 |   options: {
 9 |     output_format: string;
10 |   };
11 |   outputUrl: string;
12 |   error: null | string;
13 | }
14 | 
15 | export interface LipSyncResponse {
16 |   id: string;
17 |   createdAt: string;
18 |   status: StatusSyncLab;
19 |   model: string;
20 |   input: string;
21 |   webhookUrl: string;
22 |   options: {
23 |     output_format: string;
24 |   };
25 |   outputUrl: string;
26 |   error: null | string;
27 | }
28 | 
29 | export interface SyncLabInitialResponse {
30 |   id: string;
31 |   createdAt: string;
32 |   status: 'PENDING';
33 |   videoUrl: string | null;
34 |   originalVideoUrl: string;
35 |   originalAudioUrl: string;
36 |   synergize: boolean;
37 |   creditsDeducted: number | null;
38 |   webhookUrl: string;
39 |   errorMessage: string | null;
40 |   message: string;
41 | }
42 | 
43 | export interface SynclabInput {
44 |   type: 'video' | 'audio';
45 |   url: string;
46 |   segments_secs?: number[][];
47 |   segments_frames?: number[][];
48 | }
49 | 
50 | export interface SynclabOptions {
51 |   output_format: 'mp4';
52 |   active_speaker?: boolean;
53 | }
54 | 
55 | export interface SynclabV2RequestBody {
56 |   model: string;
57 |   input: SynclabInput[];
58 |   options: SynclabOptions;
59 |   webhookUrl?: string;
60 | }
61 | 
62 | export interface SynclabRequestBody {
63 |   audioUrl: string;
64 |   videoUrl: string;
65 |   model: string;
66 |   webhookUrl?: string;
67 |   synergize?: boolean;
68 |   pads?: number;
69 |   maxCredits?: number;
70 | }
71 | 
72 | export type StatusSyncLab = 'PENDING' | 'PROCESSING' | 'COMPLETED' | 'FAILED' | 'REJECTED' | 'CANCELED';
73 | 


--------------------------------------------------------------------------------
/src/types/speech.d.ts:
--------------------------------------------------------------------------------
 1 | export interface SpeechResponseWithIndex {
 2 |   speech: Response | Buffer;
 3 |   index: number;
 4 |   speaker: number;
 5 |   requestId: string;
 6 | }
 7 | 
 8 | export interface SpeechResponseWithDuration {
 9 |   speech: Buffer;
10 |   duration: number;
11 |   speechIndex: number;
12 |   speaker: number;
13 |   requestId: string;
14 | }
15 | 
16 | export interface SpeechAdjusted {
17 |   speech: Buffer | undefined;
18 |   transcriptionDuration: number;
19 |   end: number;
20 |   begin: number;
21 |   speaker: number;
22 |   speechDuration: number;
23 | }
24 | 
25 | export interface CreateLongerSpeechArguments {
26 |   translatedTranscription: string;
27 |   speechIndex: number;
28 |   speakerIndex: number;
29 |   targetLanguage: AllowedLanguages;
30 |   originalLanguage: string;
31 |   transcriptionWords: string;
32 |   previousText: string;
33 |   nextText: string;
34 |   originalSegmentDuration: number;
35 |   translatedSpeechDuration: number;
36 |   difference: string;
37 |   speedFactor: number;
38 |   transcriptionSummary: string;
39 |   clonedVoiceId: string;
40 | }
41 | 
42 | export interface CreateShorterSpeechArguments {
43 |   translatedTranscription: string;
44 |   originalTranscription: string;
45 |   speechIndex: number;
46 |   speakerIndex: number;
47 |   targetLanguage: AllowedLanguages;
48 |   previousText: string;
49 |   nextText: string;
50 |   transcriptionDuration: number;
51 |   translatedSpeechDuration: number;
52 |   difference: string;
53 |   transcriptionSummary: string;
54 |   clonedVoiceId: string;
55 | }
56 | 


--------------------------------------------------------------------------------
/src/types/spleeter.d.ts:
--------------------------------------------------------------------------------
 1 | interface LalalAPIResponse {
 2 |   status: 'success' | 'error';
 3 |   result: Result;
 4 | }
 5 | 
 6 | interface Result {
 7 |   [key: string]: SplitDetail;
 8 |   archive: SplitDetail;
 9 |   batch: SplitDetail;
10 | }
11 | 
12 | interface SplitDetail {
13 |   status: 'success' | 'error';
14 |   name?: string;
15 |   size?: number;
16 |   duration?: number;
17 |   stem?: string;
18 |   splitter?: 'orion' | 'phoenix';
19 |   preview?: Preview | null;
20 |   split?: any;
21 |   player?: Player | null;
22 |   task?: TaskDetail;
23 |   error?: string;
24 | }
25 | 
26 | interface Preview {
27 |   duration: number;
28 |   stem_track: string;
29 |   stem_track_size: number;
30 |   back_track: string;
31 |   back_track_size: number;
32 | }
33 | 
34 | interface Player {
35 |   stem_track: string;
36 |   stem_track_size: number;
37 |   back_track: string;
38 |   back_track_size: number;
39 | }
40 | 
41 | interface TaskDetail {
42 |   id: string[];
43 |   state: 'success' | 'error' | 'progress' | 'cancelled';
44 |   progress?: number;
45 |   split_id?: string;
46 |   error?: string;
47 | }
48 | 
49 | interface ApiUploadResponse {
50 |   status: 'success' | 'error';
51 |   id?: string;
52 |   size?: number;
53 |   duration?: number;
54 |   expires?: number;
55 |   error?: string;
56 | }
57 | 


--------------------------------------------------------------------------------
/src/utils/config.ts:
--------------------------------------------------------------------------------
 1 | import type { AllowedLanguages } from '../types';
 2 | 
 3 | export const threshold = 0.7; // 0.8 seconds
 4 | export const maxCharactersPerSegment = 350;
 5 | export const maxCharactersPerSegmentForNonLatinScriptLanguages = 175;
 6 | export const maxSimultaneousFetchElevenLabs = 1;
 7 | export const maxSimultaneousFetchOpenAI = process.env.NODE_ENV === 'production' ? 4 : 10;
 8 | export const silenceBetweenSegmentConsideredAsPause = 0.5;
 9 | 
10 | export const specialLanguagesWithSpecialCharacters: AllowedLanguages[] = ['mandarin', 'japanese', 'korean'];
11 | 


--------------------------------------------------------------------------------
/src/utils/constants.ts:
--------------------------------------------------------------------------------
  1 | import type { AudioOriginalLangAllowed } from '../types/index';
  2 | 
  3 | export const audioExtensions = ['.mp3', '.wav', '.ogg', '.aac', '.flac', '.m4a', '.wma'];
  4 | 
  5 | export const videoExtensions = ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.wmv', '.flv', '.m4v'];
  6 | 
  7 | export const allowedExtensions = [...audioExtensions, ...videoExtensions];
  8 | 
  9 | type LanguageObject = {
 10 |   [K in AudioOriginalLangAllowed]?: string;
 11 | };
 12 | 
 13 | export const languageCodes: LanguageObject = {
 14 |   af: 'Afrikaans',
 15 |   sq: 'Albanian',
 16 |   am: 'Amharic',
 17 |   ar: 'Arabic',
 18 |   hy: 'Armenian',
 19 |   as: 'Assamese',
 20 |   ast: 'Asturian',
 21 |   az: 'Azerbaijani',
 22 |   ba: 'Bashkir',
 23 |   eu: 'Basque',
 24 |   be: 'Belarusian',
 25 |   bn: 'Bengali',
 26 |   bs: 'Bosnian',
 27 |   br: 'Breton',
 28 |   bg: 'Bulgarian',
 29 |   my: 'Burmese',
 30 |   ca: 'Catalan',
 31 |   ceb: 'Cebuano',
 32 |   zh: 'Mandarin',
 33 |   hr: 'Croatian',
 34 |   cs: 'Czech',
 35 |   da: 'Danish',
 36 |   nl: 'Dutch',
 37 |   en: 'English',
 38 |   at: 'estonian',
 39 |   et: 'estonian',
 40 |   fo: 'Faroese',
 41 |   fi: 'Finnish',
 42 |   fr: 'French',
 43 |   fy: 'Western Frisian',
 44 |   ff: 'Fulah',
 45 |   gd: 'Gaelic',
 46 |   gl: 'Galician',
 47 |   lg: 'Ganda',
 48 |   ka: 'Georgian',
 49 |   de: 'German',
 50 |   el: 'Greek',
 51 |   gu: 'Gujarati',
 52 |   ht: 'Haitian Creole',
 53 |   ha: 'Hausa',
 54 |   haw: 'Hawaiian',
 55 |   he: 'Hebrew',
 56 |   hi: 'Hindi',
 57 |   hu: 'Hungarian',
 58 |   is: 'Icelandic',
 59 |   ig: 'Igbo',
 60 |   ilo: 'Iloko',
 61 |   id: 'Indonesian',
 62 |   ga: 'Irish',
 63 |   it: 'Italian',
 64 |   ja: 'Japanese',
 65 |   jv: 'Javanese',
 66 |   kn: 'Kannada',
 67 |   kk: 'Kazakh',
 68 |   km: 'Khmer',
 69 |   ko: 'Korean',
 70 |   lo: 'Lao',
 71 |   la: 'Latin',
 72 |   lv: 'Latvian',
 73 |   lb: 'Luxembourgish',
 74 |   ln: 'Lingala',
 75 |   lt: 'Lithuanian',
 76 |   mk: 'Macedonian',
 77 |   mg: 'Malagasy',
 78 |   ms: 'Malay',
 79 |   ml: 'Malayalam',
 80 |   mt: 'Maltese',
 81 |   mi: 'Maori',
 82 |   mr: 'Marathi',
 83 |   mo: 'Moldovan',
 84 |   mn: 'Mongolian',
 85 |   ne: 'Nepali',
 86 |   no: 'Norwegian',
 87 |   nn: 'Nynorsk',
 88 |   oc: 'Occitan',
 89 |   or: 'Oriya',
 90 |   pa: 'Punjabi',
 91 |   ps: 'Pashto',
 92 |   fa: 'Persian',
 93 |   pl: 'Polish',
 94 |   pt: 'Portuguese',
 95 |   ro: 'Romanian',
 96 |   ru: 'Russian',
 97 |   sa: 'Sanskrit',
 98 |   sr: 'Serbian',
 99 |   sn: 'Shona',
100 |   sd: 'Sindhi',
101 |   si: 'Sinhala',
102 |   sk: 'Slovak',
103 |   sl: 'Slovenian',
104 |   so: 'Somali',
105 |   es: 'Spanish',
106 |   su: 'Sundanese',
107 |   sw: 'Swahili',
108 |   ss: 'Swati',
109 |   sv: 'Swedish',
110 |   tl: 'Tagalog',
111 |   tg: 'Tajik',
112 |   ta: 'Tamil',
113 |   tt: 'Tatar',
114 |   te: 'Telugu',
115 |   th: 'Thai',
116 |   bo: 'Tibetan',
117 |   tn: 'Tswana',
118 |   tr: 'Turkish',
119 |   tk: 'Turkmen',
120 |   uk: 'Ukrainian',
121 |   ur: 'Urdu',
122 |   uz: 'Uzbek',
123 |   vi: 'Vietnamese',
124 |   cy: 'Welsh',
125 |   wo: 'Wolof',
126 |   xh: 'Xhosa',
127 |   yi: 'Yiddish',
128 |   yo: 'Yoruba',
129 |   zu: 'Zulu',
130 | };
131 | 
132 | export const nonLatinScriptLanguages: string[] = [
133 |   'ar', // Arabic
134 |   'am', // Amharic
135 |   'as', // Assamese
136 |   'bn', // Bengali
137 |   'my', // Burmese
138 |   'zh', // Mandarin
139 |   'gu', // Gujarati
140 |   'he', // Hebrew
141 |   'hi', // Hindi
142 |   'ja', // Japanese
143 |   'kn', // Kannada
144 |   'kk', // Kazakh
145 |   'km', // Khmer
146 |   'ko', // Korean
147 |   'lo', // Lao
148 |   'ml', // Malayalam
149 |   'mr', // Marathi
150 |   'mn', // Mongolian
151 |   'ne', // Nepali
152 |   'or', // Oriya
153 |   'pa', // Punjabi
154 |   'ps', // Pashto
155 |   'fa', // Persian
156 |   'sa', // Sanskrit
157 |   'sd', // Sindhi
158 |   'si', // Sinhala
159 |   'ta', // Tamil
160 |   'te', // Telugu
161 |   'th', // Thai
162 |   'bo', // Tibetan
163 |   'ur', // Urdu
164 |   'yi', // Yiddish
165 | ];
166 | 


--------------------------------------------------------------------------------
/src/utils/helpers.ts:
--------------------------------------------------------------------------------
  1 | import * as fs from 'fs';
  2 | import * as path from 'path';
  3 | import { allowedExtensions, audioExtensions, videoExtensions } from './constants';
  4 | import type { SegmentWitDurationAndOriginalSegment } from '../types';
  5 | import { VideoUtils } from '../ffmpeg/video-utils';
  6 | import fsPromises from 'fs/promises';
  7 | 
  8 | export class Helpers {
  9 |   static async verifyPrerequisitesForDubbing() {
 10 |     console.debug('Verifying prerequisites for dubbing...');
 11 |     const inputDir = path.join(process.cwd(), 'input');
 12 |     let foundInputFile = false;
 13 | 
 14 |     try {
 15 |       const files = await fs.promises.readdir(inputDir);
 16 |       for (const file of files) {
 17 |         const ext = path.extname(file).toLowerCase();
 18 |         if (allowedExtensions.includes(ext)) {
 19 |           foundInputFile = true;
 20 |           break;
 21 |         }
 22 |       }
 23 |     } catch (error: any) {
 24 |       if (error.code === 'ENOENT') {
 25 |         throw new Error("Input directory 'input' not found at the project root.");
 26 |       }
 27 |       throw new Error(`Error reading input directory: ${error.message}`);
 28 |     }
 29 | 
 30 |     if (!foundInputFile) {
 31 |       throw new Error(
 32 |         `No valid video or audio file found in the 'input' directory. Allowed extensions: ${allowedExtensions.join(
 33 |           ', ',
 34 |         )}`,
 35 |       );
 36 |     }
 37 | 
 38 |     const numberOfSpeakers = process.env.NUM_SPEAKERS;
 39 |     const applyLipsync = process.env.APPLY_LIPSYNC;
 40 |     const targetLanguage = process.env.TARGET_LANGUAGE;
 41 |     const syncLabApiKey = process.env.SYNC_LAB_API_KEY;
 42 | 
 43 |     if (!numberOfSpeakers) {
 44 |       throw new Error('Environment variable NUMBER_OF_SPEAKERS is missing or not a valid number.');
 45 |     }
 46 | 
 47 |     if (applyLipsync !== 'yes' && applyLipsync !== 'no') {
 48 |       throw new Error("Environment variable APPLY_LIPSYNC must be either 'yes' or 'no'.");
 49 |     }
 50 | 
 51 |     if (!targetLanguage) {
 52 |       throw new Error('Environment variable TARGET_LANGUAGE is missing.');
 53 |     }
 54 | 
 55 |     if (applyLipsync === 'yes' && !syncLabApiKey) {
 56 |       throw new Error('Environment variable SYNC_LAB_API_KEY is required when APPLY_LIPSYNC is true.');
 57 |     }
 58 | 
 59 |     console.debug('Prerequisites verified successfully.');
 60 |   }
 61 | 
 62 |   static async getInputFilePath(): Promise<string> {
 63 |     const inputDir = path.join(process.cwd(), 'input');
 64 | 
 65 |     try {
 66 |       const files = await fs.promises.readdir(inputDir);
 67 | 
 68 |       for (const file of files) {
 69 |         const ext = path.extname(file).toLowerCase();
 70 |         if (allowedExtensions.includes(ext)) {
 71 |           return path.join(inputDir, file);
 72 |         }
 73 |       }
 74 | 
 75 |       throw new Error(
 76 |         `No valid media file found in the input directory. Allowed extensions: ${allowedExtensions.join(', ')}`,
 77 |       );
 78 |     } catch (error: any) {
 79 |       if (error.code === 'ENOENT') {
 80 |         throw new Error("Input directory 'input' not found at the project root.");
 81 |       }
 82 |       throw error;
 83 |     }
 84 |   }
 85 | 
 86 |   static async getAllInputFilePaths(): Promise<string> {
 87 |     console.debug('Getting all input file paths...');
 88 |     const inputDir = path.join(process.cwd(), 'input');
 89 | 
 90 |     try {
 91 |       const files = await fs.promises.readdir(inputDir);
 92 | 
 93 |       for (const file of files) {
 94 |         const ext = path.extname(file).toLowerCase();
 95 |         if (allowedExtensions.includes(ext)) {
 96 |           return path.join(inputDir, file);
 97 |         }
 98 |       }
 99 | 
100 |       throw new Error(
101 |         `No valid media file found in the input directory. Allowed extensions: ${allowedExtensions.join(', ')}`,
102 |       );
103 |     } catch (error: any) {
104 |       if (error.code === 'ENOENT') {
105 |         throw new Error("Input directory 'input' not found at the project root.");
106 |       }
107 |       throw error;
108 |     }
109 |   }
110 | 
111 |   static getFileType(filePath: string): 'audio' | 'video' | null {
112 |     const ext = path.extname(filePath).toLowerCase();
113 | 
114 |     if (audioExtensions.includes(ext)) {
115 |       return 'audio';
116 |     } else if (videoExtensions.includes(ext)) {
117 |       return 'video';
118 |     } else {
119 |       throw new Error(`Unsupported file type: ${ext}`);
120 |     }
121 |   }
122 | 
123 |   static parseAndVerifyTranscriptionDetails(
124 |     transcriptionDetails: string,
125 |   ): SegmentWitDurationAndOriginalSegment[] {
126 |     try {
127 |       let parsedTranscriptions =
128 |         typeof transcriptionDetails === 'string'
129 |           ? (JSON.parse(transcriptionDetails) as SegmentWitDurationAndOriginalSegment[])
130 |           : (transcriptionDetails as SegmentWitDurationAndOriginalSegment[]);
131 | 
132 |       parsedTranscriptions = parsedTranscriptions.map((partTranscription) => {
133 |         // eslint-disable-next-line @typescript-eslint/no-unused-vars
134 |         const { wordsWithSilence, ...rest } = partTranscription;
135 |         const segment = rest;
136 |         if (!partTranscription.channel) {
137 |           partTranscription.channel = 0;
138 |         }
139 | 
140 |         const isEveryValueCorrect = Object.values(segment).every(
141 |           (value) => value !== '' && value !== null && value !== undefined,
142 |         );
143 | 
144 |         if (!isEveryValueCorrect) {
145 |           throw new Error('Invalid transcription details, one or more values are incorrect or empty');
146 |         }
147 | 
148 |         return partTranscription;
149 |       });
150 | 
151 |       console.debug('Transcription details parsed.');
152 |       return parsedTranscriptions;
153 |     } catch (err: any) {
154 |       console.error(err);
155 |       throw new Error('Error while parsing transcription: ' + err);
156 |     }
157 |   }
158 | 
159 |   static async getVideoLength(filePath: string) {
160 |     if (!filePath) throw new Error('File path is required');
161 | 
162 |     const duration = await VideoUtils.getFileDuration(filePath);
163 |     if (typeof duration !== 'number')
164 |       throw new Error(
165 |         `Error during audio duration calculation in translation service: duration is not a number: ${duration}`,
166 |       );
167 | 
168 |     return Math.round(duration / 60);
169 |   }
170 | 
171 |   static async splitAudioIntoBuffers(filePath: string): Promise<Buffer[]> {
172 |     try {
173 |       console.debug('Splitting audio into buffers...');
174 |       const fileSizeLimit = 10 * 1024 * 1024; // 10 MB en bytes
175 |       const fileBuffer = await fsPromises.readFile(filePath);
176 |       const buffers = [];
177 | 
178 |       for (let start = 0; start < fileBuffer.length; start += fileSizeLimit) {
179 |         const end = Math.min(start + fileSizeLimit, fileBuffer.length);
180 |         buffers.push(fileBuffer.slice(start, end));
181 |       }
182 | 
183 |       console.debug('Audio split into buffers.');
184 |       return buffers;
185 |     } catch (error) {
186 |       console.error('Erreur lors de la lecture ou de la découpe du fichier:', error);
187 |       throw error;
188 |     }
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | BLUE='\033[0;34m'
  4 | GREEN='\033[0;32m'
  5 | YELLOW='\033[1;33m'
  6 | NC='\033[0m'
  7 | BOLD='\033[1m'
  8 | 
  9 | # --- Pre-run Checks ---
 10 | if [ ! -f ".env" ]; then
 11 |   echo -e "${BOLD}${YELLOW}Warning: .env file not found in the project root.${NC}"
 12 |   echo -e "${BOLD}Please create a .env file with the required environment variables before running this script.${NC}"
 13 |   exit 1
 14 | fi
 15 | echo -e "${GREEN}.env file check passed.${NC}"
 16 | 
 17 | if ! command -v node &> /dev/null; then
 18 |   echo -e "${BOLD}Error: Node.js is not installed. Please install it to continue.${NC}"
 19 |   exit 1
 20 | fi
 21 | echo -e "${GREEN}Node.js check passed.${NC}"
 22 | 
 23 | if ! command -v bun &> /dev/null; then
 24 |   echo -e "${BOLD}Error: Bun is not installed. Please install it to continue.${NC}"
 25 |   exit 1
 26 | fi
 27 | echo -e "${GREEN}Bun check passed.${NC}"
 28 | 
 29 | if ! command -v ffmpeg &> /dev/null; then
 30 |   echo -e "${BOLD}Error: FFmpeg is not installed. Please install it to continue.${NC}"
 31 |   echo -e "${BOLD}See installation instructions at: https://ffmpeg.org/download.html${NC}"
 32 |   exit 1
 33 | fi
 34 | echo -e "${GREEN}FFmpeg check passed.${NC}"
 35 | 
 36 | if [ ! -d "node_modules" ]; then
 37 |   echo -e "${YELLOW}Dependencies not found. Installing...${NC}"
 38 |   bun install
 39 |   if [ $? -ne 0 ]; then
 40 |     echo -e "${BOLD}Error: Failed to install dependencies with pnpm.${NC}"
 41 |     exit 1
 42 |   fi
 43 |   echo -e "${GREEN}Dependencies installed successfully.${NC}"
 44 | else
 45 |     echo -e "${GREEN}Dependencies check passed (node_modules found).${NC}"
 46 | fi
 47 | 
 48 | echo -e "\n${GREEN}All checks passed. Proceeding with script...${NC}\n"
 49 | 
 50 | # --- Script Start ---
 51 | 
 52 | clear
 53 | echo -e "${BOLD}╔════════════════════════════════════════╗${NC}"
 54 | echo -e "${BOLD}║      ${BLUE}Choose the target language  ${NC}      ${BOLD}║${NC}"
 55 | echo -e "${BOLD}╚════════════════════════════════════════╝${NC}"
 56 | echo ""
 57 | 
 58 | languages=(
 59 |   "swedish"
 60 |   "korean"
 61 |   "ukrainian"
 62 |   "greek"
 63 |   "japanese"
 64 |   "english"
 65 |   "american english"
 66 |   "russian"
 67 |   "hindi"
 68 |   "german"
 69 |   "danish"
 70 |   "bulgarian"
 71 |   "czech"
 72 |   "polish"
 73 |   "slovak"
 74 |   "finnish"
 75 |   "spanish"
 76 |   "croatian"
 77 |   "dutch"
 78 |   "portuguese"
 79 |   "french"
 80 |   "malay"
 81 |   "italian"
 82 |   "romanian"
 83 |   "mandarin"
 84 |   "tamil"
 85 |   "turkish"
 86 |   "indonesian"
 87 |   "tagalog"
 88 |   "arabic"
 89 |   "norwegian"
 90 |   "vietnamese"
 91 |   "hungarian"
 92 |   "british english"
 93 |   "french canadian"
 94 | )
 95 | 
 96 | echo -e "${BOLD}Available languages:${NC}\n"
 97 | 
 98 | COLUMNS=3
 99 | count=${#languages[@]}
100 | rows=$(( (count + COLUMNS - 1) / COLUMNS ))
101 | 
102 | for (( i=0; i<rows; i++ )); do
103 |   for (( j=0; j<COLUMNS; j++ )); do
104 |     index=$((i + j*rows))
105 |     if [ $index -lt $count ]; then
106 |       printf "${GREEN}%2d)${NC} %-20s" $((index+1)) "${languages[$index]}"
107 |     fi
108 |   done
109 |   echo ""
110 | done
111 | 
112 | echo ""
113 | 
114 | valid_selection=false
115 | while [ "$valid_selection" = false ]; do
116 |   read -p "$(echo -e "${YELLOW}Enter a language number (1-$count) or type the language name [Default: french]:${NC} ")" selection
117 |   
118 |   if [ -z "$selection" ]; then
119 |     selection="french"
120 |   fi
121 | 
122 |   is_valid=false
123 | 
124 |   if [[ $selection =~ ^[0-9]+$ ]] && [ $selection -ge 1 ] && [ $selection -le $count ]; then
125 |     selected_language=${languages[$((selection-1))]}
126 |     is_valid=true
127 |   else
128 |     selection_lower=$(echo "$selection" | tr '[:upper:]' '[:lower:]')
129 |     for lang in "${languages[@]}"; do
130 |       if [ "$selection_lower" = "$(echo "$lang" | tr '[:upper:]' '[:lower:]')" ]; then
131 |         selected_language=$lang
132 |         is_valid=true
133 |         break
134 |       fi
135 |     done
136 |   fi
137 | 
138 |   if [ "$is_valid" = true ]; then
139 |     export TARGET_LANGUAGE="$selected_language"
140 |     echo -e "\n${BOLD}Selected language: ${BLUE}$selected_language${NC}${BOLD}.${NC}"
141 |     valid_selection=true
142 |   else
143 |     echo -e "\n${BOLD}Error: '${YELLOW}$selection${NC}${BOLD}' is not a valid language option. Please try again.${NC}\n"
144 |   fi
145 | done
146 | 
147 | echo -e "\n${BOLD}Starting translation...${NC}\n"
148 | 
149 | valid_speakers=false
150 | while [ "$valid_speakers" = false ]; do
151 |   echo -e "\n${BOLD}Note:${NC} It is recommended to specify the exact number of speakers for better results."
152 |   read -p "$(echo -e "${YELLOW}How many speakers are in the video? (auto-detect or 1-10) [Default: auto-detect]:${NC} ")" num_speakers
153 |   
154 |   if [ -z "$num_speakers" ]; then
155 |     num_speakers="auto-detect"
156 |   fi
157 | 
158 |   if [ "$num_speakers" = "auto-detect" ]; then
159 |     valid_speakers=true
160 |   elif [[ $num_speakers =~ ^[0-9]+$ ]] && [ $num_speakers -ge 1 ] && [ $num_speakers -le 10 ]; then
161 |     valid_speakers=true
162 |   else
163 |     echo -e "\n${BOLD}Error: '${YELLOW}$num_speakers${NC}${BOLD}' is not valid. Enter 'auto-detect' or a number between 1-10.${NC}\n"
164 |   fi
165 | done
166 | 
167 | export NUM_SPEAKERS="$num_speakers"
168 | echo -e "\n${BOLD}Number of speakers: ${BLUE}$num_speakers${NC}${BOLD}.${NC}"
169 | 
170 | valid_lipsync=false
171 | while [ "$valid_lipsync" = false ]; do
172 |   echo -e "${BOLD}If you want to apply lipsync, please make sure you have a sync.so subscription.${NC}"
173 |   echo -e "${BOLD}You should have a AWS account with API keys for S3 access.${NC}"
174 |   echo -e "\n${BOLD}Note:${NC} Lipsync duration depends on your sync.so subscription (1-30 minutes)."
175 |   echo -e "${BOLD}Currently supports only one face. Please verify your subscription limits before proceeding.${NC}"
176 |   
177 |   read -p "$(echo -e "${YELLOW}Do you want to apply lipsync? (yes/no) [Default: no]:${NC} ")" lipsync_option
178 |   
179 |   if [ -z "$lipsync_option" ]; then
180 |     lipsync_option="no"
181 |   fi
182 | 
183 |   lipsync_lower=$(echo "$lipsync_option" | tr '[:upper:]' '[:lower:]')
184 |   
185 |   if [ "$lipsync_lower" = "yes" ] || [ "$lipsync_lower" = "no" ]; then
186 |     valid_lipsync=true
187 |   else
188 |     echo -e "\n${BOLD}Error: Please enter 'yes' or 'no'.${NC}\n"
189 |   fi
190 | done
191 | 
192 | export APPLY_LIPSYNC="$lipsync_lower"
193 | echo -e "\n${BOLD}Apply lipsync: ${BLUE}$lipsync_lower${NC}${BOLD}.${NC}\n"
194 | 
195 | valid_debug=false
196 | while [ "$valid_debug" = false ]; do
197 |   read -p "$(echo -e "${YELLOW}Activate debug mode to see all logs? (yes/no) [Default: yes]:${NC} ")" debug_option
198 |   
199 |   if [ -z "$debug_option" ]; then
200 |     debug_option="yes"
201 |   fi
202 | 
203 |   debug_lower=$(echo "$debug_option" | tr '[:upper:]' '[:lower:]')
204 |   
205 |   if [ "$debug_lower" = "yes" ] || [ "$debug_lower" = "no" ]; then
206 |     valid_debug=true
207 |   else
208 |     echo -e "\n${BOLD}Error: Please enter 'yes' or 'no'.${NC}\n"
209 |   fi
210 | done
211 | 
212 | export DEBUG_MODE="$debug_lower"
213 | echo -e "\n${BOLD}Debug mode: ${BLUE}$debug_lower${NC}${BOLD}.${NC}\n"
214 | 
215 | valid_subtitle=false
216 | while [ "$valid_subtitle" = false ]; do
217 |   read -p "$(echo -e "${YELLOW}Do you want to activate subtitles? (yes/no) [Default: yes]:${NC} ")" subtitle_option
218 |   
219 |   if [ -z "$subtitle_option" ]; then
220 |     subtitle_option="yes"
221 |   fi
222 | 
223 |   subtitle_lower=$(echo "$subtitle_option" | tr '[:upper:]' '[:lower:]')
224 |   
225 |   if [ "$subtitle_lower" = "yes" ] || [ "$subtitle_lower" = "no" ]; then
226 |     valid_subtitle=true
227 |   else
228 |     echo -e "\n${BOLD}Error: Please enter 'yes' or 'no'.${NC}\n"
229 |   fi
230 | done
231 | 
232 | export ACTIVATE_SUBTITLE="$subtitle_lower"
233 | echo -e "\n${BOLD}Activate subtitles: ${BLUE}$subtitle_lower${NC}${BOLD}.${NC}\n"
234 | 
235 | bun src/core/index.ts 


--------------------------------------------------------------------------------
/temporary-files/example.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevinrss01/dubbing-engine/9bbf48ec16169e189bb893dd98f6576087e62c7d/temporary-files/example.txt


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "lib": ["ESNext"],
 4 |     "target": "ESNext",
 5 |     "module": "ESNext",
 6 |     "moduleDetection": "force",
 7 |     "jsx": "react-jsx",
 8 |     "allowJs": true,
 9 | 
10 |     /* Bundler mode */
11 |     "moduleResolution": "bundler",
12 |     "allowImportingTsExtensions": true,
13 |     "verbatimModuleSyntax": true,
14 |     "noEmit": true,
15 | 
16 |     /* Linting */
17 |     "skipLibCheck": true,
18 |     "strict": true,
19 |     "noFallthroughCasesInSwitch": true,
20 |     "forceConsistentCasingInFileNames": true
21 |   },
22 |   "include": ["src/**/*.ts"],
23 |   "exclude": ["node_modules", "dist"],
24 |   "alias": {
25 |     "@": "./src",
26 |     "@types": "./src/types"
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------