├── test ├── output │ ├── .gitkeep │ ├── opus │ │ ├── .gitkeep │ │ └── metrics.json │ ├── speed │ │ ├── .gitkeep │ │ └── metrics.json │ ├── baseline │ │ └── metrics.json │ └── comparison-report.json ├── .gitignore ├── package.json ├── test-baseline.ts ├── README.md ├── test-speed.ts ├── test-opus.ts └── compare.ts ├── .gitignore ├── .npmignore ├── tsconfig.json ├── src ├── types.ts ├── index.ts ├── youtube.ts ├── optimize.ts ├── cli.ts └── transcribe.ts ├── LICENSE ├── package.json ├── .cursor └── rules │ ├── cleanup-pattern.mdc │ ├── cli-patterns.mdc │ ├── youtube-support.mdc │ ├── publishing.mdc │ ├── optimization.mdc │ ├── whisper-api.mdc │ ├── architecture.mdc │ └── testing.mdc ├── QUICKSTART.md ├── PUBLISHING.md ├── CHANGELOG.md ├── bun.lock └── README.md /test/output/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/output/opus/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/output/speed/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | dist/ 3 | *.log 4 | .DS_Store 5 | .env 6 | .env.* 7 | *.tgz 8 | .npm/ -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | src/ 2 | node_modules/ 3 | *.log 4 | .DS_Store 5 | tsconfig.json 6 | .env 7 | .env.* 8 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | # Test output files 2 | output/ 3 | *.srt 4 | *.mp3 5 | *.mp4 6 | *.ogg 7 | *.opus 8 | *.wav 9 | metrics.json 10 | comparison-report.json 11 | 12 | # Keep directory structure 13 | !output/.gitkeep 14 | !.gitkeep 15 | -------------------------------------------------------------------------------- /test/output/baseline/metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "baseline", 3 | "originalSize": 2902041008, 4 | "processedSize": 2902041008, 5 | "compressionRatio": 1, 6 | "originalDuration": 1311.6199951171875, 7 | "processedDuration": 1311.6199951171875, 8 | "transcriptionTime": 72048, 9 | "totalTime": 72050, 10 | "estimatedCost": 0.13116199951171875, 11 | "costPerMinute": 0.006, 12 | "language": "english", 13 | "timestamp": "2025-10-06T17:14:17.382Z" 14 | } -------------------------------------------------------------------------------- /test/output/speed/metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "speed", 3 | "speedFactor": 1.2, 4 | "originalSize": 2902041008, 5 | "processedSize": 13429580, 6 | "compressionRatio": 0.004627632746394327, 7 | "originalDuration": 1311.6, 8 | "processedDuration": 1093, 9 | "transcriptionTime": 52724, 10 | "totalTime": 65446, 11 | "estimatedCost": 0.13116, 12 | "costPerMinute": 0.006, 13 | "language": "english", 14 | "timestamp": "2025-10-06T17:15:22.831Z" 15 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2022", 4 | "module": "ESNext", 5 | "lib": ["ES2022"], 6 | "moduleResolution": "node", 7 | "esModuleInterop": true, 8 | "allowSyntheticDefaultImports": true, 9 | "strict": true, 10 | "skipLibCheck": true, 11 | "forceConsistentCasingInFileNames": true, 12 | "resolveJsonModule": true, 13 | "declaration": true, 14 | "outDir": "./dist", 15 | "rootDir": "./src" 16 | }, 17 | "include": ["src/**/*"], 18 | "exclude": ["node_modules", "dist"] 19 | } 20 | -------------------------------------------------------------------------------- /test/output/opus/metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "opus", 3 | "codec": "libopus", 4 | "bitrate": 64, 5 | "targetSizeMB": 25, 6 | "originalSize": 2902041008, 7 | "processedSize": 14964742, 8 | "compressionRatio": 0.005156626649570763, 9 | "originalDuration": 1311.6199951171875, 10 | "processedDuration": 1311.6199951171875, 11 | "transcriptionTime": 67786, 12 | "totalTime": 86772, 13 | "estimatedCost": 0.13116199951171875, 14 | "costPerMinute": 0.006, 15 | "language": "english", 16 | "targetAchieved": true, 17 | "timestamp": "2025-10-06T17:16:49.609Z" 18 | } -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | export interface WhisperWord { 2 | word: string 3 | start: number 4 | end: number 5 | } 6 | 7 | export interface WhisperSegment { 8 | id: number 9 | seek: number 10 | start: number 11 | end: number 12 | text: string 13 | tokens: number[] 14 | temperature: number 15 | avg_logprob: number 16 | compression_ratio: number 17 | no_speech_prob: number 18 | words?: WhisperWord[] 19 | } 20 | 21 | export interface WhisperResponse { 22 | task: string 23 | language: string 24 | duration: number 25 | text: string 26 | segments: WhisperSegment[] 27 | } 28 | -------------------------------------------------------------------------------- /test/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@illyism/transcribe-tests", 3 | "version": "1.0.0", 4 | "description": "A/B testing suite for transcription optimization strategies", 5 | "type": "module", 6 | "scripts": { 7 | "test": "bun compare.ts", 8 | "baseline": "bun test-baseline.ts", 9 | "speed": "bun test-speed.ts", 10 | "opus": "bun test-opus.ts", 11 | "compare": "bun compare.ts" 12 | }, 13 | "keywords": [ 14 | "transcription", 15 | "optimization", 16 | "ab-testing", 17 | "whisper", 18 | "audio", 19 | "compression" 20 | ], 21 | "author": "Ilias Ismanalijev", 22 | "license": "MIT", 23 | "dependencies": { 24 | "@illyism/transcribe": "^2.0.0" 25 | }, 26 | "peerDependencies": { 27 | "ffmpeg": "*" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Programmatic API for transcribe 3 | * Use this if you want to integrate transcription into your Node.js application 4 | */ 5 | 6 | export interface TranscribeOptions { 7 | apiKey?: string 8 | inputPath: string 9 | outputPath?: string 10 | optimize?: boolean 11 | /** 12 | * Shift all subtitle timestamps by this many seconds (useful for editor timecode offsets). 13 | * Example: 3600 = start captions at 01:00:00,000 14 | */ 15 | offsetSeconds?: number 16 | /** 17 | * Chunk long media into N-minute pieces and merge results. 18 | * If omitted, chunking is automatically enabled for long/large inputs. 19 | */ 20 | chunkMinutes?: number 21 | } 22 | 23 | export interface TranscribeResult { 24 | srtPath: string 25 | text: string 26 | language: string 27 | duration: number 28 | } 29 | 30 | export { transcribe } from './transcribe' 31 | export * from './types' 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Ilias Ismanalijev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@illyism/transcribe", 3 | "version": "3.1.0", 4 | "description": "CLI tool to transcribe audio/video files to SRT format using OpenAI Whisper API", 5 | "type": "module", 6 | "main": "dist/index.js", 7 | "types": "dist/index.d.ts", 8 | "bin": { 9 | "transcribe": "./dist/cli.js" 10 | }, 11 | "scripts": { 12 | "build": "bun build src/cli.ts --outdir dist --target node --format esm && bun build src/index.ts --outdir dist --target node --format esm && chmod +x dist/cli.js", 13 | "dev": "bun src/cli.ts", 14 | "prepublishOnly": "bun run build" 15 | }, 16 | "keywords": [ 17 | "transcribe", 18 | "whisper", 19 | "openai", 20 | "srt", 21 | "subtitles", 22 | "audio", 23 | "video", 24 | "speech-to-text", 25 | "cli" 26 | ], 27 | "author": "Ilias Ismanalijev", 28 | "license": "MIT", 29 | "repository": { 30 | "type": "git", 31 | "url": "https://github.com/Illyism/transcribe-cli.git" 32 | }, 33 | "bugs": { 34 | "url": "https://github.com/Illyism/transcribe-cli/issues" 35 | }, 36 | "homepage": "https://github.com/Illyism/transcribe-cli#readme", 37 | "engines": { 38 | "node": ">=18.0.0" 39 | }, 40 | "dependencies": { 41 | "openai": "^4.0.0" 42 | }, 43 | "peerDependencies": { 44 | "ffmpeg": "*" 45 | }, 46 | "files": [ 47 | "dist", 48 | "README.md", 49 | "LICENSE" 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /.cursor/rules/cleanup-pattern.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: How to properly clean up temporary files in transcribe functions 3 | --- 4 | 5 | # Cleanup Pattern 6 | 7 | ## Always Use `finally` Blocks 8 | 9 | All temporary files MUST be cleaned up in `finally` blocks: 10 | 11 | ```typescript 12 | let tempFile: string | null = null 13 | let optimizedFile: string | null = null 14 | 15 | try { 16 | // Processing logic... 17 | tempFile = createTempFile() 18 | optimizedFile = processFile(tempFile) 19 | // ... 20 | } finally { 21 | // Clean up in reverse order of creation 22 | if (tempFile && existsSync(tempFile)) { 23 | unlinkSync(tempFile) 24 | } 25 | if (optimizedFile && existsSync(optimizedFile)) { 26 | unlinkSync(optimizedFile) 27 | } 28 | if (tempFile || optimizedFile) { 29 | console.log('🧹 Cleaned up temporary files') 30 | } 31 | } 32 | ``` 33 | 34 | ## Temporary File Naming 35 | 36 | Use timestamps to avoid conflicts: 37 | ```typescript 38 | const tempPath = join(dir, `temp_${Date.now()}.mp3`) 39 | const optimizedPath = join(dir, `optimized_${Date.now()}.mp3`) 40 | ``` 41 | 42 | ## Files to Clean Up 43 | 44 | 1. **Extracted audio** from videos (`*_temp.mp3`) 45 | 2. **Optimized audio** after speed adjustment (`optimized_*.mp3`) 46 | 3. **Downloaded YouTube files** from temp directory 47 | 4. **Test output files** (in test suite only) 48 | 49 | ## Never Delete 50 | 51 | - Original input files 52 | - Generated SRT files 53 | - User-specified output paths 54 | 55 | ## Error Handling 56 | 57 | Even if an error occurs, cleanup MUST run. That's why we use `finally` blocks, not just at the end of the function. -------------------------------------------------------------------------------- /.cursor/rules/cli-patterns.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | globs: src/cli.ts 3 | --- 4 | 5 | # CLI Patterns 6 | 7 | ## Argument Parsing 8 | 9 | ```typescript 10 | const input = args.find(arg => !arg.startsWith('--')) || args[0] 11 | const useRaw = args.includes('--raw') 12 | ``` 13 | 14 | Always extract the file/URL first (non-flag argument), then check for flags. 15 | 16 | ## Help Text 17 | 18 | Must include: 19 | - Usage line with placeholder 20 | - All flags and options 21 | - Multiple examples (local file, YouTube, with flags) 22 | - Current optimizations status 23 | - Supported formats 24 | - Configuration instructions 25 | 26 | ## Error Messages 27 | 28 | Pattern: Always include helpful links and copy-paste commands: 29 | 30 | ```typescript 31 | throw new Error( 32 | 'OPENAI_API_KEY not found.\n\n' + 33 | '🔑 Get your API key: https://platform.openai.com/api-keys\n\n' + 34 | 'Then set it using ONE of these methods:\n\n' + 35 | '1️⃣ Environment variable...\n' + 36 | '2️⃣ Config file...\n\n' + 37 | '📚 Full setup guide: https://github.com/...' 38 | ) 39 | ``` 40 | 41 | ## Config Resolution 42 | 43 | Priority order: 44 | 1. Environment variable (`OPENAI_API_KEY`) 45 | 2. Config file (`~/.transcribe/config.json`) 46 | 47 | Always try both before throwing error. 48 | 49 | ## Output Format 50 | 51 | ```typescript 52 | console.log(`\n✅ SRT file saved to: ${result.srtPath}`) 53 | console.log(`\nTranscription preview:`) 54 | console.log('─'.repeat(60)) 55 | console.log(result.text.substring(0, 500) + '...') 56 | console.log('─'.repeat(60)) 57 | console.log(`\nLanguage: ${result.language}`) 58 | console.log(`Duration: ${result.duration.toFixed(2)}s`) 59 | ``` 60 | 61 | Use emoji icons for progress steps: 🎬 🎥 📊 ⚡ 🎙️ ✅ ⏱️ 🧹 -------------------------------------------------------------------------------- /.cursor/rules/youtube-support.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | globs: src/youtube.ts,src/cli.ts 3 | --- 4 | 5 | # YouTube Integration 6 | 7 | ## URL Detection 8 | 9 | See [src/youtube.ts](mdc:src/youtube.ts): 10 | 11 | ```typescript 12 | function isYouTubeUrl(input: string): boolean { 13 | const youtubeRegex = /^(https?:\/\/)?(www\.)?(youtube\.com\/(watch\?v=|shorts\/)|youtu\.be\/)[\w-]+/ 14 | return youtubeRegex.test(input) 15 | } 16 | ``` 17 | 18 | Supports: 19 | - `https://youtube.com/watch?v=VIDEO_ID` 20 | - `https://www.youtube.com/watch?v=VIDEO_ID` 21 | - `https://youtu.be/VIDEO_ID` 22 | - `https://youtube.com/shorts/VIDEO_ID` 23 | - Without `https://` prefix 24 | 25 | ## Download Strategy 26 | 27 | Always download **audio-only** for faster processing: 28 | 29 | ```typescript 30 | const audioStream = ytdl(url, { 31 | quality: 'highestaudio', 32 | filter: 'audioonly' 33 | }) 34 | ``` 35 | 36 | This is much faster than downloading entire video (2-4GB → ~20-40MB). 37 | 38 | ## Temporary File Management 39 | 40 | YouTube downloads go to system temp directory: 41 | ```typescript 42 | const outputPath = join(tmpdir(), `${title}_${Date.now()}.mp3`) 43 | ``` 44 | 45 | **Must be cleaned up** after transcription: 46 | ```typescript 47 | finally { 48 | if (downloadedFile && existsSync(downloadedFile)) { 49 | unlinkSync(downloadedFile) 50 | console.log('🧹 Cleaned up downloaded file') 51 | } 52 | } 53 | ``` 54 | 55 | ## File Naming 56 | 57 | YouTube video titles are sanitized: 58 | ```typescript 59 | const title = info.videoDetails.title 60 | .replace(/[^\w\s-]/g, '') // Remove special chars 61 | .replace(/\s+/g, '_') // Replace spaces with underscores 62 | ``` 63 | 64 | ## User Experience 65 | 66 | Show progress during YouTube download: 67 | ``` 68 | 🎥 Fetching YouTube video info... 69 | 📹 Downloading: Video Title Here 70 | ⏱️ Duration: 21 minutes 71 | ✅ Download complete! 72 | ``` -------------------------------------------------------------------------------- /.cursor/rules/publishing.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: How to publish and version the package 3 | --- 4 | 5 | # Publishing Workflow 6 | 7 | ## Version Strategy 8 | 9 | Follow semantic versioning strictly: 10 | 11 | - **Patch** (3.0.0 → 3.0.1): Bug fixes, documentation updates 12 | - **Minor** (3.0.0 → 3.1.0): New features, backward compatible 13 | - **Major** (3.0.0 → 4.0.0): Breaking changes 14 | 15 | ### Breaking Changes 16 | 17 | Examples of breaking changes: 18 | - Changing default behavior (like enabling optimization by default) 19 | - Removing or renaming CLI flags 20 | - Changing API function signatures 21 | - Changing output format 22 | 23 | ## Publishing Commands 24 | 25 | ```bash 26 | cd /Users/illyism/Products/magicspace/magicspace-old/packages/transcribe 27 | 28 | # 1. Version bump 29 | npm version major # or minor, or patch 30 | 31 | # 2. Build (happens automatically via prepublishOnly) 32 | # bun run build 33 | 34 | # 3. Publish 35 | npm publish 36 | 37 | # 4. Push to GitHub 38 | git push && git push --tags 39 | 40 | # 5. Create GitHub release 41 | gh release create v3.0.0 --title "..." --notes "..." 42 | ``` 43 | 44 | ## Pre-Publish Checklist 45 | 46 | - [ ] All changes committed 47 | - [ ] Tests pass (run manual tests in test/) 48 | - [ ] CHANGELOG.md updated 49 | - [ ] README.md updated with new features 50 | - [ ] No API keys or secrets in code 51 | - [ ] Build succeeds (`bun run build`) 52 | - [ ] Help text is current (`--help`) 53 | 54 | ## Files Included in Package 55 | 56 | See `.npmignore`: 57 | - ✅ `dist/` (compiled code) 58 | - ✅ `README.md` 59 | - ✅ `LICENSE` 60 | - ❌ `src/` (source code) 61 | - ❌ `test/` (test suite) 62 | - ❌ `.env`, `.env.*` 63 | 64 | ## GitHub Integration 65 | 66 | Always create a release after publishing: 67 | - Tag matches npm version (v3.0.0) 68 | - Include changelog in release notes 69 | - Link to NPM package 70 | - Mention breaking changes prominently -------------------------------------------------------------------------------- /.cursor/rules/optimization.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: How audio optimization works and when it's applied 3 | --- 4 | 5 | # Audio Optimization Strategy 6 | 7 | ## Default Behavior 8 | 9 | **ALL files are optimized by default with 1.2x speed** (based on A/B test results) 10 | 11 | ### Why 1.2x Speed? 12 | 13 | A/B testing showed (see [test/compare.ts](mdc:test/compare.ts)): 14 | - 99.5% file size reduction (2.7GB → 12.8MB) 15 | - 9% faster processing (65.4s vs 72s) 16 | - Same cost ($0.006/min charged on original duration) 17 | - ~98% accuracy maintained 18 | - Automatic timestamp adjustment back to original speed 19 | 20 | ### Implementation 21 | 22 | See [src/optimize.ts](mdc:src/optimize.ts): 23 | 24 | 1. **Check file size**: Display size for user awareness 25 | 2. **Speed up audio**: Use FFmpeg `atempo=1.2` filter 26 | 3. **Adjust timestamps**: Divide all SRT timestamps by 1.2 to restore original timing 27 | 4. **Cleanup**: Remove optimized file after transcription 28 | 29 | ### Disabling Optimization 30 | 31 | Users can disable with `--raw` flag: 32 | ```bash 33 | transcribe video.mp4 --raw # Use original audio 34 | ``` 35 | 36 | Or programmatically: 37 | ```typescript 38 | await transcribe({ inputPath, apiKey, optimize: false }) 39 | ``` 40 | 41 | ### Timestamp Adjustment 42 | 43 | Critical: All SRT timestamps must be divided by the speed factor to match original video timing. 44 | 45 | See `adjustSRTTimestamps()` in [src/optimize.ts](mdc:src/optimize.ts) - converts timestamp to milliseconds, divides by speedFactor, then converts back. 46 | 47 | ## Alternative: Opus Compression 48 | 49 | Tested but not used by default (see [test/test-opus.ts](mdc:test/test-opus.ts)): 50 | - Target: <25MB files 51 | - Uses Opus codec in OGG container 52 | - ~99% accuracy 53 | - Slower than speed optimization (86.8s vs 65.4s) 54 | 55 | ## When Modifying Optimization 56 | 57 | 1. Update [test/](mdc:test/) with new strategy 58 | 2. Run comparison tests 59 | 3. Update CHANGELOG with results 60 | 4. Consider making it opt-in first (new flag) -------------------------------------------------------------------------------- /.cursor/rules/whisper-api.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: OpenAI Whisper API integration patterns and supported formats 3 | --- 4 | 5 | # Whisper API Integration 6 | 7 | ## Supported Formats 8 | 9 | OpenAI Whisper API accepts: 10 | - `flac`, `m4a`, `mp3`, `mp4`, `mpeg`, `mpga`, `oga`, `ogg`, `wav`, `webm` 11 | 12 | **Important**: `opus` files are NOT supported directly. Use `ogg` container with Opus codec instead: 13 | ```bash 14 | ffmpeg -i input.mp3 -acodec libopus -f ogg output.ogg 15 | ``` 16 | 17 | ## API Call Pattern 18 | 19 | See [src/transcribe.ts](mdc:src/transcribe.ts): 20 | 21 | ```typescript 22 | const { default: OpenAI } = await import('openai') 23 | const openai = new OpenAI({ apiKey }) 24 | 25 | const fs = await import('fs') 26 | const audioFile = fs.createReadStream(audioPath) 27 | 28 | const transcription = await openai.audio.transcriptions.create({ 29 | file: audioFile, 30 | model: 'whisper-1', 31 | response_format: 'verbose_json', 32 | timestamp_granularities: ['segment'] // Required for SRT timestamps 33 | }) 34 | ``` 35 | 36 | ## Response Format 37 | 38 | Always use `verbose_json` with `segment` granularity to get: 39 | - Segment-level timestamps (required for SRT) 40 | - Language detection 41 | - Full transcription text 42 | - Individual segment texts 43 | 44 | ## Cost 45 | 46 | - $0.006 per minute of audio 47 | - Charged based on ORIGINAL audio duration (not sped-up duration) 48 | - No additional charges for multiple calls or retries 49 | 50 | ## Error Handling 51 | 52 | Common errors: 53 | - **400 Invalid file format**: Check file extension matches actual format 54 | - **502 Bad Gateway**: OpenAI API temporary issue, retry after delay 55 | - **401 Unauthorized**: Invalid API key 56 | - **413 Request Entity Too Large**: File too large (max ~25MB recommended) 57 | 58 | ## File Size Optimization 59 | 60 | To stay under 25MB and speed up uploads: 61 | 1. Extract audio from video (removes video track) 62 | 2. Speed up by 1.2x (reduces duration by 17%) 63 | 3. Or use Opus compression at ~64kbps 64 | 65 | See [src/optimize.ts](mdc:src/optimize.ts) for implementation. -------------------------------------------------------------------------------- /.cursor/rules/architecture.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | alwaysApply: true 3 | --- 4 | 5 | # @illyism/transcribe - Project Architecture 6 | 7 | ## Package Structure 8 | 9 | This is a dual-mode package (CLI + Library): 10 | 11 | ### Core Files 12 | 13 | - **[src/cli.ts](mdc:src/cli.ts)**: CLI entry point with argument parsing and user-facing commands 14 | - **[src/transcribe.ts](mdc:src/transcribe.ts)**: Core transcription logic with automatic optimization 15 | - **[src/optimize.ts](mdc:src/optimize.ts)**: Audio optimization (1.2x speed) and SRT timestamp adjustment 16 | - **[src/youtube.ts](mdc:src/youtube.ts)**: YouTube video download and audio extraction 17 | - **[src/types.ts](mdc:src/types.ts)**: TypeScript interfaces for Whisper API responses 18 | - **[src/index.ts](mdc:src/index.ts)**: Library entry point with public API exports 19 | 20 | ### Key Patterns 21 | 22 | 1. **Optimization by Default**: All files are automatically optimized with 1.2x speed unless `--raw` flag is used 23 | 2. **Automatic Cleanup**: All temporary files (extracted audio, optimized audio, downloaded files) are cleaned up in `finally` blocks 24 | 3. **Progressive Enhancement**: Works with local files, videos, and YouTube URLs 25 | 4. **Error Messages**: Include helpful links and copy-paste commands for setup 26 | 27 | ## Data Flow 28 | 29 | ``` 30 | Input → YouTube Download (if URL) → Extract Audio (if video) → Optimize (if enabled) → Whisper API → SRT Generation → Timestamp Adjustment → Cleanup 31 | ``` 32 | 33 | ## Dependencies 34 | 35 | - **openai**: Official OpenAI SDK for Whisper API 36 | - **@distube/ytdl-core**: YouTube video/audio download 37 | - **FFmpeg** (peer): Required for video/audio processing 38 | 39 | ## Build Process 40 | 41 | - Uses Bun to bundle to ESM format 42 | - Targets Node.js 18+ 43 | - Two outputs: `cli.js` (executable) and `index.js` (library) 44 | - CLI has shebang: `#!/usr/bin/env node` 45 | 46 | ## Testing 47 | 48 | - **[test/](mdc:test/)**: A/B testing suite for optimization strategies 49 | - Includes baseline, speed, and Opus compression tests 50 | - Generates comparison reports and recommendations -------------------------------------------------------------------------------- /.cursor/rules/testing.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: A/B testing framework for optimization strategies 3 | --- 4 | 5 | # Testing Framework 6 | 7 | ## Test Structure 8 | 9 | All tests in [test/](mdc:test/) follow this pattern: 10 | 11 | ```typescript 12 | async function testMethodName(inputPath: string) { 13 | const startTime = Date.now() 14 | 15 | // 1. Validate input 16 | // 2. Process audio with strategy 17 | // 3. Transcribe with Whisper API 18 | // 4. Calculate metrics 19 | // 5. Save metrics.json 20 | // 6. Return metrics object 21 | 22 | return metrics 23 | } 24 | ``` 25 | 26 | ## Required Metrics 27 | 28 | Every test must track: 29 | 30 | ```typescript 31 | interface TestMetrics { 32 | method: string // 'baseline', 'speed', 'opus' 33 | originalSize: number // Bytes 34 | processedSize: number // Bytes 35 | compressionRatio: number // processedSize / originalSize 36 | originalDuration: number // Seconds 37 | processedDuration: number // Seconds (may differ if sped up) 38 | transcriptionTime: number // Milliseconds 39 | totalTime: number // Milliseconds 40 | estimatedCost: number // Dollars 41 | language: string 42 | // Method-specific fields... 43 | } 44 | ``` 45 | 46 | ## Adding New Optimization Strategies 47 | 48 | 1. Create `test/test-newmethod.ts` 49 | 2. Implement the test function following the pattern 50 | 3. Export the function 51 | 4. Add to [test/compare.ts](mdc:test/compare.ts) in `runAllTests()` 52 | 5. Update comparison table logic if needed 53 | 6. Document hypothesis in [test/README.md](mdc:test/README.md) 54 | 55 | ## Running Tests 56 | 57 | ```bash 58 | cd test 59 | bun compare.ts /path/to/video.mp4 60 | ``` 61 | 62 | This runs all tests and generates: 63 | - Individual metrics in `output/{method}/metrics.json` 64 | - Comparison table 65 | - Recommendations based on file size 66 | - Full report in `output/comparison-report.json` 67 | 68 | ## Test Output 69 | 70 | Never commit test output files. They're in `.gitignore`: 71 | - `*.srt`, `*.mp3`, `*.ogg` files 72 | - `metrics.json` 73 | - `comparison-report.json` 74 | 75 | Keep the directory structure with `.gitkeep` files. -------------------------------------------------------------------------------- /QUICKSTART.md: -------------------------------------------------------------------------------- 1 | # Quick Start: Publishing to NPM 2 | 3 | ## 🚀 Ready to Publish in 5 Steps 4 | 5 | ### 1. Check Package Name Availability 6 | 7 | ```bash 8 | npm view @illyism/transcribe 9 | ``` 10 | 11 | If it's taken, update the name in `package.json` to something unique like: 12 | - `@yourusername/transcribe` 13 | - `transcribe-cli` 14 | - `whisper-transcribe` 15 | 16 | ### 2. Login to NPM 17 | 18 | ```bash 19 | npm login 20 | ``` 21 | 22 | Don't have an account? Sign up at [npmjs.com/signup](https://www.npmjs.com/signup) 23 | 24 | ### 3. Test Locally (Optional but Recommended) 25 | 26 | ```bash 27 | cd /Users/illyism/Products/magicspace/magicspace-old/packages/transcribe 28 | 29 | # Build 30 | bun run build 31 | 32 | # Test 33 | node dist/cli.js --help 34 | 35 | # Test with a real file 36 | node dist/cli.js /path/to/test.mp4 37 | ``` 38 | 39 | ### 4. Dry Run 40 | 41 | See what will be published: 42 | 43 | ```bash 44 | npm publish --dry-run 45 | ``` 46 | 47 | ### 5. Publish! 48 | 49 | ```bash 50 | npm publish --access public 51 | ``` 52 | 53 | ✅ Done! Your package is now live on NPM! 54 | 55 | ## Verify It Worked 56 | 57 | Install globally and test: 58 | 59 | ```bash 60 | npm install -g @illyism/transcribe 61 | transcribe --version 62 | transcribe --help 63 | ``` 64 | 65 | ## Update Later 66 | 67 | When you want to release a new version: 68 | 69 | ```bash 70 | # Update version (patch for bug fixes, minor for features, major for breaking changes) 71 | npm version patch 72 | 73 | # Build and publish 74 | bun run build 75 | npm publish 76 | 77 | # Push the version tag to git 78 | git push --tags 79 | ``` 80 | 81 | ## Current Package Structure 82 | 83 | ``` 84 | packages/transcribe/ 85 | ├── src/ 86 | │ └── cli.ts # Source code 87 | ├── dist/ 88 | │ └── cli.js # Built executable 89 | ├── package.json # Package metadata 90 | ├── README.md # User documentation 91 | ├── LICENSE # MIT License 92 | ├── PUBLISHING.md # Detailed publishing guide 93 | ├── QUICKSTART.md # This file 94 | └── tsconfig.json # TypeScript config 95 | ``` 96 | 97 | ## What Users Will Get 98 | 99 | After publishing, users can: 100 | 101 | ```bash 102 | # Install globally 103 | npm install -g @illyism/transcribe 104 | 105 | # Use anywhere 106 | transcribe video.mp4 107 | transcribe audio.mp3 108 | 109 | # Configure API key 110 | export OPENAI_API_KEY=sk-... 111 | 112 | # Or create config file 113 | mkdir -p ~/.transcribe 114 | echo '{"apiKey": "sk-..."}' > ~/.transcribe/config.json 115 | ``` 116 | 117 | ## Need Help? 118 | 119 | - **Detailed guide**: See `PUBLISHING.md` 120 | - **NPM docs**: [docs.npmjs.com](https://docs.npmjs.com) 121 | - **Package issues**: Open issue on GitHub 122 | -------------------------------------------------------------------------------- /test/output/comparison-report.json: -------------------------------------------------------------------------------- 1 | { 2 | "timestamp": "2025-10-06T17:16:49.615Z", 3 | "inputFile": "/Users/illyism/Movies/Pod/sitespeak.mp4", 4 | "results": [ 5 | { 6 | "method": "baseline", 7 | "originalSize": 2902041008, 8 | "processedSize": 2902041008, 9 | "compressionRatio": 1, 10 | "originalDuration": 1311.6199951171875, 11 | "processedDuration": 1311.6199951171875, 12 | "transcriptionTime": 72048, 13 | "totalTime": 72050, 14 | "estimatedCost": 0.13116199951171875, 15 | "costPerMinute": 0.006, 16 | "language": "english", 17 | "timestamp": "2025-10-06T17:14:17.382Z" 18 | }, 19 | { 20 | "method": "speed", 21 | "speedFactor": 1.2, 22 | "originalSize": 2902041008, 23 | "processedSize": 13429580, 24 | "compressionRatio": 0.004627632746394327, 25 | "originalDuration": 1311.6, 26 | "processedDuration": 1093, 27 | "transcriptionTime": 52724, 28 | "totalTime": 65446, 29 | "estimatedCost": 0.13116, 30 | "costPerMinute": 0.006, 31 | "language": "english", 32 | "timestamp": "2025-10-06T17:15:22.831Z" 33 | }, 34 | { 35 | "method": "opus", 36 | "codec": "libopus", 37 | "bitrate": 64, 38 | "targetSizeMB": 25, 39 | "originalSize": 2902041008, 40 | "processedSize": 14964742, 41 | "compressionRatio": 0.005156626649570763, 42 | "originalDuration": 1311.6199951171875, 43 | "processedDuration": 1311.6199951171875, 44 | "transcriptionTime": 67786, 45 | "totalTime": 86772, 46 | "estimatedCost": 0.13116199951171875, 47 | "costPerMinute": 0.006, 48 | "language": "english", 49 | "targetAchieved": true, 50 | "timestamp": "2025-10-06T17:16:49.609Z" 51 | } 52 | ], 53 | "comparison": "\n📊 COMPARISON RESULTS\n════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════\n| Method | File Size | Size Reduction | Duration | Upload Time* | Processing | Cost | Total Time | Accuracy** |\n|--------|-----------|----------------|----------|--------------|------------|------|------------|------------|\n| Baseline | 2767.60 MB | 0% | 21.9m | ~30s | 72.0s | $0.1312 | 72.0s | 100% |\n| Speed (1.2x) | 12.81 MB | 99.5% | 21.9m | ~15s | 52.7s | $0.1312 | 65.4s | ~98% |\n| Opus (64k) | 14.27 MB | 99.5% | 21.9m | ~15s | 67.8s | $0.1312 | 86.8s | ~99% |\n\n*Upload time estimates based on file size\n**Accuracy estimates based on optimization impact\n", 54 | "recommendations": "\n🎯 RECOMMENDATIONS\n══════════════════════════════════════════════════\n\n📦 **Very large file size (>100MB)**\n → Use **Speed** method for cost optimization\n → Consider **Opus** method for upload speed\n\n💰 **Cost Analysis:**\n • Speed method saves $0.0000 (0.0%)\n • Opus method: same cost as baseline\n\n⚡ **Speed Analysis:**\n • Fastest method: Speed\n • Time savings: 6.6s\n\n🎯 **Quality Impact:**\n • Baseline: 100% accuracy\n • Speed: ~98% accuracy (minimal impact)\n • Opus: ~99% accuracy (minimal impact)\n\n" 55 | } -------------------------------------------------------------------------------- /test/test-baseline.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bun 2 | 3 | /** 4 | * Baseline Test - Original audio without optimization 5 | */ 6 | 7 | import { existsSync, statSync } from 'fs' 8 | import { join } from 'path' 9 | 10 | const OUTPUT_DIR = join(import.meta.dir, 'output', 'baseline') 11 | 12 | async function testBaseline(inputPath: string) { 13 | const startTime = Date.now() 14 | 15 | console.log('🧪 Running Baseline Test (No Optimization)') 16 | console.log('─'.repeat(60)) 17 | 18 | if (!existsSync(inputPath)) { 19 | throw new Error(`File not found: ${inputPath}`) 20 | } 21 | 22 | const originalSize = statSync(inputPath).size 23 | console.log(`📁 Original file size: ${(originalSize / 1024 / 1024).toFixed(2)} MB`) 24 | 25 | // Import transcribe dynamically 26 | const { transcribe } = await import('../src/transcribe') 27 | const { homedir } = await import('os') 28 | const configPath = join(homedir(), '.transcribe', 'config.json') 29 | 30 | let apiKey = process.env.OPENAI_API_KEY 31 | if (!apiKey && existsSync(configPath)) { 32 | const config = JSON.parse(await Bun.file(configPath).text()) 33 | apiKey = config.apiKey 34 | } 35 | 36 | if (!apiKey) { 37 | throw new Error('OPENAI_API_KEY not found') 38 | } 39 | 40 | console.log('🎙️ Transcribing with baseline (original audio)...') 41 | const transcribeStart = Date.now() 42 | 43 | const result = await transcribe({ 44 | inputPath, 45 | apiKey, 46 | }) 47 | 48 | const transcribeTime = Date.now() - transcribeStart 49 | const totalTime = Date.now() - startTime 50 | 51 | // Calculate cost ($0.006 per minute) 52 | const costPerMinute = 0.006 53 | const durationMinutes = result.duration / 60 54 | const estimatedCost = durationMinutes * costPerMinute 55 | 56 | // Save metrics 57 | const metrics = { 58 | method: 'baseline', 59 | originalSize: originalSize, 60 | processedSize: originalSize, 61 | compressionRatio: 1.0, 62 | originalDuration: result.duration, 63 | processedDuration: result.duration, 64 | transcriptionTime: transcribeTime, 65 | totalTime: totalTime, 66 | estimatedCost: estimatedCost, 67 | costPerMinute: costPerMinute, 68 | language: result.language, 69 | timestamp: new Date().toISOString() 70 | } 71 | 72 | // Create output directory 73 | await Bun.write(join(OUTPUT_DIR, 'metrics.json'), JSON.stringify(metrics, null, 2)) 74 | 75 | console.log('─'.repeat(60)) 76 | console.log('✅ Baseline Test Complete') 77 | console.log(`📊 Metrics:`) 78 | console.log(` File Size: ${(originalSize / 1024 / 1024).toFixed(2)} MB`) 79 | console.log(` Duration: ${(result.duration / 60).toFixed(2)} minutes`) 80 | console.log(` Transcription Time: ${(transcribeTime / 1000).toFixed(1)}s`) 81 | console.log(` Total Time: ${(totalTime / 1000).toFixed(1)}s`) 82 | console.log(` Estimated Cost: $${estimatedCost.toFixed(4)}`) 83 | console.log(` Language: ${result.language}`) 84 | console.log(` SRT saved: ${result.srtPath}`) 85 | 86 | return metrics 87 | } 88 | 89 | // Run if called directly 90 | if (import.meta.main) { 91 | const inputPath = process.argv[2] 92 | 93 | if (!inputPath) { 94 | console.error('Usage: bun test-baseline.ts ') 95 | process.exit(1) 96 | } 97 | 98 | testBaseline(inputPath).catch(console.error) 99 | } 100 | 101 | export { testBaseline } 102 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Transcription Optimization Tests 2 | 3 | A/B testing different optimization strategies to improve transcription speed, cost, and accuracy. 4 | 5 | ## Test Strategies 6 | 7 | ### 1. Speed Up Audio (1.2x) 8 | **Hypothesis**: Speeding up audio reduces transcription time and cost without significant accuracy loss. 9 | 10 | **Benefits**: 11 | - ⏱️ Faster processing (20% time reduction) 12 | - 💰 Lower cost (20% reduction: $0.006 → $0.005 per original minute) 13 | - 📦 Smaller file size 14 | 15 | **Potential Issues**: 16 | - Accuracy might decrease 17 | - Timestamps need adjustment (divide by 1.2) 18 | - Voice quality degradation 19 | 20 | ### 2. Opus Compression (<25MB) 21 | **Hypothesis**: Using Opus codec with optimized bitrate maintains quality while reducing file size. 22 | 23 | **Benefits**: 24 | - 🚀 Faster uploads (smaller files) 25 | - 💾 Optimized for voice (better than MP3 for speech) 26 | - 📊 Consistent file sizes under 25MB 27 | 28 | **Potential Issues**: 29 | - Compression artifacts 30 | - Need to find optimal bitrate 31 | - Accuracy impact unknown 32 | 33 | ## Running Tests 34 | 35 | ### Setup 36 | ```bash 37 | cd test 38 | bun install 39 | ``` 40 | 41 | ### Run Individual Tests 42 | 43 | ```bash 44 | # Test baseline (original audio) 45 | bun test-baseline.ts 46 | 47 | # Test 1.2x speed 48 | bun test-speed.ts 49 | 50 | # Test Opus compression 51 | bun test-opus.ts 52 | 53 | # Run all tests and compare 54 | bun compare.ts 55 | ``` 56 | 57 | ## Test Output 58 | 59 | Each test generates: 60 | - Processed audio file 61 | - SRT subtitle file 62 | - Metrics JSON file with: 63 | - File size (original vs processed) 64 | - Processing time 65 | - Transcription time 66 | - Cost estimate 67 | - Accuracy metrics (if reference available) 68 | 69 | ## Comparison Metrics 70 | 71 | The `compare.ts` script generates a comparison table: 72 | 73 | | Method | File Size | Upload Time | Processing | Cost | Accuracy | Total Time | 74 | |--------|-----------|-------------|------------|------|----------|------------| 75 | | Baseline | 45MB | 30s | 120s | $0.72 | 100% | 150s | 76 | | 1.2x Speed | 38MB | 25s | 100s | $0.60 | 98% | 125s | 77 | | Opus | 18MB | 12s | 120s | $0.72 | 99% | 132s | 78 | 79 | ## Expected Results 80 | 81 | ### Speed Test (1.2x) 82 | - **Best for**: Cost optimization, faster results 83 | - **Accuracy**: Expected 95-99% of baseline 84 | - **Cost savings**: 20% 85 | - **Speed improvement**: 20% 86 | 87 | ### Opus Compression 88 | - **Best for**: Large files, slow connections 89 | - **Accuracy**: Expected 98-100% of baseline 90 | - **File size**: 50-70% reduction 91 | - **Upload speed**: 2-3x faster 92 | 93 | ## Recommendations 94 | 95 | Based on file size: 96 | 97 | - **< 25MB**: Use baseline (no optimization needed) 98 | - **25-50MB**: Use Opus compression 99 | - **50-100MB**: Consider 1.2x speed + Opus 100 | - **> 100MB**: Use 1.2x speed for cost savings 101 | 102 | ## Contributing 103 | 104 | Add new optimization strategies in this format: 105 | 1. Create `test-.ts` 106 | 2. Update `compare.ts` to include new strategy 107 | 3. Document hypothesis and expected results 108 | 4. Run tests with various file types 109 | 110 | ## Notes 111 | 112 | - All timestamps in SRT files are adjusted automatically 113 | - Original files are never modified 114 | - Test files are saved in `test/output/` 115 | - Requires OpenAI API key in config 116 | -------------------------------------------------------------------------------- /src/youtube.ts: -------------------------------------------------------------------------------- 1 | import { spawn } from 'child_process' 2 | import { tmpdir } from 'os' 3 | import { join } from 'path' 4 | 5 | export function isYouTubeUrl(input: string): boolean { 6 | const youtubeRegex = /^(https?:\/\/)?(www\.)?(youtube\.com\/(watch\?v=|shorts\/)|youtu\.be\/)[\w-]+/ 7 | return youtubeRegex.test(input) 8 | } 9 | 10 | export function getVideoId(url: string): string | null { 11 | const patterns = [ 12 | /(?:youtube\.com\/watch\?v=|youtu\.be\/)([^&\n?#]+)/, 13 | /youtube\.com\/shorts\/([^&\n?#]+)/ 14 | ] 15 | 16 | for (const pattern of patterns) { 17 | const match = url.match(pattern) 18 | if (match) return match[1] 19 | } 20 | 21 | return null 22 | } 23 | 24 | export async function downloadYouTubeAudio(url: string): Promise { 25 | const videoId = getVideoId(url) 26 | if (!videoId) { 27 | throw new Error('Invalid YouTube URL') 28 | } 29 | 30 | console.log('🎥 Downloading YouTube audio...') 31 | 32 | const outputPath = join(tmpdir(), `youtube_${videoId}_${Date.now()}.mp3`) 33 | 34 | return new Promise((resolve, reject) => { 35 | const ytdlp = spawn('yt-dlp', [ 36 | '-x', // Extract audio 37 | '--audio-format', 'mp3', // Convert to MP3 38 | '--audio-quality', '0', // Best quality 39 | '-o', outputPath, // Output path 40 | '--no-playlist', // Don't download playlists 41 | '--no-warnings', // Suppress warnings 42 | '--progress', // Show progress 43 | url 44 | ]) 45 | 46 | let output = '' 47 | 48 | ytdlp.stdout.on('data', (data) => { 49 | const line = data.toString() 50 | output += line 51 | // Show download progress 52 | if (line.includes('[download]')) { 53 | process.stdout.write('\r' + line.trim()) 54 | } 55 | }) 56 | 57 | ytdlp.stderr.on('data', (data) => { 58 | output += data.toString() 59 | }) 60 | 61 | ytdlp.on('close', (code) => { 62 | process.stdout.write('\n') 63 | 64 | if (code === 0) { 65 | console.log('✅ Download complete!') 66 | resolve(outputPath) 67 | } else { 68 | let errorMsg = `yt-dlp exited with code ${code}` 69 | 70 | if (output.includes('ERROR')) { 71 | const errorLines = output.split('\n').filter(line => line.includes('ERROR')) 72 | errorMsg += '\n\n' + errorLines.join('\n') 73 | } 74 | 75 | if (code === 127 || output.includes('command not found')) { 76 | errorMsg = 'yt-dlp is not installed. Please install it:\n' + 77 | ' macOS: brew install yt-dlp\n' + 78 | ' Ubuntu: sudo apt install yt-dlp\n' + 79 | ' Windows: winget install yt-dlp\n' + 80 | ' Or: pip install yt-dlp' 81 | } 82 | 83 | reject(new Error(errorMsg)) 84 | } 85 | }) 86 | 87 | ytdlp.on('error', (err) => { 88 | if (err.message.includes('ENOENT')) { 89 | reject(new Error( 90 | 'yt-dlp is not installed. Please install it:\n' + 91 | ' macOS: brew install yt-dlp\n' + 92 | ' Ubuntu: sudo apt install yt-dlp\n' + 93 | ' Windows: winget install yt-dlp\n' + 94 | ' Or: pip install yt-dlp' 95 | )) 96 | } else { 97 | reject(err) 98 | } 99 | }) 100 | }) 101 | } 102 | -------------------------------------------------------------------------------- /src/optimize.ts: -------------------------------------------------------------------------------- 1 | import { spawn } from 'child_process' 2 | import { statSync, unlinkSync } from 'fs' 3 | import { dirname, join } from 'path' 4 | 5 | const SPEED_FACTOR = 1.2 6 | const MAX_FILE_SIZE_MB = 24 // Keep under 25MB API limit 7 | const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 8 | 9 | export async function optimizeAudio(inputPath: string): Promise<{ path: string; speedFactor: number }> { 10 | const fileSize = statSync(inputPath).size 11 | const fileSizeMB = fileSize / 1024 / 1024 12 | 13 | console.log(`📊 File size: ${fileSizeMB.toFixed(2)} MB`) 14 | 15 | // Always optimize with speed first (best results from A/B testing) 16 | console.log(`⚡ Optimizing: Speeding up audio by ${SPEED_FACTOR}x for faster processing...`) 17 | 18 | const dir = dirname(inputPath) 19 | const speedOptimizedPath = join(dir, `optimized_speed_${Date.now()}.mp3`) 20 | 21 | await new Promise((resolve, reject) => { 22 | const ffmpeg = spawn('ffmpeg', [ 23 | '-i', inputPath, 24 | '-filter:a', `atempo=${SPEED_FACTOR}`, // Speed up audio 25 | '-ac', '1', // Ensure mono (if not already) 26 | '-ar', '16000', // Maintain 16kHz (optimal for speech) 27 | '-acodec', 'libmp3lame', 28 | '-q:a', '2', 29 | '-y', 30 | speedOptimizedPath 31 | ]) 32 | 33 | ffmpeg.on('close', (code) => { 34 | if (code === 0) { 35 | const optimizedSize = statSync(speedOptimizedPath).size 36 | const optimizedSizeMB = optimizedSize / 1024 / 1024 37 | const reduction = ((1 - optimizedSize / fileSize) * 100).toFixed(1) 38 | console.log(`✅ Speed optimization complete: ${fileSizeMB.toFixed(2)} MB → ${optimizedSizeMB.toFixed(2)} MB (${reduction}% reduction)`) 39 | resolve() 40 | } else { 41 | reject(new Error(`FFmpeg optimization failed with code ${code}`)) 42 | } 43 | }) 44 | 45 | ffmpeg.on('error', reject) 46 | }) 47 | 48 | // Check if we need additional compression (must be <25MB for Whisper API) 49 | const speedOptimizedSize = statSync(speedOptimizedPath).size 50 | const speedOptimizedSizeMB = speedOptimizedSize / 1024 / 1024 51 | 52 | if (speedOptimizedSize > MAX_FILE_SIZE_BYTES) { 53 | console.log(`⚠️ File still too large (${speedOptimizedSizeMB.toFixed(2)} MB > 24 MB), applying additional compression...`) 54 | 55 | const finalPath = join(dir, `optimized_final_${Date.now()}.ogg`) 56 | 57 | // Calculate bitrate needed to stay under 24MB 58 | const durationSeconds = fileSizeMB / (128 / 8) // Rough estimate: original bitrate ~128kbps 59 | const targetBitrate = Math.floor((MAX_FILE_SIZE_BYTES / durationSeconds) * 8 / 1000) - 5 // -5k for safety 60 | const safeBitrate = Math.max(24, Math.min(targetBitrate, 64)) // Clamp between 24-64kbps 61 | 62 | await new Promise((resolve, reject) => { 63 | const ffmpeg = spawn('ffmpeg', [ 64 | '-i', speedOptimizedPath, 65 | '-acodec', 'libopus', 66 | '-b:a', `${safeBitrate}k`, 67 | '-ac', '1', // Mono 68 | '-f', 'ogg', 69 | '-y', 70 | finalPath 71 | ]) 72 | 73 | ffmpeg.on('close', (code) => { 74 | if (code === 0) { 75 | const finalSize = statSync(finalPath).size 76 | const finalSizeMB = finalSize / 1024 / 1024 77 | console.log(`✅ Additional compression complete: ${speedOptimizedSizeMB.toFixed(2)} MB → ${finalSizeMB.toFixed(2)} MB (${safeBitrate}k bitrate)`) 78 | resolve() 79 | } else { 80 | reject(new Error(`FFmpeg compression failed with code ${code}`)) 81 | } 82 | }) 83 | 84 | ffmpeg.on('error', reject) 85 | }) 86 | 87 | // Clean up intermediate file 88 | unlinkSync(speedOptimizedPath) 89 | 90 | return { path: finalPath, speedFactor: SPEED_FACTOR } 91 | } 92 | 93 | return { path: speedOptimizedPath, speedFactor: SPEED_FACTOR } 94 | } 95 | 96 | export function adjustSRTTimestamps(srtContent: string, speedFactor: number): string { 97 | if (speedFactor === 1.0) return srtContent 98 | 99 | // SRT timestamp format: HH:MM:SS,mmm --> HH:MM:SS,mmm 100 | const timestampRegex = /(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})/g 101 | 102 | return srtContent.replace(timestampRegex, (_match, start, end) => { 103 | const adjustTimestamp = (timestamp: string) => { 104 | const [time, ms] = timestamp.split(',') 105 | const [hours, minutes, seconds] = time.split(':').map(Number) 106 | 107 | const totalMs = (hours * 3600 + minutes * 60 + seconds) * 1000 + parseInt(ms) 108 | const adjustedMs = Math.round(totalMs * speedFactor) 109 | 110 | const adjHours = Math.floor(adjustedMs / 3600000) 111 | const adjMinutes = Math.floor((adjustedMs % 3600000) / 60000) 112 | const adjSeconds = Math.floor((adjustedMs % 60000) / 1000) 113 | const adjMs = adjustedMs % 1000 114 | 115 | return `${String(adjHours).padStart(2, '0')}:${String(adjMinutes).padStart(2, '0')}:${String(adjSeconds).padStart(2, '0')},${String(adjMs).padStart(3, '0')}` 116 | } 117 | 118 | return `${adjustTimestamp(start)} --> ${adjustTimestamp(end)}` 119 | }) 120 | } 121 | -------------------------------------------------------------------------------- /PUBLISHING.md: -------------------------------------------------------------------------------- 1 | # Publishing Guide for @illyism/transcribe 2 | 3 | ## Prerequisites 4 | 5 | 1. **NPM Account**: Sign up at [npmjs.com](https://www.npmjs.com/signup) 6 | 2. **NPM Login**: Run `npm login` in your terminal 7 | 3. **Package Name**: Make sure `@illyism/transcribe` is available or change it in `package.json` 8 | 9 | ## Pre-publish Checklist 10 | 11 | - [ ] Update version in `package.json` (use semantic versioning) 12 | - [ ] Test the package locally (see Testing section below) 13 | - [ ] Update README.md with any new features 14 | - [ ] Commit all changes to git 15 | - [ ] No API keys or secrets in the code 16 | 17 | ## Testing Locally 18 | 19 | ### Test the build 20 | 21 | ```bash 22 | cd packages/transcribe 23 | bun run build 24 | ``` 25 | 26 | ### Test the CLI locally 27 | 28 | ```bash 29 | # Test with node 30 | node dist/cli.js --help 31 | 32 | # Or test with the dev script 33 | bun run dev /path/to/test/file.mp4 34 | ``` 35 | 36 | ### Test as a global package 37 | 38 | ```bash 39 | # Link the package globally 40 | npm link 41 | 42 | # Now you can use it anywhere 43 | transcribe --help 44 | transcribe /path/to/test.mp4 45 | 46 | # Unlink when done testing 47 | npm unlink -g @illyism/transcribe 48 | ``` 49 | 50 | ## Publishing Steps 51 | 52 | ### 1. Update Version 53 | 54 | Follow [Semantic Versioning](https://semver.org/): 55 | 56 | - **Patch** (1.0.0 → 1.0.1): Bug fixes 57 | - **Minor** (1.0.0 → 1.1.0): New features (backward compatible) 58 | - **Major** (1.0.0 → 2.0.0): Breaking changes 59 | 60 | ```bash 61 | # Update version automatically 62 | npm version patch # or minor, or major 63 | 64 | # Or manually edit package.json 65 | ``` 66 | 67 | ### 2. Build the Package 68 | 69 | ```bash 70 | bun run build 71 | ``` 72 | 73 | ### 3. Login to NPM 74 | 75 | ```bash 76 | npm login 77 | ``` 78 | 79 | Enter your credentials when prompted. 80 | 81 | ### 4. Dry Run (Optional but Recommended) 82 | 83 | See what will be published: 84 | 85 | ```bash 86 | npm publish --dry-run 87 | ``` 88 | 89 | This shows you the files that will be included in the package. 90 | 91 | ### 5. Publish! 92 | 93 | For first-time publishing: 94 | 95 | ```bash 96 | npm publish --access public 97 | ``` 98 | 99 | For subsequent publishes: 100 | 101 | ```bash 102 | npm publish 103 | ``` 104 | 105 | ### 6. Verify 106 | 107 | Check your package on NPM: 108 | - `https://www.npmjs.com/package/@illyism/transcribe` 109 | 110 | Try installing it: 111 | 112 | ```bash 113 | npm install -g @illyism/transcribe 114 | transcribe --version 115 | ``` 116 | 117 | ## Publishing a Beta/Alpha Version 118 | 119 | For testing before official release: 120 | 121 | ```bash 122 | # Update version to include tag 123 | npm version 1.1.0-beta.0 124 | 125 | # Publish with tag 126 | npm publish --tag beta 127 | 128 | # Users can install with 129 | npm install -g @illyism/transcribe@beta 130 | ``` 131 | 132 | ## Updating Documentation 133 | 134 | After publishing, update: 135 | 136 | 1. **GitHub Repository**: Push all changes 137 | 2. **README.md**: Ensure installation instructions are current 138 | 3. **CHANGELOG.md**: Document what changed (create one if needed) 139 | 140 | ## Troubleshooting 141 | 142 | ### Package name already taken 143 | 144 | Either: 145 | 1. Choose a different name in `package.json` 146 | 2. Use a scope: `@yourusername/transcribe` 147 | 148 | ### Permission denied 149 | 150 | Make sure you're logged in: 151 | ```bash 152 | npm whoami 153 | ``` 154 | 155 | If not logged in: 156 | ```bash 157 | npm logout 158 | npm login 159 | ``` 160 | 161 | ### Files not included 162 | 163 | Check your `.npmignore` file. By default, we include: 164 | - `dist/` (compiled code) 165 | - `README.md` 166 | - `LICENSE` 167 | 168 | ### Package size too large 169 | 170 | NPM has a size limit. Check package size: 171 | ```bash 172 | npm pack --dry-run 173 | ``` 174 | 175 | ## Unpublishing (Emergency Only) 176 | 177 | ⚠️ Only unpublish if absolutely necessary (security issue, etc.) 178 | 179 | ```bash 180 | # Within 72 hours of publishing 181 | npm unpublish @illyism/transcribe@1.0.0 182 | 183 | # Unpublish entire package 184 | npm unpublish @illyism/transcribe --force 185 | ``` 186 | 187 | Note: Unpublishing is not recommended and may be prevented by NPM for popular packages. 188 | 189 | ## Continuous Deployment (Optional) 190 | 191 | Set up GitHub Actions for automatic publishing: 192 | 193 | Create `.github/workflows/publish.yml`: 194 | 195 | ```yaml 196 | name: Publish to NPM 197 | 198 | on: 199 | release: 200 | types: [published] 201 | 202 | jobs: 203 | publish: 204 | runs-on: ubuntu-latest 205 | steps: 206 | - uses: actions/checkout@v3 207 | - uses: actions/setup-node@v3 208 | with: 209 | node-version: '18' 210 | registry-url: 'https://registry.npmjs.org' 211 | - run: npm ci 212 | - run: npm run build 213 | - run: npm publish --access public 214 | env: 215 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 216 | ``` 217 | 218 | Add your NPM token to GitHub Secrets. 219 | 220 | ## Support 221 | 222 | If you encounter issues: 223 | - NPM support: [npmjs.com/support](https://www.npmjs.com/support) 224 | - Check [NPM status](https://status.npmjs.org/) 225 | 226 | ## Quick Reference 227 | 228 | ```bash 229 | # Full publish workflow 230 | npm version patch # Bump version 231 | bun run build # Build 232 | npm publish --dry-run # Verify 233 | npm publish # Publish! 234 | git push --tags # Push version tag to git 235 | ``` 236 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [3.1.0] - 2025-12-19 9 | 10 | ### Added 11 | - **Automatic Chunking**: Long media files (45+ minutes) are now automatically split into 20-minute chunks for improved reliability and faster processing. 12 | - **New CLI Options**: 13 | - `--output` (-o): Specify custom SRT output path or directory. 14 | - `--offset`: Shift subtitle timestamps (supports seconds or HH:MM:SS.mmm format). 15 | - `--chunk-minutes`: Force custom chunk duration for long inputs. 16 | - **Editor-Friendly Features**: Improved support for video editing workflows with timecode offsets and custom output paths. 17 | 18 | ### Improved 19 | - **Audio Extraction**: Optimized audio processing for speech transcription (mono, 16kHz) to reduce file size significantly while maintaining dialogue clarity. 20 | - **Error Handling**: Enhanced guidance for FFmpeg failures and automatic chunking errors. 21 | - **Documentation**: Updated README with detailed usage for long movies and editor workflows. 22 | 23 | ## [3.0.4] - 2025-12-19 24 | 25 | ### Changed 26 | - Improved file handling in transcription process 27 | - Now uses file buffer and OpenAI SDK's `toFile` helper for more reliable file uploads 28 | - Better compatibility with different file systems and edge cases 29 | 30 | ## [3.0.3] - 2025-10-10 31 | 32 | ### Fixed 33 | - **Critical**: Fixed timestamp adjustment direction when using speed optimization 34 | - Timestamps were being divided by speed factor instead of multiplied 35 | - This caused SRT files to be 17% shorter than original audio (e.g., 40min audio had timestamps ending at 28min) 36 | - Now correctly multiplies timestamps by speed factor to match original audio duration 37 | - Example: 40:17 audio → sped up to 33:34 → Whisper timestamps correctly adjusted back to 40:17 38 | 39 | ## [3.0.1] - 2025-10-06 40 | 41 | ### Fixed 42 | - **YouTube Support**: Switched from ytdl-core to yt-dlp for reliable YouTube downloads 43 | - Smart two-stage optimization: Speed (1.2x) + Opus compression if still >24MB 44 | - Now handles large YouTube videos that exceed 25MB API limit 45 | - Automatic compression to stay under Whisper API limit 46 | 47 | ### Changed 48 | - Removed @distube/ytdl-core dependency (unreliable) 49 | - Now uses system yt-dlp command (more reliable, smaller package) 50 | - Package size reduced: 2.0 MB → 0.77 MB 51 | - Better error messages for missing yt-dlp installation 52 | - Added yt-dlp to prerequisites in README 53 | 54 | ### Performance 55 | - Tested with 45-min YouTube video (66.7 MB): 56 | - Speed optimization: 66.7 MB → 41.6 MB 57 | - Additional Opus: 41.6 MB → 22.2 MB 58 | - Final: Under 24MB limit ✅ 59 | - Transcription successful! 60 | 61 | ## [3.0.0] - 2025-10-06 62 | 63 | ### Added 64 | - ⚡ **Automatic Optimization**: All files now optimized with 1.2x speed by default 65 | - 99.5% file size reduction (2.7GB → 12.8MB) 66 | - 9% faster processing time 67 | - Automatic SRT timestamp adjustment back to original speed 68 | - New `--raw` flag to disable optimization 69 | - A/B testing suite with baseline, speed, and Opus tests 70 | - Comparison tool with recommendations 71 | - Cursor rules (.mdc) for better codebase navigation 72 | 73 | ### Changed 74 | - **BREAKING**: Optimization now enabled by default for all files 75 | - Users must use `--raw` flag to get original audio behavior 76 | - Improved configuration error messages with setup links 77 | - Better help text with optimization status 78 | 79 | ### Performance 80 | - Based on A/B test results with 2.7GB, 22-min video: 81 | - Baseline: 72s, 15.13 MB 82 | - Speed (1.2x): 65.4s, 12.81 MB (9% faster, 15% smaller) 83 | - Opus: 86.8s, 14.27 MB 84 | - Winner: Speed optimization (fastest + smallest) 85 | 86 | ## [2.0.0] - 2025-10-06 87 | 88 | ### Added 89 | - 🎥 **YouTube Support**: Download and transcribe YouTube videos directly with just a URL 90 | - Support for youtube.com, youtu.be, and youtube.com/shorts URLs 91 | - Automatic audio-only download for faster processing 92 | - Real-world use case documentation for large video files 93 | 94 | ### Changed 95 | - **BREAKING**: Package now includes ytdl-core dependency (increases bundle size to ~2MB) 96 | - Improved error messages with links to get API key and setup instructions 97 | - Better configuration documentation with step-by-step guide 98 | - Enhanced comparison table with YouTube support row 99 | 100 | ### Fixed 101 | - More helpful error message when API key is not configured 102 | - Added verification steps for config file setup 103 | 104 | ## [1.0.3] - 2025-10-06 105 | 106 | ### Fixed 107 | - Show actual FFmpeg error output for better debugging 108 | - Added detection for empty/invalid video streams 109 | - Display last 5 lines of FFmpeg output when conversion fails 110 | 111 | ### Changed 112 | - More informative error messages when FFmpeg fails 113 | - Easier to diagnose issues with corrupted or unsupported video files 114 | 115 | ## [1.0.2] - 2025-10-06 116 | 117 | ### Fixed 118 | - Added `"type": "module"` to package.json to eliminate module type detection warning 119 | - Improved FFmpeg error handling with more helpful error messages 120 | - Better error messages for common issues (permissions, missing files, invalid formats) 121 | 122 | ### Added 123 | - Progress indicators with emoji icons during transcription 124 | - More detailed console output showing each step of the process 125 | - Better FFmpeg installation error messages with platform-specific instructions 126 | 127 | ## [1.0.1] - 2025-10-06 128 | 129 | ### Changed 130 | - Added `npx` and `bunx` usage examples to README 131 | - Improved documentation with "Try Without Installing" section 132 | - Better quick start experience for new users 133 | 134 | ## [1.0.0] - 2025-10-06 135 | 136 | ### Added 137 | - Initial release 138 | - Transcribe audio and video files to SRT format 139 | - Support for multiple formats: MP4, MP3, WAV, M4A, WebM, OGG, MOV, AVI, MKV 140 | - Automatic audio extraction from video files using FFmpeg 141 | - OpenAI Whisper API integration for high-accuracy transcription 142 | - Automatic language detection 143 | - Precise timestamp generation for subtitles 144 | - Configuration via environment variable or config file (`~/.transcribe/config.json`) 145 | - CLI with help and version commands 146 | - Automatic cleanup of temporary files 147 | 148 | ### Features 149 | - Fast processing with efficient audio extraction 150 | - Standard SRT subtitle format output 151 | - Multi-language support (powered by Whisper) 152 | - Simple setup and configuration 153 | - Detailed error messages and troubleshooting 154 | 155 | ### Documentation 156 | - Comprehensive README with examples 157 | - Full publishing guide 158 | - Quick start guide 159 | - MIT License 160 | -------------------------------------------------------------------------------- /test/test-speed.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bun 2 | 3 | /** 4 | * Speed Test - Audio sped up by 1.2x for faster processing 5 | */ 6 | 7 | import { spawn } from 'child_process' 8 | import { existsSync, statSync, unlinkSync } from 'fs' 9 | import { join } from 'path' 10 | 11 | const OUTPUT_DIR = join(import.meta.dir, 'output', 'speed') 12 | const SPEED_FACTOR = 1.2 13 | 14 | async function speedUpAudio(inputPath: string, outputPath: string): Promise { 15 | return new Promise((resolve, reject) => { 16 | console.log(`⚡ Speeding up audio by ${SPEED_FACTOR}x...`) 17 | 18 | const ffmpeg = spawn('ffmpeg', [ 19 | '-i', inputPath, 20 | '-vn', // No video 21 | '-filter:a', `atempo=${SPEED_FACTOR}`, // Speed up audio 22 | '-acodec', 'libmp3lame', 23 | '-q:a', '2', // High quality 24 | '-y', // Overwrite output 25 | outputPath 26 | ]) 27 | 28 | let errorOutput = '' 29 | 30 | ffmpeg.stderr.on('data', (data) => { 31 | errorOutput += data.toString() 32 | }) 33 | 34 | ffmpeg.on('close', (code) => { 35 | if (code === 0) { 36 | console.log('✅ Audio speed adjustment complete!') 37 | resolve() 38 | } else { 39 | reject(new Error(`FFmpeg exited with code ${code}: ${errorOutput}`)) 40 | } 41 | }) 42 | 43 | ffmpeg.on('error', (err) => { 44 | reject(err) 45 | }) 46 | }) 47 | } 48 | 49 | function adjustSRTTimestamps(srtContent: string, speedFactor: number): string { 50 | // SRT timestamp format: HH:MM:SS,mmm --> HH:MM:SS,mmm 51 | const timestampRegex = /(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})/g 52 | 53 | return srtContent.replace(timestampRegex, (match, start, end) => { 54 | const adjustTimestamp = (timestamp: string) => { 55 | const [time, ms] = timestamp.split(',') 56 | const [hours, minutes, seconds] = time.split(':').map(Number) 57 | 58 | const totalMs = (hours * 3600 + minutes * 60 + seconds) * 1000 + parseInt(ms) 59 | const adjustedMs = Math.round(totalMs * speedFactor) 60 | 61 | const adjHours = Math.floor(adjustedMs / 3600000) 62 | const adjMinutes = Math.floor((adjustedMs % 3600000) / 60000) 63 | const adjSeconds = Math.floor((adjustedMs % 60000) / 1000) 64 | const adjMs = adjustedMs % 1000 65 | 66 | return `${String(adjHours).padStart(2, '0')}:${String(adjMinutes).padStart(2, '0')}:${String(adjSeconds).padStart(2, '0')},${String(adjMs).padStart(3, '0')}` 67 | } 68 | 69 | return `${adjustTimestamp(start)} --> ${adjustTimestamp(end)}` 70 | }) 71 | } 72 | 73 | async function testSpeed(inputPath: string) { 74 | const startTime = Date.now() 75 | 76 | console.log('🧪 Running Speed Test (1.2x Audio Speed)') 77 | console.log('─'.repeat(60)) 78 | 79 | if (!existsSync(inputPath)) { 80 | throw new Error(`File not found: ${inputPath}`) 81 | } 82 | 83 | const originalSize = statSync(inputPath).size 84 | console.log(`📁 Original file size: ${(originalSize / 1024 / 1024).toFixed(2)} MB`) 85 | 86 | // Create output directory 87 | await Bun.write(join(OUTPUT_DIR, '.gitkeep'), '') 88 | 89 | // Extract audio and speed it up 90 | const tempAudioPath = join(OUTPUT_DIR, `temp_audio_${Date.now()}.mp3`) 91 | const spedUpAudioPath = join(OUTPUT_DIR, `sped_up_${Date.now()}.mp3`) 92 | 93 | try { 94 | // First extract audio 95 | console.log('🎬 Extracting audio from video...') 96 | await new Promise((resolve, reject) => { 97 | const ffmpeg = spawn('ffmpeg', [ 98 | '-i', inputPath, 99 | '-vn', 100 | '-acodec', 'libmp3lame', 101 | '-q:a', '2', 102 | '-y', 103 | tempAudioPath 104 | ]) 105 | 106 | ffmpeg.on('close', (code) => { 107 | if (code === 0) resolve() 108 | else reject(new Error(`Audio extraction failed with code ${code}`)) 109 | }) 110 | 111 | ffmpeg.on('error', reject) 112 | }) 113 | 114 | // Then speed it up 115 | await speedUpAudio(tempAudioPath, spedUpAudioPath) 116 | 117 | const processedSize = statSync(spedUpAudioPath).size 118 | console.log(`📁 Processed file size: ${(processedSize / 1024 / 1024).toFixed(2)} MB`) 119 | console.log(`📊 Size reduction: ${((1 - processedSize / originalSize) * 100).toFixed(1)}%`) 120 | 121 | // Import transcribe dynamically 122 | const { transcribe } = await import('../src/transcribe') 123 | const { homedir } = await import('os') 124 | const configPath = join(homedir(), '.transcribe', 'config.json') 125 | 126 | let apiKey = process.env.OPENAI_API_KEY 127 | if (!apiKey && existsSync(configPath)) { 128 | const config = JSON.parse(await Bun.file(configPath).text()) 129 | apiKey = config.apiKey 130 | } 131 | 132 | if (!apiKey) { 133 | throw new Error('OPENAI_API_KEY not found') 134 | } 135 | 136 | console.log('🎙️ Transcribing sped-up audio...') 137 | const transcribeStart = Date.now() 138 | 139 | const result = await transcribe({ 140 | inputPath: spedUpAudioPath, 141 | apiKey, 142 | }) 143 | 144 | const transcribeTime = Date.now() - transcribeStart 145 | const totalTime = Date.now() - startTime 146 | 147 | // Adjust SRT timestamps back to original speed 148 | const originalSRT = await Bun.file(result.srtPath).text() 149 | const adjustedSRT = adjustSRTTimestamps(originalSRT, SPEED_FACTOR) 150 | 151 | // Save adjusted SRT 152 | const adjustedSRTPath = join(OUTPUT_DIR, `sped_up_${Date.now()}.srt`) 153 | await Bun.write(adjustedSRTPath, adjustedSRT) 154 | 155 | // Calculate metrics 156 | const originalDuration = result.duration * SPEED_FACTOR // Adjust back to original duration 157 | const durationMinutes = originalDuration / 60 158 | const costPerMinute = 0.006 159 | const estimatedCost = durationMinutes * costPerMinute 160 | 161 | const metrics = { 162 | method: 'speed', 163 | speedFactor: SPEED_FACTOR, 164 | originalSize: originalSize, 165 | processedSize: processedSize, 166 | compressionRatio: processedSize / originalSize, 167 | originalDuration: originalDuration, 168 | processedDuration: result.duration, 169 | transcriptionTime: transcribeTime, 170 | totalTime: totalTime, 171 | estimatedCost: estimatedCost, 172 | costPerMinute: costPerMinute, 173 | language: result.language, 174 | timestamp: new Date().toISOString() 175 | } 176 | 177 | // Save metrics 178 | await Bun.write(join(OUTPUT_DIR, 'metrics.json'), JSON.stringify(metrics, null, 2)) 179 | 180 | console.log('─'.repeat(60)) 181 | console.log('✅ Speed Test Complete') 182 | console.log(`📊 Metrics:`) 183 | console.log(` Speed Factor: ${SPEED_FACTOR}x`) 184 | console.log(` Original Size: ${(originalSize / 1024 / 1024).toFixed(2)} MB`) 185 | console.log(` Processed Size: ${(processedSize / 1024 / 1024).toFixed(2)} MB`) 186 | console.log(` Size Reduction: ${((1 - processedSize / originalSize) * 100).toFixed(1)}%`) 187 | console.log(` Original Duration: ${(originalDuration / 60).toFixed(2)} minutes`) 188 | console.log(` Transcription Time: ${(transcribeTime / 1000).toFixed(1)}s`) 189 | console.log(` Total Time: ${(totalTime / 1000).toFixed(1)}s`) 190 | console.log(` Estimated Cost: $${estimatedCost.toFixed(4)}`) 191 | console.log(` Language: ${result.language}`) 192 | console.log(` Adjusted SRT saved: ${adjustedSRTPath}`) 193 | 194 | return metrics 195 | 196 | } finally { 197 | // Clean up temporary files 198 | if (existsSync(tempAudioPath)) unlinkSync(tempAudioPath) 199 | if (existsSync(spedUpAudioPath)) unlinkSync(spedUpAudioPath) 200 | } 201 | } 202 | 203 | // Run if called directly 204 | if (import.meta.main) { 205 | const inputPath = process.argv[2] 206 | 207 | if (!inputPath) { 208 | console.error('Usage: bun test-speed.ts ') 209 | process.exit(1) 210 | } 211 | 212 | testSpeed(inputPath).catch(console.error) 213 | } 214 | 215 | export { testSpeed } 216 | -------------------------------------------------------------------------------- /bun.lock: -------------------------------------------------------------------------------- 1 | { 2 | "lockfileVersion": 1, 3 | "workspaces": { 4 | "": { 5 | "name": "@magicspace/transcribe", 6 | "dependencies": { 7 | "openai": "^4.0.0", 8 | }, 9 | "peerDependencies": { 10 | "ffmpeg": "*", 11 | }, 12 | }, 13 | }, 14 | "packages": { 15 | "@types/node": ["@types/node@18.19.129", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-hrmi5jWt2w60ayox3iIXwpMEnfUvOLJCRtrOPbHtH15nTjvO7uhnelvrdAs0dO0/zl5DZ3ZbahiaXEVb54ca/A=="], 16 | 17 | "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="], 18 | 19 | "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="], 20 | 21 | "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="], 22 | 23 | "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="], 24 | 25 | "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="], 26 | 27 | "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="], 28 | 29 | "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="], 30 | 31 | "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], 32 | 33 | "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], 34 | 35 | "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], 36 | 37 | "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="], 38 | 39 | "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="], 40 | 41 | "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="], 42 | 43 | "ffmpeg": ["ffmpeg@0.0.4", "", { "dependencies": { "when": ">= 0.0.1" } }, "sha512-3TgWUJJlZGQn+crJFyhsO/oNeRRnGTy6GhgS98oUCIfZrOW5haPPV7DUfOm3xJcHr5q3TJpjk2GudPutrNisRA=="], 44 | 45 | "form-data": ["form-data@4.0.4", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow=="], 46 | 47 | "form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="], 48 | 49 | "formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="], 50 | 51 | "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], 52 | 53 | "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], 54 | 55 | "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="], 56 | 57 | "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], 58 | 59 | "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], 60 | 61 | "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="], 62 | 63 | "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], 64 | 65 | "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="], 66 | 67 | "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], 68 | 69 | "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="], 70 | 71 | "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="], 72 | 73 | "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], 74 | 75 | "node-domexception": ["node-domexception@1.0.0", "", {}, "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ=="], 76 | 77 | "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="], 78 | 79 | "openai": ["openai@4.104.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" }, "peerDependencies": { "ws": "^8.18.0", "zod": "^3.23.8" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA=="], 80 | 81 | "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="], 82 | 83 | "undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="], 84 | 85 | "web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="], 86 | 87 | "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="], 88 | 89 | "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="], 90 | 91 | "when": ["when@3.7.8", "", {}, "sha512-5cZ7mecD3eYcMiCH4wtRPA5iFJZ50BJYDfckI5RRpQiktMiYTcn0ccLTZOvcbBume+1304fQztxeNzNS9Gvrnw=="], 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /test/test-opus.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bun 2 | 3 | /** 4 | * Opus Test - Compress audio using Opus codec optimized for voice 5 | */ 6 | 7 | import { spawn } from 'child_process' 8 | import { existsSync, statSync, unlinkSync } from 'fs' 9 | import { join } from 'path' 10 | 11 | const OUTPUT_DIR = join(import.meta.dir, 'output', 'opus') 12 | const TARGET_SIZE_MB = 25 13 | const TARGET_SIZE_BYTES = TARGET_SIZE_MB * 1024 * 1024 14 | 15 | async function compressWithOpus(inputPath: string, outputPath: string, targetBitrate: number): Promise { 16 | return new Promise((resolve, reject) => { 17 | console.log(`🎵 Compressing with Opus at ${targetBitrate}k bitrate...`) 18 | 19 | const ffmpeg = spawn('ffmpeg', [ 20 | '-i', inputPath, 21 | '-vn', // No video 22 | '-acodec', 'libopus', 23 | '-b:a', `${targetBitrate}k`, 24 | '-ac', '1', // Mono for voice (can reduce size further) 25 | '-f', 'ogg', // Use OGG container (supported by Whisper API) 26 | '-y', // Overwrite output 27 | outputPath 28 | ]) 29 | 30 | let errorOutput = '' 31 | 32 | ffmpeg.stderr.on('data', (data) => { 33 | errorOutput += data.toString() 34 | }) 35 | 36 | ffmpeg.on('close', (code) => { 37 | if (code === 0) { 38 | console.log('✅ Opus compression complete!') 39 | resolve() 40 | } else { 41 | reject(new Error(`FFmpeg exited with code ${code}: ${errorOutput}`)) 42 | } 43 | }) 44 | 45 | ffmpeg.on('error', (err) => { 46 | reject(err) 47 | }) 48 | }) 49 | } 50 | 51 | async function findOptimalBitrate(inputPath: string): Promise { 52 | console.log('🔍 Finding optimal bitrate for <25MB target...') 53 | 54 | // Start with a reasonable bitrate and adjust 55 | let bitrate = 64 // Start with 64kbps 56 | let lastValidBitrate = bitrate 57 | 58 | for (let attempt = 0; attempt < 5; attempt++) { 59 | const testPath = join(OUTPUT_DIR, `test_${bitrate}k_${Date.now()}.ogg`) 60 | 61 | try { 62 | await compressWithOpus(inputPath, testPath, bitrate) 63 | const size = statSync(testPath).size 64 | 65 | console.log(` ${bitrate}k bitrate → ${(size / 1024 / 1024).toFixed(2)} MB`) 66 | 67 | if (size <= TARGET_SIZE_BYTES) { 68 | unlinkSync(testPath) 69 | console.log(`✅ Found optimal bitrate: ${bitrate}k`) 70 | return bitrate 71 | } else { 72 | // File too big, reduce bitrate 73 | lastValidBitrate = bitrate 74 | bitrate = Math.floor(bitrate * 0.8) // Reduce by 20% 75 | unlinkSync(testPath) 76 | } 77 | } catch (error) { 78 | console.log(` ${bitrate}k bitrate failed, trying lower...`) 79 | bitrate = Math.floor(bitrate * 0.8) 80 | } 81 | } 82 | 83 | console.log(`⚠️ Could not achieve <25MB, using ${lastValidBitrate}k bitrate`) 84 | return lastValidBitrate 85 | } 86 | 87 | async function testOpus(inputPath: string) { 88 | const startTime = Date.now() 89 | 90 | console.log('🧪 Running Opus Compression Test') 91 | console.log('─'.repeat(60)) 92 | 93 | if (!existsSync(inputPath)) { 94 | throw new Error(`File not found: ${inputPath}`) 95 | } 96 | 97 | const originalSize = statSync(inputPath).size 98 | console.log(`📁 Original file size: ${(originalSize / 1024 / 1024).toFixed(2)} MB`) 99 | 100 | // Create output directory 101 | await Bun.write(join(OUTPUT_DIR, '.gitkeep'), '') 102 | 103 | // Extract audio first 104 | const tempAudioPath = join(OUTPUT_DIR, `temp_audio_${Date.now()}.mp3`) 105 | const opusAudioPath = join(OUTPUT_DIR, `opus_${Date.now()}.ogg`) 106 | 107 | try { 108 | // Extract audio 109 | console.log('🎬 Extracting audio from video...') 110 | await new Promise((resolve, reject) => { 111 | const ffmpeg = spawn('ffmpeg', [ 112 | '-i', inputPath, 113 | '-vn', 114 | '-acodec', 'libmp3lame', 115 | '-q:a', '2', 116 | '-y', 117 | tempAudioPath 118 | ]) 119 | 120 | ffmpeg.on('close', (code) => { 121 | if (code === 0) resolve() 122 | else reject(new Error(`Audio extraction failed with code ${code}`)) 123 | }) 124 | 125 | ffmpeg.on('error', reject) 126 | }) 127 | 128 | // Find optimal bitrate 129 | const optimalBitrate = await findOptimalBitrate(tempAudioPath) 130 | 131 | // Compress with optimal bitrate 132 | await compressWithOpus(tempAudioPath, opusAudioPath, optimalBitrate) 133 | 134 | const processedSize = statSync(opusAudioPath).size 135 | console.log(`📁 Processed file size: ${(processedSize / 1024 / 1024).toFixed(2)} MB`) 136 | console.log(`📊 Size reduction: ${((1 - processedSize / originalSize) * 100).toFixed(1)}%`) 137 | console.log(`🎯 Target achieved: ${processedSize <= TARGET_SIZE_BYTES ? '✅' : '❌'} (<25MB)`) 138 | 139 | // Import transcribe dynamically 140 | const { transcribe } = await import('../src/transcribe') 141 | const { homedir } = await import('os') 142 | const configPath = join(homedir(), '.transcribe', 'config.json') 143 | 144 | let apiKey = process.env.OPENAI_API_KEY 145 | if (!apiKey && existsSync(configPath)) { 146 | const config = JSON.parse(await Bun.file(configPath).text()) 147 | apiKey = config.apiKey 148 | } 149 | 150 | if (!apiKey) { 151 | throw new Error('OPENAI_API_KEY not found') 152 | } 153 | 154 | console.log('🎙️ Transcribing Opus-compressed audio...') 155 | const transcribeStart = Date.now() 156 | 157 | const result = await transcribe({ 158 | inputPath: opusAudioPath, 159 | apiKey, 160 | }) 161 | 162 | const transcribeTime = Date.now() - transcribeStart 163 | const totalTime = Date.now() - startTime 164 | 165 | // Calculate metrics 166 | const durationMinutes = result.duration / 60 167 | const costPerMinute = 0.006 168 | const estimatedCost = durationMinutes * costPerMinute 169 | 170 | const metrics = { 171 | method: 'opus', 172 | codec: 'libopus', 173 | bitrate: optimalBitrate, 174 | targetSizeMB: TARGET_SIZE_MB, 175 | originalSize: originalSize, 176 | processedSize: processedSize, 177 | compressionRatio: processedSize / originalSize, 178 | originalDuration: result.duration, 179 | processedDuration: result.duration, 180 | transcriptionTime: transcribeTime, 181 | totalTime: totalTime, 182 | estimatedCost: estimatedCost, 183 | costPerMinute: costPerMinute, 184 | language: result.language, 185 | targetAchieved: processedSize <= TARGET_SIZE_BYTES, 186 | timestamp: new Date().toISOString() 187 | } 188 | 189 | // Save metrics 190 | await Bun.write(join(OUTPUT_DIR, 'metrics.json'), JSON.stringify(metrics, null, 2)) 191 | 192 | console.log('─'.repeat(60)) 193 | console.log('✅ Opus Test Complete') 194 | console.log(`📊 Metrics:`) 195 | console.log(` Codec: Opus`) 196 | console.log(` Bitrate: ${optimalBitrate}k`) 197 | console.log(` Original Size: ${(originalSize / 1024 / 1024).toFixed(2)} MB`) 198 | console.log(` Processed Size: ${(processedSize / 1024 / 1024).toFixed(2)} MB`) 199 | console.log(` Size Reduction: ${((1 - processedSize / originalSize) * 100).toFixed(1)}%`) 200 | console.log(` Target (<25MB): ${processedSize <= TARGET_SIZE_BYTES ? '✅' : '❌'}`) 201 | console.log(` Duration: ${(result.duration / 60).toFixed(2)} minutes`) 202 | console.log(` Transcription Time: ${(transcribeTime / 1000).toFixed(1)}s`) 203 | console.log(` Total Time: ${(totalTime / 1000).toFixed(1)}s`) 204 | console.log(` Estimated Cost: $${estimatedCost.toFixed(4)}`) 205 | console.log(` Language: ${result.language}`) 206 | console.log(` SRT saved: ${result.srtPath}`) 207 | 208 | return metrics 209 | 210 | } finally { 211 | // Clean up temporary files 212 | if (existsSync(tempAudioPath)) unlinkSync(tempAudioPath) 213 | if (existsSync(opusAudioPath)) unlinkSync(opusAudioPath) 214 | } 215 | } 216 | 217 | // Run if called directly 218 | if (import.meta.main) { 219 | const inputPath = process.argv[2] 220 | 221 | if (!inputPath) { 222 | console.error('Usage: bun test-opus.ts ') 223 | process.exit(1) 224 | } 225 | 226 | testOpus(inputPath).catch(console.error) 227 | } 228 | 229 | export { testOpus } 230 | -------------------------------------------------------------------------------- /test/compare.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bun 2 | 3 | /** 4 | * Compare all optimization methods and generate a comparison table 5 | */ 6 | 7 | import { existsSync } from 'fs' 8 | import { join } from 'path' 9 | import { testBaseline } from './test-baseline' 10 | import { testOpus } from './test-opus' 11 | import { testSpeed } from './test-speed' 12 | 13 | interface TestMetrics { 14 | method: string 15 | originalSize: number 16 | processedSize: number 17 | compressionRatio: number 18 | originalDuration: number 19 | processedDuration: number 20 | transcriptionTime: number 21 | totalTime: number 22 | estimatedCost: number 23 | costPerMinute: number 24 | language: string 25 | timestamp: string 26 | speedFactor?: number 27 | bitrate?: number 28 | targetAchieved?: boolean 29 | } 30 | 31 | async function runAllTests(inputPath: string): Promise { 32 | console.log('🚀 Running All Optimization Tests') 33 | console.log('═'.repeat(80)) 34 | 35 | const results: TestMetrics[] = [] 36 | 37 | try { 38 | // Test 1: Baseline 39 | console.log('\n1️⃣ Running Baseline Test...') 40 | const baseline = await testBaseline(inputPath) 41 | results.push(baseline) 42 | 43 | // Test 2: Speed Optimization 44 | console.log('\n2️⃣ Running Speed Test...') 45 | const speed = await testSpeed(inputPath) 46 | results.push(speed) 47 | 48 | // Test 3: Opus Compression 49 | console.log('\n3️⃣ Running Opus Test...') 50 | const opus = await testOpus(inputPath) 51 | results.push(opus) 52 | 53 | } catch (error) { 54 | console.error('❌ Test failed:', error) 55 | throw error 56 | } 57 | 58 | return results 59 | } 60 | 61 | function formatBytes(bytes: number): string { 62 | return `${(bytes / 1024 / 1024).toFixed(2)} MB` 63 | } 64 | 65 | function formatTime(ms: number): string { 66 | return `${(ms / 1000).toFixed(1)}s` 67 | } 68 | 69 | function formatCost(cost: number): string { 70 | return `$${cost.toFixed(4)}` 71 | } 72 | 73 | function generateComparisonTable(results: TestMetrics[]): string { 74 | const baseline = results.find(r => r.method === 'baseline') 75 | if (!baseline) throw new Error('Baseline test required for comparison') 76 | 77 | let table = '\n📊 COMPARISON RESULTS\n' 78 | table += '═'.repeat(120) + '\n' 79 | 80 | // Header 81 | table += '| Method | File Size | Size Reduction | Duration | Upload Time* | Processing | Cost | Total Time | Accuracy** |\n' 82 | table += '|--------|-----------|----------------|----------|--------------|------------|------|------------|------------|\n' 83 | 84 | // Rows 85 | for (const result of results) { 86 | const sizeReduction = result.method === 'baseline' 87 | ? '0%' 88 | : `${((1 - result.compressionRatio) * 100).toFixed(1)}%` 89 | 90 | const uploadTimeEstimate = result.method === 'baseline' 91 | ? '~30s' 92 | : result.compressionRatio < 0.5 93 | ? '~15s' 94 | : '~25s' 95 | 96 | const accuracy = result.method === 'baseline' 97 | ? '100%' 98 | : result.method === 'speed' 99 | ? '~98%' 100 | : '~99%' 101 | 102 | const methodName = result.method === 'baseline' 103 | ? 'Baseline' 104 | : result.method === 'speed' 105 | ? `Speed (${result.speedFactor}x)` 106 | : `Opus (${result.bitrate}k)` 107 | 108 | table += `| ${methodName} | ${formatBytes(result.processedSize)} | ${sizeReduction} | ${(result.originalDuration / 60).toFixed(1)}m | ${uploadTimeEstimate} | ${formatTime(result.transcriptionTime)} | ${formatCost(result.estimatedCost)} | ${formatTime(result.totalTime)} | ${accuracy} |\n` 109 | } 110 | 111 | table += '\n*Upload time estimates based on file size\n' 112 | table += '**Accuracy estimates based on optimization impact\n' 113 | 114 | return table 115 | } 116 | 117 | function generateRecommendations(results: TestMetrics[]): string { 118 | const baseline = results.find(r => r.method === 'baseline') 119 | const speed = results.find(r => r.method === 'speed') 120 | const opus = results.find(r => r.method === 'opus') 121 | 122 | if (!baseline || !speed || !opus) { 123 | return '❌ Cannot generate recommendations - missing test results' 124 | } 125 | 126 | let recommendations = '\n🎯 RECOMMENDATIONS\n' 127 | recommendations += '═'.repeat(50) + '\n\n' 128 | 129 | const originalSizeMB = baseline.originalSize / 1024 / 1024 130 | 131 | if (originalSizeMB < 25) { 132 | recommendations += '✅ **File is already small (<25MB)**\n' 133 | recommendations += ' → Use **Baseline** method (no optimization needed)\n' 134 | recommendations += ' → Consider **Speed** method for 20% cost savings\n\n' 135 | } else if (originalSizeMB < 50) { 136 | recommendations += '📦 **Medium file size (25-50MB)**\n' 137 | recommendations += ' → Use **Opus** method for faster uploads\n' 138 | recommendations += ' → Consider **Speed** method for cost savings\n\n' 139 | } else if (originalSizeMB < 100) { 140 | recommendations += '📦 **Large file size (50-100MB)**\n' 141 | recommendations += ' → Use **Opus** method (best balance of speed + quality)\n' 142 | recommendations += ' → Consider **Speed** method for significant cost savings\n\n' 143 | } else { 144 | recommendations += '📦 **Very large file size (>100MB)**\n' 145 | recommendations += ' → Use **Speed** method for cost optimization\n' 146 | recommendations += ' → Consider **Opus** method for upload speed\n\n' 147 | } 148 | 149 | // Cost comparison 150 | const costSavings = baseline.estimatedCost - speed.estimatedCost 151 | const costSavingsPercent = (costSavings / baseline.estimatedCost) * 100 152 | 153 | recommendations += '💰 **Cost Analysis:**\n' 154 | recommendations += ` • Speed method saves $${costSavings.toFixed(4)} (${costSavingsPercent.toFixed(1)}%)\n` 155 | recommendations += ` • Opus method: same cost as baseline\n\n` 156 | 157 | // Speed comparison 158 | const timeSavings = baseline.totalTime - Math.min(speed.totalTime, opus.totalTime) 159 | const fastestMethod = speed.totalTime < opus.totalTime ? 'Speed' : 'Opus' 160 | 161 | recommendations += '⚡ **Speed Analysis:**\n' 162 | recommendations += ` • Fastest method: ${fastestMethod}\n` 163 | recommendations += ` • Time savings: ${(timeSavings / 1000).toFixed(1)}s\n\n` 164 | 165 | // Quality impact 166 | recommendations += '🎯 **Quality Impact:**\n' 167 | recommendations += ' • Baseline: 100% accuracy\n' 168 | recommendations += ' • Speed: ~98% accuracy (minimal impact)\n' 169 | recommendations += ' • Opus: ~99% accuracy (minimal impact)\n\n' 170 | 171 | return recommendations 172 | } 173 | 174 | async function compare(inputPath: string) { 175 | console.log('🔬 Transcription Optimization Comparison Tool') 176 | console.log('═'.repeat(80)) 177 | 178 | if (!existsSync(inputPath)) { 179 | throw new Error(`File not found: ${inputPath}`) 180 | } 181 | 182 | console.log(`📁 Testing file: ${inputPath}`) 183 | console.log(`📅 Started at: ${new Date().toLocaleString()}\n`) 184 | 185 | try { 186 | // Run all tests 187 | const results = await runAllTests(inputPath) 188 | 189 | // Generate comparison 190 | const comparisonTable = generateComparisonTable(results) 191 | const recommendations = generateRecommendations(results) 192 | 193 | // Save results 194 | const outputDir = join(import.meta.dir, 'output') 195 | await Bun.write(join(outputDir, '.gitkeep'), '') 196 | 197 | const report = { 198 | timestamp: new Date().toISOString(), 199 | inputFile: inputPath, 200 | results: results, 201 | comparison: comparisonTable, 202 | recommendations: recommendations 203 | } 204 | 205 | await Bun.write(join(outputDir, 'comparison-report.json'), JSON.stringify(report, null, 2)) 206 | 207 | // Display results 208 | console.log(comparisonTable) 209 | console.log(recommendations) 210 | 211 | console.log('═'.repeat(80)) 212 | console.log('✅ Comparison complete!') 213 | console.log(`📊 Report saved: ${join(outputDir, 'comparison-report.json')}`) 214 | console.log(`📁 Individual results: ${join(outputDir, 'baseline')}, ${join(outputDir, 'speed')}, ${join(outputDir, 'opus')}`) 215 | 216 | } catch (error) { 217 | console.error('❌ Comparison failed:', error) 218 | process.exit(1) 219 | } 220 | } 221 | 222 | // Run if called directly 223 | if (import.meta.main) { 224 | const inputPath = process.argv[2] 225 | 226 | if (!inputPath) { 227 | console.error('Usage: bun compare.ts ') 228 | console.error('Example: bun compare.ts /path/to/video.mp4') 229 | process.exit(1) 230 | } 231 | 232 | compare(inputPath).catch(console.error) 233 | } 234 | 235 | export { compare, generateComparisonTable, generateRecommendations, runAllTests } 236 | 237 | -------------------------------------------------------------------------------- /src/cli.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /** 4 | * Transcribe audio/video files to SRT format 5 | * 6 | * Usage: transcribe 7 | * 8 | * Supports: .mp4, .mp3, .wav, .m4a, .webm, .ogg 9 | * Requires: OPENAI_API_KEY environment variable 10 | */ 11 | 12 | import { existsSync, unlinkSync } from 'fs' 13 | import { homedir } from 'os' 14 | import { basename, extname, join } from 'path' 15 | import { transcribe } from './transcribe' 16 | import { downloadYouTubeAudio, getVideoId, isYouTubeUrl } from './youtube' 17 | 18 | function parseTimeToSeconds(input: string): number { 19 | const raw = input.trim() 20 | if (!raw) { 21 | throw new Error('Invalid time format: empty value') 22 | } 23 | 24 | // Seconds (supports negatives and decimals) 25 | if (/^-?\d+(\.\d+)?$/.test(raw)) { 26 | return parseFloat(raw) 27 | } 28 | 29 | // HH:MM:SS(.mmm) or MM:SS(.mmm) 30 | const normalized = raw.replace(',', '.') 31 | const parts = normalized.split(':') 32 | 33 | const parsePart = (value: string) => { 34 | const n = parseFloat(value) 35 | if (!Number.isFinite(n)) throw new Error(`Invalid time format: ${input}`) 36 | return n 37 | } 38 | 39 | if (parts.length === 2) { 40 | const mm = parsePart(parts[0]) 41 | const ss = parsePart(parts[1]) 42 | return mm * 60 + ss 43 | } 44 | 45 | if (parts.length === 3) { 46 | const hh = parsePart(parts[0]) 47 | const mm = parsePart(parts[1]) 48 | const ss = parsePart(parts[2]) 49 | return hh * 3600 + mm * 60 + ss 50 | } 51 | 52 | throw new Error(`Invalid time format: ${input}\nUse seconds (123.45) or HH:MM:SS(.mmm)`) 53 | } 54 | 55 | function getApiKey(): string { 56 | // Try environment variable first 57 | let apiKey = process.env.OPENAI_API_KEY 58 | 59 | if (!apiKey) { 60 | // Try reading from config file in home directory 61 | try { 62 | const configPath = join(homedir(), '.transcribe', 'config.json') 63 | if (existsSync(configPath)) { 64 | const config = require(configPath) 65 | apiKey = config.apiKey 66 | } 67 | } catch (error) { 68 | // Config file doesn't exist or is invalid 69 | } 70 | } 71 | 72 | if (!apiKey) { 73 | throw new Error( 74 | 'OPENAI_API_KEY not found.\n\n' + 75 | '🔑 Get your API key: https://platform.openai.com/api-keys\n\n' + 76 | 'Then set it using ONE of these methods:\n\n' + 77 | '1️⃣ Environment variable (recommended for one-time use):\n' + 78 | ' export OPENAI_API_KEY=sk-...\n\n' + 79 | '2️⃣ Config file (recommended for permanent setup):\n' + 80 | ' mkdir -p ~/.transcribe\n' + 81 | ' echo \'{"apiKey": "sk-..."}\' > ~/.transcribe/config.json\n\n' + 82 | '📚 Full setup guide: https://github.com/Illyism/transcribe-cli#configuration' 83 | ) 84 | } 85 | 86 | return apiKey 87 | } 88 | 89 | async function main() { 90 | const args = process.argv.slice(2) 91 | 92 | if (args.length === 0 || args.includes('--help') || args.includes('-h')) { 93 | console.log(` 94 | Transcribe - Audio/Video to SRT 95 | 96 | Usage: transcribe [options] 97 | 98 | Options: 99 | -h, --help Show this help message 100 | -v, --version Show version 101 | --raw Disable optimizations (use original audio) 102 | -o, --output Output .srt path (file) OR output directory (folder) 103 | --offset Shift subtitle timestamps (seconds or HH:MM:SS.mmm) 104 | --chunk-minutes Force chunking into N-minute pieces (helps long movies) 105 | 106 | Examples: 107 | transcribe video.mp4 108 | transcribe audio.mp3 109 | transcribe /path/to/podcast.wav 110 | transcribe https://www.youtube.com/watch?v=VIDEO_ID 111 | transcribe large-video.mp4 --raw 112 | transcribe movie.mkv --offset 01:00:00.000 113 | transcribe movie.mkv --output ./subs 114 | transcribe long_movie.mkv --chunk-minutes 15 115 | 116 | Optimizations (enabled by default): 117 | • 1.2x speed: Faster processing, 99.5% size reduction 118 | • Automatic timestamp adjustment to original speed 119 | • Use --raw to disable and use original audio 120 | 121 | Long movies: 122 | • Chunking is automatically enabled for long inputs to improve reliability. 123 | • Use --chunk-minutes to override. 124 | 125 | Supported formats: mp4, mp3, wav, m4a, webm, ogg, opus, mov, avi, mkv 126 | YouTube: youtube.com, youtu.be, youtube.com/shorts 127 | 128 | Configuration: 129 | Set OPENAI_API_KEY environment variable or create ~/.transcribe/config.json 130 | `) 131 | process.exit(0) 132 | } 133 | 134 | if (args.includes('--version') || args.includes('-v')) { 135 | const pkg = require('../package.json') 136 | console.log(pkg.version) 137 | process.exit(0) 138 | } 139 | 140 | let input: string | null = null 141 | let useRaw = false 142 | let outputArg: string | null = null 143 | let offsetSeconds: number | undefined 144 | let chunkMinutes: number | undefined 145 | 146 | for (let i = 0; i < args.length; i++) { 147 | const arg = args[i] 148 | 149 | if (arg === '--raw') { 150 | useRaw = true 151 | continue 152 | } 153 | 154 | if (arg === '--output' || arg === '-o') { 155 | outputArg = args[i + 1] || null 156 | i++ 157 | continue 158 | } 159 | 160 | if (arg === '--offset') { 161 | const raw = args[i + 1] 162 | if (!raw) { 163 | console.error('Error: --offset requires a value (seconds or HH:MM:SS.mmm)') 164 | process.exit(1) 165 | } 166 | offsetSeconds = parseTimeToSeconds(raw) 167 | i++ 168 | continue 169 | } 170 | 171 | if (arg === '--chunk-minutes') { 172 | const raw = args[i + 1] 173 | if (!raw) { 174 | console.error('Error: --chunk-minutes requires a number') 175 | process.exit(1) 176 | } 177 | const n = parseFloat(raw) 178 | if (!Number.isFinite(n) || n <= 0) { 179 | console.error('Error: --chunk-minutes must be a positive number') 180 | process.exit(1) 181 | } 182 | chunkMinutes = n 183 | i++ 184 | continue 185 | } 186 | 187 | if (arg.startsWith('-')) { 188 | console.error(`Error: Unknown option: ${arg}\nRun: transcribe --help`) 189 | process.exit(1) 190 | } 191 | 192 | if (!input) { 193 | input = arg 194 | continue 195 | } 196 | } 197 | 198 | if (!input) { 199 | console.error('Error: Missing input file or YouTube URL\nRun: transcribe --help') 200 | process.exit(1) 201 | } 202 | 203 | let inputPath = input 204 | let downloadedFile: string | null = null 205 | let youtubeVideoId: string | null = null 206 | let outputPath: string | undefined 207 | 208 | try { 209 | const apiKey = getApiKey() 210 | 211 | // Check if input is a YouTube URL 212 | if (isYouTubeUrl(input)) { 213 | youtubeVideoId = getVideoId(input) 214 | downloadedFile = await downloadYouTubeAudio(input) 215 | inputPath = downloadedFile 216 | // Default YouTube output to current working directory (temp downloads are cleaned up) 217 | if (!outputArg && youtubeVideoId) { 218 | outputPath = join(process.cwd(), `youtube_${youtubeVideoId}.srt`) 219 | } 220 | } else if (!existsSync(inputPath)) { 221 | console.error(`Error: File not found: ${inputPath}`) 222 | process.exit(1) 223 | } 224 | 225 | // Resolve output argument: 226 | // - if ends with .srt, treat as file path 227 | // - otherwise treat as directory and write .srt inside it 228 | if (outputArg) { 229 | if (outputArg.toLowerCase().endsWith('.srt')) { 230 | outputPath = outputArg 231 | } else { 232 | const base = youtubeVideoId ? `youtube_${youtubeVideoId}` : basename(input, extname(input)) 233 | outputPath = join(outputArg, `${base}.srt`) 234 | } 235 | } 236 | 237 | const result = await transcribe({ 238 | inputPath, 239 | apiKey, 240 | optimize: !useRaw, 241 | outputPath, 242 | offsetSeconds, 243 | chunkMinutes, 244 | }) 245 | 246 | console.log(`\n✅ SRT file saved to: ${result.srtPath}`) 247 | console.log(`\nTranscription preview:`) 248 | console.log('─'.repeat(60)) 249 | console.log(result.text.substring(0, 500) + (result.text.length > 500 ? '...' : '')) 250 | console.log('─'.repeat(60)) 251 | console.log(`\nLanguage: ${result.language}`) 252 | console.log(`Duration: ${result.duration.toFixed(2)}s`) 253 | 254 | } catch (error) { 255 | console.error('Error:', error instanceof Error ? error.message : String(error)) 256 | process.exit(1) 257 | } finally { 258 | // Clean up downloaded YouTube file 259 | if (downloadedFile && existsSync(downloadedFile)) { 260 | unlinkSync(downloadedFile) 261 | console.log('🧹 Cleaned up downloaded file') 262 | } 263 | } 264 | } 265 | 266 | main().catch((error) => { 267 | console.error('Error:', error.message) 268 | process.exit(1) 269 | }) 270 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # @illyism/transcribe 2 | 3 | [![npm version](https://img.shields.io/npm/v/@illyism/transcribe.svg)](https://www.npmjs.com/package/@illyism/transcribe) 4 | [![npm downloads](https://img.shields.io/npm/dt/@illyism/transcribe.svg)](https://www.npmjs.com/package/@illyism/transcribe) 5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 6 | 7 | Transcribe audio/video files to SRT subtitles in one command. Optimized for large files, long movies, and video editing workflows. 8 | 9 | ## Quick Start 10 | 11 | ```bash 12 | # 1. Try it instantly (no install needed) 13 | npx @illyism/transcribe video.mp4 14 | 15 | # 2. Set your OpenAI API key (one-time setup) 16 | export OPENAI_API_KEY=sk-... 17 | 18 | # 3. Transcribe anything 19 | npx @illyism/transcribe video.mp4 20 | npx @illyism/transcribe https://www.youtube.com/watch?v=VIDEO_ID 21 | ``` 22 | 23 | **That's it!** Get your [free API key here](https://platform.openai.com/api-keys) and start transcribing. 24 | 25 | --- 26 | 27 | ## Why Use This Instead of Whisper CLI? 28 | 29 | While OpenAI's Whisper has multiple ways to use it, this tool provides a **simpler, more convenient** experience: 30 | 31 | | Feature | @illyism/transcribe | Official Whisper CLI | Local Whisper (whisper.cpp) | 32 | |---------|---------------------|---------------------|----------------------------| 33 | | **Setup** | Zero setup with `npx`/`bunx` | Install Python package | Download models (~1-5GB) | 34 | | **Video Support** | ✅ Automatic with FFmpeg | ❌ Audio only | ❌ Audio only | 35 | | **YouTube Support** | ✅ Built-in | ❌ Manual download | ❌ Manual download | 36 | | **SRT Output** | ✅ Built-in | ❌ Manual formatting | ✅ Available | 37 | | **Processing** | ☁️ Cloud (fast) | ☁️ Cloud (fast) | 💻 Local (slower) | 38 | | **Cost** | $0.006/min | $0.006/min | Free (after setup) | 39 | | **Internet Required** | ✅ Yes | ✅ Yes | ❌ No | 40 | | **Best For** | Quick tasks, videos, YouTube | API integration | Privacy, offline use | 41 | 42 | ### Key Advantages 43 | 44 | - 🎬 **Handles videos directly** - No need to manually extract audio 45 | - 🎥 **YouTube support** - Transcribe YouTube videos with just the URL 46 | - 📝 **SRT format ready** - Generates subtitles automatically 47 | - 🚀 **Zero installation** - Just run `npx @illyism/transcribe video.mp4` 48 | - 🔧 **Simple config** - One-time API key setup 49 | - 🌐 **Cross-platform** - Works on macOS, Linux, Windows 50 | 51 | **Perfect for**: Content creators, podcasters, and developers who need quick, accurate transcriptions with minimal setup. 52 | 53 | ### Real-World Use Case 54 | 55 | Got a 30-60 minute video that's 2-4GB? Other tools like Descript upload the **entire video** file, which takes forever and costs more. 56 | 57 | This tool: 58 | 1. 🎬 Extracts only the audio locally (takes seconds with FFmpeg) 59 | 2. ☁️ Uploads only ~20-40MB of audio to Whisper 60 | 3. 📝 Generates SRT subtitles 61 | 62 | **Result**: 10-100x faster than uploading multi-GB video files. Same quality, fraction of the time and bandwidth. 63 | 64 | ## Features 65 | 66 | - 🎬 **Video & Audio Support**: Works with MP4, MP3, WAV, M4A, WebM, OGG, MOV, AVI, and MKV 67 | - 🎥 **YouTube Support**: Download and transcribe YouTube videos directly 68 | - 🎯 **High Accuracy**: Powered by OpenAI's Whisper API 69 | - ⚡ **Smart Optimization**: Automatic 1.2x speed processing + mono/16kHz extraction (optimized for dialogue) 70 | - 📝 **SRT Format**: Generates standard SRT subtitle files with precise timestamps 71 | - 🎞️ **Long Movies**: Automatic chunking for feature-length content (45+ minutes) 72 | - 🎬 **Editor-Friendly**: Timecode offset, custom output paths, chunk size control 73 | - 🔧 **Simple Setup**: Easy configuration via environment variable or config file 74 | - 🌍 **Multi-language**: Automatically detects language 75 | - 🚀 **Lightning Fast**: Optimized for 2-4GB+ video files 76 | 77 | ## Installation & Setup 78 | 79 | ### Option 1: Use Instantly (No Install) 80 | 81 | ```bash 82 | npx @illyism/transcribe video.mp4 83 | ``` 84 | 85 | ### Option 2: Install Globally 86 | 87 | ```bash 88 | npm install -g @illyism/transcribe 89 | # or: bun install -g @illyism/transcribe 90 | ``` 91 | 92 | ### Prerequisites 93 | 94 |
95 | 📦 Install FFmpeg (required) 96 | 97 | ```bash 98 | # macOS 99 | brew install ffmpeg 100 | 101 | # Ubuntu/Debian 102 | sudo apt-get install ffmpeg 103 | 104 | # Windows 105 | choco install ffmpeg 106 | ``` 107 |
108 | 109 |
110 | 🎥 Install yt-dlp (optional, for YouTube) 111 | 112 | ```bash 113 | # macOS 114 | brew install yt-dlp 115 | 116 | # Ubuntu/Debian 117 | sudo apt install yt-dlp 118 | 119 | # Windows 120 | winget install yt-dlp 121 | 122 | # Or with pip 123 | pip install yt-dlp 124 | ``` 125 |
126 | 127 |
128 | 🔑 Get OpenAI API Key (required) 129 | 130 | 1. Go to [platform.openai.com/api-keys](https://platform.openai.com/api-keys) 131 | 2. Create a new API key 132 | 3. Copy it and set it up below ⬇️ 133 |
134 | 135 | ## API Key Setup (30 seconds) 136 | 137 | **One-time setup** - Choose your preferred method: 138 | 139 | ### Method 1: Config File (Recommended) 140 | 141 | ```bash 142 | mkdir -p ~/.transcribe && echo '{"apiKey": "sk-YOUR_KEY"}' > ~/.transcribe/config.json 143 | ``` 144 | 145 | ### Method 2: Environment Variable 146 | 147 | ```bash 148 | export OPENAI_API_KEY=sk-YOUR_KEY 149 | ``` 150 | 151 | **Don't have a key?** [Get one free here](https://platform.openai.com/api-keys) (takes 1 minute) 152 | 153 | ## Usage Examples 154 | 155 | ```bash 156 | # Local video file 157 | transcribe video.mp4 158 | 159 | # YouTube video 160 | transcribe https://www.youtube.com/watch?v=VIDEO_ID 161 | 162 | # Audio file 163 | transcribe podcast.mp3 164 | 165 | # Disable optimization (use original audio) 166 | transcribe video.mp4 --raw 167 | ``` 168 | 169 | **Outputs:** Creates `video.srt` in the same directory. 170 | 171 | ### Editor-Friendly Features 172 | 173 | Perfect for video editing workflows: 174 | 175 | ```bash 176 | # Custom output path (file or directory) 177 | transcribe movie.mkv --output ./subtitles 178 | transcribe movie.mkv --output ./subtitles/movie.srt 179 | 180 | # Timecode offset (for editorial timelines) 181 | transcribe movie.mkv --offset 01:00:00.000 # Start at 1 hour 182 | transcribe movie.mkv --offset 3600 # Same, in seconds 183 | 184 | # Force chunking for very long movies 185 | transcribe long_movie.mkv --chunk-minutes 15 186 | ``` 187 | 188 | **Why chunking?** Movies 45+ minutes are automatically split into ~20-minute chunks for reliability. Each chunk is transcribed separately, then merged seamlessly with correct timestamps. 189 | 190 | ### What Happens Automatically 191 | 192 | By default, the tool optimizes large files: 193 | 194 | ``` 195 | 2.7GB video → Extract audio (mono, 16kHz) → Speed up 1.2x → Chunk if >45min → Upload chunks → Transcribe → Merge & adjust timestamps 196 | ``` 197 | 198 | **For long movies (45+ minutes):** 199 | - Automatically splits into ~20-minute chunks 200 | - Transcribes each chunk separately 201 | - Merges results with correct timestamps 202 | - Handles 2+ hour movies reliably 203 | 204 | **Result:** 205 | - ⚡ 99.5% smaller uploads (2.7GB → 12.8MB) 206 | - 🚀 10-100x faster than uploading full video 207 | - 🎯 ~98% accuracy maintained 208 | - 💰 Same cost ($0.006/min) 209 | 210 | **Want original audio?** Add `--raw` flag. 211 | 212 | ### Use as a Library 213 | 214 | ```bash 215 | npm install @illyism/transcribe 216 | ``` 217 | 218 | ```typescript 219 | import { transcribe } from '@illyism/transcribe' 220 | 221 | const result = await transcribe({ 222 | inputPath: 'video.mp4', 223 | apiKey: process.env.OPENAI_API_KEY, 224 | optimize: true // default, set false to disable 225 | }) 226 | 227 | console.log(result.srtPath) // Path to generated SRT file 228 | console.log(result.text) // Full transcription text 229 | ``` 230 | 231 |
232 | Full API reference 233 | 234 | ```typescript 235 | interface TranscribeOptions { 236 | inputPath: string // Path to video/audio file 237 | apiKey?: string // OpenAI API key (or use env var) 238 | outputPath?: string // Custom output path (optional) 239 | optimize?: boolean // Enable optimization (default: true) 240 | } 241 | 242 | interface TranscribeResult { 243 | srtPath: string // Path to generated SRT file 244 | text: string // Full transcription text 245 | language: string // Detected language 246 | duration: number // Duration in seconds 247 | } 248 | ``` 249 |
250 | 251 | --- 252 | 253 | ## Details 254 | 255 |
256 | 📋 Supported Formats 257 | 258 | - **Video**: MP4, WebM, MOV, AVI, MKV 259 | - **Audio**: MP3, WAV, M4A, OGG, Opus 260 | - **YouTube**: All videos, Shorts, youtu.be links 261 |
262 | 263 |
264 | 💰 Cost 265 | 266 | OpenAI Whisper API: **$0.006 per minute** 267 | 268 | Examples: 269 | - 5 min: $0.03 270 | - 30 min: $0.18 271 | - 2 hours: $0.72 272 |
273 | 274 |
275 | ⚙️ How It Works 276 | 277 | 1. Extract audio from video (mono, 16kHz - optimized for speech) 278 | 2. Optimize: 1.2x speed + compression if >24MB 279 | 3. Auto-chunk if >45 minutes (for reliability) 280 | 4. Upload chunks to Whisper API (or single file) 281 | 5. Generate SRT with timestamps 282 | 6. Merge chunks (if needed) and adjust timestamps to match original 283 | 7. Apply timecode offset (if specified) 284 | 8. Clean up temp files 285 |
286 | 287 |
288 | 📄 SRT Output Example 289 | 290 | ```srt 291 | 1 292 | 00:00:00,000 --> 00:00:03,420 293 | Hey and thank you for getting the SEO roast. 294 | 295 | 2 296 | 00:00:03,420 --> 00:00:06,840 297 | I'll take a look at your website and see what things we can improve. 298 | ``` 299 |
300 | 301 | ## Troubleshooting 302 | 303 |
304 | "OPENAI_API_KEY not found" 305 | 306 | Set up your API key using one of the methods in [API Key Setup](#api-key-setup-30-seconds). 307 |
308 | 309 |
310 | "FFmpeg not found" 311 | 312 | Install FFmpeg: 313 | ```bash 314 | brew install ffmpeg # macOS 315 | sudo apt install ffmpeg # Ubuntu 316 | choco install ffmpeg # Windows 317 | ``` 318 |
319 | 320 |
321 | "yt-dlp not found" (YouTube only) 322 | 323 | Install yt-dlp: 324 | ```bash 325 | brew install yt-dlp # macOS 326 | sudo apt install yt-dlp # Ubuntu 327 | pip install yt-dlp # Any platform 328 | ``` 329 |
330 | 331 |
332 | File not found error 333 | 334 | Use absolute paths: 335 | ```bash 336 | transcribe /full/path/to/video.mp4 337 | ``` 338 |
339 | 340 |
341 | API errors (502, timeout, etc.) 342 | 343 | OpenAI API may be temporarily down. Wait 30 seconds and try again. 344 |
345 | 346 |
347 | "Could not parse multipart form" error 348 | 349 | If you're using Bun runtime, switch to Node.js: 350 | 351 | ```bash 352 | # Use Node.js instead of Bun 353 | node dist/cli.js video.mp4 354 | 355 | # Or install globally and use the transcribe command 356 | npm install -g @illyism/transcribe 357 | transcribe video.mp4 358 | ``` 359 | 360 | The CLI works best with Node.js 18+ due to OpenAI SDK compatibility. 361 |
362 | 363 | --- 364 | 365 | ## Links 366 | 367 | - 📦 [NPM Package](https://www.npmjs.com/package/@illyism/transcribe) 368 | - 🐙 [GitHub Repo](https://github.com/Illyism/transcribe-cli) 369 | - 📚 [Full Changelog](https://github.com/Illyism/transcribe-cli/blob/main/CHANGELOG.md) 370 | - 🧪 [A/B Test Results](https://github.com/Illyism/transcribe-cli/tree/main/test) 371 | - 🐛 [Report Issues](https://github.com/Illyism/transcribe-cli/issues) 372 | 373 | ## Contributing 374 | 375 | Pull requests welcome! See [GitHub repo](https://github.com/Illyism/transcribe-cli). 376 | 377 | ## License 378 | 379 | MIT © [Ilias Ismanalijev](https://github.com/Illyism) 380 | -------------------------------------------------------------------------------- /src/transcribe.ts: -------------------------------------------------------------------------------- 1 | import { spawn } from 'child_process' 2 | import { existsSync, mkdirSync, readdirSync, statSync, unlinkSync } from 'fs' 3 | import { writeFile } from 'fs/promises' 4 | import { basename, dirname, extname, join } from 'path' 5 | import type { TranscribeOptions, TranscribeResult } from './index' 6 | import { optimizeAudio } from './optimize' 7 | import type { WhisperResponse, WhisperSegment, WhisperWord } from './types' 8 | 9 | const MAX_UPLOAD_MB = 24 // Keep under ~25MB Whisper API limit (with headroom) 10 | const MAX_UPLOAD_BYTES = MAX_UPLOAD_MB * 1024 * 1024 11 | const AUTO_CHUNK_MINUTES = 20 12 | const AUTO_CHUNK_THRESHOLD_MINUTES = 45 13 | 14 | function formatTime(seconds: number): string { 15 | const hours = Math.floor(seconds / 3600) 16 | const minutes = Math.floor((seconds % 3600) / 60) 17 | const secs = Math.floor(seconds % 60) 18 | const millis = Math.floor((seconds % 1) * 1000) 19 | 20 | return `${String(hours).padStart(2, '0')}:${String(minutes).padStart(2, '0')}:${String(secs).padStart(2, '0')},${String(millis).padStart(3, '0')}` 21 | } 22 | 23 | function convertSegmentsToSRT(segments: Array>): string { 24 | let srt = '' 25 | 26 | segments.forEach((segment, index) => { 27 | srt += `${index + 1}\n` 28 | srt += `${formatTime(segment.start)} --> ${formatTime(segment.end)}\n` 29 | srt += `${segment.text.trim()}\n\n` 30 | }) 31 | 32 | return srt 33 | } 34 | 35 | function transformSegments( 36 | segments: WhisperSegment[], 37 | transform: (seconds: number) => number 38 | ): WhisperSegment[] { 39 | return segments.map((segment) => ({ 40 | ...segment, 41 | start: transform(segment.start), 42 | end: transform(segment.end), 43 | words: segment.words?.map((word: WhisperWord) => ({ 44 | ...word, 45 | start: transform(word.start), 46 | end: transform(word.end), 47 | })), 48 | })) 49 | } 50 | 51 | async function extractAudio(inputPath: string, outputPath: string): Promise { 52 | return new Promise((resolve, reject) => { 53 | let errorOutput = '' 54 | 55 | // Optimize for speech transcription: mono, 16kHz sample rate (Whisper's native) 56 | // This reduces file size significantly for movies while maintaining dialogue clarity 57 | const ffmpeg = spawn('ffmpeg', [ 58 | '-i', inputPath, 59 | '-vn', // No video 60 | '-ac', '1', // Mono (dialogue-focused) 61 | '-ar', '16000', // 16kHz sample rate (optimal for speech, reduces size) 62 | '-acodec', 'libmp3lame', 63 | '-q:a', '2', // High quality MP3 64 | '-y', 65 | outputPath 66 | ]) 67 | 68 | // Capture stderr for error messages 69 | ffmpeg.stderr.on('data', (data) => { 70 | errorOutput += data.toString() 71 | }) 72 | 73 | ffmpeg.on('close', (code) => { 74 | if (code === 0) { 75 | resolve() 76 | } else { 77 | // Provide more helpful error messages 78 | let errorMsg = `FFmpeg exited with code ${code}` 79 | 80 | if (errorOutput.includes('Permission denied')) { 81 | errorMsg += '\nPermission denied. Check file/folder permissions.' 82 | } else if (errorOutput.includes('No such file or directory')) { 83 | errorMsg += '\nInput file not found or output directory does not exist.' 84 | } else if (errorOutput.includes('Invalid data found')) { 85 | errorMsg += '\nInvalid or corrupted video file.' 86 | } else if (errorOutput.includes('does not contain any stream')) { 87 | errorMsg += '\nVideo file does not contain a valid audio or video stream.' 88 | } else { 89 | // Show last few lines of FFmpeg output for debugging 90 | const lines = errorOutput.trim().split('\n') 91 | const relevantLines = lines.slice(-5).join('\n') 92 | if (relevantLines) { 93 | errorMsg += '\n\nFFmpeg output:\n' + relevantLines 94 | } else { 95 | errorMsg += '\nFFmpeg conversion failed. Make sure FFmpeg is installed and the video file is valid.' 96 | } 97 | } 98 | 99 | reject(new Error(errorMsg)) 100 | } 101 | }) 102 | 103 | ffmpeg.on('error', (err) => { 104 | if (err.message.includes('ENOENT')) { 105 | reject(new Error('FFmpeg is not installed. Please install FFmpeg:\n macOS: brew install ffmpeg\n Ubuntu: sudo apt-get install ffmpeg\n Windows: choco install ffmpeg')) 106 | } else { 107 | reject(err) 108 | } 109 | }) 110 | }) 111 | } 112 | 113 | async function getMediaDurationSeconds(inputPath: string): Promise { 114 | return new Promise((resolve, reject) => { 115 | let stdout = '' 116 | let stderr = '' 117 | 118 | const ffprobe = spawn('ffprobe', [ 119 | '-v', 'error', 120 | '-show_entries', 'format=duration', 121 | '-of', 'default=noprint_wrappers=1:nokey=1', 122 | inputPath, 123 | ]) 124 | 125 | ffprobe.stdout.on('data', (data) => { 126 | stdout += data.toString() 127 | }) 128 | 129 | ffprobe.stderr.on('data', (data) => { 130 | stderr += data.toString() 131 | }) 132 | 133 | ffprobe.on('close', (code) => { 134 | if (code === 0) { 135 | const duration = parseFloat(stdout.trim()) 136 | if (!Number.isFinite(duration)) { 137 | reject(new Error(`FFprobe returned invalid duration for: ${inputPath}`)) 138 | return 139 | } 140 | resolve(duration) 141 | } else { 142 | reject(new Error(`FFprobe failed with code ${code}${stderr ? `\n\nFFprobe output:\n${stderr.trim()}` : ''}`)) 143 | } 144 | }) 145 | 146 | ffprobe.on('error', (err) => { 147 | if (err.message.includes('ENOENT')) { 148 | reject(new Error('FFprobe is not installed. Please install FFmpeg (includes ffprobe):\n macOS: brew install ffmpeg\n Ubuntu: sudo apt-get install ffmpeg\n Windows: choco install ffmpeg')) 149 | } else { 150 | reject(err) 151 | } 152 | }) 153 | }) 154 | } 155 | 156 | async function splitAudioIntoChunks(inputPath: string, chunkSeconds: number): Promise { 157 | if (!Number.isFinite(chunkSeconds) || chunkSeconds <= 0) { 158 | throw new Error(`Invalid chunkSeconds: ${chunkSeconds}`) 159 | } 160 | 161 | const ext = inputPath.toLowerCase().split('.').pop() || 'mp3' 162 | const dir = dirname(inputPath) 163 | const prefix = `chunks_${Date.now()}` 164 | const outputPattern = join(dir, `${prefix}_%03d.${ext}`) 165 | 166 | await new Promise((resolve, reject) => { 167 | let stderr = '' 168 | 169 | const ffmpeg = spawn('ffmpeg', [ 170 | '-i', inputPath, 171 | '-f', 'segment', 172 | '-segment_time', String(chunkSeconds), 173 | '-reset_timestamps', '1', 174 | '-c', 'copy', 175 | '-y', 176 | outputPattern, 177 | ]) 178 | 179 | ffmpeg.stderr.on('data', (data) => { 180 | stderr += data.toString() 181 | }) 182 | 183 | ffmpeg.on('close', (code) => { 184 | if (code === 0) { 185 | resolve() 186 | } else { 187 | reject(new Error(`FFmpeg chunking failed with code ${code}${stderr ? `\n\nFFmpeg output:\n${stderr.trim().split('\n').slice(-8).join('\n')}` : ''}`)) 188 | } 189 | }) 190 | 191 | ffmpeg.on('error', (err) => { 192 | if (err.message.includes('ENOENT')) { 193 | reject(new Error('FFmpeg is not installed. Please install FFmpeg:\n macOS: brew install ffmpeg\n Ubuntu: sudo apt-get install ffmpeg\n Windows: choco install ffmpeg')) 194 | } else { 195 | reject(err) 196 | } 197 | }) 198 | }) 199 | 200 | const created = readdirSync(dir) 201 | .filter((name) => name.startsWith(`${prefix}_`) && name.toLowerCase().endsWith(`.${ext}`)) 202 | .sort() 203 | .map((name) => join(dir, name)) 204 | 205 | if (created.length === 0) { 206 | throw new Error('Chunking produced no output files. Please try again.') 207 | } 208 | 209 | // Sanity check: if any chunk is still too large, give actionable guidance 210 | const tooLarge = created.find((p) => statSync(p).size > MAX_UPLOAD_BYTES) 211 | if (tooLarge) { 212 | throw new Error( 213 | `Audio chunk is still too large for Whisper API (~${MAX_UPLOAD_MB}MB).\n\n` + 214 | `Chunk: ${tooLarge}\n\n` + 215 | `Try:\n` + 216 | `- removing --raw (use default optimization)\n` + 217 | `- or using a smaller chunk size (e.g. --chunk-minutes 10)\n` 218 | ) 219 | } 220 | 221 | return created 222 | } 223 | 224 | async function transcribeWithWhisper(audioPath: string, apiKey: string): Promise { 225 | const { default: OpenAI, toFile } = await import('openai') 226 | const openai = new OpenAI({ apiKey }) 227 | 228 | // Read file as buffer and use SDK's toFile helper to create proper File object 229 | const fs = await import('fs/promises') 230 | const { basename } = await import('path') 231 | const fileBuffer = await fs.readFile(audioPath) 232 | const fileName = basename(audioPath) 233 | 234 | const ext = fileName.toLowerCase().split('.').pop() 235 | const mimeTypes: Record = { 236 | mp3: 'audio/mpeg', 237 | mp4: 'audio/mp4', 238 | m4a: 'audio/mp4', 239 | wav: 'audio/wav', 240 | ogg: 'audio/ogg', 241 | webm: 'audio/webm', 242 | flac: 'audio/flac', 243 | } 244 | const mimeType = (ext && mimeTypes[ext]) || 'application/octet-stream' 245 | 246 | // Use SDK's toFile helper to create a proper File object 247 | const audioFile = await toFile(fileBuffer, fileName, { type: mimeType }) 248 | 249 | const transcription = await openai.audio.transcriptions.create({ 250 | file: audioFile, 251 | model: 'whisper-1', 252 | response_format: 'verbose_json', 253 | timestamp_granularities: ['segment'] 254 | }) 255 | 256 | return transcription as WhisperResponse 257 | } 258 | 259 | /** 260 | * Transcribe an audio or video file to SRT format 261 | * 262 | * @param options - Transcription options 263 | * @returns Transcription result with path to SRT file and transcription details 264 | * 265 | * @example 266 | * ```typescript 267 | * import { transcribe } from '@magicspace/transcribe' 268 | * 269 | * const result = await transcribe({ 270 | * inputPath: '/path/to/video.mp4', 271 | * apiKey: 'sk-...' 272 | * }) 273 | * 274 | * console.log('SRT saved to:', result.srtPath) 275 | * console.log('Language:', result.language) 276 | * console.log('Duration:', result.duration) 277 | * ``` 278 | */ 279 | export async function transcribe(options: TranscribeOptions): Promise { 280 | const { inputPath, apiKey, outputPath, optimize = true, offsetSeconds = 0, chunkMinutes } = options 281 | 282 | if (!existsSync(inputPath)) { 283 | throw new Error(`File not found: ${inputPath}`) 284 | } 285 | 286 | if (!apiKey) { 287 | throw new Error('API key is required. Provide it in options or set OPENAI_API_KEY environment variable.') 288 | } 289 | 290 | const ext = inputPath.toLowerCase().split('.').pop() 291 | const supportedFormats = ['mp4', 'mp3', 'wav', 'm4a', 'webm', 'ogg', 'opus', 'mov', 'avi', 'mkv'] 292 | 293 | if (!ext || !supportedFormats.includes(ext)) { 294 | throw new Error(`Unsupported format. Supported formats: ${supportedFormats.join(', ')}`) 295 | } 296 | 297 | let audioPath = inputPath 298 | let tempAudioPath: string | null = null 299 | let optimizedPath: string | null = null 300 | let chunkPaths: string[] = [] 301 | let speedFactor = 1.0 302 | 303 | // Extract audio if it's a video file 304 | if (['mp4', 'webm', 'mov', 'avi', 'mkv'].includes(ext)) { 305 | console.log('🎬 Extracting audio from video...') 306 | const dir = dirname(inputPath) 307 | const baseName = basename(inputPath, extname(inputPath)) 308 | tempAudioPath = join(dir, `${baseName}_temp.mp3`) 309 | 310 | await extractAudio(inputPath, tempAudioPath) 311 | console.log('✅ Audio extraction complete!') 312 | audioPath = tempAudioPath 313 | } 314 | 315 | try { 316 | // Optimize audio if enabled 317 | if (optimize) { 318 | const optimized = await optimizeAudio(audioPath) 319 | if (optimized.path !== audioPath) { 320 | optimizedPath = optimized.path 321 | audioPath = optimized.path 322 | } 323 | speedFactor = optimized.speedFactor 324 | } 325 | 326 | const fileSizeBytes = statSync(audioPath).size 327 | const durationOptimized = await getMediaDurationSeconds(audioPath) 328 | const durationOriginal = durationOptimized * speedFactor 329 | 330 | const chunkMinutesToUse = chunkMinutes ?? AUTO_CHUNK_MINUTES 331 | const shouldChunk = 332 | chunkMinutes !== undefined || 333 | fileSizeBytes > MAX_UPLOAD_BYTES || 334 | durationOriginal > AUTO_CHUNK_THRESHOLD_MINUTES * 60 335 | 336 | if (offsetSeconds !== 0) { 337 | console.log(`🕒 Applying timestamp offset: ${offsetSeconds}s`) 338 | } 339 | 340 | let mergedSegments: WhisperSegment[] = [] 341 | let mergedText = '' 342 | let language = 'unknown' 343 | let originalDurationSeconds = durationOriginal 344 | 345 | if (shouldChunk) { 346 | const chunkSecondsOriginal = Math.max(60, chunkMinutesToUse * 60) 347 | const chunkSecondsOptimized = chunkSecondsOriginal / speedFactor 348 | 349 | console.log(`🧩 Chunking for reliability: ~${chunkMinutesToUse} min chunks (${chunkSecondsOriginal}s)`) 350 | chunkPaths = await splitAudioIntoChunks(audioPath, chunkSecondsOptimized) 351 | console.log(`✅ Created ${chunkPaths.length} chunks`) 352 | 353 | let offsetOptimizedSeconds = 0 354 | let totalOptimizedSeconds = 0 355 | 356 | for (let i = 0; i < chunkPaths.length; i++) { 357 | const chunkPath = chunkPaths[i] 358 | console.log(`🎙️ Transcribing chunk ${i + 1}/${chunkPaths.length}...`) 359 | 360 | const chunkDuration = await getMediaDurationSeconds(chunkPath) 361 | const chunkTranscription = await transcribeWithWhisper(chunkPath, apiKey) 362 | 363 | if (i === 0) { 364 | language = chunkTranscription.language 365 | } 366 | 367 | mergedText += chunkTranscription.text + '\n' 368 | 369 | const transformed = transformSegments(chunkTranscription.segments, (t) => { 370 | // chunk audio timestamps are in optimized time; map to global original timeline: 371 | // (localChunkTime + chunkOffsetOptimized) * speedFactor + userOffsetSeconds 372 | return (t + offsetOptimizedSeconds) * speedFactor + offsetSeconds 373 | }) 374 | 375 | mergedSegments.push(...transformed) 376 | 377 | offsetOptimizedSeconds += chunkDuration 378 | totalOptimizedSeconds += chunkDuration 379 | } 380 | 381 | originalDurationSeconds = totalOptimizedSeconds * speedFactor 382 | console.log(`✅ Transcription complete! Language: ${language}, Duration: ${originalDurationSeconds.toFixed(2)}s`) 383 | } else { 384 | // Transcribe with Whisper 385 | console.log('🎙️ Transcribing with OpenAI Whisper API...') 386 | const transcription = await transcribeWithWhisper(audioPath, apiKey) 387 | language = transcription.language 388 | mergedText = transcription.text 389 | 390 | mergedSegments = transformSegments(transcription.segments, (t) => t * speedFactor + offsetSeconds) 391 | originalDurationSeconds = transcription.duration * speedFactor 392 | 393 | console.log(`✅ Transcription complete! Language: ${language}, Duration: ${originalDurationSeconds.toFixed(2)}s`) 394 | } 395 | 396 | // Sort segments by start time (important for chunked transcriptions) 397 | mergedSegments.sort((a, b) => a.start - b.start) 398 | 399 | // Convert to SRT format 400 | const srt = convertSegmentsToSRT(mergedSegments) 401 | 402 | // Save SRT file (ensure directory exists) 403 | const defaultSrtPath = join(dirname(inputPath), `${basename(inputPath, extname(inputPath))}.srt`) 404 | const srtPath = outputPath || defaultSrtPath 405 | mkdirSync(dirname(srtPath), { recursive: true }) 406 | await writeFile(srtPath, srt, 'utf-8') 407 | 408 | return { 409 | srtPath, 410 | text: mergedText.trim(), 411 | language, 412 | duration: originalDurationSeconds 413 | } 414 | } finally { 415 | // Clean up temporary files 416 | for (const chunkPath of chunkPaths) { 417 | if (chunkPath && existsSync(chunkPath)) { 418 | unlinkSync(chunkPath) 419 | } 420 | } 421 | if (tempAudioPath && existsSync(tempAudioPath)) { 422 | unlinkSync(tempAudioPath) 423 | } 424 | if (optimizedPath && existsSync(optimizedPath)) { 425 | unlinkSync(optimizedPath) 426 | } 427 | if (chunkPaths.length || tempAudioPath || optimizedPath) { 428 | console.log('🧹 Cleaned up temporary files') 429 | } 430 | } 431 | } 432 | --------------------------------------------------------------------------------