├── .runpod
    ├── handler.py
    ├── tests.json
    └── hub.json
├── .gitignore
├── THIRD_PARTY_NOTICES.md
├── test_input.json
├── examples
    └── infer.mjs
├── LICENSE
├── Dockerfile
├── infer.py
└── README.md


/.runpod/handler.py:
--------------------------------------------------------------------------------
1 | infer.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | package.json
2 | package-lock.json
3 | 
4 | node_modules
5 | 


--------------------------------------------------------------------------------
/THIRD_PARTY_NOTICES.md:
--------------------------------------------------------------------------------
1 | Portions of this project are using pyannote's speaker diarization pipeline (https://huggingface.co/pyannote/speaker-diarization-3.1), which is licensed under the MIT license.
2 | 
3 | 


--------------------------------------------------------------------------------
/test_input.json:
--------------------------------------------------------------------------------
1 | { "input" : { "streaming": false, "transcribe_args": { "url" : "https://github.com/metaldaniel/HebrewASR-Comparison/raw/main/HaTankistiot_n12-mp3.mp3", "language": "he", "diarize": true, "verbose": false } } }
2 | 


--------------------------------------------------------------------------------
/.runpod/tests.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tests": [
 3 |   ],
 4 |   "config": {
 5 |     "gpuTypeId": "NVIDIA GeForce RTX 4090",
 6 |     "gpuCount": 1,
 7 |     "env": [
 8 |       {
 9 |         "key": "UNUSED_ENV_KEY_HERE",
10 |         "value": "UNUSED_ENV_VALUE_HERE"
11 |       }
12 |     ],
13 |     "allowedCudaVersions": [
14 |       "12.7"
15 |     ]
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/.runpod/hub.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "ivrit.ai speech-to-text inference",
 3 |   "description": "Hebrew speech-to-text based on ivrit.ai's latest Whisper model",
 4 |   "type": "serverless",
 5 |   "category": "audio",
 6 |   "iconUrl": "https://transcribe.ivrit.ai/static/favicon.png",
 7 |   "config": {
 8 |     "runsOn": "GPU",
 9 |     "containerDiskInGb": 20,
10 |     "gpuIds": "AMPERE_16",
11 |     "gpuCount": 1,
12 |     "presets": [],
13 |     "env": []
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/examples/infer.mjs:
--------------------------------------------------------------------------------
 1 | # Simple reference code for infering using Javascript
 2 | 
 3 | import runpodSdk from "runpod-sdk";
 4 | 
 5 | const { RUNPOD_API_KEY, ENDPOINT_ID } = process.env;
 6 | 
 7 | const runpod = runpodSdk(RUNPOD_API_KEY);
 8 | const endpoint = runpod.endpoint(ENDPOINT_ID);
 9 | 
10 | const result = await endpoint.runSync({ "input" : {
11 |   "type" : "url",
12 |   "url" : "https://github.com/metaldaniel/HebrewASR-Comparison/raw/main/HaTankistiot_n12-mp3.mp3"
13 | } });
14 | 
15 | console.log(result);
16 | 
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 ivrit.ai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Include Python
 2 | FROM pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime
 3 | 
 4 | # Define your working directory
 5 | WORKDIR /
 6 | 
 7 | # Configure LD_LIBRARY_PATH
 8 | ENV LD_LIBRARY_PATH="/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/lib:/opt/conda/lib/python3.11/site-packages/nvidia/cublas/lib"
 9 | 
10 | # Install relevant packages 
11 | RUN apt update
12 | RUN apt install -y ffmpeg
13 | 
14 | # Install python packages
15 | RUN pip3 install ivrit[all]==0.1.8 torch==2.4.1 huggingface-hub==0.36.0 runpod
16 | 
17 | RUN python3 -c 'import faster_whisper; m = faster_whisper.WhisperModel("ivrit-ai/whisper-large-v3-turbo-ct2")'
18 | RUN python3 -c 'import faster_whisper; m = faster_whisper.WhisperModel("ivrit-ai/yi-whisper-large-v3-turbo-ct2")'
19 | RUN python3 -c 'import faster_whisper; m = faster_whisper.WhisperModel("large-v3-turbo")'
20 | RUN python3 -c 'import pyannote.audio; p = pyannote.audio.Pipeline.from_pretrained("ivrit-ai/pyannote-speaker-diarization-3.1")'
21 | RUN python3 -c 'from speechbrain.inference.speaker import EncoderClassifier; EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")'
22 | 
23 | # Add your file
24 | ADD infer.py .
25 | 
26 | # Call your file when your container starts
27 | CMD [ "python", "-u", "/infer.py" ]
28 | 
29 | 


--------------------------------------------------------------------------------
/infer.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import runpod
  3 | import ivrit
  4 | import types
  5 | import logging
  6 | 
  7 | # Maximum size for grouped arrays (in characters).
  8 | # This ensures we are below the maximum size of an item in a RunPod stream.
  9 | MAX_RUNPOD_STREAM_ELEMENT_SIZE = 500000
 10 | 
 11 | # Global variables to track the currently loaded model
 12 | current_model = None
 13 | 
 14 | def transcribe(job):
 15 |     engine = job['input'].get('engine', 'faster-whisper')
 16 |     model_name = job['input'].get('model', None)
 17 |     is_streaming = job['input'].get('streaming', False)
 18 | 
 19 |     if not engine in ['faster-whisper', 'stable-whisper']:
 20 |         yield { "error" : f"engine should be 'faster-whisper' or 'stable-whisper', but is {engine} instead." }
 21 | 
 22 |     if not model_name:
 23 |         yield { "error" : "Model not provided." }
 24 | 
 25 |     # Get the API key from the job input
 26 |     api_key = job['input'].get('api_key', None)
 27 | 
 28 |     # Extract transcribe_args from job input
 29 |     transcribe_args = job['input'].get('transcribe_args', None)
 30 | 
 31 |     # Validate that transcribe_args contains either blob or url
 32 |     if not transcribe_args:
 33 |         yield { "error" : "transcribe_args field not provided." }
 34 |     
 35 |     if not ('blob' in transcribe_args or 'url' in transcribe_args):
 36 |         yield { "error" : "transcribe_args must contain either 'blob' or 'url' field." }
 37 | 
 38 |     stream_gen = transcribe_core(engine, model_name, transcribe_args)
 39 | 
 40 |     if is_streaming:
 41 |         for entry in stream_gen:
 42 |             yield entry
 43 |     else:
 44 |         result = [entry for entry in stream_gen]
 45 |         yield { 'result' : result }
 46 | 
 47 | def transcribe_core(engine, model_name, transcribe_args):
 48 |     print('Transcribing...')
 49 |     
 50 |     global current_model
 51 | 
 52 |     different_model = (not current_model) or (current_model.engine != engine or current_model.model != model_name)
 53 | 
 54 |     if different_model:
 55 |         print(f'Loading new model: {engine} with {model_name}')
 56 |         current_model = ivrit.load_model(engine=engine, model=model_name, local_files_only=True)
 57 |     else:
 58 |         print(f'Reusing existing model: {engine} with {model_name}')
 59 | 
 60 |     diarize = transcribe_args.get('diarize', False)
 61 | 
 62 |     if diarize:
 63 |         res = current_model.transcribe(**transcribe_args)
 64 | 
 65 |         segs = res['segments']
 66 |     else:
 67 |         transcribe_args['stream'] = True 
 68 |         segs = current_model.transcribe(**transcribe_args)
 69 | 
 70 |     # Check if segs is a generator
 71 |     if isinstance(segs, types.GeneratorType):
 72 |         # For generators, yield results one by one as an array of one value
 73 |         for s in segs:
 74 |             yield [dataclasses.asdict(s)]
 75 |     else:
 76 |         # For non-generators, group multiple consecutive members into larger arrays
 77 |         # ensuring their total size is less than MAX_RUNPOD_STREAM_ELEMENT_SIZE
 78 |         current_group = []
 79 |         current_size = 0
 80 |         
 81 |         for s in segs:
 82 |             seg_dict = dataclasses.asdict(s)
 83 |             seg_size = len(str(seg_dict))
 84 |             
 85 |             # If adding this segment would exceed the max size, yield current group
 86 |             if current_group and (current_size + seg_size > MAX_RUNPOD_STREAM_ELEMENT_SIZE):
 87 |                 yield current_group
 88 |                 current_group = []
 89 |                 current_size = 0
 90 |             
 91 |             # Add segment to current group
 92 |             current_group.append(seg_dict)
 93 |             current_size += seg_size
 94 |         
 95 |         # Yield any remaining segments in the final group
 96 |         if current_group:
 97 |             yield current_group
 98 | 
 99 | runpod.serverless.start({"handler": transcribe, "return_aggregate_stream": True})
100 | 
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # runpod-serverless
  2 | 
  3 | [![RunPod](https://api.runpod.io/badge/ivrit-ai/runpod-serverless)](https://www.runpod.io/console/hub/ivrit-ai/runpod-serverless)
  4 | 
  5 | A template for quickly deploying an ivrit.ai Speech-to-text API.
  6 | 
  7 | Note: if you register at [runpod.io](https://runpod.io), we'd like to ask you to consider using our [referral link](https://runpod.io/?ref=06octndf).
  8 | It provides us with credits, which we can then use to provide better services.
  9 | 
 10 | ## Description
 11 | 
 12 | This project provides a serverless solution for transcribing Hebrew audio files. It leverages runpod.io's infrastructure to process audio files efficiently and return transcriptions.
 13 | It is part of the [ivrit.ai](https://ivrit.ai) non-profit project.
 14 | 
 15 | ## API: easy deployment through the Runpod hub
 16 | 
 17 | If you simply want to use our models via an API, quick deploy is avaialble via the RunPod hub.
 18 | 
 19 | 1. Open this template on the hub by clicking [here](https://www.runpod.io/console/hub/ivrit-ai/runpod-serverless).
 20 | 2. Click the "Deploy" button and create the endpoint.
 21 | 3. Follow the instructions under the [Usage](#usage) section.
 22 | 
 23 | ## Contents
 24 | 
 25 | - `Dockerfile`: Used to create the Docker image for the serverless function.
 26 | - `infer.py`: The main script that handles the transcription process, placed inside the Docker image.
 27 | 
 28 | ## Setting up your inference endpoint
 29 | 
 30 | 1. Log in to [runpod.io]
 31 | 2. Choose Menu->Serverless
 32 | 3. Choose New Endpoint
 33 | 4. Select the desired worker configuration.
 34 |    - You can choose the cheapest worker (16GB GPU, $0.00016/second as of August 1st, 2024).
 35 |    - Active workers can be 0, max workers is 1 or more.
 36 |    - GPUs/worker should be set to 1.
 37 |    - Container image should be set to **yairlifshitz/whisper-runpod-serverless:latest**, or your own Docker image (instruction later on how to build this).
 38 |    - Container disk should have at least 20 GB.
 39 | 5. Click Deploy.
 40 | 
 41 | ## Usage
 42 | 
 43 | Once deployed on runpod.io, you can transcribe Hebrew audio either by providing a URL to transcribe (easily supports >1GB payloads, depending on Docker image's free disk space and timeout settings) or by uploading a file (up to ~5-10MB).
 44 | 
 45 | ### Using the endpoint
 46 | 
 47 | Use the ivrit python package.
 48 | 
 49 | ```
 50 | import ivrit
 51 | 
 52 | model = ivrit.load_model(engine="runpod", model="ivrit-ai/whisper-large-v3-turbo-ct2", api_key="<your API key>", endpoint_id="<your endpoint ID>")
 53 | 
 54 | # Local file transcription (up to ~5MB)
 55 | result = model.transcribe(path="<your file>", language="he")
 56 | 
 57 | # URL-based transcription
 58 | result = model.transcribe(url="<your URL>", language="he")
 59 | 
 60 | # Print resulting text
 61 | print(result['text'])
 62 | 
 63 | # Iterate over segments
 64 | for segment in result['segments']:
 65 |     print(segment) 
 66 | ```
 67 | 
 68 | Supported models are **ivrit-ai/whisper-large-v3-ct2** and **ivrit-ai/whisper-large-v3-turbo-ct2**.
 69 | 
 70 | #### Output Options
 71 | 
 72 | The `transcribe()` method accepts an `output_options` parameter (dictionary) to control the detail level of the output:
 73 | 
 74 | ```
 75 | result = model.transcribe(
 76 |     path="<your file>", 
 77 |     language="he",
 78 |     output_options={
 79 |         "word_timestamps": False,  # Disable word-level timestamps
 80 |         "extra_data": False         # Disable extra metadata
 81 |     }
 82 | )
 83 | ```
 84 | 
 85 | Setting both `word_timestamps` and `extra_data` to `False` significantly reduces the output length, which is important when using non-streaming mode as it minimizes response payload size and improves performance.
 86 | 
 87 | #### Diarization (Speaker Identification)
 88 | 
 89 | For diarization (identifying different speakers), use the `stable-whisper` core engine:
 90 | 
 91 | ```
 92 | import ivrit
 93 | 
 94 | model = ivrit.load_model(
 95 |     engine="runpod", 
 96 |     model="ivrit-ai/whisper-large-v3-turbo-ct2", 
 97 |     api_key="<your API key>", 
 98 |     endpoint_id="<your endpoint ID>",
 99 |     core_engine="stable-whisper"
100 | )
101 | 
102 | # Transcribe with diarization enabled
103 | result = model.transcribe(
104 |     path="<your file>", 
105 |     language="he",
106 |     diarize=True
107 | )
108 | 
109 | # Results will include speaker labels in segments
110 | for segment in result['segments']:
111 |     print(f"Speakers {segment.speakers}: {segment['text']}")
112 | ```
113 | 
114 | ## Contributing
115 | 
116 | Contributions are welcome! Please feel free to submit a Pull Request.
117 | Patreon link: [here](https://www.patreon.com/ivrit_ai).
118 | 
119 | ## License
120 | 
121 | Our code and model are released under the MIT license.
122 | 
123 | ## Acknowledgements
124 | 
125 | - [Our long list of data contributors](https://www.ivrit.ai/en/credits)
126 | - Our data annotation volunteers!
127 | 


--------------------------------------------------------------------------------