├── .github └── workflows │ └── main.yml ├── .gitignore ├── Apps ├── AIAttendant │ ├── AIAActor.py │ ├── AIAAppConfig.py │ ├── AIAProfile.py │ └── AIASession.py └── LiveTranslator │ ├── LTActor.py │ ├── LTAppConfig.py │ ├── LTProfile.py │ └── LTSession.py ├── Cluster ├── InfernBatchedWorker.py ├── InfernBenchActor.py ├── InfernLLMActor.py ├── InfernLLMWorker.py ├── InfernRTPActor.py ├── InfernSIPActor.py ├── InfernSTTActor.py ├── InfernSTTWorker.py ├── InfernTTSActor.py ├── InfernTTSWorker.py ├── LLMSession.py ├── RemoteRTPGen.py ├── RemoteTTSSession.py ├── STTSession.py └── TTSSession.py ├── Core ├── AStreamMarkers.py ├── AudioChunk.py ├── Codecs │ ├── G711.py │ ├── G722.py │ └── GenCodec.py ├── ConfigValidators.py ├── Exceptions │ └── InfernSessNotFoundErr.py ├── InfernConfig.py ├── InfernWrkThread.py ├── OutputMuxer.py ├── T2T │ ├── NumbersToWords.py │ └── Translator.py └── VAD │ ├── SileroVAD.py │ ├── SileroVADUtils.py │ └── ZlibVAD.py ├── HelloSippyTTSRT ├── HelloSippyRT.py ├── HelloSippyRTPipe.py └── HelloSippyRTPipeTest.py ├── Infernos.py ├── LICENSE ├── README.md ├── RTP ├── AudioInput.py ├── InfernRTPConf.py ├── InfernRTPEPoint.py ├── InfernRTPIngest.py ├── RTPOutputWorker.py └── RTPParams.py ├── SIP ├── InfernSIP.py ├── InfernSIPConf.py ├── InfernSIPProfile.py ├── InfernUA.py ├── InfernUAC.py ├── InfernUAS.py ├── RemoteSession.py └── SipSessInfo.py ├── config.yaml ├── config └── InfernGlobals.py ├── docker ├── Dockerfile ├── install_conda.sh ├── install_hw.sh ├── install_requirements.sh ├── intel-ray.diff └── setup_conda.sh ├── examples ├── ai_attendant.yaml ├── llm_test.py ├── sippylabs.txt └── voice_ass.py ├── requirements.txt ├── safetorch └── InfernTorcher.py └── utils └── tts.py /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: Build & Publush 4 | 5 | # Controls when the action will run. 6 | on: 7 | # Triggers the workflow on all push or pull request events 8 | push: 9 | pull_request: 10 | 11 | release: 12 | types: [created] 13 | 14 | # Allows you to run this workflow manually from the Actions tab 15 | workflow_dispatch: 16 | 17 | schedule: 18 | - cron: "0 0 * * *" 19 | 20 | # added using https://github.com/step-security/secure-repo 21 | permissions: 22 | contents: read 23 | 24 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 25 | jobs: 26 | Docker: 27 | name: Build&Push to DockerHub 28 | if: (github.event_name == 'push' || github.event_name == 'pull_request') 29 | runs-on: [self-hosted, linux, x64] 30 | strategy: 31 | matrix: 32 | infer-hw: ['nvidia', 'intel'] 33 | env: 34 | DOCKER_REPO: 'sippylabs/infernos' 35 | BASE_IMAGE: 'ubuntu:24.10' 36 | PYTHON_VER: '3.11' 37 | CONDA_MAINENV: 'Infernos' 38 | INFER_HW: ${{ matrix.infer-hw }} 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v4 42 | with: 43 | submodules: 'recursive' 44 | 45 | - name: Set up Docker Buildx 46 | uses: docker/setup-buildx-action@v3 47 | 48 | - name: Log in to Docker Hub 49 | if: github.event_name != 'pull_request' 50 | uses: docker/login-action@v3 51 | with: 52 | username: ${{ secrets.DOCKER_USERNAME }} 53 | password: ${{ secrets.DOCKER_PASSWORD }} 54 | 55 | - name: Extract metadata (tags, labels) for Docker 56 | id: meta 57 | uses: docker/metadata-action@v5 58 | with: 59 | images: ${{ env.DOCKER_REPO }} 60 | tags: | 61 | type=schedule 62 | type=ref,event=branch,prefix=${{ env.INFER_HW }}- 63 | type=ref,event=tag,prefix=${{ env.INFER_HW }}- 64 | type=ref,event=pr,prefix=${{ env.INFER_HW }}- 65 | type=raw,value=${{ env.INFER_HW }}-latest,enable={{is_default_branch}} 66 | type=sha 67 | 68 | - name: Get branch name 69 | run: echo "GIT_BRANCH=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_ENV 70 | 71 | - name: Build Docker image 72 | uses: docker/build-push-action@v6 73 | env: 74 | CACHE_SPEC: "type=registry,ref=${{ env.DOCKER_REPO }}:${{ env.INFER_HW }}-${{ env.GIT_BRANCH }}-buildcache" 75 | with: 76 | context: . 77 | file: ./docker/Dockerfile 78 | push: true 79 | build-args: | 80 | BASE_IMAGE=${{ env.BASE_IMAGE }} 81 | PYTHON_VER=${{ env.PYTHON_VER }} 82 | CONDA_MAINENV=${{ env.CONDA_MAINENV }} 83 | INFER_HW=${{ env.INFER_HW }} 84 | tags: | 85 | ${{ steps.meta.outputs.tags }} 86 | labels: ${{ steps.meta.outputs.labels }} 87 | cache-from: ${{ env.CACHE_SPEC }} 88 | cache-to: ${{ env.CACHE_SPEC }},mode=max 89 | #cache-from: type=gha 90 | #cache-to: type=registry,ref=${{ env.DOCKER_REPO }}:${{ env.INFER_HW }}-buildcache,mode=max 91 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.wav 3 | -------------------------------------------------------------------------------- /Apps/AIAttendant/AIAActor.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, List, Union 2 | from uuid import UUID 3 | from functools import partial 4 | 5 | from ray import ray 6 | import nltk 7 | from tensorboardX import SummaryWriter 8 | 9 | from config.InfernGlobals import InfernGlobals as IG 10 | from Cluster.InfernSIPActor import InfernSIPActor 11 | from Cluster.InfernTTSActor import InfernTTSActor 12 | from Cluster.InfernSTTActor import InfernSTTActor 13 | from Cluster.InfernLLMActor import InfernLLMActor 14 | from Cluster.STTSession import STTResult, STTSentinel 15 | from Cluster.LLMSession import LLMResult 16 | from SIP.RemoteSession import RemoteSessionOffer 17 | from Core.T2T.NumbersToWords import NumbersToWords 18 | from Core.Exceptions.InfernSessNotFoundErr import InfernSessNotFoundErr 19 | 20 | from .AIASession import AIASession 21 | from ..LiveTranslator.LTActor import ntw_filter 22 | 23 | class AIASessNotFoundErr(InfernSessNotFoundErr): pass 24 | 25 | @ray.remote(resources={"ai_attendant": 1}) 26 | class AIAActor(): 27 | sessions: Dict[UUID, AIASession] 28 | thunmbstones: List[UUID] 29 | translator: callable 30 | nstts: int = 0 31 | def __init__(self): 32 | self.stt_out_lang = 'en' 33 | 34 | def start(self, aia_prof: 'AIAProfile', sip_actr:InfernSIPActor): 35 | self.aia_prof = aia_prof 36 | self.tts_lang = aia_prof.tts_lang 37 | self.stt_lang = aia_prof.stt_lang 38 | nltk.download('punkt') 39 | nltk.download('punkt_tab') 40 | self.aia_actr = ray.get_runtime_context().current_actor 41 | self.sip_actr = sip_actr 42 | self.tts_actr = InfernTTSActor.remote() 43 | self.stt_actr = InfernSTTActor.remote() 44 | self.llm_actr = InfernLLMActor.remote() 45 | futs = [self.stt_actr.start.remote(), self.tts_actr.start.remote(lang=self.tts_lang, output_sr=8000), 46 | self.llm_actr.start.remote()] 47 | if self.stt_out_lang == self.tts_lang: 48 | self.translator = ntw_filter 49 | else: 50 | flt = partial(ntw_filter, obj=NumbersToWords(self.tts_lang)) 51 | self.translator = IG.get_translator(self.stt_out_lang, self.tts_lang, filter=flt).translate 52 | self.swriter = SummaryWriter() 53 | ray.get(futs) 54 | self.sessions = {} 55 | self.thumbstones = [] 56 | 57 | def new_sip_session_received(self, new_sess:RemoteSessionOffer): 58 | aia_sess = AIASession(self, new_sess, self.aia_prof.llm_prompt) 59 | print(f'{aia_sess=}') 60 | self.sessions[aia_sess.id] = aia_sess 61 | 62 | def sess_term(self, sess_id:UUID, sip_sess_id:UUID, relaxed:bool=False): 63 | try: 64 | self._get_session(sess_id).sess_term(sip_sess_id) 65 | except AIASessNotFoundErr: 66 | if not relaxed: raise 67 | return 68 | del self.sessions[sess_id] 69 | self.thumbstones.append(sess_id) 70 | if len(self.thumbstones) > 100: 71 | self.thumbstones = self.thumbstones[-100:] 72 | 73 | def text_in(self, sess_id:UUID, result:Union[STTResult,STTSentinel]): 74 | if isinstance(result, STTResult): 75 | self.swriter.add_scalar(f'stt/inf_time', result.inf_time, self.nstts) 76 | self.nstts += 1 77 | self._get_session(sess_id).text_in(result) 78 | 79 | def text_out(self, sess_id:UUID, result:LLMResult): 80 | try: 81 | self._get_session(sess_id).text_out(result) 82 | except AIASessNotFoundErr: 83 | if not sess_id in self.thumbstones: raise 84 | 85 | def tts_say_done(self, sess_id:UUID): 86 | self._get_session(sess_id).tts_say_done() 87 | 88 | def _get_session(self, sess_id:UUID): 89 | try: return self.sessions[sess_id] 90 | except KeyError: raise AIASessNotFoundErr(f'No AIA session with id {sess_id}') 91 | -------------------------------------------------------------------------------- /Apps/AIAttendant/AIAAppConfig.py: -------------------------------------------------------------------------------- 1 | from .AIAProfile import AIAProfile 2 | 3 | class AIAAppConfig(): 4 | schema: dict = { 5 | 'ai_attendant': { 6 | 'type': 'dict', 7 | 'schema': { 8 | **AIAProfile.schema, 9 | } 10 | }, 11 | } 12 | -------------------------------------------------------------------------------- /Apps/AIAttendant/AIAProfile.py: -------------------------------------------------------------------------------- 1 | import ray 2 | from typing import Optional 3 | 4 | from Cluster.InfernSIPActor import InfernSIPActor 5 | 6 | from .AIAActor import AIAActor 7 | 8 | 9 | class AIAProfile(): 10 | schema: dict = { 11 | 'profiles': { 12 | 'type': 'dict', 13 | 'keysrules': {'type': 'string'}, 14 | 'valuesrules': { 15 | 'type': 'dict', 16 | 'schema': { 17 | 'tts_lang': {'type': 'string'}, 18 | 'stt_lang': {'type': 'string'}, 19 | 'llm_prompt': {'type': 'string'}, 20 | } 21 | } 22 | } 23 | } 24 | stt_lang: str = 'en' 25 | tts_lang: str = 'en' 26 | llm_prompt: str 27 | actor: Optional[AIAActor] = None 28 | 29 | def __init__(self, name, conf): 30 | self.name = name 31 | self.tts_lang = conf['tts_lang'] 32 | self.stt_lang = conf['stt_lang'] 33 | self.llm_prompt = open(conf['llm_prompt']).read() 34 | 35 | def finalize(self, iconf:'InfernConfig'): 36 | pass 37 | 38 | def getActor(self, iconf:'InfernConfig', sip_act:InfernSIPActor): 39 | if self.actor is None: 40 | self.actor = AIAActor.remote() 41 | ray.get(self.actor.start.remote(self, sip_act)) 42 | return self.actor 43 | -------------------------------------------------------------------------------- /Apps/AIAttendant/AIASession.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List, Optional, Dict, Union 2 | from uuid import UUID, uuid4 3 | from functools import partial 4 | import ray 5 | 6 | from nltk.tokenize import sent_tokenize 7 | 8 | from Cluster.TTSSession import TTSRequest 9 | from Cluster.STTSession import STTRequest, STTResult, STTSentinel 10 | from Cluster.LLMSession import LLMRequest, LLMResult, LLMSessionParams 11 | from Cluster.RemoteTTSSession import RemoteTTSSession 12 | from Cluster.InfernRTPActor import InfernRTPActor, RTPSessNotFoundErr 13 | from Core.T2T.NumbersToWords import NumbersToWords 14 | from RTP.AudioInput import AudioInput 15 | from SIP.RemoteSession import RemoteSessionOffer, RemoteSessionAccept 16 | from Core.T2T.Translator import Translator 17 | from Core.AudioChunk import AudioChunk 18 | from ..LiveTranslator.LTSession import _sess_term, TTSProxy 19 | 20 | class STTProxy(AudioInput): 21 | from time import monotonic 22 | last_chunk_time: Optional[float] = None 23 | debug = True 24 | stt_do: callable 25 | stt_done: callable 26 | def __init__(self, stt_actr, stt_lang, stt_sess_id, stt_done): 27 | self.stt_do = partial(stt_actr.stt_session_soundin.remote, sess_id=stt_sess_id) 28 | self.lang, self.stt_done = stt_lang, stt_done 29 | 30 | def audio_in(self, chunk:AudioChunk): 31 | if self.last_chunk_time is None: 32 | return 33 | if chunk.active: 34 | self.last_chunk_time = None 35 | return 36 | if self.monotonic() - self.last_chunk_time < 2.0: 37 | return 38 | def stt_done(result:STTSentinel): 39 | print(f'STTProxy: {result=}') 40 | self.stt_done(result=result) 41 | self.last_chunk_time = None 42 | sreq = STTSentinel('flush', stt_done) 43 | self.stt_do(req=sreq) 44 | 45 | # This method runs in the context of the inbound RTP Actor 46 | def vad_chunk_in(self, chunk:AudioChunk): 47 | self.last_chunk_time = self.monotonic() 48 | if self.debug: 49 | print(f'STTProxy: VAD: {len(chunk.audio)=} {chunk.track_id=}') 50 | def stt_done(result:STTResult): 51 | print(f'STTProxy: {result=}') 52 | self.stt_done(result=result) 53 | sreq = STTRequest(chunk, stt_done, self.lang) 54 | sreq.mode = 'translate' 55 | self.stt_do(req=sreq) 56 | 57 | class AIASession(): 58 | debug = False 59 | id: UUID 60 | stt_sess_id: UUID 61 | rtp_sess_id: UUID 62 | llm_sess_id: UUID 63 | last_llm_req_id: UUID 64 | rtp_actr: InfernRTPActor 65 | tts_sess: RemoteTTSSession 66 | say_buffer: List[TTSRequest] 67 | translator: Optional[Translator] 68 | stt_sess_term: callable 69 | text_in_buffer: List[str] 70 | saying: UUID 71 | 72 | def __init__(self, aiaa:'AIAActor', new_sess:RemoteSessionOffer, llm_prompt:str): 73 | self.id = uuid4() 74 | self.say_buffer = [] 75 | sess_term_alice = partial(_sess_term, sterm=aiaa.aia_actr.sess_term.remote, sess_id=self.id, sip_sess_id=new_sess.sip_sess_id) 76 | self.tts_say_done_cb = partial(aiaa.aia_actr.tts_say_done.remote, sess_id=self.id) 77 | amsg = RemoteSessionAccept(disc_cb=sess_term_alice, auto_answer=True) 78 | try: 79 | rtp_alice = ray.get(new_sess.accept(msg=amsg)) 80 | except KeyError: 81 | print(f'Failed to accept {new_sess.sip_sess_id=}') 82 | return 83 | self.rtp_actr, self.rtp_sess_id = rtp_alice 84 | stt_sess = aiaa.stt_actr.new_stt_session.remote(keep_context=True) 85 | llmp = LLMSessionParams(llm_prompt) 86 | llm_sess = aiaa.llm_actr.new_llm_session.remote(llmp) 87 | self.tts_sess = RemoteTTSSession(aiaa.tts_actr) 88 | self.stt_sess_id, self.llm_sess_id = ray.get([stt_sess, llm_sess]) 89 | self.stt_sess_term = partial(aiaa.stt_actr.stt_session_end.remote, self.stt_sess_id) 90 | self.llm_sess_term = partial(aiaa.llm_actr.llm_session_end.remote, self.llm_sess_id) 91 | self.translator = aiaa.translator 92 | text_cb = partial(aiaa.aia_actr.text_in.remote, sess_id=self.id) 93 | vad_handler = STTProxy(aiaa.stt_actr, aiaa.stt_lang, self.stt_sess_id, text_cb) 94 | try: 95 | ray.get(self.rtp_actr.rtp_session_connect.remote(self.rtp_sess_id, vad_handler)) 96 | except RTPSessNotFoundErr: 97 | print(f'RTPSessNotFoundErr: {self.rtp_sess_id=}') 98 | sess_term_alice() 99 | return 100 | soundout = partial(self.rtp_actr.rtp_session_soundout.remote, self.rtp_sess_id) 101 | tts_soundout = TTSProxy(soundout) 102 | self.tts_sess.start(tts_soundout) 103 | self.speaker = ray.get(aiaa.tts_actr.get_rand_voice_id.remote()) 104 | self.speaker = 6852 105 | self.llm_text_cb = partial(aiaa.aia_actr.text_out.remote, sess_id=self.id) 106 | self.llm_session_textin = partial(aiaa.llm_actr.llm_session_textin.remote, sess_id=self.llm_sess_id) 107 | self.llm_session_context_add = partial(aiaa.llm_actr.llm_session_context_add.remote, 108 | sess_id=self.llm_sess_id) 109 | si = new_sess.sess_info 110 | self.n2w = NumbersToWords() 111 | self.text_in_buffer = [] 112 | self.text_to_llm(f'') 113 | print(f'Agent {self.speaker} at your service.') 114 | 115 | def text_to_llm(self, text:str): 116 | req = LLMRequest(text, self.llm_text_cb) 117 | req.auto_ctx_add = False 118 | self.llm_session_textin(req=req) 119 | self.last_llm_req_id = req.id 120 | 121 | def text_in(self, result:Union[STTResult,STTSentinel]): 122 | if isinstance(result, STTResult): 123 | if self.debug: 124 | print(f'STT: "{result.text=}" {result.no_speech_prob=}') 125 | nsp = result.no_speech_prob 126 | if nsp > STTRequest.max_ns_prob or len(result.text) == 0: 127 | if result.duration < 5.0: 128 | return 129 | text = f'' 130 | else: 131 | text = result.text 132 | self.text_in_buffer.append(text) 133 | if len(self.say_buffer) > 0: 134 | self.say_buffer = self.say_buffer[:1] 135 | if self.saying is not None: 136 | self.llm_session_context_add(content='', role='user') 137 | self.tts_sess.stop_saying(self.saying) 138 | self.saying = None 139 | return 140 | if len(self.text_in_buffer) == 0: 141 | return 142 | text = ' '.join(self.text_in_buffer) 143 | self.text_in_buffer = [] 144 | self.text_to_llm(text) 145 | return 146 | 147 | def text_out(self, result:LLMResult): 148 | if self.debug: print(f'text_out({result.text=})') 149 | if result.req_id != self.last_llm_req_id: 150 | print(f'LLMResult for old req_id: {result.req_id}') 151 | return 152 | if result.text == '': 153 | print(f'LLMResult: nothing to say') 154 | return 155 | text = sent_tokenize(result.text) 156 | out_sents = [text.pop(0),] 157 | for t in text: 158 | if len(out_sents[-1]) + len(t) < 128 or out_sents[-1].endswith(' i.e.'): 159 | out_sents[-1] += ' ' + t 160 | else: 161 | out_sents.append(t) 162 | for t in out_sents: 163 | self.tts_say(t) 164 | 165 | def _tts_say(self, tr:TTSRequest): 166 | self.saying = self.tts_sess.say(tr) 167 | self.llm_session_context_add(content=tr.text[0], role='assistant') 168 | 169 | def tts_say(self, text): 170 | if self.debug: print(f'tts_say({text=})') 171 | text = self.n2w(text) 172 | tts_req = TTSRequest([text,], done_cb=self.tts_say_done_cb, speaker_id=self.speaker) 173 | self.say_buffer.append(tts_req) 174 | if len(self.say_buffer) > 1: 175 | return 176 | self._tts_say(tts_req) 177 | 178 | def tts_say_done(self): 179 | if self.debug: print(f'tts_say_done()') 180 | tbuf = self.say_buffer 181 | tbuf.pop(0) 182 | if len(tbuf) > 0: 183 | self._tts_say(tbuf[0]) 184 | return 185 | self.saying = None 186 | 187 | def sess_term(self, _): 188 | self.stt_sess_term() 189 | self.tts_sess.end() 190 | self.llm_sess_term() 191 | -------------------------------------------------------------------------------- /Apps/LiveTranslator/LTActor.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, List 2 | from uuid import UUID 3 | from functools import partial 4 | 5 | from ray import ray 6 | import nltk 7 | from tensorboardX import SummaryWriter 8 | 9 | from config.InfernGlobals import InfernGlobals as IG 10 | from Cluster.InfernSIPActor import InfernSIPActor 11 | from Cluster.InfernTTSActor import InfernTTSActor 12 | from Cluster.InfernSTTActor import InfernSTTActor 13 | from Cluster.STTSession import STTResult 14 | from SIP.RemoteSession import RemoteSessionOffer 15 | from Core.T2T.NumbersToWords import NumbersToWords 16 | from Core.Exceptions.InfernSessNotFoundErr import InfernSessNotFoundErr 17 | 18 | from .LTSession import LTSession, VADSignals 19 | 20 | def ntw_filter(text, from_code=None, to_code=None, tr=lambda x:x, obj=NumbersToWords()): 21 | print(f'ntw_filter({from_code=}, {to_code=}, {text=})') 22 | return obj(tr(text)) 23 | 24 | class LTSessNotFoundErr(InfernSessNotFoundErr): pass 25 | 26 | @ray.remote(resources={"live_translator": 1}) 27 | class LTActor(): 28 | sessions: Dict[UUID, LTSession] 29 | vds: Optional[VADSignals]=None 30 | translators: List[callable] 31 | nstts: int = 0 32 | def __init__(self): 33 | self.stt_out_langs = ('en', 'en') 34 | 35 | def start(self, lt_prof: 'LTProfile', sip_actr:InfernSIPActor): 36 | self.lt_prof = lt_prof 37 | self.tts_langs = lt_prof.tts_langs 38 | self.stt_langs = lt_prof.stt_langs 39 | nltk.download('punkt') 40 | self.lt_actr = ray.get_runtime_context().current_actor 41 | self.sip_actr = sip_actr 42 | self.tts_actrs = dict((l, InfernTTSActor.remote()) for l in self.tts_langs) 43 | self.stt_actr = InfernSTTActor.remote() 44 | futs = [_a.start.remote(**_k) for _a, _k in ((self.stt_actr, {}),) + 45 | tuple((a, {'lang':l, 'output_sr':8000}) for l, a in self.tts_actrs.items())] 46 | self.translators = [ntw_filter if _sol == _tl else 47 | IG.get_translator(_sol, _tl, filter=partial(ntw_filter, obj=NumbersToWords(_tl))).translate 48 | for _tl, _sol in zip(self.tts_langs, self.stt_out_langs)] 49 | self.swriter = SummaryWriter() 50 | ray.get(futs) 51 | self.sessions = {} 52 | 53 | def precache(self, lt_prof: 'LTProfile'): 54 | nltk.download('punkt') 55 | lt_actr = ray.get_runtime_context().current_actor 56 | tts_actrs = dict((l, InfernTTSActor.remote()) for l in lt_prof.tts_langs) 57 | stt_actr = InfernSTTActor.remote() 58 | futs = [_a.start.remote(**_k) for _a, _k in ((stt_actr, {}),) + 59 | tuple((a, {'lang':l, 'output_sr':8000, 'device':'cpu'}) for l, a in tts_actrs.items())] 60 | translators = [ntw_filter if _sol == _tl else 61 | IG.get_translator(_sol, _tl, filter=partial(ntw_filter, obj=NumbersToWords(_tl))).translate 62 | for _tl, _sol in zip(lt_prof.tts_langs, self.stt_out_langs)] 63 | ray.get(futs) 64 | for a in list(tts_actrs.values()) + [stt_actr]: 65 | ray.get(a.stop.remote()) 66 | 67 | def new_sip_session_received(self, new_sess:RemoteSessionOffer): 68 | if self.vds is None: 69 | self.vds = VADSignals() 70 | lt_sess = LTSession(self, new_sess) 71 | print(f'{lt_sess=}') 72 | self.sessions[lt_sess.id] = lt_sess 73 | 74 | def sess_term(self, sess_id:UUID, sip_sess_id:UUID, relaxed:bool=False): 75 | try: 76 | self._get_session(sess_id).sess_term(sip_sess_id) 77 | except LTSessNotFoundErr: 78 | if not relaxed: raise 79 | return 80 | del self.sessions[sess_id] 81 | 82 | def text_in(self, sess_id:UUID, result:STTResult): 83 | self.swriter.add_scalar(f'stt/inf_time', result.inf_time, self.nstts) 84 | self.nstts += 1 85 | self._get_session(sess_id).text_in(result) 86 | 87 | def tts_say_done(self, sess_id:UUID, direction:int): 88 | self._get_session(sess_id).tts_say_done(direction) 89 | 90 | def _get_session(self, sess_id:UUID): 91 | try: return self.sessions[sess_id] 92 | except KeyError: raise LTSessNotFoundErr(f'No LT session with id {sess_id}') 93 | -------------------------------------------------------------------------------- /Apps/LiveTranslator/LTAppConfig.py: -------------------------------------------------------------------------------- 1 | from .LTProfile import LTProfile 2 | 3 | class LTAppConfig(): 4 | schema: dict = { 5 | 'live_translator': { 6 | 'type': 'dict', 7 | 'schema': { 8 | **LTProfile.schema, 9 | } 10 | }, 11 | 'live_translator_precache': {'type': 'boolean'}, 12 | } 13 | -------------------------------------------------------------------------------- /Apps/LiveTranslator/LTProfile.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Optional 2 | 3 | import ray 4 | 5 | from Cluster.InfernSIPActor import InfernSIPActor 6 | 7 | from .LTActor import LTActor 8 | 9 | class LTProfile(): 10 | schema: dict = { 11 | 'profiles': { 12 | 'type': 'dict', 13 | 'keysrules': {'type': 'string'}, 14 | 'valuesrules': { 15 | 'type': 'dict', 16 | 'schema': { 17 | 'tts_langs': {'type': 'list', 'schema': {'type': 'string'}}, 18 | 'stt_langs': {'type': 'list', 'schema': {'type': 'string'}}, 19 | 'outbound': {'type': 'string'} 20 | } 21 | } 22 | } 23 | } 24 | name: str 25 | tts_langs: Tuple[str] 26 | stt_langs: Tuple[str] 27 | _outbound_spec: str 28 | outbound_conn: 'InfernSIPProfile' 29 | outbount_params: str 30 | actor: Optional[LTActor] = None 31 | precache: bool 32 | 33 | def __init__(self, name, conf, precache): 34 | self.name = name 35 | self.tts_langs = tuple(conf['tts_langs']) 36 | self.stt_langs = tuple(conf['stt_langs']) 37 | if not precache: 38 | self._outbound = conf['outbound'] 39 | self.precache = precache 40 | 41 | def finalize(self, iconf:'InfernConfig'): 42 | if not self.precache: 43 | sip_cname, params = self._outbound.split(';', 1) 44 | self.outbound_conn = iconf.connectors[sip_cname] 45 | self.outbount_params = params 46 | else: 47 | actor = LTActor.remote() 48 | res = ray.get(actor.precache.remote(self)) 49 | 50 | def getActor(self, iconf:'InfernConfig', sip_act:InfernSIPActor): 51 | if self.actor is None: 52 | self.actor = LTActor.remote() 53 | ray.get(self.actor.start.remote(self, sip_act)) 54 | return self.actor 55 | -------------------------------------------------------------------------------- /Apps/LiveTranslator/LTSession.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List, Optional, Dict 2 | from functools import partial, lru_cache 3 | from uuid import UUID, uuid4 4 | 5 | from ray import ray 6 | from nltk.tokenize import sent_tokenize 7 | 8 | from Cluster.InfernRTPActor import InfernRTPActor 9 | from Cluster.InfernTTSActor import InfernTTSActor 10 | from Cluster.RemoteTTSSession import RemoteTTSSession 11 | from Cluster.STTSession import STTRequest, STTResult 12 | from Cluster.TTSSession import TTSRequest 13 | from Core.AudioChunk import AudioChunk, AudioChunkFromURL 14 | from RTP.AudioInput import AudioInput 15 | from Core.T2T.Translator import Translator 16 | from SIP.RemoteSession import RemoteSessionOffer, RemoteSessionAccept, NewRemoteSessionRequest 17 | from Core.AStreamMarkers import ASMarkerNewSent 18 | 19 | #from .LTProfile import LTProfile 20 | 21 | import pickle 22 | import gzip 23 | from random import choice 24 | 25 | @lru_cache(maxsize=4) 26 | def get_top_speakers(lang:str): 27 | skips = 0 28 | i = 0 29 | res = [] 30 | while True: 31 | try: 32 | with gzip.open(f'checkpoint/{lang}/speaker.{i}.{lang}.pkl.gz', 'rb') as file: 33 | res.append(pickle.load(file)) 34 | except FileNotFoundError: 35 | skips += 1 36 | if skips > 200: break 37 | i += 1 38 | if len(res) == 0: 39 | return None 40 | gen = max(r.nres for r in res) 41 | res = sorted([r for r in res if r.nres == gen], key=lambda r: r.max_error())[:50] 42 | return tuple(r.speaker_id for r in res) 43 | 44 | class VADSignals(): 45 | def __init__(self): 46 | eng, deng= [AudioChunkFromURL(f'https://github.com/commaai/openpilot/blob/master/selfdrive/assets/sounds/{n}.wav?raw=true') for n in ('engage', 'disengage')] 47 | eng.track_id = 2 48 | eng.debug = True 49 | self.eng = ray.put(eng) 50 | self.deng = ray.put(deng) 51 | 52 | class STTProxy(): 53 | debug = True 54 | stt_do: callable 55 | stt_done: callable 56 | vad_mirror: callable 57 | def __init__(self, lta:'LTActor', uas:'Sess', stt_done, vad_mirror, direction): 58 | self.stt_do = partial(lta.stt_actr.stt_session_soundin.remote, sess_id=uas.stt_sess_id) 59 | self.lang, self.stt_done = uas.stt_lang, stt_done 60 | self.vad_mirror = vad_mirror 61 | self.eng = lta.vds.eng 62 | self.direction = direction 63 | 64 | # This method runs in the context of the inbound RTP Actor 65 | def __call__(self, chunk:AudioChunk): 66 | if self.debug: 67 | dir = 'A' if self.direction == 0 else 'B' 68 | print(f'STTProxy: VAD({dir}): {len(chunk.audio)=} {chunk.track_id=}') 69 | #self.vad_mirror(chunk=self.eng) 70 | def stt_done(result:STTResult, direction=self.direction): 71 | print(f'STTProxy: {result=}') 72 | result.direction = direction 73 | self.stt_done(result=result) 74 | sreq = STTRequest(chunk, stt_done, self.lang) 75 | sreq.mode = 'translate' 76 | self.stt_do(req=sreq) 77 | 78 | class TTSProxy(): 79 | debug = False 80 | tts_consume: callable 81 | def __init__(self, tts_consume): 82 | self.tts_consume = tts_consume 83 | 84 | # This method runs in the context of the outbound RTP Actor 85 | def __call__(self, chunk:AudioChunk): 86 | if self.debug and isinstance(chunk, ASMarkerNewSent): 87 | print(f'TTSProxy: ASMarkerNewSent') 88 | chunk.track_id = 1 89 | chunk.debug = False 90 | self.tts_consume(chunk=chunk) 91 | 92 | class SessionInfo(): 93 | soundout: callable 94 | rsess_pause: callable 95 | rsess_connect: callable 96 | translator: callable 97 | get_speaker: callable 98 | tts_say: callable 99 | tts_say_done: callable 100 | def __init__(self, lts:'LTSession', lta:'LTActor', xua:'Sess', yua:'Sess'): 101 | #lt_actr = ray.get_runtime_context().current_actor 102 | self.soundout = partial(xua.rtp_actr.rtp_session_soundout.remote, xua.rtp_sess_id) 103 | vad_cb = self.soundout 104 | text_cb = partial(lta.lt_actr.text_in.remote, sess_id=lts.id) 105 | self.tts_say_done = partial(lta.lt_actr.tts_say_done.remote, sess_id=lts.id, direction=xua.direction) 106 | vad_handler = STTProxy(lta, xua, text_cb, vad_cb, xua.direction) 107 | self.rsess_pause = partial(xua.rtp_actr.rtp_session_connect.remote, xua.rtp_sess_id, 108 | AudioInput(vad_chunk_in=vad_handler)) 109 | ysoundout = partial(yua.rtp_actr.rtp_session_soundout.remote, yua.rtp_sess_id) 110 | self.rsess_connect = partial(xua.rtp_actr.rtp_session_connect.remote, xua.rtp_sess_id, 111 | AudioInput(yua.rtp_sess_id, vad_handler)) 112 | self.translator = xua.translator 113 | self.get_speaker = (lambda: None) if xua.speakers is None else partial(choice, xua.speakers) 114 | self.sip_sess_term = partial(lta.sip_actr.sess_term.remote, xua.sip_sess_id) 115 | self.stt_sess_term = partial(lta.stt_actr.stt_session_end.remote, xua.stt_sess_id) 116 | self.tts_sess_term = xua.tts_sess.end 117 | self.tts_say = xua.tts_sess.say 118 | self.tts_soundout = TTSProxy(ysoundout) 119 | 120 | def sess_term(self): 121 | self.stt_sess_term() 122 | self.tts_sess_term() 123 | 124 | class Sessions(): 125 | info: Tuple[SessionInfo] 126 | def __init__(self, lts:'LTSession', lta:'LTActor', xua:'Sess', yua:'Sess'): 127 | self.info = ( 128 | SessionInfo(lts, lta, xua, yua), 129 | SessionInfo(lts, lta, yua, xua), 130 | ) 131 | for i, u in zip(self.info, (xua, yua)): 132 | i.rsess_connect() 133 | #i.rsess_pause() 134 | u.tts_sess.start(i.tts_soundout) 135 | 136 | class Sess(): 137 | direction: int 138 | sip_sess_id: UUID 139 | rtp_sess_id: UUID 140 | tts_sess: RemoteTTSSession 141 | stt_sess_id: UUID 142 | rtp_actr: InfernRTPActor 143 | tts_actr: InfernTTSActor 144 | translator: Optional[Translator] 145 | def __init__(self, lta:'LTActor', direction:int): 146 | self.direction = direction 147 | tts_lang, stt_lang = lta.tts_langs[direction], lta.stt_langs[direction] 148 | self.speakers = get_top_speakers(tts_lang) 149 | self.tts_lang, self.stt_lang = tts_lang, stt_lang 150 | self.translator = lta.translators[direction] 151 | self.tts_sess = RemoteTTSSession(lta.tts_actrs[tts_lang]) 152 | 153 | def _sess_term(*args, sterm:callable, sess_id:UUID, sip_sess_id:UUID): 154 | return sterm(sess_id, sip_sess_id, relaxed=True) 155 | 156 | class LTSession(): 157 | debug = False 158 | id: UUID 159 | alice: Sess 160 | bob: Sess 161 | say_buffer: Dict[int, List[TTSRequest]] 162 | 163 | def __init__(self, lta, new_sess:RemoteSessionOffer): 164 | self.id = uuid4() 165 | self.say_buffer = {0:[], 1:[]} 166 | lt_prof: 'LTProfile' = lta.lt_prof 167 | dest_number = dict(x.split('=', 1) for x in lt_prof.outbount_params.split(';'))['cld'] 168 | #dest_number = '205' 169 | #dest_number = '601' 170 | sess_term_alice = partial(_sess_term, sterm=lta.lt_actr.sess_term.remote, sess_id=self.id, sip_sess_id=new_sess.sip_sess_id) 171 | amsg = RemoteSessionAccept(disc_cb=sess_term_alice, auto_answer=False) 172 | try: 173 | rtp_alice = ray.get(new_sess.accept(msg=amsg)) 174 | except KeyError: 175 | print(f'Failed to accept {new_sess.sip_sess_id=}') 176 | return 177 | sess_term_bob = partial(_sess_term, sterm=lta.lt_actr.sess_term.remote, sess_id=self.id, sip_sess_id=None) 178 | bmsg = NewRemoteSessionRequest(cld=dest_number, sip_prof=lt_prof.outbound_conn, disc_cb=sess_term_bob) 179 | bmsg.conn_sip_sess_id = new_sess.sip_sess_id 180 | sip_sess_id_bob = lta.sip_actr.new_sess.remote(msg=bmsg) 181 | ssess = [lta.stt_actr.new_stt_session.remote(keep_context=True) for _ in lta.stt_langs] 182 | 183 | alice = Sess(lta, 0) 184 | bob = Sess(lta, 1) 185 | 186 | #alice.tts_sess, bob.tts_sess = [RemoteTTSSession(lta.tts_actrs[lang]) for lang in lta.tts_langs] 187 | 188 | alice.sip_sess_id = new_sess.sip_sess_id 189 | alice.rtp_actr, alice.rtp_sess_id = rtp_alice 190 | bob.sip_sess_id, bob.rtp_actr, bob.rtp_sess_id = ray.get(sip_sess_id_bob) 191 | alice.stt_sess_id, bob.stt_sess_id = ray.get(ssess) 192 | self.fabric = Sessions(self, lta, alice, bob) 193 | if self.debug: print(f'{alice=} {bob=} {self.fabric=}') 194 | self.alice, self.bob = alice, bob 195 | 196 | def sess_term(self, sip_sess_id): 197 | for i in self.fabric.info: 198 | i.sess_term() 199 | if sip_sess_id == self.alice.sip_sess_id: 200 | self.fabric.info[1].sip_sess_term() 201 | else: 202 | self.fabric.info[0].sip_sess_term() 203 | 204 | def text_in(self, result:STTResult): 205 | sdir = 'A->B' if result.direction == 0 else 'B->A' 206 | print(f'STT: {sdir} "{result.text=}" {result.no_speech_prob=}') 207 | nsp = result.no_speech_prob 208 | if nsp > STTRequest.max_ns_prob: return 209 | sinfo = self.fabric.info[result.direction] 210 | text = sinfo.translator(result.text) 211 | speaker_id = sinfo.get_speaker() 212 | #sinfo.rsess_pause() 213 | print(f'TTS: {sdir} "{text=}" {speaker_id=}') 214 | text = sent_tokenize(text) 215 | out_sents = [text.pop(0),] 216 | for t in text: 217 | if len(out_sents[-1]) + len(t) < 128 or out_sents[-1].endswith(' i.e.'): 218 | out_sents[-1] += ' ' + t 219 | else: 220 | out_sents.append(t) 221 | 222 | print(f'TTS split: "{out_sents=}" {[len(t) for t in out_sents]=}') 223 | tts_req = ray.put(TTSRequest(out_sents, speaker_id=speaker_id, done_cb=sinfo.tts_say_done)) 224 | self.say_buffer[result.direction].append(tts_req) 225 | if len(self.say_buffer[result.direction]) > 1: 226 | return 227 | sinfo.tts_say(tts_req) 228 | return 229 | 230 | def tts_say_done(self, direction:int): 231 | if self.debug: print(f'tts_say_done({direction=})') 232 | tbuf = self.say_buffer[direction] 233 | tbuf.pop(0) 234 | if len(tbuf) > 0: 235 | self.fabric.info[direction].tts_say(tbuf[0]) 236 | return 237 | -------------------------------------------------------------------------------- /Cluster/InfernBatchedWorker.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | from queue import Queue, Empty as QueueEmpty 3 | from abc import ABC, abstractmethod 4 | 5 | from Core.InfernWrkThread import InfernWrkThread, RTPWrkTRun 6 | 7 | class InfernBatchedWorker(InfernWrkThread, ABC): 8 | max_batch_size: int 9 | inf_queue: Queue[Optional[object]] 10 | def __init__(self): 11 | super().__init__() 12 | self.inf_queue = Queue() 13 | 14 | def infer(self, wi:object): 15 | self.inf_queue.put(wi) 16 | 17 | def next_batch(self) -> List[object]: 18 | wis = [] 19 | while len(wis) < self.max_batch_size: 20 | if len(wis) == 0: 21 | wi = self.inf_queue.get() 22 | else: 23 | try: wi = self.inf_queue.get_nowait() 24 | except QueueEmpty: break 25 | if wi is None: 26 | return None 27 | wis.append(wi) 28 | return wis 29 | 30 | @abstractmethod 31 | def process_batch(self, wis:List[object]): pass 32 | 33 | def run(self): 34 | super().thread_started() 35 | while self.get_state() == RTPWrkTRun: 36 | wis = self.next_batch() 37 | if wis is None: 38 | break 39 | for wi in (wi for wi in wis if hasattr(wi, '_proc_start_cb')): 40 | wi._proc_start_cb() 41 | self.process_batch(wis) 42 | 43 | def stop(self): 44 | self.inf_queue.put(None) 45 | super().stop() 46 | -------------------------------------------------------------------------------- /Cluster/InfernLLMActor.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from uuid import UUID 3 | from queue import Queue 4 | 5 | import ray 6 | 7 | from Cluster.InfernLLMWorker import InfernLLMWorker 8 | from Cluster.LLMSession import LLMSession, LLMRequest, LLMInferRequest, LLMSessionParams 9 | 10 | @ray.remote(num_gpus=1.0, resources={"llm": 1}) 11 | class InfernLLMActor(): 12 | debug = True 13 | sessions: Dict[UUID, LLMSession] 14 | LLM: InfernLLMWorker 15 | 16 | def __init__(self): 17 | super().__init__() 18 | self.sessions = {} 19 | 20 | def start(self): 21 | for device in ('xpu', 'cuda', 'cpu'): 22 | try: 23 | self.llm = InfernLLMWorker(device) 24 | except (ValueError, RuntimeError): 25 | continue 26 | break 27 | else: 28 | raise RuntimeError('Failed to initialize LLM') 29 | self.llm.start() 30 | tq = Queue() 31 | def res_cb(result): tq.put(result) 32 | irs = tuple(LLMInferRequest(LLMRequest('What is your name?', None), [{}]) 33 | for _ in range(self.llm.max_batch_size)) 34 | for _i in irs: _i.textout_cb = res_cb 35 | with self.llm.inf_queue.mutex: 36 | for ir in irs: 37 | self.llm.inf_queue.queue.append(ir) 38 | self.llm.inf_queue.not_empty.notify() 39 | for _ in irs: 40 | tq.get() 41 | 42 | def stop(self): 43 | self.llm.stop() 44 | 45 | def new_llm_session(self, sconf:LLMSessionParams): 46 | if self.debug: print('InfernLLMActor.new_llm_session') 47 | sess = LLMSession(self.llm, sconf) 48 | self.sessions[sess.id] = sess 49 | return sess.id 50 | 51 | def llm_session_end(self, sess_id): 52 | if self.debug: print('InfernLLMActor.llm_session_end') 53 | sess = self.sessions[sess_id] 54 | sess.stop() 55 | del self.sessions[sess_id] 56 | 57 | def llm_session_textin(self, sess_id, req:LLMRequest): 58 | if self.debug: print('InfernLLMActor.llm_session_textin') 59 | sess = self.sessions[sess_id] 60 | sess.textin(req) 61 | return sess_id 62 | 63 | def llm_session_context_add(self, sess_id, content:str, role:str = 'user'): 64 | if self.debug: print('InfernLLMActor.llm_session_context_add') 65 | sess = self.sessions[sess_id] 66 | sess.context_add(content, role) 67 | return sess_id 68 | -------------------------------------------------------------------------------- /Cluster/InfernLLMWorker.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List, Iterator 2 | from os.path import exists as path_exists 3 | from itertools import chain 4 | from functools import partial 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | 9 | from transformers import TextStreamer 10 | 11 | from Cluster.InfernBatchedWorker import InfernBatchedWorker 12 | from Cluster.InfernTTSWorker import get_torch_hw 13 | from Cluster.LLMSession import LLMResult, LLMInferRequest 14 | 15 | class ResultsStreamer(TextStreamer): 16 | debug = False 17 | sync_on = ('. ', '? ', '! ', '\n') 18 | decode_batch_size = 8 19 | def __init__(self, wis:List[LLMInferRequest], upper:'InfernLLMWorker'): 20 | super().__init__(tokenizer=upper.llm_tokenizer) 21 | self.wi_cbs = tuple(wi.textout_cb for wi in wis) 22 | self.newLLMResult = tuple(partial(LLMResult, req_id=wi.req.id) for wi in wis) 23 | batch_size = len(wis) 24 | self.oposs = [0 for _ in range(batch_size)] 25 | self.current_tokens = None 26 | self.batch_decode = partial(upper.llm_tokenizer.batch_decode, skip_special_tokens=True) 27 | 28 | def put(self, token_ids): 29 | if self.current_tokens is None: 30 | self.current_tokens = torch.zeros((token_ids.shape[0], 0), dtype=torch.long) 31 | return 32 | if token_ids.dim() == 1: # Shape [batch_size] 33 | token_ids = token_ids.unsqueeze(1) 34 | self.current_tokens = torch.cat([self.current_tokens, token_ids], dim=1) 35 | if self.current_tokens.shape[1] % self.decode_batch_size == 0: 36 | return 37 | results = self.batch_decode(self.current_tokens) 38 | for (ir, r), op, cb, newLR in zip(enumerate(results), self.oposs, self.wi_cbs, self.newLLMResult): 39 | new_content = r[op:] 40 | if len(new_content) == 0: continue 41 | sp = (op + pos + len(c) for c in self.sync_on if (pos:=new_content.rfind(c)) >= 0) 42 | try: 43 | spos = next(sp) 44 | except StopIteration: 45 | continue 46 | r = r[op:spos-1] 47 | if len(r) < 10: continue 48 | cb(result=newLR(r)) 49 | self.oposs[ir] = spos 50 | if self.debug: 51 | print(f'{self.oposs=} {self.current_tokens.shape=}') 52 | 53 | def end(self): 54 | if self.debug: 55 | print(f'finished: {self.current_tokens.shape=}') 56 | results = self.batch_decode(self.current_tokens) 57 | for r, op, cb, newLR in zip(results, self.oposs, self.wi_cbs, self.newLLMResult): 58 | if len(r) == op: continue 59 | cb(result=newLR(r[op:])) 60 | del self.current_tokens 61 | del self.wi_cbs 62 | 63 | class InfernLLMWorker(InfernBatchedWorker): 64 | model_name = "Qwen/Qwen2.5-14B-Instruct" 65 | model_cache_dir = f"/tmp/saved_model.{model_name}" 66 | max_batch_size: int = 8 67 | debug = True 68 | llm_model: object 69 | llm_tokenizer: object 70 | output_sr: int 71 | 72 | def __init__(self, device=None): 73 | from warnings import filterwarnings 74 | filterwarnings("ignore", category=FutureWarning) 75 | filterwarnings("ignore", category=UserWarning) 76 | from transformers import AutoTokenizer 77 | from ipex_llm.transformers import AutoModelForCausalLM 78 | super().__init__() 79 | if device is None: 80 | device = get_torch_hw() 81 | def load_model(mn): 82 | m = AutoModelForCausalLM.from_pretrained(mn, torch_dtype="auto", 83 | device_map="auto", 84 | optimize_model=True, 85 | trust_remote_code=True, 86 | load_in_4bit=True, 87 | use_cache=True 88 | ) 89 | if mn != self.model_cache_dir: 90 | m.save_low_bit(self.model_cache_dir) 91 | return m.to(device) 92 | if path_exists(self.model_cache_dir): 93 | try: 94 | model = AutoModelForCausalLM.load_low_bit(self.model_cache_dir, 95 | trust_remote_code=True) 96 | except Exception: 97 | model = load_model(self.model_name) 98 | else: 99 | model = load_model(self.model_name) 100 | self.llm_model = model.to(device) 101 | self.llm_tokenizer = AutoTokenizer.from_pretrained(self.model_name) 102 | 103 | def process_batch(self, wis:List[LLMInferRequest]): 104 | if self.debug: 105 | print(f'InfernLLMWorker.process_batch: got {len(wis)=}') 106 | streamer = ResultsStreamer(wis, self) 107 | with torch.no_grad(): 108 | messages = [self.llm_tokenizer.apply_chat_template(list(r.context), tokenize=False, 109 | add_generation_prompt=True) 110 | for r in wis] 111 | model_inputs = self.llm_tokenizer(messages, return_tensors="pt", padding=True).to(self.llm_model.device) 112 | self.llm_model.generate( 113 | **model_inputs, 114 | max_new_tokens=16 * 1024, 115 | output_scores=True, 116 | return_dict_in_generate=True, 117 | streamer=streamer, 118 | ) 119 | torch.xpu.synchronize() 120 | -------------------------------------------------------------------------------- /Cluster/InfernRTPActor.py: -------------------------------------------------------------------------------- 1 | #try: import intel_extension_for_pytorch as ipex 2 | #except ModuleNotFoundError: ipex = None 3 | 4 | from typing import Dict, Union, List 5 | from uuid import UUID 6 | from _thread import get_ident 7 | 8 | from ray import ray 9 | 10 | from sippy.Network_server import RTP_port_allocator 11 | 12 | from config.InfernGlobals import InfernGlobals as IG 13 | from Core.AudioChunk import AudioChunk 14 | from Core.AStreamMarkers import ASMarkerGeneric 15 | from Core.Exceptions.InfernSessNotFoundErr import InfernSessNotFoundErr 16 | from RTP.InfernRTPIngest import InfernRTPIngest 17 | from RTP.InfernRTPEPoint import InfernRTPEPoint 18 | from RTP.AudioInput import AudioInput 19 | from RTP.RTPParams import RTPParams 20 | from RTP.InfernRTPConf import InfernRTPConf 21 | 22 | class RTPSessNotFoundErr(InfernSessNotFoundErr): pass 23 | 24 | @ray.remote(num_gpus=0.01, resources={"rtp": 1}) 25 | class InfernRTPActor(): 26 | devices = ('mps', 'cuda', 'cpu') 27 | device: str 28 | sessions: Dict[UUID, InfernRTPEPoint] 29 | thumbstones: List[UUID] 30 | ring: InfernRTPIngest 31 | palloc: RTP_port_allocator 32 | inf_rc: InfernRTPConf 33 | def __init__(self, inf_rc:InfernRTPConf): 34 | self.sessions = {} 35 | self.thumbstones = [] 36 | self.inf_rc = inf_rc 37 | 38 | def new_rtp_session(self, rtp_params:RTPParams): 39 | print(f'{IG.stdtss()}: new_rtp_session') 40 | rep = InfernRTPEPoint(self.inf_rc, rtp_params, self.ring, self._get_direct_soundout) 41 | self.sessions[rep.id] = rep 42 | return (rep.id, rep.rserv.uopts.laddress) 43 | 44 | def rtp_session_connect(self, rtp_id, ain:AudioInput): 45 | print(f'{IG.stdtss()}: rtp_session_connect[{str(rtp_id)[:6]}]') 46 | rep = self._get_session(rtp_id) 47 | rep.connect(ain) 48 | 49 | def rtp_session_end(self, rtp_id, relaxed:bool=False): 50 | print(f'{IG.stdtss()}: rtp_session_end') 51 | try: 52 | rep = self._get_session(rtp_id) 53 | except RTPSessNotFoundErr: 54 | if relaxed or rtp_id in self.thumbstones: return 55 | raise 56 | rep.writer.end() 57 | 58 | def rtp_session_soundout(self, rtp_id, chunk:Union[AudioChunk, ASMarkerGeneric]): 59 | try: 60 | rep = self._get_session(rtp_id) 61 | except RTPSessNotFoundErr: 62 | if rtp_id in self.thumbstones: 63 | return 64 | raise 65 | return rep.soundout(chunk) 66 | 67 | def _get_direct_soundout(self, rtp_id): 68 | rep = self._get_session(rtp_id) 69 | return rep.soundout 70 | 71 | def rtp_session_join(self, rtp_id): 72 | print(f'{IG.stdtss()}: rtp_session_join') 73 | rep = self._get_session(rtp_id) 74 | rep.shutdown() 75 | del self.sessions[rtp_id] 76 | self.thumbstones.append(rtp_id) 77 | if len(self.thumbstones) > 100: 78 | self.thumbstones = self.thumbstones[-100:] 79 | 80 | def rtp_session_update(self, rtp_id, rtp_params:RTPParams): 81 | print(f'{IG.stdtss()}: rtp_session_update') 82 | rep = self._get_session(rtp_id) 83 | rep.update(rtp_params) 84 | 85 | def start(self): 86 | for device in self.devices: 87 | self.ring = InfernRTPIngest(device) 88 | try: 89 | self.ring.start() 90 | except (AssertionError, RuntimeError): 91 | print(f'{device} did not work') 92 | continue 93 | self.device = device 94 | break 95 | else: 96 | raise RuntimeError('No suitable device found') 97 | 98 | def loop(self): 99 | from sippy.Core.EventDispatcher import ED2 100 | ED2.my_ident = get_ident() 101 | rval = ED2.loop() 102 | self.ring.stop() 103 | self.ring.join() 104 | return rval 105 | 106 | def stop(self): 107 | from sippy.Core.EventDispatcher import ED2 108 | ED2.callFromThread(ED2.breakLoop, 0) 109 | 110 | def _get_session(self, rtp_id:UUID) -> InfernRTPEPoint: 111 | try: return self.sessions[rtp_id] 112 | except KeyError: raise RTPSessNotFoundErr(f'No RTP session found for {rtp_id}') 113 | -------------------------------------------------------------------------------- /Cluster/InfernSIPActor.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from _thread import get_ident 3 | from queue import Queue 4 | 5 | import ray 6 | 7 | #from Core.InfernConfig import InfernConfig 8 | from SIP.InfernSIP import InfernSIP 9 | from SIP.RemoteSession import RemoteSessionAccept, NewRemoteSessionRequest 10 | from Cluster.InfernRTPActor import InfernRTPActor 11 | 12 | @ray.remote(resources={"head": 0.5}) 13 | class InfernSIPActor(): 14 | sip_stack: InfernSIP 15 | default_resources = {'head':1, 'stt': 1, 'tts':1, 'rtp': 1} 16 | def loop(self, inf_c:'InfernConfig'): 17 | #raise Exception("BP") 18 | from sippy.Core.EventDispatcher import ED2 19 | ED2.my_ident = get_ident() 20 | rtp_actr = self.rtp_actr = InfernRTPActor.options(max_concurrency=2).remote(inf_c.rtp_conf) 21 | sip_actr = ray.get_runtime_context().current_actor 22 | ray.get(rtp_actr.start.remote()) 23 | self.sip_stack = InfernSIP(sip_actr, rtp_actr, inf_c) 24 | rtp_actr.loop.remote() 25 | rval = ED2.loop() 26 | ray.get(rtp_actr.stop.remote()) 27 | return rval 28 | 29 | def new_sess(self, msg:NewRemoteSessionRequest): 30 | from sippy.Core.EventDispatcher import ED2 31 | rval = Queue() 32 | ED2.callFromThread(self.sip_stack.new_session, msg, rval) 33 | sip_sess, rtp_sess = rval.get() 34 | return (sip_sess.id, self.rtp_actr, rtp_sess.sess_id) 35 | 36 | def new_sess_accept(self, sip_sess_id, msg:RemoteSessionAccept): 37 | from sippy.Core.EventDispatcher import ED2 38 | sip_sess = self.sip_stack.get_session(sip_sess_id) 39 | rval = Queue() 40 | ED2.callFromThread(sip_sess.accept, msg, rval) 41 | rtp_sess = rval.get() 42 | return (self.rtp_actr, rtp_sess.sess_id) 43 | 44 | def new_sess_reject(self, sip_sess_id): 45 | from sippy.Core.EventDispatcher import ED2 46 | sip_sess = self.sip_stack.get_session(sip_sess_id) 47 | ED2.callFromThread(sip_sess.reject) 48 | 49 | def sess_term(self, sip_sess_id): 50 | from sippy.Core.EventDispatcher import ED2 51 | sip_sess = self.sip_stack.get_session(sip_sess_id) 52 | ED2.callFromThread(sip_sess.sess_term) 53 | 54 | def sess_event(self, sip_sess_id, event, **kwargs): 55 | from sippy.Core.EventDispatcher import ED2 56 | sip_sess = self.sip_stack.get_session(sip_sess_id) 57 | event.kwargs = kwargs 58 | ED2.callFromThread(sip_sess.recvEvent, event) 59 | 60 | def stop(self): 61 | from sippy.Core.EventDispatcher import ED2 62 | ED2.callFromThread(ED2.breakLoop, 0) 63 | -------------------------------------------------------------------------------- /Cluster/InfernSTTActor.py: -------------------------------------------------------------------------------- 1 | #try: import intel_extension_for_pytorch as ipex 2 | #except ModuleNotFoundError: ipex = None 3 | 4 | from typing import Dict, Union 5 | from uuid import UUID 6 | 7 | import ray 8 | 9 | from Cluster.InfernSTTWorker import InfernSTTWorker 10 | from Cluster.STTSession import STTSession, STTRequest, STTSentinel 11 | 12 | @ray.remote(num_gpus=0.25, resources={"stt": 1}) 13 | class InfernSTTActor(): 14 | debug = False 15 | sessions: Dict[UUID, STTSession] 16 | stt: InfernSTTWorker 17 | 18 | def __init__(self): 19 | super().__init__() 20 | self.sessions = {} 21 | 22 | def start(self): 23 | from sys import stderr 24 | for device in ('xpu', 'cuda', 'cpu'): 25 | try: 26 | self.stt = InfernSTTWorker(device) 27 | except (ValueError, RuntimeError): 28 | print(f'Failed to initialize STT with {device=}', file=stderr) 29 | continue 30 | break 31 | else: 32 | raise RuntimeError('Failed to initialize STT') 33 | self.stt.start() 34 | 35 | def stop(self): 36 | self.stt.stop() 37 | 38 | def new_stt_session(self, keep_context:bool=False): 39 | if self.debug: print('InfernSTTActor.new_stt_session') 40 | sess = STTSession(self.stt, keep_context) 41 | self.sessions[sess.id] = sess 42 | return sess.id 43 | 44 | def stt_session_end(self, sess_id): 45 | if self.debug: print('InfernSTTActor.stt_session_end') 46 | sess = self.sessions[sess_id] 47 | sess.stop() 48 | del self.sessions[sess_id] 49 | 50 | def stt_session_soundin(self, sess_id, req:Union[STTRequest,STTSentinel]): 51 | if self.debug: print('InfernSTTActor.stt_session_soundin') 52 | sess = self.sessions[sess_id] 53 | sess.soundin(req) 54 | -------------------------------------------------------------------------------- /Cluster/InfernSTTWorker.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List 2 | from os.path import expanduser, exists as path_exists 3 | from subprocess import Popen, PIPE 4 | from functools import partial 5 | 6 | import ctranslate2 7 | import transformers 8 | from methodtools import lru_cache 9 | 10 | import torch 11 | from torch.nn import functional as F 12 | 13 | from Cluster.STTSession import STTRequest, STTResult 14 | from Cluster.InfernBatchedWorker import InfernBatchedWorker 15 | 16 | class InfernSTTWorker(InfernBatchedWorker): 17 | max_batch_size: int = 4 18 | max_chunk_duration: float = 32.0 19 | model: ctranslate2.models.Whisper 20 | processor: transformers.WhisperProcessor 21 | device: str 22 | cache_dir: str = '~/.cache/Infernos' 23 | sample_rate: int = 16000 24 | debug = False 25 | def __init__(self, device: str, model_name: str = "openai/whisper-large-v3"): 26 | super().__init__() 27 | if device != 'xpu': 28 | cache_dir = expanduser(f'{self.cache_dir}/{model_name}.ct2') 29 | if not any((path_exists(f'{cache_dir}/{_c}') for _c in ('model.bin', 'config.json', 'vocabulary.json'))): 30 | print(f'Converting "{model_name}" to "{cache_dir}"...') 31 | command = ['ct2-transformers-converter', '--model', model_name, '--output_dir', cache_dir] 32 | process = Popen(command, stdout=PIPE, stderr=PIPE) 33 | stdout, stderr = process.communicate() 34 | if process.returncode != 0: 35 | raise RuntimeError(f'{command[0]} failed with {process.returncode=}, {stdout=}, {stderr=}') 36 | self.model = ctranslate2.models.Whisper(cache_dir, device=device, compute_type="int8") 37 | else: 38 | from warnings import filterwarnings 39 | filterwarnings("ignore", category=FutureWarning) 40 | filterwarnings("ignore", category=UserWarning) 41 | from ipex_llm.transformers import AutoModelForSpeechSeq2Seq 42 | model = AutoModelForSpeechSeq2Seq.from_pretrained( 43 | model_name, 44 | load_in_4bit=True, 45 | torch_dtype="auto", 46 | device_map="auto", 47 | optimize_model=True, 48 | trust_remote_code=True, 49 | use_cache=True 50 | ) 51 | self.model = model.to(device) 52 | self.processor = transformers.WhisperProcessor.from_pretrained(model_name) 53 | if device == 'xpu': 54 | self.no_speech_token_id = self.processor.tokenizer.convert_tokens_to_ids("<|nospeech|>") 55 | self.process_audios = partial(self.processor, return_tensors="pt") 56 | else: 57 | self.process_audios = partial(self.processor, return_tensors="np") 58 | self.device = device 59 | self.infer_and_decode = partial(self.infer_and_decode_ct2 if device != 'xpu' else self.infer_and_decode_torch) 60 | 61 | def infer_and_decode_ct2(self, prompts, inputs, max_nsps): 62 | input_features = inputs.input_features 63 | features = ctranslate2.StorageView.from_array(input_features) 64 | try: 65 | results = self.model.generate(features, prompts, return_no_speech_prob=True) 66 | except RuntimeError as e: 67 | if 'out of memory' not in str(e) or len(prompts) == 1: raise 68 | torch.cuda.empty_cache() 69 | results = [] 70 | for _if, _pr in zip(input_features, prompts): 71 | features = ctranslate2.StorageView.from_array([_if,]) 72 | results.extend(self.model.generate(features, [_pr], return_no_speech_prob=True)) 73 | decoded_results = ((self.processor.decode(r.sequences_ids[0]), r.no_speech_prob, r.sequences_ids[0]) 74 | for r in results) 75 | return decoded_results 76 | 77 | def infer_and_decode_torch(self, prompts, inputs, max_nsps): 78 | inputs = {k: v.to(self.device) for k, v in inputs.items()} 79 | max_len = max(len(t) for t in prompts) 80 | prompts = torch.stack([ 81 | F.pad(t, (0, max_len - t.size(0)), value=self.processor.tokenizer.pad_token_id) 82 | for t in (torch.tensor(pr, device=self.device) for pr in prompts) 83 | ]) 84 | with torch.no_grad(): 85 | forward_outputs = self.model( 86 | **inputs, 87 | decoder_input_ids=prompts, 88 | ) 89 | logprobs = forward_outputs.logits[:, 0].log_softmax(-1) 90 | no_speech_probs = logprobs[:, self.no_speech_token_id].exp().tolist() 91 | if all(nsp > max_nsp for nsp, max_nsp in zip(no_speech_probs, max_nsps)): 92 | return (('', nsp, []) for nsp in no_speech_probs) 93 | with torch.no_grad(): 94 | gen_outputs = self.model.generate( 95 | **inputs, 96 | decoder_input_ids=prompts, 97 | return_dict_in_generate=True, 98 | output_scores=True, 99 | ) 100 | gen_seq = gen_outputs.sequences 101 | decoded_texts = self.processor.batch_decode(gen_seq, skip_special_tokens=True) 102 | decoded_results = ( 103 | (text.strip(), nsp, gos.tolist()) for text, nsp, gos in 104 | zip(decoded_texts, no_speech_probs, gen_seq) 105 | ) 106 | torch.xpu.synchronize() 107 | return decoded_results 108 | 109 | def process_batch(self, wis:List[Tuple[STTRequest, callable, List[int]]]): 110 | if self.debug: 111 | print(f'InfernSTTWorker.process_batch: got {len(wis)=}') 112 | assert all(wi[0].chunk.samplerate == self.sample_rate for wi in wis) 113 | audios = [wi[0].chunk.audio for wi in wis] 114 | inputs = self.process_audios(audios, sampling_rate=self.sample_rate) 115 | prompts = self.get_prompt(tuple((wi[0].lang, wi[0].mode, wi[0].timestamps) for wi in wis)) 116 | max_nsps = [wi[0].max_ns_prob for wi in wis] 117 | good_results = self.infer_and_decode(prompts, inputs, max_nsps) 118 | for (wi, text_cb, c), (r, nsp, t) in zip(wis, good_results): 119 | # Remove leading and trailing space: "WhitespaceTokenizer adds a space at the beginning?" (copilot) 120 | if len(r) > 0 and r[0] == ' ': r = r[1:] 121 | if c is not None: c[:] = (c + t)[:-224] 122 | res = STTResult(text=r, no_speech_prob=nsp, req=wi) 123 | text_cb(result = res) 124 | 125 | @lru_cache(maxsize=16) 126 | def get_prompt(self, options:Tuple[Tuple[str, str, bool]]): 127 | prompt = tuple(self.processor.tokenizer.convert_tokens_to_ids( 128 | [ 129 | "<|startoftranscript|>", 130 | f"<|{language}|>", 131 | f"<|{mode}|>", 132 | ] + ([] if timestamps else ["<|notimestamps|>"]) 133 | ) for language, mode, timestamps in options) 134 | return prompt 135 | -------------------------------------------------------------------------------- /Cluster/InfernTTSActor.py: -------------------------------------------------------------------------------- 1 | #try: import intel_extension_for_pytorch as ipex 2 | #except ModuleNotFoundError: ipex = None 3 | 4 | from typing import Dict, Optional 5 | from uuid import UUID 6 | 7 | import ray 8 | 9 | from Cluster.TTSSession import TTSSession, TTSRequest 10 | from Cluster.InfernTTSWorker import InfernTTSWorker 11 | 12 | @ray.remote(num_gpus=0.25, resources={"tts": 1}) 13 | class InfernTTSActor(): 14 | sessions: Dict[UUID, TTSSession] 15 | tts: InfernTTSWorker 16 | 17 | def __init__(self): 18 | super().__init__() 19 | self.sessions = {} 20 | 21 | def start(self, lang:str='en', output_sr:int=16000, device=None): 22 | self.tts = InfernTTSWorker(lang, output_sr, device) 23 | self.tts.start() 24 | 25 | def stop(self): 26 | self.tts.stop() 27 | 28 | def get_rand_voice_id(self) -> int: 29 | return self.tts.get_rand_voice_id() 30 | 31 | def new_tts_session(self): 32 | tts_actr = ray.get_runtime_context().current_actor 33 | rgen = TTSSession(self.tts, tts_actr) 34 | self.sessions[rgen.id] = rgen 35 | return rgen.id 36 | 37 | def tts_session_start(self, rgen_id, soundout:callable): 38 | rgen = self.sessions[rgen_id] 39 | rgen.start(soundout) 40 | 41 | def tts_session_say(self, rgen_id, req:TTSRequest): 42 | rgen = self.sessions[rgen_id] 43 | return rgen.say(req) 44 | 45 | def tts_session_stop_saying(self, rgen_id:UUID, rsay_id:UUID): 46 | rgen = self.sessions[rgen_id] 47 | return rgen.stop_saying(rsay_id) 48 | 49 | def tts_session_end(self, rgen_id): 50 | rgen = self.sessions[rgen_id] 51 | rgen.stop() 52 | del self.sessions[rgen_id] 53 | -------------------------------------------------------------------------------- /Cluster/InfernTTSWorker.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | 5 | from Cluster.InfernBatchedWorker import InfernBatchedWorker 6 | from HelloSippyTTSRT.HelloSippyRTPipe import HelloSippyRTPipe, HelloSippyPlayRequest, \ 7 | HelloSippyPipeState, HelloSippyPipeStateBatched 8 | 9 | def get_ja_T5Processor(device, model_name): 10 | from utils.speecht5_openjtalk_tokenizer import SpeechT5OpenjtalkTokenizer 11 | from transformers import SpeechT5Processor, SpeechT5FeatureExtractor 12 | 13 | print(f'get_ja_T5Processor: device = {device}, model_name = {model_name}') 14 | tokenizer = SpeechT5OpenjtalkTokenizer.from_pretrained(model_name) 15 | tokenizer._in_target_context_manager = False 16 | tokenizer.split_special_tokens = True 17 | tokenizer._added_tokens_encoder = {} 18 | tokenizer._unk_token = None 19 | feature_extractor = SpeechT5FeatureExtractor.from_pretrained(model_name) 20 | return SpeechT5Processor(feature_extractor, tokenizer) 21 | 22 | class cleanup_text_eu(): 23 | replacements = [("Ä", "E"), ("Æ", "E"), ("Ç", "C"), ("É", "E"), ("Í", "I"), ("Ó", "O"), ("Ö", "E"), ("Ü", "Y"), ("ß", "S"), 24 | ("à", "a"), ("á", "a"), ("ã", "a"), ("ä", "e"), ("å", "a"), ("ë", "e"), ("í", "i"), ("ï", "i"), ("ð", "o"), ("ñ", "n"), 25 | ("ò", "o"), ("ó", "o"), ("ô", "o"), ("ö", "u"), ("ú", "u"), ("ü", "y"), ("ý", "y"), ("Ā", "A"), ("ā", "a"), ("ă", "a"), 26 | ("ą", "a"), ("ć", "c"), ("Č", "C"), ("č", "c"), ("ď", "d"), ("Đ", "D"), ("ę", "e"), ("ě", "e"), ("ğ", "g"), ("İ", "I"), 27 | ("О", "O"), ("Ł", "L"), ("ń", "n"), ("ň", "n"), ("Ō", "O"), ("ō", "o"), ("ő", "o"), ("ř", "r"), ("Ś", "S"), ("ś", "s"), 28 | ("Ş", "S"), ("ş", "s"), ("Š", "S"), ("š", "s"), ("ū", "u"), ("ź", "z"), ("Ż", "Z"), ("Ž", "Z"), ("ǐ", "i"), ("ǐ", "i"), 29 | ("ș", "s"), ("ț", "t"), ("ù", "u"), 30 | ] 31 | r_from, r_to = [''.join(x) for x in zip(*replacements)] 32 | replacements = str.maketrans(r_from, r_to) 33 | 34 | def __call__(self, text): 35 | return text.translate(self.replacements) 36 | 37 | lang2model = {'en': {'cleanup_text':cleanup_text_eu()}, 38 | 'it': {'model':'Sandiago21/speecht5_finetuned_voxpopuli_it', 'cleanup_text':cleanup_text_eu()}, 39 | 'es': {'model':'Sandiago21/speecht5_finetuned_facebook_voxpopuli_spanish', 'cleanup_text':cleanup_text_eu()}, 40 | 'fr': {'model':'Sandiago21/speecht5_finetuned_facebook_voxpopuli_french', 'cleanup_text':cleanup_text_eu()}, 41 | 'de': {'model':'JFuellem/speecht5_finetuned_voxpopuli_de', 'cleanup_text':cleanup_text_eu()}, 42 | 'pt': {'model':'evertonaleixo/speecht5_finetuned_fleurs_ptbr', 'cleanup_text':cleanup_text_eu()}, 43 | 'ru': {'model':'zaebee/speecht5_tts_common_ru'}, 44 | 'ja': {'model': 'esnya/japanese_speecht5_tts', 'get_processor': get_ja_T5Processor}, 45 | } 46 | 47 | def get_torch_hw(): 48 | if torch.cuda.is_available(): 49 | return 'cuda' 50 | if hasattr(torch, 'xpu') and torch.xpu.is_available(): 51 | return 'xpu' 52 | if hasattr(torch, 'mps'): 53 | return 'mps' 54 | raise AttributeError('Could not find CUDA deivces') 55 | 56 | class InfernTTSWorker(InfernBatchedWorker): 57 | max_batch_size: int = 8 58 | debug = False 59 | tts_engine: HelloSippyRTPipe 60 | output_sr: int 61 | 62 | def __init__(self, lang, output_sr, device=None): 63 | from warnings import filterwarnings 64 | filterwarnings("ignore", category=FutureWarning) 65 | filterwarnings("ignore", category=UserWarning) 66 | try: 67 | import intel_extension_for_pytorch as ipex 68 | except ModuleNotFoundError: 69 | ipex = None 70 | super().__init__() 71 | if device is None: 72 | device = get_torch_hw() 73 | tts_engine = HelloSippyRTPipe(device, output_sr=output_sr, **lang2model[lang]) 74 | if ipex is not None: 75 | for a in ('model', 'vocoder', 'chunker'): 76 | x = getattr(tts_engine, a) 77 | try: x = ipex.optimize(x) 78 | except AttributeError: continue 79 | setattr(tts_engine, a, x) 80 | self.tts_engine = tts_engine 81 | self.output_sr = output_sr 82 | 83 | def process_batch(self, wis:List[HelloSippyPlayRequest]): 84 | new_states = [HelloSippyPipeState(self.tts_engine, r) for r in wis] 85 | state = HelloSippyPipeStateBatched(new_states, self.tts_engine) 86 | while True: 87 | try: 88 | self.tts_engine.infer(state) 89 | except RuntimeError as e: 90 | self.handle_runtime_error(e, wis, state) 91 | raise 92 | if not self.tts_engine.unbatch_and_dispatch(state): break 93 | 94 | def handle_runtime_error(self, e, state, wis:List[HelloSippyPlayRequest]): 95 | print(f'InfernTTSWorker.handle_runtime_error: {e}') 96 | affected = [(d, w) for d, w in zip(state, wis) if d.dispatch is not None] 97 | 98 | def get_voice(self, *args): 99 | return self.tts_engine.get_voice(*args) 100 | 101 | def get_rand_voice(self): 102 | return self.tts_engine.get_rand_voice() 103 | 104 | def get_rand_voice_id(self): 105 | return self.tts_engine.get_rand_voice_id() 106 | 107 | -------------------------------------------------------------------------------- /Cluster/LLMSession.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Optional 2 | from time import monotonic 3 | from functools import partial 4 | from uuid import uuid4, UUID 5 | 6 | class LLMRequest(): 7 | id: UUID 8 | text: str 9 | textout_cb: callable 10 | auto_ctx_add: bool = True 11 | def __init__(self, text:str, textout_cb:callable): 12 | self.text, self.textout_cb = text, textout_cb 13 | self.id = uuid4() 14 | 15 | class LLMResult(): 16 | req_id: UUID 17 | text: str 18 | def __init__(self, text:str, req_id:UUID): 19 | self.text, self.req_id = text, req_id 20 | 21 | class LLMInferRequest(): 22 | req: LLMRequest 23 | context: Tuple[dict] 24 | textout_cb: callable 25 | 26 | def __init__(self, req:LLMRequest, context:List[dict]): 27 | self.req, self.context = req, tuple(context) 28 | 29 | class LLMSessionParams(): 30 | system_prompt: str 31 | def __init__(self, system_prompt:str): 32 | self.system_prompt = system_prompt 33 | 34 | class LLMSession(): 35 | id: UUID 36 | context: List[dict] 37 | debug: bool = False 38 | def __init__(self, llm:'InfernLLMWorker', params:LLMSessionParams): 39 | self.id = uuid4() 40 | self.context = [{"role": "system", "content": params.system_prompt}] 41 | self.llm = llm 42 | 43 | def context_add(self, content:str, role:str = "user"): 44 | if self.debug: 45 | print(f'{monotonic():4.3f}: LLMSession.context_add: {self.context=}, {content=}') 46 | if len(self.context) > 0 and self.context[-1]["role"] == role: 47 | self.context[-1]["content"] += f' {content}' 48 | else: 49 | self.context.append({"role": role, "content": content}) 50 | 51 | def textin(self, req:LLMRequest): 52 | if self.debug: 53 | print(f'{monotonic():4.3f}: LLMSession.textin: ${req.text=}, {req.textout_cb=} {self.context=}') 54 | self.context_add(req.text) 55 | ireq = LLMInferRequest(req, self.context) 56 | if hasattr(req, '_proc_start_cb'): 57 | ireq._proc_start_cb = req._proc_start_cb 58 | ireq.textout_cb = partial(self.textout, req = req) 59 | self.llm.infer(ireq) 60 | 61 | def textout(self, req:LLMRequest, result:LLMResult): 62 | if self.debug: 63 | print(f'{monotonic():4.3f}: LLMSession.textout: {result.text=}') 64 | if req.auto_ctx_add: 65 | self.context_add(result.text, "assistant") 66 | req.textout_cb(result = result) 67 | 68 | def stop(self): 69 | if self.debug: print('STTSession.stop') 70 | del self.llm 71 | -------------------------------------------------------------------------------- /Cluster/RemoteRTPGen.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from functools import partial 3 | 4 | import ray 5 | from ray.exceptions import RayTaskError 6 | 7 | from Cluster.InfernRTPActor import RTPSessNotFoundErr 8 | from RTP.AudioInput import AudioInput 9 | from RTP.RTPParams import RTPParams 10 | 11 | class RTPGenError(Exception): 12 | pass 13 | 14 | class RemoteRTPGen(): 15 | def __init__(self, rtp_actr, params:RTPParams): 16 | self.rtp_actr = rtp_actr 17 | fut = rtp_actr.new_rtp_session.remote(params) 18 | try: self.sess_id, self.rtp_address = ray.get(fut) 19 | except RayTaskError as e: raise RTPGenError("new_rtp_session() failed") from e 20 | 21 | def connect(self, ain:AudioInput): 22 | return self.rtp_actr.rtp_session_connect.remote(self.sess_id, ain) 23 | 24 | def update(self, params:RTPParams): 25 | return ray.get(self.rtp_actr.rtp_session_update.remote(self.sess_id, params)) 26 | 27 | def get_soundout(self) -> callable: 28 | return partial(self.rtp_actr.rtp_session_soundout.remote, rtp_id=self.sess_id) 29 | 30 | def soundout(self, chunk): 31 | self.rtp_actr.rtp_session_soundout.remote(rtp_id=self.sess_id, chunk=chunk) 32 | 33 | def end(self, relaxed:bool=True): 34 | return self.rtp_actr.rtp_session_end.remote(self.sess_id, relaxed) 35 | 36 | def join(self): 37 | try: ray.get(self.rtp_actr.rtp_session_join.remote(self.sess_id)) 38 | except RTPSessNotFoundErr: pass 39 | -------------------------------------------------------------------------------- /Cluster/RemoteTTSSession.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from uuid import UUID 3 | 4 | import ray 5 | from ray.exceptions import RayTaskError 6 | 7 | from .TTSSession import TTSRequest 8 | 9 | class TTSSessionError(Exception): 10 | pass 11 | 12 | class RemoteTTSSession(): 13 | sess_id: UUID 14 | def __init__(self, tts_actr): 15 | super().__init__() 16 | self.tts_actr = tts_actr 17 | try: self.sess_id = ray.get(tts_actr.new_tts_session.remote()) 18 | except RayTaskError as e: raise TTSSessionError("new_tts_session() failed") from e 19 | 20 | def start(self, soundout:callable): 21 | return ray.get(self.tts_actr.tts_session_start.remote(self.sess_id, soundout)) 22 | 23 | def end(self): 24 | return ray.get(self.tts_actr.tts_session_end.remote(self.sess_id)) 25 | 26 | def say(self, req:TTSRequest): 27 | return self.tts_actr.tts_session_say.remote(rgen_id=self.sess_id, req=req) 28 | 29 | def stop_saying(self, rsay_id:UUID): 30 | return self.tts_actr.tts_session_stop_saying.remote(rgen_id=self.sess_id, rsay_id=rsay_id) 31 | -------------------------------------------------------------------------------- /Cluster/STTSession.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | from uuid import uuid4, UUID 3 | from fractions import Fraction 4 | from functools import partial 5 | from threading import Lock 6 | from time import monotonic 7 | 8 | from Core.AudioChunk import AudioChunk, VadAudioChunk 9 | 10 | class STTRequest(): 11 | lang: str 12 | chunk: AudioChunk 13 | text_cb: callable 14 | mode: str = 'transcribe' 15 | timestamps: bool = False 16 | stime: float 17 | max_ns_prob: float = 0.5 18 | def __init__(self, chunk:AudioChunk, text_cb:callable, lang:str): 19 | self.stime = monotonic() 20 | self.lang, self.chunk, self.text_cb = lang, chunk, text_cb 21 | 22 | class STTSentinel(): 23 | stime: float 24 | text_cb: callable 25 | def __init__(self, signal:str, text_cb:callable): 26 | self.stime = monotonic() 27 | self.signal, self.text_cb = signal, text_cb 28 | 29 | class STTResult(): 30 | text: str 31 | no_speech_prob: float 32 | duration: Fraction 33 | offsets: Optional[List]=None 34 | inf_time: float 35 | def __init__(self, text:str, no_speech_prob:float, req:STTRequest): 36 | self.text = text 37 | self.no_speech_prob = no_speech_prob 38 | self.duration = Fraction(len(req.chunk.audio), req.chunk.samplerate) 39 | self.inf_time = monotonic() - req.stime 40 | 41 | class STTSession(): 42 | debug = False 43 | id: UUID 44 | lang: str = 'en' 45 | context: List[int] 46 | state_lock: Lock 47 | busy: bool = False 48 | pending: List[STTRequest] 49 | 50 | def __init__(self, stt, keep_context:bool): 51 | super().__init__() 52 | self.id = uuid4() 53 | self.stt = stt 54 | self.state_lock = Lock() 55 | self.context = [] if keep_context else None 56 | self.pending = [] 57 | 58 | def stop(self): 59 | if self.debug: print('STTSession.stop') 60 | with self.state_lock: 61 | del self.stt, self.pending 62 | 63 | def soundin(self, req:Union[STTRequest,STTSentinel]): 64 | if self.debug: 65 | if isinstance(req, STTRequest): 66 | print(f'STTSession.soundin({len(req.chunk.audio)=})') 67 | else: 68 | print(f'STTSession.soundin({req=})') 69 | results = [] 70 | with self.state_lock: 71 | self.pending.append(req) 72 | if self.busy: 73 | return 74 | assert len(self.pending) == 1 75 | self.busy = True 76 | self._process_pending_stt_lckd(results) 77 | for cb, r in results: 78 | cb(result=r) 79 | 80 | def _process_pending_stt_lckd(self, results:List): 81 | while self.pending: 82 | req = self.pending.pop(0) 83 | if isinstance(req, STTRequest): 84 | if isinstance(req.chunk, VadAudioChunk): 85 | nr = next((r for r in self.pending if isinstance(r, STTRequest)), None) 86 | if nr is not None and isinstance(nr.chunk, VadAudioChunk): 87 | ca, cb = req.chunk, nr.chunk 88 | if cb.tpos() + cb.duration() - ca.tpos() < self.stt.max_chunk_duration: 89 | ca.append(cb) 90 | self.pending.remove(nr) 91 | self.pending.insert(0, req) 92 | continue 93 | if req.chunk.samplerate != self.stt.sample_rate: 94 | req.chunk.resample(self.stt.sample_rate) 95 | req.chunk.audio = req.chunk.audio.numpy() 96 | text_cb = partial(self.stt_out, req.text_cb) 97 | self.stt.infer((req, text_cb, self.context)) 98 | break 99 | if all(isinstance(r, STTRequest) for r in self.pending): 100 | results.append((req.text_cb, req)) 101 | else: 102 | self.busy = False 103 | 104 | def stt_out(self, text_cb, result:STTResult): 105 | results = [(text_cb, result)] 106 | with self.state_lock: 107 | if not hasattr(self, 'stt'): 108 | return 109 | if self.debug: print(f'STTSession.stt_out({result.text=})') 110 | assert self.busy 111 | self._process_pending_stt_lckd(results) 112 | for cb, r in results: 113 | cb(result=r) 114 | -------------------------------------------------------------------------------- /Cluster/TTSSession.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Sippy Software, Inc. All rights reserved. 2 | # 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without modification, 6 | # are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation and/or 13 | # other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | 26 | from typing import Optional, Union, List, Tuple, Dict 27 | from time import monotonic 28 | from uuid import uuid4, UUID 29 | from queue import Queue 30 | 31 | import ray 32 | 33 | from Core.AStreamMarkers import ASMarkerNewSent, ASMarkerGeneric, \ 34 | ASMarkerSentDoneCB 35 | 36 | from functools import partial 37 | from HelloSippyTTSRT.HelloSippyRTPipe import HelloSippyPlayRequest 38 | from Core.AudioChunk import AudioChunk 39 | from Cluster.InfernTTSWorker import InfernTTSWorker 40 | 41 | class TTSRequest(): 42 | text: Union[str,List[str],Tuple[str]] 43 | speaker_id: Optional[int] 44 | done_cb: Optional[callable] 45 | def __init__(self, text:Union[str,List[str],Tuple[str]], speaker_id:Optional[int]=None, done_cb:Optional[callable]=None): 46 | self.text = text 47 | self.speaker_id = speaker_id 48 | self.done_cb = done_cb 49 | 50 | class TTSSndDispatch(): 51 | id: UUID 52 | debug: bool = False 53 | cancelled: bool = False 54 | done_cb: Optional[callable] = None 55 | cleanup_cb: Optional[callable] = None 56 | soundout: callable 57 | output_sr: int 58 | def __init__(self, soundout:callable, output_sr:int, done_cb:Optional[callable]): 59 | self.id = uuid4() 60 | self.soundout, self.output_sr, self.done_cb = soundout, output_sr, done_cb 61 | 62 | def cancel(self): 63 | self.cancelled = True 64 | chunk = ASMarkerNewSent() if self.done_cb is None \ 65 | else ASMarkerSentDoneCB(self.done_cb, sync=True) 66 | self.soundout(chunk=chunk) 67 | if self.cleanup_cb is not None: 68 | self.cleanup_cb() 69 | 70 | def sound_dispatch(self, chunk): 71 | if self.cancelled: 72 | return 73 | do_cleanup = False 74 | if chunk is None: 75 | if self.debug: 76 | print(f'{monotonic():4.3f}: TTSSndDispatch.sound_dispatch {self.done_cb=}') 77 | chunk = ASMarkerNewSent() if self.done_cb is None \ 78 | else ASMarkerSentDoneCB(self.done_cb, sync=True) 79 | do_cleanup = True 80 | elif not isinstance(chunk, ASMarkerGeneric): 81 | assert chunk.size(0) > 0 82 | chunk=AudioChunk(chunk, self.output_sr) 83 | self.soundout(chunk=chunk) 84 | if do_cleanup and self.cleanup_cb is not None: 85 | self.cleanup_cb() 86 | 87 | class TTSSession(): 88 | debug = True 89 | id: UUID 90 | tts: InfernTTSWorker 91 | tts_actr: ray.remote 92 | soundout: callable 93 | active_req: Dict[UUID, TTSSndDispatch] 94 | 95 | def __init__(self, tts:InfernTTSWorker, tts_actr:ray.remote): 96 | super().__init__() 97 | self.id = uuid4() 98 | self.tts, self.tts_actr = tts, tts_actr 99 | self.active_req = {} 100 | 101 | def start(self, soundout:callable): 102 | self.soundout = soundout 103 | 104 | def say(self, req:TTSRequest) -> UUID: 105 | if self.debug: 106 | print(f'{monotonic():4.3f}: TTSSession.say: ${req.text=}, {req.speaker_id=}, {req.done_cb=}') 107 | if req.speaker_id is not None: 108 | speaker = self.tts.get_voice(req.speaker_id) 109 | else: 110 | speaker, req.speaker_id = self.tts.get_rand_voice() 111 | if isinstance(req.text, str): req.text = (req.text,) 112 | text, done_cb = req.text[0], req.done_cb 113 | if len(req.text) > 1: 114 | req.text.pop(0) 115 | done_cb = partial(self.tts_actr.tts_session_say.remote, rgen_id=self.id, req=req) 116 | trd = TTSSndDispatch(self.soundout, self.tts.output_sr, done_cb) 117 | def cleanup_cb(): 118 | if self.debug: 119 | print(f'{monotonic():4.3f}: TTSSession.cleanup_cb') 120 | del self.active_req[trd.id] 121 | trd.cleanup_cb = cleanup_cb 122 | preq = HelloSippyPlayRequest(self.id, text, speaker, trd.sound_dispatch) 123 | self.active_req[trd.id] = trd 124 | self.tts.infer(preq) 125 | return trd.id 126 | 127 | def stop_saying(self, rsay_id:UUID): 128 | if self.debug: 129 | print(f'{monotonic():4.3f}: TTSSession.stop_saying: {rsay_id=}') 130 | trd = self.active_req.get(rsay_id) 131 | if trd is None: 132 | return False 133 | trd.cancel() 134 | return True 135 | 136 | def stop(self): 137 | pass 138 | 139 | def __del__(self): 140 | if self.debug: 141 | print('TTSSession.__del__') 142 | -------------------------------------------------------------------------------- /Core/AStreamMarkers.py: -------------------------------------------------------------------------------- 1 | from time import monotonic 2 | 3 | import ray 4 | 5 | from Core.Exceptions.InfernSessNotFoundErr import InfernSessNotFoundErr 6 | 7 | class ASMarkerGeneric(): 8 | track_id: int 9 | debug: bool = False 10 | def __init__(self, track_id:int=0): 11 | self.track_id = track_id 12 | 13 | class ASMarkerNewSent(ASMarkerGeneric): 14 | # This runs in the context of the RTPOutputWorker thread 15 | def on_proc(self, tro_self, *args): pass 16 | 17 | class ASMarkerSentDoneCB(ASMarkerNewSent): 18 | debug = False 19 | def __init__(self, done_cb:callable, sync:bool=False, **kwargs): 20 | super().__init__(**kwargs) 21 | self.done_cb = done_cb 22 | self.sync = sync 23 | 24 | def on_proc(self, tro_self): 25 | if self.debug: 26 | print(f'{monotonic():4.3f}: ASMarkerSentDoneCB.on_proc') 27 | x = self.done_cb() 28 | if self.sync: 29 | try: ray.get(x) 30 | except InfernSessNotFoundErr: pass 31 | -------------------------------------------------------------------------------- /Core/AudioChunk.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import requests 3 | import soundfile as sf 4 | from io import BytesIO 5 | 6 | from config.InfernGlobals import InfernGlobals as IG 7 | 8 | class AudioChunk(): 9 | debug: bool = False 10 | samplerate: int 11 | audio:torch.Tensor 12 | track_id: int = 0 13 | active: bool = True 14 | def __init__(self, audio:torch.Tensor, samplerate:int): 15 | assert isinstance(audio, torch.Tensor) 16 | self.audio = audio 17 | self.samplerate = samplerate 18 | 19 | def resample(self, sample_rate:int): 20 | assert sample_rate != self.samplerate 21 | audio = self.audio.to(torch.float) 22 | audio = IG.get_resampler(self.samplerate, sample_rate, audio.device)(audio).to(self.audio.dtype) 23 | self.samplerate, self.audio = sample_rate, audio 24 | return self 25 | 26 | def duration(self): 27 | return self.audio.size(0) / self.samplerate 28 | 29 | class VadAudioChunk(AudioChunk): 30 | debug: bool = False 31 | ipos: int 32 | def __init__(self, audio:torch.Tensor, samplerate:int, ipos:int): 33 | super().__init__(audio, samplerate) 34 | self.ipos = ipos 35 | 36 | def tpos(self): 37 | return self.ipos / self.samplerate 38 | 39 | def append(self, other:'VadAudioChunk'): 40 | assert self.samplerate == other.samplerate 41 | if self.debug: 42 | print(f'VadAudioChunk.append: {self.ipos=} {self.audio.size(0)=} {other.ipos=} {other.audio.size(0)=}') 43 | sdiff = other.ipos - (self.ipos + self.audio.size(0)) 44 | assert sdiff >= 0 45 | if sdiff > 0: 46 | self.audio = torch.cat((self.audio, torch.zeros(sdiff, dtype=self.audio.dtype, device=self.audio.device)), dim=0) 47 | self.audio = torch.cat((self.audio, other.audio), dim=0) 48 | 49 | class AudioChunkFromURL(AudioChunk): 50 | def __init__(self, url:str, samplerate=8000, dtype=torch.float16, **kwargs): 51 | response = requests.get(url) 52 | sound_bytes = BytesIO(response.content) 53 | audio, samplerate_in = sf.read(sound_bytes) 54 | audio = torch.from_numpy(audio).to(dtype) 55 | super().__init__(audio, samplerate_in, **kwargs) 56 | if samplerate_in != samplerate: 57 | self.resample(samplerate) 58 | -------------------------------------------------------------------------------- /Core/Codecs/G711.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import audioop 3 | 4 | from Core.AudioChunk import AudioChunk 5 | from .GenCodec import GenCodec 6 | 7 | _pcm_to_ulaw_ct = torch.zeros(65536, dtype=torch.uint8) 8 | for i in range(-32768, 32768): 9 | pcm_data = i.to_bytes(2, 'little', signed=True) 10 | ulaw_data = audioop.lin2ulaw(pcm_data, 2) 11 | ulaw_value = ulaw_data[0] # Get the byte value from bytes 12 | _pcm_to_ulaw_ct[i + 32768] = ulaw_value # Shift index to make it non-negative 13 | _ulaw_to_pcm_ct = torch.zeros(256, dtype=torch.int16) 14 | for i in range(256): 15 | # Convert each µ-law value back to PCM value 16 | ulaw_byte = i.to_bytes(1, 'little') 17 | pcm_data = audioop.ulaw2lin(ulaw_byte, 2) # Convert µ-law byte to linear PCM 18 | pcm_value = int.from_bytes(pcm_data, 'little', signed=True) 19 | _ulaw_to_pcm_ct[i] = pcm_value 20 | 21 | class G711Codec(GenCodec): 22 | ptype = 0 # G.711u 23 | ename = 'PCMU' 24 | 25 | def encode(self, audio_tensor:torch.Tensor): 26 | # Scale from [-1, 1] to [-32768, 32767] 27 | audio_scaled = torch.clamp(audio_tensor * 32767.0, -32768, 32767).to(torch.int16) 28 | 29 | # Shift and look up in the conversion table 30 | audio_ulaw = _pcm_to_ulaw_ct[(audio_scaled + 32768).long()] 31 | 32 | return audio_ulaw.cpu().numpy().tobytes() 33 | 34 | def decode(self, ulaw_bytes:bytes, resample:bool=True, sample_rate:int=GenCodec.srate): 35 | # Convert byte string to a tensor of uint8 36 | ulaw_tensor = torch.tensor(list(ulaw_bytes), dtype=torch.uint8) 37 | 38 | # Use ulaw_to_pcm table to convert each µ-law value to PCM value 39 | audio_pcm = _ulaw_to_pcm_ct[ulaw_tensor.long()] 40 | 41 | # Scale from [-32768, 32767] to [-1, 1] 42 | audio_float = audio_pcm.float() / 32767.0 43 | 44 | chunk = AudioChunk(audio_float, self.srate) 45 | if resample and sample_rate != self.srate: 46 | chunk.resample(sample_rate) 47 | return chunk 48 | 49 | def device(self): 50 | global _pcm_to_ulaw_ct, _ulaw_to_pcm_ct 51 | assert _pcm_to_ulaw_ct.device == _ulaw_to_pcm_ct.device 52 | return _pcm_to_ulaw_ct.device 53 | 54 | def to(self, device): 55 | global _pcm_to_ulaw_ct, _ulaw_to_pcm_ct 56 | assert _pcm_to_ulaw_ct.device == _ulaw_to_pcm_ct.device 57 | _pcm_to_ulaw_ct = _pcm_to_ulaw_ct.to(device) 58 | _ulaw_to_pcm_ct = _ulaw_to_pcm_ct.to(device) 59 | return self 60 | 61 | def e2d_frames(self, enframes:int, out_srate:int=GenCodec.srate): 62 | assert out_srate % self.srate == 0 63 | return enframes * out_srate // self.srate 64 | 65 | def d2e_frames(self, dnframes:int, in_srate:int=GenCodec.srate): 66 | assert in_srate % self.srate == 0 67 | return dnframes * self.srate // in_srate 68 | 69 | def silence(self, nframes:int): 70 | return b'\xff' * nframes 71 | -------------------------------------------------------------------------------- /Core/Codecs/G722.py: -------------------------------------------------------------------------------- 1 | from G722 import G722 2 | import torch 3 | import numpy as np 4 | 5 | from Core.AudioChunk import AudioChunk 6 | from .GenCodec import GenCodec 7 | 8 | class G722Codec(GenCodec): 9 | codec:G722 10 | srate:int = 8000 11 | default_br:int = 64000 12 | ptype:int = 9 # G.722 13 | ename:str = 'G722' # encoding name 14 | _device:str = 'cpu' 15 | 16 | def __init__(self): 17 | super().__init__() 18 | self.codec = G722(self.srate, self.default_br) 19 | 20 | def encode(self, audio_tensor:torch.Tensor): 21 | # Scale from [-1, 1] to [-32768, 32767] 22 | audio_scaled = torch.clamp(audio_tensor * 32767.0, -32768, 32767).to(torch.int16).numpy() 23 | 24 | # Shift and look up in the conversion table 25 | audio_enc = self.codec.encode(audio_scaled) 26 | 27 | return audio_enc 28 | 29 | def decode(self, audio_enc:bytes, resample:bool=True, sample_rate:int=srate): 30 | # Use ulaw_to_pcm table to convert each µ-law value to PCM value 31 | audio_pcm = torch.tensor(self.codec.decode(audio_enc)).to(self._device) 32 | 33 | # Scale from [-32768, 32767] to [-1, 1] 34 | audio_float = audio_pcm.float() / 32767.0 35 | 36 | chunk = AudioChunk(audio_float, self.srate) 37 | if resample and sample_rate != self.srate: 38 | chunk.resample(sample_rate) 39 | return chunk 40 | 41 | def device(self): return self._device 42 | 43 | def to(self, device): 44 | self._device = device 45 | return self 46 | 47 | def silence(self, nframes:int): 48 | return self.encode(torch.zeros(self.e2d_frames(nframes), dtype=torch.int16)) 49 | 50 | def e2d_frames(self, enframes:int, out_srate:int=srate): 51 | #assert out_srate % self.srate == 0 52 | return enframes * (1 if self.srate == 8000 else 2) * out_srate // self.srate 53 | 54 | def d2e_frames(self, dnframes:int, in_srate:int=srate): 55 | #assert in_srate % self.srate == 0 56 | return dnframes * self.srate // ((1 if self.srate == 8000 else 2) * in_srate) 57 | -------------------------------------------------------------------------------- /Core/Codecs/GenCodec.py: -------------------------------------------------------------------------------- 1 | class GenCodec(): 2 | srate:int = 8000 # sample rate 3 | crate:int = 8000 # clock rate 4 | ptype:int # payload type 5 | ename:str # encoding name 6 | 7 | def __init__(self): 8 | assert self.ptype is not None and self.ename is not None 9 | 10 | @classmethod 11 | def rtpmap(cls): 12 | assert all(hasattr(cls, attr) for attr in ('ptype', 'ename')) 13 | return f'rtpmap:{cls.ptype} {cls.ename}/{cls.crate}' 14 | -------------------------------------------------------------------------------- /Core/ConfigValidators.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from cerberus import Validator 3 | 4 | class InfernConfigParseErr(Exception): pass 5 | 6 | def validate_yaml(schema, filename): 7 | try: 8 | with open(filename, 'r') as file: 9 | data = yaml.safe_load(file) 10 | 11 | v = Validator(schema) 12 | if not v.validate(data): 13 | raise InfernConfigParseErr(f"Validation errors in {filename}: {v.errors}") 14 | 15 | except yaml.YAMLError as exc: 16 | raise InfernConfigParseErr(f"Error parsing YAML file {filename}: {exc}") from exc 17 | return data 18 | 19 | def validate_port_range(field, value, error): 20 | if ':' in value: 21 | _, port = value.split(':', 1) 22 | if not (1 <= int(port) <= 65535): 23 | error(field, 'Port number must be in the range 1-65535') 24 | -------------------------------------------------------------------------------- /Core/Exceptions/InfernSessNotFoundErr.py: -------------------------------------------------------------------------------- 1 | class InfernSessNotFoundErr(Exception): pass -------------------------------------------------------------------------------- /Core/InfernConfig.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Union 2 | from functools import partial 3 | 4 | from Cluster.InfernSIPActor import InfernSIPActor 5 | from SIP.InfernSIPConf import InfernSIPConf 6 | from SIP.InfernSIPProfile import InfernSIPProfile 7 | from RTP.InfernRTPConf import InfernRTPConf 8 | 9 | from .ConfigValidators import validate_yaml 10 | 11 | # Define the schema 12 | schema = { 13 | 'sip': { 14 | 'type': 'dict', 15 | 'schema': { 16 | **InfernSIPConf.schema, 17 | **InfernSIPProfile.schema, 18 | } 19 | }, 20 | 'rtp': { 21 | 'type': 'dict', 22 | 'schema': { 23 | **InfernRTPConf.schema, 24 | } 25 | }, 26 | 'apps': { 27 | 'type': 'dict', 28 | 'schema': { 29 | # Filled by modules 30 | } 31 | } 32 | } 33 | 34 | class InfernConfig(): 35 | sip_actr: Optional[InfernSIPActor] 36 | sip_conf: Optional[InfernSIPConf] 37 | rtp_conf: Optional[InfernRTPConf] 38 | connectors: Dict[str, InfernSIPProfile] 39 | apps: Dict[str, Union['LTProfile', 'AIAProfile']] 40 | def __init__(self, filename: str): 41 | from Apps.LiveTranslator.LTProfile import LTProfile 42 | from Apps.LiveTranslator.LTAppConfig import LTAppConfig 43 | schema['apps']['schema'].update(LTAppConfig.schema) 44 | from Apps.AIAttendant.AIAProfile import AIAProfile 45 | from Apps.AIAttendant.AIAAppConfig import AIAAppConfig 46 | schema['apps']['schema'].update(AIAAppConfig.schema) 47 | d = validate_yaml(schema, filename) 48 | self.sip_conf = InfernSIPConf(d['sip'].get('settings', None)) if 'sip' in d else None 49 | self.rtp_conf = InfernRTPConf(d['rtp'].get('settings', None)) if 'rtp' in d else None 50 | try: 51 | self.connectors = dict((f'sip/{name}', InfernSIPProfile(name, conf)) 52 | for name, conf in d['sip']['profiles'].items()) 53 | except KeyError: 54 | self.connectors = {} 55 | precache = 'live_translator_precache' in d['apps'] and d['apps']['live_translator_precache'] 56 | _LTProfile = partial(LTProfile, precache=precache) 57 | self.apps = {} 58 | for aname, AProf in (('live_translator', _LTProfile), ('ai_attendant', AIAProfile)): 59 | if aname not in d['apps']: continue 60 | app_confs = dict((f'apps/{aname}/{name}', AProf(name, conf)) 61 | for name, conf in d['apps'][aname]['profiles'].items()) 62 | self.apps.update(app_confs) 63 | for app in self.apps.values(): 64 | app.finalize(self) 65 | if 'sip' in d: 66 | self.sip_actr = InfernSIPActor.options(max_concurrency=2).remote() 67 | for conn in self.connectors.values(): 68 | conn.finalize(self.sip_actr, self) 69 | else: 70 | self.sip_actr = None 71 | -------------------------------------------------------------------------------- /Core/InfernWrkThread.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Sippy Software, Inc. All rights reserved. 2 | # 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without modification, 6 | # are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation and/or 13 | # other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | 26 | from threading import Thread, Lock 27 | 28 | RTPWrkTInit = 0 29 | RTPWrkTRun = 1 30 | RTPWrkTStop = 2 31 | 32 | class InfernWrkThread(Thread): 33 | state_lock: Lock = None 34 | state: int = RTPWrkTInit 35 | 36 | def __init__(self): 37 | self.state_lock = Lock() 38 | super().__init__() 39 | self.setDaemon(True) 40 | 41 | def start(self): 42 | super().start() 43 | 44 | def get_state(self, locked=False): 45 | if not locked: self.state_lock.acquire() 46 | state = self.state 47 | if not locked: self.state_lock.release() 48 | return state 49 | 50 | def _set_state(self, newstate, expected_state = None, raise_on_error = True): 51 | self.state_lock.acquire() 52 | pstate = self.state 53 | if expected_state is not None and self.state != expected_state: 54 | self.state_lock.release() 55 | if raise_on_error: 56 | raise AssertionError(f'Unexpected state: {self.state}, {expected_state} expected') 57 | return pstate 58 | self.state = newstate 59 | self.state_lock.release() 60 | return pstate 61 | 62 | def thread_started(self): 63 | self._set_state(RTPWrkTRun, expected_state = RTPWrkTInit) 64 | 65 | def stop(self): 66 | pstate = self._set_state(RTPWrkTStop, expected_state = RTPWrkTRun, raise_on_error = True) 67 | if pstate == RTPWrkTRun: 68 | self.join() 69 | self._set_state(RTPWrkTInit, expected_state = RTPWrkTStop) 70 | -------------------------------------------------------------------------------- /Core/OutputMuxer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Dict, List 2 | from time import monotonic 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | from .AudioChunk import AudioChunk 8 | from .AStreamMarkers import ASMarkerGeneric, ASMarkerNewSent 9 | 10 | class OutputMuxer(): 11 | debug = False 12 | output_sr:int 13 | qsize:int 14 | device:str 15 | chunks_in: List[Union[AudioChunk, ASMarkerGeneric]] 16 | def __init__(self, output_sr:int, qsize:int, device:str): 17 | self.output_sr = output_sr 18 | self.qsize = qsize 19 | self.device = device 20 | self.chunks_in = [] 21 | 22 | def chunk_in(self, chunk:Union[AudioChunk, ASMarkerGeneric]): 23 | if isinstance(chunk, AudioChunk): 24 | if chunk.samplerate != self.output_sr: 25 | chunk = chunk.resample(self.output_sr) 26 | if len(self.chunks_in) > 0 and isinstance(self.chunks_in[-1], AudioChunk): 27 | chunk.audio = torch.cat((self.chunks_in.pop().audio, chunk.audio), dim=0) 28 | self.chunks_in.append(chunk) 29 | 30 | def idle(self, rtp_worker): 31 | chunk_o = torch.empty(0).to(self.device) 32 | if len(self.chunks_in) == 1 and isinstance(self.chunks_in[0], AudioChunk) and \ 33 | self.chunks_in[0].audio.size(0) < self.qsize: 34 | return None 35 | while len(self.chunks_in) > 0 and (rsize:=self.qsize-chunk_o.size(0)) > 0: 36 | chunk = self.chunks_in[0] 37 | if isinstance(chunk, ASMarkerNewSent): 38 | #self.update_frm_ctrs(prcsd_inc=pos.get_buf_nframes()) 39 | if chunk_o.size(0) > 0: 40 | return chunk_o 41 | if self.debug: 42 | print(f'{monotonic():4.3f}: ASMarkerNewSent {chunk.on_proc=}') 43 | self.chunks_in.pop(0) 44 | chunk.on_proc(rtp_worker) 45 | continue 46 | chunk_o = torch.cat((chunk_o, chunk.audio[:rsize]), dim=0) 47 | if chunk.audio.size(0) > rsize: 48 | chunk.audio = chunk.audio[rsize:] 49 | else: 50 | self.chunks_in.pop(0) 51 | if chunk_o.size(0) > 0 and chunk_o.size(0) < self.qsize: 52 | print(f'{monotonic():4.3f}: Reinserting {chunk_o.size()=}') 53 | self.chunks_in.insert(0, AudioChunk(chunk_o, self.output_sr)) 54 | return None 55 | 56 | return chunk_o if chunk_o.size(0) > 0 else None 57 | 58 | class OutputMTMuxer(): 59 | tracks:Dict[int, OutputMuxer] 60 | output_sr:int 61 | qsize:int 62 | device:str 63 | def __init__(self, output_sr:int, qsize:int, device:str): 64 | self.tracks = {} 65 | self.output_sr = output_sr 66 | self.qsize = qsize 67 | self.device = device 68 | 69 | def chunk_in(self, chunk:Union[AudioChunk, ASMarkerGeneric]): 70 | if chunk.track_id not in self.tracks: 71 | self.tracks[chunk.track_id] = OutputMuxer(self.output_sr, self.qsize, self.device) 72 | self.tracks[chunk.track_id].chunk_in(chunk) 73 | 74 | def idle(self, rtp_worker): 75 | chunks = [chunk for chunk in [track.idle(rtp_worker) for track in self.tracks.values()] if chunk is not None] 76 | if len(chunks) == 0: return None 77 | if len(chunks) == 1: return chunks[0] 78 | max_len = max([chunk.size(0) for chunk in chunks]) 79 | chunks = [F.pad(chunk, (0, max_len-chunk.size(0)), "constant", 0) if chunk.size(0) < max_len else chunk 80 | for chunk in chunks] 81 | merged = torch.sum(torch.stack(chunks), dim=0) / len(self.tracks) 82 | #max_val = torch.max(torch.abs(merged)) 83 | #if max_val > 1: 84 | # merged /= max_val 85 | return merged 86 | -------------------------------------------------------------------------------- /Core/T2T/NumbersToWords.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import re 3 | import inflect 4 | 5 | from config.InfernGlobals import InfernGlobals as IG 6 | 7 | class NumbersToWords: 8 | tr:Optional[callable] 9 | cache:dict 10 | def __init__(self, lang='en'): 11 | self.p = inflect.engine() 12 | self.tr, self.cache = (None, None) if lang == 'en' else (IG.get_translator('en', lang).translate, {}) 13 | 14 | def __call__(self, text): 15 | # Find all instances of numbers in the text 16 | numbers = re.findall(r'\b\d[\d.,]*%?(?=[\s.,!]|$)', text) 17 | 18 | # For each number found, replace it with its word equivalent 19 | for number in numbers: 20 | if number.endswith('%'): 21 | tr_number = number[:-1] 22 | suffix = ' percent' 23 | elif number[-1] in ('.', ',', '!'): 24 | tr_number = number[:-1] 25 | suffix = number[-1] 26 | else: 27 | suffix = '' 28 | tr_number = number 29 | word = self.p.number_to_words(tr_number) + suffix 30 | if self.tr is not None: 31 | if (word_tr:=self.cache.get(number, None)) is None: 32 | self.cache[number] = word_tr = self.tr(word) 33 | word = word_tr 34 | text = text.replace(number, word, 1) 35 | return text 36 | 37 | if __name__ == '__main__': 38 | n2w = NumbersToWords() 39 | print(n2w('I have 3 cats and 2 dogs.')) 40 | print(n2w('I have 3% cats and 2% dogs.')) 41 | print(n2w('I have 30000 cats and 2999 dogs.')) 42 | print(n2w('I have 50% cats and 29.0% dogs.')) 43 | print(n2w('I have 3,090.6 cats and 21,188,128 dogs.%,')) 44 | print(n2w('I have 3% cats and dogs 2%.')) 45 | print(n2w('I have 3% cats and dogs 20%, and mice 3.0%.')) 46 | print(n2w('I have 3% cats and dogs since 2024, or 2023.')) 47 | -------------------------------------------------------------------------------- /Core/T2T/Translator.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Optional 2 | from functools import partial 3 | 4 | import argostranslate.package 5 | from argostranslate.translate import get_installed_languages 6 | 7 | def load_pair(from_code, to_code): 8 | print(f'load_pair({from_code=}, {to_code=})') 9 | argostranslate.package.update_package_index() 10 | available_packages = argostranslate.package.get_available_packages() 11 | package_to_install = next( 12 | filter( 13 | lambda x: x.from_code == from_code and x.to_code == to_code, available_packages 14 | ) 15 | ) 16 | print(f'{package_to_install=}') 17 | argostranslate.package.install_from_path(package_to_install.download()) 18 | 19 | class Translator(): 20 | supported_langs = ["en", "it", "de", "ru", "ja"] 21 | translators: Tuple[callable] 22 | def __init__(self, from_code: str, to_code: str, filter:Optional[callable]=None): 23 | to_code_p = [to_code,] 24 | inter_codes = [x for x in self.supported_langs if x not in (from_code, to_code)] 25 | success = False 26 | while not success: 27 | try: load_pair(from_code, to_code) 28 | except StopIteration: pass 29 | else: 30 | success = True 31 | break 32 | while len(inter_codes) > 0: 33 | inter_code = inter_codes.pop() 34 | try: 35 | load_pair(from_code, inter_code) 36 | load_pair(inter_code, to_code) 37 | except StopIteration: 38 | if len(inter_codes) == 0: raise 39 | continue 40 | to_code_p.insert(0, to_code) 41 | success = True 42 | break 43 | ilangs = dict((x.code, x) for x in get_installed_languages()) 44 | from_lang = ilangs[from_code] 45 | translators = [] 46 | for tc in to_code_p: 47 | to_lang = ilangs[tc] 48 | tr = from_lang.get_translation(to_lang).translate 49 | if filter is not None: tr = partial(filter, from_code=from_code, to_code=tc, tr=tr) 50 | translators.append(tr) 51 | from_lang, from_code = to_lang, tc 52 | self.translators = tuple(translators) 53 | 54 | def translate(self, sourceText): 55 | for translator in self.translators: 56 | sourceText = translatedText = translator(sourceText) 57 | return translatedText 58 | 59 | if __name__ == '__main__': 60 | tr = Translator('en', 'ja') 61 | t0 = tr.translate('Hello world!') 62 | tr = Translator('ru', 'it') 63 | tr1 = Translator('it', 'de') 64 | #print(tr.to_code_p, tr1.to_code_p) 65 | sourceText = "Привет, как твои дела?" 66 | t1 = tr.translate(sourceText) 67 | t2 = tr1.translate(t1) 68 | print(t0, t1, t2) 69 | 70 | -------------------------------------------------------------------------------- /Core/VAD/SileroVAD.py: -------------------------------------------------------------------------------- 1 | #try: import intel_extension_for_pytorch as ipex 2 | #except ModuleNotFoundError: ipex = None 3 | 4 | from typing import Tuple, List, Optional 5 | 6 | import torch 7 | 8 | from Cluster.InfernBatchedWorker import InfernBatchedWorker 9 | from Core.AudioChunk import AudioChunk, VadAudioChunk 10 | from Core.VAD.SileroVADUtils import VADIteratorB, VADChannelState, VADBatchFromList 11 | 12 | class VADChannel(): 13 | audio_chunk_in: callable 14 | vad_chunk_in: callable 15 | decode: callable 16 | vad_buffer: bytes = b'' 17 | state: VADChannelState 18 | active_start: Optional[int] = None 19 | active_buffer: torch.Tensor 20 | def __init__(self, audio_chunk_in:callable, vad_chunk_in: callable, decode: callable, device:str): 21 | self.audio_chunk_in = audio_chunk_in 22 | self.vad_chunk_in = vad_chunk_in 23 | self.decode = decode 24 | self.state = VADChannelState(device) 25 | self.active_buffer = torch.zeros(0).to('cpu') 26 | 27 | def ingest(self, svad:'SileroVADWorker', data: bytes, codec): 28 | self.vad_buffer += data 29 | if codec.e2d_frames(len(self.vad_buffer), svad.input_sr) < svad.window_size_samples: 30 | return None 31 | decode_samples = codec.d2e_frames(svad.window_size_samples, svad.input_sr) 32 | chunk = codec.decode(self.vad_buffer[:decode_samples], sample_rate=svad.input_sr) 33 | assert chunk.audio.size(0) == svad.window_size_samples, f'{len(chunk).audio.size(0)=} {svad.window_size_samples=}' 34 | self.vad_buffer = self.vad_buffer[decode_samples:] 35 | svad.infer((self, chunk)) 36 | #self.vad_chunk_in(chunk, True) 37 | 38 | class SileroVADWorker(InfernBatchedWorker): 39 | max_batch_size: int = 200 40 | input_sr: int 41 | max_vad_frames: int 42 | def __init__(self, device, input_sr: int = 8000): 43 | super().__init__() 44 | model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad:v3.1', 45 | model='silero_vad', force_reload=True, trust_repo=True) 46 | self.device = device 47 | self.model = model = model.eval().to(device) 48 | #for n, t in [(_t, getattr(_m, _t, None)) 49 | # for _m in (model._model_8k.decoder.rnn, model._model.decoder.rnn) 50 | # for _t in dir(_m) 51 | # if _t != 'graph']: 52 | # if not isinstance(t, torch.Tensor): continue 53 | # print(f'{n=} {t.is_contiguous()=}') 54 | 55 | self.vad_iterator = VADIteratorB(model, sampling_rate=input_sr) 56 | self.window_size_samples = 768 # number of samples in a single audio chunk 57 | self.input_sr = input_sr 58 | self.max_vad_frames = input_sr * 30 # 30 seconds for Whisper 59 | 60 | @torch.no_grad() 61 | def process_batch(self, wis:List[Tuple[VADChannel, torch.Tensor]]): 62 | from time import sleep 63 | #sleep(0.5) 64 | #print(f'InfernSTTWorker.process_batch: got {len(wis)=}') 65 | while len(wis) > 0: 66 | nbatch = [] 67 | cbatch: List[VADChannel] = [] 68 | pbatch: List[AudioChunk] = [] 69 | sbatch: List[VADChannelState] = [] 70 | for wi in wis: 71 | if (ch:=wi[0]) not in cbatch: 72 | cbatch.append(ch) 73 | pbatch.append(wi[1]) 74 | sbatch.append(ch.state) 75 | else: 76 | nbatch.append(wi) 77 | wis = nbatch 78 | bstate = VADBatchFromList(sbatch) 79 | chunks = torch.stack([p.audio for p in pbatch], dim=0).to(self.device) 80 | self.vad_iterator(chunks, bstate=bstate, return_seconds=False) 81 | for i, (vc, p) in enumerate(zip(cbatch, pbatch)): 82 | sd = vc.state 83 | if sd.speech: print(f'speech_dict[{i}]={sd.speech} {sd.current_sample=}', end=' ') 84 | vc.active_buffer = torch.cat((vc.active_buffer, p.audio.cpu())) 85 | if sd.speech and 'start' in sd.speech: 86 | assert vc.active_start is None, f'{vc.active_start=}' 87 | vc.active_start = sd.speech['start'] 88 | poff = sd.current_sample - vc.active_start 89 | assert poff > 0 and poff < vc.active_buffer.size(0), f'{poff=} {vc.active_buffer.size(0)=} {sd.current_sample=} {vc.active_start=}' 90 | vc.active_buffer = vc.active_buffer[-poff:] 91 | elif sd.speech and 'end' in sd.speech: 92 | active_end = sd.speech["end"] 93 | assert vc.active_start is not None and active_end > vc.active_start, f'{vc.active_start=} {sd.temp_end=} {active_end=}' 94 | assert sd.current_sample > active_end, f'{sd.current_sample=} {active_end=}' 95 | poff = sd.current_sample - active_end 96 | assert poff > 0 and poff < vc.active_buffer.size(0), f'{poff=} {vc.active_buffer.size(0)=} {sd.current_sample=} {active_end=}' 97 | obuf = vc.active_buffer[:-poff] 98 | assert obuf.size(0) == active_end - vc.active_start, f'{obuf.size(0)=} {vc.active_start=} {active_end=}' 99 | vc.vad_chunk_in(VadAudioChunk(obuf, self.input_sr, vc.active_start)) 100 | vc.active_start = None 101 | if vc.active_start is None: 102 | vc.active_buffer = vc.active_buffer[:self.window_size_samples*2] 103 | elif vc.active_buffer.size(0) > self.max_vad_frames: 104 | chunk = VadAudioChunk(vc.active_buffer[:self.max_vad_frames], self.input_sr, vc.active_start) 105 | vc.active_buffer = vc.active_buffer[self.max_vad_frames:] 106 | vc.active_start += self.max_vad_frames 107 | if sd.temp_end != 0: 108 | print(f'{sd.current_sample=}: {sd.temp_end=} -> {vc.active_start=}') 109 | if sd.temp_end < vc.active_start: 110 | sd.temp_end = vc.active_start 111 | vc.vad_chunk_in(chunk) 112 | vc.audio_chunk_in(p, vc.active_start is not None) 113 | -------------------------------------------------------------------------------- /Core/VAD/SileroVADUtils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional 2 | import torch 3 | 4 | class VADChannelState: 5 | triggered: bool = False 6 | temp_end: int = 0 7 | current_sample: int = 0 8 | speech: Optional[Dict[str, int]] = None 9 | model_state: List[torch.Tensor] 10 | def __init__(self, device:str): 11 | self.model_state = [torch.zeros(2, 64).to(device), torch.zeros(2, 64).to(device)] 12 | 13 | class VADBatchState: 14 | batch_size: int 15 | channels: List[VADChannelState] 16 | device: str 17 | def __init__(self, batch_size, device:str='cpu'): 18 | self.batch_size = batch_size 19 | self.channels = [VADChannelState(device) for _ in range(batch_size)] 20 | 21 | def get_model_state(self): 22 | return [torch.stack([s.model_state[r] for s in self.channels], dim=1) for r in range(2)] 23 | 24 | def save_model_state(self, state:List[torch.Tensor]): 25 | for c, s1, s2 in zip(self.channels, state[0].unbind(1), state[1].unbind(1)): 26 | c.model_state = [s1, s2] 27 | 28 | class VADBatchFromList(VADBatchState): 29 | def __init__(self, states:List[VADChannelState]): 30 | self.batch_size = len(states) 31 | self.channels = states 32 | 33 | class VADIteratorB: 34 | def __init__(self, 35 | model, 36 | threshold: float = 0.5, 37 | sampling_rate: int = 16000, 38 | min_silence_duration_ms: int = 100, 39 | speech_pad_ms: int = 30, 40 | ): 41 | 42 | """ 43 | Class for stream imitation 44 | 45 | Parameters 46 | ---------- 47 | model: preloaded .jit silero VAD model 48 | 49 | threshold: float (default - 0.5) 50 | Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. 51 | It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. 52 | 53 | sampling_rate: int (default - 16000) 54 | Currently silero VAD models support 8000 and 16000 sample rates 55 | 56 | min_silence_duration_ms: int (default - 100 milliseconds) 57 | In the end of each speech chunk wait for min_silence_duration_ms before separating it 58 | 59 | speech_pad_ms: int (default - 30 milliseconds) 60 | Final speech chunks are padded by speech_pad_ms each side 61 | """ 62 | 63 | self.model = model 64 | self.threshold = threshold 65 | self.sampling_rate = sampling_rate 66 | 67 | if sampling_rate not in [8000, 16000]: 68 | raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]') 69 | 70 | self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 71 | self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000 72 | self.model.reset_states() 73 | 74 | def __call__(self, x:torch.Tensor, bstate:Optional[VADBatchState]=None, return_seconds=False): 75 | """ 76 | x: torch.Tensor 77 | audio chunk (see examples in repo) 78 | 79 | return_seconds: bool (default - False) 80 | whether return timestamps in seconds (default - samples) 81 | """ 82 | 83 | if not torch.is_tensor(x): 84 | try: 85 | x = torch.Tensor(x) 86 | except: 87 | raise TypeError("Audio cannot be casted to tensor. Cast it manually") 88 | 89 | if x.dim() == 1: x = x.unsqueeze(0) 90 | else: assert x.dim() == 2, f"Audio should be 1D or 2D tensor, but got {x.dim()}" 91 | 92 | batch_size = x.size(0) 93 | 94 | if bstate is None: 95 | bstate = VADBatchState(batch_size, device=x.device) 96 | self.model.reset_states() 97 | else: 98 | assert bstate.batch_size == batch_size, f"Batch size should be {batch_size}, but got {bstate.batch_size}" 99 | (_mc:=self.model._c)._h, _mc._c, _mc._last_sr, _mc._last_batch_size = bstate.get_model_state() + [self.sampling_rate, batch_size] 100 | 101 | window_size_samples = len(x[0]) if x.dim() == 2 else len(x) 102 | 103 | speech_probs = (y for y in self.model(x, self.sampling_rate).tolist()) 104 | 105 | for speech_prob, channel in zip(speech_probs, bstate.channels): 106 | channel.current_sample += window_size_samples 107 | if (speech_prob >= self.threshold) and channel.temp_end: 108 | channel.temp_end = 0 109 | 110 | if (speech_prob >= self.threshold) and not channel.triggered: 111 | channel.triggered = True 112 | speech_pad_samples = self.speech_pad_samples if channel.current_sample > window_size_samples else 0 113 | speech_start = channel.current_sample - speech_pad_samples - window_size_samples 114 | channel.speech = {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)} 115 | continue 116 | 117 | if (speech_prob < self.threshold - 0.15) and channel.triggered: 118 | if not channel.temp_end: 119 | channel.temp_end = channel.current_sample 120 | if channel.current_sample - channel.temp_end < self.min_silence_samples: 121 | channel.speech = None 122 | continue 123 | else: 124 | speech_end = channel.temp_end + self.speech_pad_samples - window_size_samples 125 | channel.temp_end = 0 126 | channel.triggered = False 127 | channel.speech = {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)} 128 | continue 129 | 130 | channel.speech = None 131 | bstate.save_model_state([(_mc:=self.model._c)._h, _mc._c]) 132 | #print(f'{bstate.model_state[0].size()=} {bstate.model_state[1].size()=}') 133 | return bstate 134 | -------------------------------------------------------------------------------- /Core/VAD/ZlibVAD.py: -------------------------------------------------------------------------------- 1 | from zlib import compress 2 | 3 | class VADResult(): 4 | chunk: bytes 5 | active: bool = True 6 | 7 | class ZlibVAD(): 8 | vad_duration: float = 0.1 9 | vad_threshold: float = 0.6 10 | vad_frames: int 11 | max_vad_frames: int 12 | vad_buffer: bytes = b'' 13 | chunk_buffer: bytes = b'' 14 | ninactive: int = 0 15 | activation_threshold: int = 5 16 | def __init__(self, input_sr: int = 8000): 17 | self.vad_frames = int(input_sr * self.vad_duration) 18 | self.max_vad_frames = input_sr * 30 # 30 seconds for Whisper 19 | 20 | def ingest(self, data: bytes, vad_chunk_in: callable): 21 | self.vad_buffer += data 22 | if len(self.vad_buffer) < self.vad_frames: 23 | return None 24 | chunk = self.vad_buffer[:self.vad_frames] 25 | self.vad_buffer = self.vad_buffer[self.vad_frames:] 26 | r = len(compress(chunk))/len(chunk) 27 | v = VADResult() 28 | active = False if r < self.vad_threshold else True 29 | vad_chunk_in(chunk, active) 30 | max_len_reached = len(self.chunk_buffer) >= (self.max_vad_frames - (self.vad_frames * self.activation_threshold)) 31 | if active: 32 | self.ninactive = 0 33 | if not max_len_reached: 34 | self.chunk_buffer += chunk 35 | return None 36 | v.chunk = self.chunk_buffer[:self.max_vad_frames] 37 | self.chunk_buffer = self.chunk_buffer[self.max_vad_frames:] 38 | return v 39 | else: 40 | if self.ninactive > self.activation_threshold: 41 | assert len(self.chunk_buffer) > self.vad_frames * self.activation_threshold 42 | chunk = self.chunk_buffer[:-self.vad_frames*self.activation_threshold] 43 | if len(chunk) < self.vad_frames * self.activation_threshold: 44 | v = None 45 | else: 46 | v.chunk = chunk 47 | self.chunk_buffer = b'' 48 | self.ninactive = 0 49 | return v 50 | self.chunk_buffer += chunk 51 | self.ninactive += 1 52 | return None -------------------------------------------------------------------------------- /HelloSippyTTSRT/HelloSippyRT.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional 2 | from time import monotonic 3 | 4 | import torch 5 | from transformers import SpeechT5ForTextToSpeech, \ 6 | SpeechT5HifiGanConfig, SpeechT5HifiGan, SpeechT5Processor, \ 7 | SpeechT5Config 8 | from transformers.models.speecht5.modeling_speecht5 import \ 9 | SpeechT5EncoderWithSpeechPrenet 10 | from transformers import PretrainedConfig, PreTrainedModel 11 | from datasets import load_dataset 12 | import torch.nn as nn 13 | from methodtools import lru_cache 14 | 15 | from config.InfernGlobals import InfernGlobals 16 | 17 | GenerateSpeech_cb = Callable[[torch.FloatTensor], None] 18 | 19 | class HelloSippyRT(): 20 | pass 21 | 22 | def _generate_speech_rt( 23 | hsrt: HelloSippyRT, 24 | input_values: torch.FloatTensor, 25 | speech_cb: GenerateSpeech_cb, 26 | speaker_embeddings: Optional[torch.FloatTensor] = None, 27 | threshold: float = 0.5, 28 | minlenratio: float = 0.0, 29 | maxlenratio: float = 20.0, 30 | ) -> int: 31 | with hsrt.cuda_lock: 32 | encoder_attention_mask = torch.ones_like(input_values) 33 | 34 | model = hsrt.model 35 | encoder_out = model.speecht5.encoder( 36 | input_values=input_values, 37 | attention_mask=encoder_attention_mask, 38 | return_dict=True, 39 | ) 40 | 41 | encoder_last_hidden_state = encoder_out.last_hidden_state 42 | 43 | # downsample encoder attention mask 44 | if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet): 45 | encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask( 46 | encoder_out[0].shape[1], encoder_attention_mask 47 | ) 48 | 49 | maxlen = int(encoder_last_hidden_state.size(1) * maxlenratio / model.config.reduction_factor) 50 | minlen = int(encoder_last_hidden_state.size(1) * minlenratio / model.config.reduction_factor) 51 | 52 | # Start the output sequence with a mel spectrum that is all zeros. 53 | output_sequence = encoder_last_hidden_state.new_zeros(1, 1, model.config.num_mel_bins) 54 | 55 | spectrogram = torch.zeros(0, model.config.num_mel_bins).to(model.device) 56 | past_key_values = None 57 | idx = 0 58 | 59 | ###stime_pre = None 60 | btime = monotonic() 61 | p_ch = hsrt.chunker 62 | _c = hsrt.c_conf 63 | prfs = torch.zeros(_c.pre_frames, model.config.num_mel_bins, 64 | device=model.device) 65 | pofs = torch.zeros(_c.post_frames, model.config.num_mel_bins, 66 | device=model.device) 67 | oschedule = [_c.chunk_size, _c.chunk_size, _c.chunk_size*2] 68 | output_len = oschedule[0] 69 | chunk_size = _c.chunk_size 70 | vocoder = hsrt.vocoder 71 | while True: 72 | idx += 1 73 | 74 | # Run the decoder prenet on the entire output sequence. 75 | decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings) 76 | 77 | # Run the decoder layers on the last element of the prenet output. 78 | decoder_out = model.speecht5.decoder.wrapped_decoder( 79 | hidden_states=decoder_hidden_states[:, -1:], 80 | attention_mask=None, 81 | encoder_hidden_states=encoder_last_hidden_state, 82 | encoder_attention_mask=encoder_attention_mask, 83 | past_key_values=past_key_values, 84 | use_cache=True, 85 | output_attentions=False, 86 | return_dict=True, 87 | ) 88 | 89 | last_decoder_output = decoder_out.last_hidden_state[0, -1] 90 | past_key_values = decoder_out.past_key_values 91 | 92 | # Predict the new mel spectrum for this step in the sequence. 93 | spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output) 94 | spectrum = spectrum.view(model.config.reduction_factor, model.config.num_mel_bins) 95 | spectrogram = torch.cat((spectrogram, spectrum), dim=0) 96 | 97 | # Extend the output sequence with the new mel spectrum. 98 | spv = spectrum[-1].view(1, 1, model.config.num_mel_bins) 99 | output_sequence = torch.cat((output_sequence, spv), dim=1) 100 | 101 | # Predict the probability that this is the stop token. 102 | prob = model.speech_decoder_postnet.prob_out(last_decoder_output).sigmoid() 103 | 104 | # Finished when stop token or maximum length is reached. 105 | theend = theend_cb = False 106 | if idx >= minlen and (int(sum(prob >= threshold)) > 0 or idx >= maxlen): 107 | theend = True 108 | 109 | if (len(spectrogram) >= output_len and len(spectrogram) + prfs.size(0) >= chunk_size + _c.eframes) \ 110 | or (theend and len(spectrogram) > 0): 111 | _s = spectrogram.unsqueeze(0) 112 | _s = model.speech_decoder_postnet.postnet(_s) 113 | _s = _s.squeeze(0) 114 | #print(_s.size(0), prfs.size(0), _s.device) 115 | in_size = _s.size() 116 | _s = [prfs, _s] 117 | if theend: 118 | _s.append(pofs) 119 | _s = torch.cat(_s, dim=0) 120 | extra_pad = (_s.size(0) - _c.eframes) % chunk_size 121 | assert extra_pad < chunk_size 122 | if extra_pad > 0: 123 | extra_pad = chunk_size - extra_pad 124 | #print(_s.size()) 125 | _pofs = torch.zeros(extra_pad, 126 | _s.size(1), device=_s.device) 127 | _s = torch.cat((_s, _pofs), dim=0) 128 | outputs = [] 129 | while _s.size(0) >= _c.eframes + chunk_size: 130 | #print(_s.size(), _s.device) 131 | _i = _s[:_c.eframes + chunk_size, :] 132 | _o = vocoder(_i).unsqueeze(0) 133 | _o = p_ch(_i, _o) 134 | outputs.append(_o.squeeze(0)) 135 | #print('out', _o.size(), outputs[-1].size()) 136 | _s = _s[chunk_size:, :] 137 | if extra_pad > 0: 138 | ep_trim = extra_pad * _c.frame_size 139 | assert outputs[-1].size(0) > ep_trim 140 | outputs[-1] = outputs[-1][:-ep_trim] 141 | outputs = torch.cat(outputs, dim=0) 142 | #print('_s after:', _s.size(0)) 143 | assert _s.size(0) >= _c.eframes and _s.size(0) < _c.eframes + chunk_size 144 | #print('prfs', prfs.size(), 'inputs', in_size, 'outputs', outputs.size(), '_s', _s.size()) 145 | #print(_s.shape, outputs.shape) 146 | prfs = _s 147 | #print(monotonic() - btime) 148 | hsrt.cuda_lock.release() 149 | qlen, theend_cb = speech_cb(outputs) 150 | hsrt.cuda_lock.acquire() 151 | if output_len in oschedule: 152 | oschedule.pop(0) 153 | if len(oschedule) > 0: 154 | output_len = oschedule[0] 155 | elif qlen > 1 and output_len < 64: 156 | output_len *= 2 157 | spectrogram = torch.zeros(0, model.config.num_mel_bins).to(model.device) 158 | if theend or theend_cb: 159 | break 160 | 161 | return idx 162 | 163 | class AmendmentNetwork1Config(PretrainedConfig): 164 | chunk_size = 8 165 | pre_frames = 2 166 | post_frames = 2 167 | frame_size = 256 168 | num_mels = 80 169 | chunk_size: int 170 | trim_pr: int 171 | trim_po: int 172 | output_size: int 173 | eframes: int 174 | 175 | def __init__(self, *a, **ka): 176 | super().__init__(*a, **ka) 177 | self.eframes = self.pre_frames + self.post_frames 178 | self.trim_pr = self.pre_frames * self.frame_size 179 | self.trim_po = self.post_frames * self.frame_size 180 | self.output_size = self.chunk_size * self.frame_size 181 | 182 | class SimpleResidualBlock(nn.Module): 183 | def __init__(self, channels): 184 | super().__init__() 185 | self.conv1 = nn.Conv1d(channels, channels, kernel_size=3, stride=1, 186 | padding=1, dilation=1) 187 | self.conv2 = nn.Conv1d(channels, channels, kernel_size=3, stride=1, 188 | padding=3, dilation=3) 189 | 190 | def forward(self, x, lrelu): 191 | assert lrelu is not None 192 | residual = x 193 | x = lrelu(x) 194 | x = self.conv1(x) 195 | x = lrelu(x) 196 | x = self.conv2(x) 197 | x += residual 198 | return x 199 | 200 | class AmendmentNetwork1(PreTrainedModel): 201 | config_class = AmendmentNetwork1Config 202 | def __init__(self, config=None): 203 | if config is None: 204 | config = self.config_class() 205 | super().__init__(config) 206 | _c = self._c = config 207 | 208 | self.conv_pre_m = nn.Conv1d(_c.num_mels, 32, kernel_size=3, stride=1, padding=1) 209 | self.conv_pre_a = nn.Conv1d(_c.frame_size, 160, kernel_size=3, stride=1, padding=1) 210 | self.upsampler = nn.ModuleList([ 211 | nn.ConvTranspose1d(192, 128, kernel_size=8, stride=4, padding=2), 212 | nn.ConvTranspose1d(128, 64, kernel_size=8, stride=4, padding=2), 213 | ]) 214 | self.lrelu = nn.LeakyReLU(0.01) 215 | self.resblock = SimpleResidualBlock(64) 216 | self.post_conv = nn.Conv1d(in_channels=64, out_channels=_c.frame_size, 217 | kernel_size=8, stride=24, padding=0) 218 | 219 | def forward(self, mel, audio): 220 | batch_size, total_length = audio.size() 221 | T = mel.size(-1) 222 | #print(Exception(f"BP: ms:{mel.size()} as:{audio.size()}")) 223 | audio_reshaped = audio.view(batch_size, self._c.frame_size, -1) 224 | mel = mel.view(batch_size, T, -1) 225 | #print(Exception(f"BP: ms:{mel.size()} as:{audio.size()} ars:{audio_reshaped.size()}")) 226 | x_mel = self.conv_pre_m(mel) 227 | x_audio = self.conv_pre_a(audio_reshaped) 228 | am_comb = torch.cat((x_mel, x_audio), dim=1) 229 | for i, layer in enumerate(self.upsampler): 230 | am_comb = self.lrelu(am_comb) 231 | am_comb = layer(am_comb) 232 | am_comb = self.resblock(am_comb, self.lrelu) 233 | am_comb = self.lrelu(am_comb) 234 | am_comb = self.post_conv(am_comb).squeeze(-1) 235 | am_comb = self.lrelu(am_comb).view(batch_size, -1) 236 | audio = audio[:, self._c.trim_pr:-self._c.trim_po] * am_comb 237 | return audio.tanh() 238 | 239 | class HelloSippyRT(): 240 | processor: SpeechT5Processor 241 | chunker: AmendmentNetwork1 242 | c_conf: AmendmentNetwork1Config 243 | vocoder: SpeechT5HifiGan 244 | model: SpeechT5ForTextToSpeech 245 | cuda_lock = InfernGlobals().torcher 246 | default_model = "microsoft/speecht5_tts" 247 | def __init__(self, device, model=default_model, get_processor:Optional[callable]=None): 248 | with self.cuda_lock: 249 | mc = SpeechT5Config.from_pretrained(model) 250 | if get_processor is None: 251 | self.processor = SpeechT5Processor.from_pretrained(model, config=mc) 252 | else: 253 | self.processor = get_processor(device, model, config=mc) 254 | model = SpeechT5ForTextToSpeech.from_pretrained(model, 255 | config=mc).to(device) 256 | model.eval() 257 | self.model = model 258 | _vc_conf = SpeechT5HifiGanConfig() 259 | vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", 260 | config = _vc_conf).to(device) 261 | vocoder.eval() 262 | self.vocoder = vocoder 263 | self.c_conf = AmendmentNetwork1Config() 264 | chunker = AmendmentNetwork1.from_pretrained("sobomax/speecht5-rt.post_vocoder.v2", 265 | config=self.c_conf) 266 | chunker = chunker.to(device) 267 | chunker.eval() 268 | self.chunker = chunker 269 | embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") 270 | self.speaker_embeddings = [torch.tensor(ed["xvector"]).unsqueeze(0) 271 | for ed in embeddings_dataset] 272 | 273 | def get_rand_voice_id(self): 274 | return torch.randint(0, len(self.speaker_embeddings), (1,)).item() 275 | 276 | def get_rand_voice(self): 277 | with self.cuda_lock: 278 | s_index = self.get_rand_voice_id() 279 | rv = self.speaker_embeddings[s_index].to(self.model.device) 280 | return rv 281 | 282 | @lru_cache(maxsize=16) 283 | def get_voice(self, s_index:int): 284 | with self.cuda_lock: 285 | rv = self.speaker_embeddings[s_index].to(self.model.device) 286 | return rv 287 | 288 | @torch.no_grad() 289 | def generate_speech_rt( 290 | self, 291 | input_ids: torch.LongTensor, 292 | speech_cb: GenerateSpeech_cb, 293 | speaker_embeddings: Optional[torch.FloatTensor] = None, 294 | threshold: float = 0.5, 295 | minlenratio: float = 0.0, 296 | maxlenratio: float = 20.0, 297 | ) -> int: 298 | return _generate_speech_rt( 299 | self, 300 | input_ids, 301 | speech_cb, 302 | speaker_embeddings, 303 | threshold, 304 | minlenratio, 305 | maxlenratio, 306 | ) 307 | 308 | @torch.no_grad() 309 | def tts_rt(self, text, speech_cb, speaker=None): 310 | with self.cuda_lock: 311 | inputs = self.processor(text=text, 312 | return_tensors="pt").to(self.model.device) 313 | if speaker is None: 314 | speaker = self.get_rand_voice() 315 | self.generate_speech_rt(inputs["input_ids"], speech_cb, 316 | speaker) 317 | -------------------------------------------------------------------------------- /HelloSippyTTSRT/HelloSippyRTPipeTest.py: -------------------------------------------------------------------------------- 1 | try: import intel_extension_for_pytorch as ipex 2 | except ModuleNotFoundError: ipex = None 3 | 4 | import sys, random, weakref, uuid 5 | from typing import List, Optional, Tuple 6 | import contextlib, time 7 | from os.path import exists as path_exists 8 | from queue import Queue, Empty as QueueEmpty 9 | from dataclasses import dataclass 10 | 11 | import numpy as np 12 | 13 | import torch 14 | for i in range(2): 15 | try: 16 | from HelloSippyTTSRT.HelloSippyRTPipe import HelloSippyRTPipe, HelloSippyPipeState, HelloSippyPipeStateBatched, \ 17 | HelloSippyPlayRequest, SessCmd, SessSyncCmd 18 | except ModuleNotFoundError: 19 | from sys import path as sys_path 20 | from os import getcwd 21 | sys_path.append(getcwd()) 22 | else: break 23 | else: raise ModuleNotFoundError('HelloSippyRTPipe') 24 | 25 | from transformers import set_seed 26 | 27 | class ErrMaxSessReached(Exception): pass 28 | 29 | import threading 30 | 31 | from elperiodic.ElPeriodic import ElPeriodic 32 | 33 | from time import monotonic 34 | 35 | class trp_thread(threading.Thread): 36 | queue: Queue 37 | queue_out: Optional[Queue] = None 38 | elp: Optional[ElPeriodic] = None 39 | period = None 40 | def __init__(self, period:float=0.0, noreturn:bool=False): 41 | self.queue = Queue() 42 | if not noreturn: self.queue_out = Queue() 43 | if period > 0.0: self.elp = ElPeriodic(1.0 / period) 44 | super().__init__(target=self.__thread) 45 | self.daemon = True 46 | self.start() 47 | 48 | def __call__(self, func): 49 | #raise Exception(f'__call__ {args=} {kwargs=}') 50 | def __call(*args, **kwargs): 51 | #raise Exception(f'__call {args=} {kwargs=}') 52 | t = monotonic() 53 | self.queue.put((func, args, kwargs)) 54 | ex, res = self.queue_out.get() 55 | if ex: raise ex 56 | return res 57 | def __call_noret(*args, **kwargs): 58 | self.queue.put((func, args, kwargs)) 59 | return __call if self.queue_out else __call_noret 60 | #return self.queue_out.get() 61 | 62 | def __thread(self): 63 | while True: 64 | a = self.queue.get() 65 | if a is None: break 66 | func, args, kwargs = a 67 | st = monotonic() 68 | try: res = (None, func(*args, **kwargs)) 69 | except Exception as ex:res = (ex, None) 70 | et = monotonic() 71 | if self.queue_out: self.queue_out.put(res) 72 | elif res[0]: raise res[0] 73 | if self.elp: self.elp.procrastinate() 74 | 75 | def __del__(self): 76 | print('del') 77 | if not hasattr(self, 'queue'): return 78 | self.queue.put(None) 79 | self.join() 80 | self.func = None 81 | 82 | class WeakDispatcher(): 83 | def __init__(self, queue:Queue): self.queue = weakref.ref(queue) 84 | def __call__(self, res): 85 | q = self.queue() 86 | if q: q.put(res.to(torch.float16).numpy() if res is not None else None) 87 | 88 | class InfernSession: 89 | _cmd_queue:Queue 90 | id:uuid.UUID 91 | default_speaker:torch.Tensor 92 | def __init__(self, queue, default_speaker:torch.Tensor): self.id, self._cmd_queue, self.default_speaker = uuid.uuid4(), queue, default_speaker 93 | def play(self, text:str, dispatch:Queue, speaker:Optional[torch.Tensor] = None): 94 | cmd = HelloSippyPlayRequest(self.id, text, speaker if speaker else self.default_speaker, WeakDispatcher(dispatch)) 95 | self._cmd_queue.put(cmd) 96 | 97 | class HelloSippyRTPipeTest(HelloSippyRTPipe): 98 | _main_thread_id: int 99 | _sync_queue: Queue 100 | sessions: weakref.WeakValueDictionary[InfernSession] 101 | max_sessions: int = 50 102 | output_sr = 8000 103 | 104 | def __init__(self, *a, **kwa): 105 | self._main_thread_id = threading.get_ident() 106 | self._sync_queue = Queue() 107 | self.sessions = weakref.WeakValueDictionary() 108 | super().__init__(*a, **kwa) 109 | 110 | def alloc_session(self, speaker:Optional[torch.Tensor]=None) -> Tuple[InfernSession, HelloSippyPipeState]: 111 | assert threading.get_ident() == self._main_thread_id 112 | if len(self.sessions) >= self.max_sessions: raise ErrMaxSessReached(f'No more sessions available {self.max_sessions=}') 113 | if not speaker: speaker = self.get_rand_voice()[0] 114 | rv = InfernSession(self._sync_queue, speaker) 115 | self.sessions[rv.id] = rv 116 | ss = SessSyncCmd(self.sessions) 117 | self._sync_queue.put(ss) 118 | return rv 119 | 120 | def savetensor(self, tensor:torch.Tensor, name:str): 121 | fname = f'{name}{self.saveidx}.npy' 122 | np.save(fname, tensor.cpu().numpy()) 123 | 124 | class WorkerState: state:Optional[HelloSippyPipeStateBatched]=None; live:Optional[List[uuid.UUID]]=None 125 | 126 | @trp_thread(noreturn=True) 127 | def synchronize(self, ws:Optional[WorkerState]) -> None: 128 | if not ws: ws = self.WorkerState() 129 | state = ws.state 130 | if state: return (self.main_gen(ws), None)[-1] 131 | ssq = [self._sync_queue.get(),] 132 | try: 133 | while True: ssq.append(self._sync_queue.get_nowait()) 134 | except QueueEmpty: pass 135 | assert all(isinstance(x, SessCmd) for x in ssq) 136 | syncs, reqs = [x for x in ssq if isinstance(x, SessSyncCmd)], [x for x in ssq if not isinstance(x, SessSyncCmd)] 137 | if len(syncs) == 0 and len(reqs) == 0: raise AssertionError(f'this could not be happening {ssq=}') 138 | #print(f'{len(syncs)=} {len(reqs)=} {syncs=}') 139 | ws.live = live = syncs[-1].live if len(syncs) > 0 else ws.live 140 | if not live: return (self.synchronize(ws), None)[-1] 141 | reqs_live = [x for x in reqs if x.session in live] 142 | if len(reqs_live) == 0: return (self.synchronize(ws), None)[-1] 143 | with self.cuda_lock: 144 | new_states = [HelloSippyPipeState(self, r) for r in reqs_live] 145 | #if state: state.mergein(new_states) 146 | ws.state = HelloSippyPipeStateBatched(new_states, self) 147 | #raise Exception(f'{len(ssq)=} {reqs_live=} {live=} {len(self.sessions)=}') 148 | self.main_gen(ws) 149 | 150 | @trp_thread(noreturn=True) 151 | def main_gen(self, ws:WorkerState) -> None: 152 | super().infer(ws.state) 153 | #print(f'{state.ends_at.shape=} {state.ends_at.cpu().numpy()=} {state.audio.shape=}') 154 | self.unbatch_and_dispatch(ws) 155 | 156 | @trp_thread(noreturn=True) 157 | def unbatch_and_dispatch(self, ws:WorkerState): 158 | more = super().unbatch_and_dispatch(ws.state) 159 | if not more: 160 | ws.state = None 161 | self.synchronize(ws) 162 | 163 | class Timing(contextlib.ContextDecorator): 164 | def __init__(self, prefix="", on_exit=None, enabled=True): self.prefix, self.on_exit, self.enabled = prefix, on_exit, enabled 165 | def __enter__(self): self.st = time.perf_counter_ns() 166 | def __exit__(self, *exc): 167 | self.et = time.perf_counter_ns() - self.st 168 | if self.enabled: print(f"{self.prefix}{self.et*1e-6:6.2f} ms"+(self.on_exit(self.et) if self.on_exit else "")) 169 | 170 | def seed_RNGs(): 171 | seed = 42 172 | random.seed(seed) 173 | torch.manual_seed(seed) 174 | set_seed(seed) 175 | np.random.seed(seed) 176 | torch.cuda.manual_seed_all(seed) 177 | torch.backends.cudnn.deterministic = True 178 | 179 | @torch.no_grad() 180 | def main(): 181 | import soundfile as sf 182 | from time import monotonic 183 | seed_RNGs() 184 | from random import choices 185 | from utils.tts import smith_set, bender_set, hal_set 186 | n = 50 187 | prompts = choices([y for x in smith_set() + bender_set() + hal_set() for y in x.split('|')], k=n) 188 | #prompts = prompts 189 | #prompts = [prompts[0] for _ in range(n)] 190 | @dataclass(frozen=True) 191 | class ResFeedback: n:int; time_to_first_frame:float; time_to_last_frame:float; number_of_frames:int 192 | class res_cb(threading.Thread): 193 | def __init__(self, n, name='dispatch', res_queue=None): 194 | super().__init__(target=self.__thread) 195 | self.n, self.name, self.res_queue = n, name, res_queue 196 | if self.name == 'dispatch': self.data = np.empty(0) 197 | self.q = Queue() 198 | self.daemon = True 199 | self.start() 200 | 201 | def __thread(self): 202 | st = monotonic() 203 | time_to_first_frame = None 204 | while (y:=self.q.get()) is not None: 205 | #print(f'{self.name}{self.n}({y.shape=})') 206 | self.data = np.concatenate((self.data, y), axis=0) 207 | if time_to_first_frame is None: time_to_first_frame = monotonic() - st 208 | self.eos(ResFeedback(self.n, time_to_first_frame, monotonic()-st, int(self.data.shape[0]))) 209 | 210 | def eos(self, res:ResFeedback): 211 | sys.stdout.write(f'eos({self.n}) {self.data.shape=}\n') 212 | sys.stdout.flush() 213 | sf.write(f'out_{self.n}.wav', self.data, 8000, 'PCM_16') 214 | if self.res_queue: self.res_queue.put(res) 215 | 216 | params = {'hidden_dropout':0.0, 'positional_dropout':0.0, 'speech_decoder_prenet_dropout':0.0, 217 | 'activation_dropout':0.0, 'encoder_layerdrop':0.0, 'decoder_layerdrop':0.0, 'attention_dropout':0.0, 218 | 'speech_decoder_postnet_dropout':0.0, 'feat_proj_dropout':0.0} 219 | sp = HelloSippyRTPipeTest('xpu' if ipex is not None else 'cuda') 220 | if ipex is not None: 221 | sp.model = ipex.optimize(sp.model, dtype=torch.bfloat16) 222 | sp.vocoder = ipex.optimize(sp.vocoder, dtype=torch.bfloat16) 223 | sp.chunker = ipex.optimize(sp.chunker, dtype=torch.bfloat16) 224 | 225 | s1 = [sp.alloc_session() for i in range(50)] 226 | del s1 227 | res_queue = Queue() 228 | from time import sleep 229 | #sp.synchronize(None) 230 | s2 = [((s:=sp.alloc_session()), (r:=res_cb(n, res_queue=res_queue)), s.play(p, r.q), 'sleep(0.5)') for n, p in enumerate(prompts)] 231 | sp.synchronize(None) 232 | for _ in range(len(s2)): 233 | res = res_queue.get() 234 | rtr = (res.time_to_last_frame - res.time_to_first_frame) / (res.number_of_frames / 8000) 235 | print(f'Sess#{res.n}: {res.time_to_first_frame=}, {res.time_to_last_frame=}, {res.number_of_frames=} {rtr=}') 236 | sys.stdout.flush() 237 | s2[res.n][1].join() 238 | return(0) 239 | 240 | def init_states(states): 241 | d_callbacks = [res_cb(n, 'dispatch', lambda x:x[0].shape) for n, _ in enumerate(states)] 242 | e_callbacks = [d.eos for d in d_callbacks] 243 | for state, d_cb, e_cb in zip(states, d_callbacks, e_callbacks): state.dispatch, state.eos_cb = d_cb, e_cb 244 | return states 245 | seed_RNGs() 246 | states = [sp.once(x) for x in prompts] 247 | init_states(states) 248 | states = sp.batch_for_main_gen(states) 249 | states.res_queue = Queue() 250 | sp.synchronize(states) 251 | with Timing("main_gen: "): 252 | state = states.res_queue.get() 253 | #while state.next is not None: 254 | # state = state.next(state) 255 | #exit(1) 256 | with Timing("once: "): 257 | seed_RNGs() 258 | states = [sp.once(x) for x in prompts] 259 | init_states(states) 260 | #state1 = sp.once('Hello, world!') 261 | #state2 = sp.once('How are you doing today?') 262 | #state3 = sp.once('I am doing well, thank you very much.') 263 | with Timing("batch_for_main_gen: "): 264 | states = sp.batch_for_main_gen(states) 265 | states.res_queue = Queue() 266 | sp.synchronize(states) 267 | with Timing("main_gen: "): 268 | state = states.res_queue.get() 269 | 270 | if __name__ == '__main__' and (r:=main()) not in (None, 0): raise RuntimeError(f'main() returned {r}') 271 | -------------------------------------------------------------------------------- /Infernos.py: -------------------------------------------------------------------------------- 1 | from getopt import getopt, GetoptError 2 | import os, sys 3 | 4 | import ray 5 | 6 | from sippy.misc import daemonize 7 | 8 | sys.path.append('.') 9 | 10 | from Cluster.InfernSIPActor import InfernSIPActor 11 | from Core.InfernConfig import InfernConfig 12 | 13 | def patch_signals(): 14 | import threading 15 | import signal 16 | 17 | def _start_new_thread(*args): 18 | allsigs = list(signal.valid_signals()) 19 | 20 | old_sigset = signal.pthread_sigmask(signal.SIG_BLOCK, allsigs) 21 | ret = _old_start_new_thread(*args) 22 | signal.pthread_sigmask(signal.SIG_SETMASK, old_sigset) 23 | return ret 24 | 25 | _old_start_new_thread = threading._start_new_thread 26 | threading._start_new_thread = _start_new_thread 27 | 28 | def usage(): 29 | print('usage: Infernos.py [-f] [-L logfile] [-i pidfile] [myconfig.yaml]') 30 | sys.exit(1) 31 | 32 | if __name__ == '__main__': 33 | try: 34 | opts, args = getopt(sys.argv[1:], 'fL:i:') 35 | except GetoptError: 36 | usage() 37 | 38 | if len(args) > 1: 39 | usage() 40 | 41 | cfile = 'config.yaml' if len(args) == 0 else args[0] 42 | 43 | idir = os.path.realpath(sys.argv[0]) 44 | idir = os.path.dirname(idir) 45 | sys.path.append(idir) 46 | logfile = '/var/log/Infernos.log' 47 | pidfile = None 48 | foreground = False 49 | for o, a in opts: 50 | if o == '-f': 51 | foreground = True 52 | elif o == '-L': 53 | logfile = a 54 | elif o == '-i': 55 | pidfile = a 56 | 57 | if not foreground: 58 | daemonize(logfile) 59 | 60 | patch_signals() 61 | 62 | if logfile == '-': 63 | lfile = sys.stdout 64 | else: 65 | lfile = open(logfile, 'a') 66 | 67 | default_resources = InfernSIPActor.default_resources 68 | default_resources['live_translator'] = 1 69 | default_resources['ai_attendant'] = 1 70 | default_resources['tts'] = 2 71 | default_resources['stt'] = 1 72 | default_resources['llm'] = 1 73 | try: 74 | ray.init(num_gpus=2, resources = default_resources) 75 | except ValueError as ex: 76 | if str(ex).index('connecting to an existing cluster') < 0: raise ex 77 | ray.init() 78 | 79 | inf_c = InfernConfig(cfile) 80 | 81 | if pidfile != None: 82 | open(pidfile, 'w').write('%d' % os.getpid()) 83 | 84 | if inf_c.sip_actr is None: 85 | ray.shutdown() 86 | exit(0) 87 | 88 | try: 89 | exit(ray.get(inf_c.sip_actr.loop.remote(inf_c))) 90 | except KeyboardInterrupt: 91 | ray.get(inf_c.sip_actr.stop.remote()) 92 | raise 93 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2023-2024, Sippy Labs 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # I.N.F.E.R.N.O.S. 2 | 3 | ### Interactive Neural Framework for Efficient Realtime Network Operations on Streams 4 | 5 | 🔥 Welcome to Infernos, where data comes to life in real-time! 🔥 6 | 7 | ## Overview 8 | 9 | Harness the power of **I.N.F.E.R.N.O.S.** to transform audio, video, and 10 | text streams with state-of-the-art inference in an instant. Embrace a 11 | blazing-fast future, free from lag. 12 | 13 | ## News 14 | 15 | Initial integration of the LLM (Qwen 2.5) and addition of the A.I. 16 | Attendant application. 17 | 18 | Upcoming presentation at the OpenSIPS Summit 2025. 19 | 20 | ## Features 21 | 22 | - **Interactive:** Infernos isn't just another tool; it's an 23 | experience. Speak in one voice and marvel as it's automatically 24 | translated into a completely different tone or even language, and 25 | then seamlessly transmitted in real-time during phone or web 26 | meetings. 27 | 28 | - **Neural Power:** With deep learning at its core, Infernos is 29 | optimized for top-notch performance. 30 | 31 | - **Multimodal Support:** Whether it's audio, video, or text, Infernos 32 | handles them with elegance. 33 | 34 | - **Efficiency:** Designed for low-latency, high-throughput 35 | operations. 36 | 37 | - **Realtime:** Don't wait. Experience the magic as it unfolds. 38 | 39 | ## Quick Start 40 | 41 | 1. Clone the repository: 42 | 43 | ```bash 44 | git clone https://github.com/sippy/Infernos.git 45 | ``` 46 | 47 | 2. Navigate to the project directory and install dependencies: 48 | 49 | ```bash 50 | cd Infernos && pip install -r requirements.txt 51 | ``` 52 | 53 | 3. Create a configuration file. In the following example we would 54 | listen and accept SIP calls from `MY_IP` and pass them into Live 55 | Translator application. Then use SIP account to send 56 | outbound call legs to `DEST_NUM`@`MY_SIP_SRV`: 57 | 58 | ```bash 59 | MY_IP="A.B.C.D" 60 | MY_SIP_SRV="E.F.G.H" 61 | DEST_NUM="12345" 62 | DEST_USER="foo" 63 | DEST_PWD="bar" 64 | cat > config.yaml <English / English->Portugese 115 | on a AWS instance "from zero to hero" in less than 60 minutes. 116 | - [Infernos: cost efficient AI inference for real-time applications:](https://www.youtube.com/watch?v=eawO0hXeO5Y) 117 | Overview of the Infernos architecture and progress over the past few months. 118 | 119 | ## Join US 120 | 121 | - [Discord](https://discord.gg/bb95ZWhrhQ) 122 | 123 | ------------------------------------------------------------------------ 124 | 125 | Stay on the lookout for more sizzling updates, and always remember: 126 | **Infernos** makes the future sizzle! 127 | -------------------------------------------------------------------------------- /RTP/AudioInput.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | class AudioInput(): 4 | vad_chunk_in:Optional[callable] 5 | audio_in:Optional[callable] 6 | def __init__(self, audio_in:Optional[callable]=None, vad_chunk_in:Optional[callable]=None): 7 | self.vad_chunk_in = vad_chunk_in 8 | self.audio_in = audio_in 9 | -------------------------------------------------------------------------------- /RTP/InfernRTPConf.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from sippy.Network_server import RTP_port_allocator 4 | 5 | class InfernRTPConf(): 6 | schema: dict = { 7 | 'settings': { 8 | 'type': 'dict', 9 | 'schema': { 10 | 'min_port': {'type': 'integer', 'min': 1, 'max': 65535}, 11 | 'max_port': {'type': 'integer', 'min': 1, 'max': 65535}, 12 | } 13 | } 14 | } 15 | palloc: RTP_port_allocator 16 | def __init__(self, conf:Optional[dict]=None): 17 | max_port = conf.get('max_port', None) if conf is not None else None 18 | min_port = conf.get('min_port', None) if conf is not None else None 19 | self.palloc = RTP_port_allocator(min_port, max_port) 20 | -------------------------------------------------------------------------------- /RTP/InfernRTPEPoint.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Union 2 | from uuid import uuid4, UUID 3 | from threading import Lock 4 | 5 | from sippy.Udp_server import Udp_server, Udp_server_opts 6 | from sippy.misc import local4remote 7 | 8 | from config.InfernGlobals import InfernGlobals as IG 9 | from Core.AudioChunk import AudioChunk 10 | from Core.AStreamMarkers import ASMarkerGeneric, ASMarkerNewSent 11 | from RTP.RTPOutputWorker import RTPOutputWorker 12 | from RTP.InfernRTPIngest import RTPInStream 13 | from RTP.AudioInput import AudioInput 14 | from RTP.RTPParams import RTPParams 15 | from RTP.InfernRTPIngest import InfernRTPIngest 16 | from RTP.InfernRTPConf import InfernRTPConf 17 | 18 | class InfernRTPEPoint(): 19 | debug: bool = False 20 | id: UUID 21 | dl_file = None 22 | firstframe = True 23 | rtp_params:RTPParams 24 | state_lock: Lock 25 | def __init__(self, rc:InfernRTPConf, rtp_params:RTPParams, ring:InfernRTPIngest, get_direct_soundout:callable): 26 | self.id = uuid4() 27 | self.rtp_params = rtp_params 28 | self.state_lock = Lock() 29 | self.writer = RTPOutputWorker('cpu', rtp_params) 30 | self.rsess = RTPInStream(ring, rtp_params, get_direct_soundout) 31 | rtp_laddr = local4remote(rtp_params.rtp_target[0]) 32 | rserv_opts = Udp_server_opts((rtp_laddr, rc.palloc), self.rtp_received) 33 | rserv_opts.nworkers = 1 34 | rserv_opts.direct_dispatch = True 35 | self.rserv = Udp_server({}, rserv_opts) 36 | self.writer_setup() 37 | 38 | def writer_setup(self): 39 | self.writer.set_pkt_send_f(self.send_pkt) 40 | if self.dl_file is not None: 41 | self.writer.enable_datalog(self.dl_file) 42 | self.writer.start() 43 | 44 | def send_pkt(self, pkt): 45 | with self.state_lock: 46 | rtp_target = self.rtp_params.rtp_target 47 | self.rserv.send_to(pkt, rtp_target) 48 | 49 | def rtp_received(self, data, address, udp_server, rtime): 50 | #self.dprint(f"InfernRTPIngest.rtp_received: len(data) = {len(data)}") 51 | with self.state_lock: 52 | if address != self.rtp_params.rtp_target: 53 | if self.debug: 54 | print(f"InfernRTPIngest.rtp_received: address mismatch {address=} {self.rtp_params.rtp_target=}") 55 | return 56 | self.rsess.rtp_received(data, address, rtime) 57 | 58 | def update(self, rtp_params:RTPParams): 59 | with self.state_lock: 60 | self.rtp_params.rtp_target = rtp_params.rtp_target 61 | if self.rtp_params.out_ptime != rtp_params.out_ptime: 62 | self.writer.end() 63 | self.writer.join() 64 | self.writer = RTPOutputWorker('cpu', rtp_params) 65 | self.writer_setup() 66 | self.rsess.stream_update() 67 | 68 | def connect(self, ain:AudioInput): 69 | self.rsess.stream_connect(ain) 70 | 71 | def shutdown(self): 72 | with self.state_lock: 73 | self.writer.join() 74 | self.rserv.shutdown() 75 | self.rserv, self.writer = (None, None) 76 | 77 | def __del__(self): 78 | if self.debug: 79 | print('InfernRTPEPoint.__del__') 80 | 81 | def soundout(self, chunk:Union[AudioChunk, ASMarkerGeneric]): 82 | ismark = isinstance(chunk, ASMarkerGeneric) 83 | if self.firstframe or ismark: 84 | if self.debug: 85 | print(f'{IG.stdtss()}: rtp_session_soundout[{str(self.id)[:6]}]: {"mark" if ismark else chunk.audio.size(0)}') 86 | self.firstframe = False 87 | if ismark and isinstance(chunk, ASMarkerNewSent): 88 | self.firstframe = True 89 | with self.state_lock: 90 | if self.writer is None: return 91 | return self.writer.soundout(chunk) 92 | -------------------------------------------------------------------------------- /RTP/InfernRTPIngest.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | from queue import Queue 3 | from threading import Lock 4 | from uuid import UUID 5 | 6 | from rtpsynth.RtpJBuf import RtpJBuf, RTPFrameType, RTPParseError 7 | 8 | from Core.InfernWrkThread import InfernWrkThread, RTPWrkTRun 9 | from Core.VAD.SileroVAD import SileroVADWorker, VADChannel 10 | from Core.Codecs.G711 import G711Codec 11 | from Core.AudioChunk import AudioChunk 12 | from RTP.AudioInput import AudioInput 13 | from RTP.RTPParams import RTPParams 14 | 15 | class WIPkt(): 16 | def __init__(self, stream: 'RTPInStream', data, address, rtime): 17 | self.stream = stream 18 | self.data = data 19 | self.address = address 20 | self.rtime = rtime 21 | 22 | class WIStreamUpdate(): 23 | def __init__(self, stream: 'RTPInStream'): 24 | self.stream = stream 25 | 26 | class WIStreamConnect(): 27 | def __init__(self, stream: 'RTPInStream', ain:AudioInput): 28 | self.stream = stream 29 | self.ain = ain 30 | 31 | class RTPInStream(): 32 | jb_size: int = 8 33 | input_sr: int = 8000 34 | last_output_lseq: Optional[int] = None 35 | vchan: VADChannel 36 | codec: G711Codec 37 | output_sr: int = 16000 38 | npkts: int = 0 39 | ain: AudioInput 40 | ain_lock: Lock 41 | get_direct_soundout: callable 42 | def __init__(self, ring:'InfernRTPIngest', rtp_params:RTPParams, get_direct_soundout:callable): 43 | self.jbuf = RtpJBuf(self.jb_size) 44 | self.codec = rtp_params.codec().to(ring.device) 45 | self.ring = ring 46 | self.get_direct_soundout = get_direct_soundout 47 | self.ain = AudioInput() 48 | self.ain_lock = Lock() 49 | self.vchan = VADChannel(self.audio_chunk_out, self.vad_chunk_out, self.codec.decode, ring.device) 50 | 51 | def rtp_received(self, data, address, rtime): 52 | #self.dprint(f"InfernRTPIngest.rtp_received: len(data) = {len(data)}") 53 | self.ring.pkt_queue.put(WIPkt(self, data, address, rtime)) 54 | 55 | def stream_update(self): 56 | self.ring.pkt_queue.put(WIStreamUpdate(self)) 57 | 58 | def stream_connect(self, ain:AudioInput): 59 | if isinstance(ain.vad_chunk_in, UUID): ain.vad_chunk_in = self.get_direct_soundout(ain.vad_chunk_in) 60 | if isinstance(ain.audio_in, UUID): ain.audio_in = self.get_direct_soundout(ain.audio_in) 61 | self.ring.pkt_queue.put(WIStreamConnect(self, ain)) 62 | 63 | def _proc_in_tread(self, wi:Union[WIPkt,WIStreamUpdate], svad:SileroVADWorker): 64 | def dprint(msg:str): return self.ring.dprint(f'InfernRTPIngest.run: {msg}') if self.ring.debug else None 65 | 66 | if isinstance(wi, WIStreamUpdate): 67 | dprint("stream update") 68 | self.jbuf = RtpJBuf(self.jb_size) 69 | self.last_output_lseq = None 70 | return 71 | if isinstance(wi, WIStreamConnect): 72 | dprint("stream connect") 73 | with self.ain_lock: 74 | self.ain = wi.ain 75 | return 76 | data, address, rtime = wi.data, wi.address, wi.rtime 77 | try: 78 | res = self.jbuf.udp_in(data) 79 | except RTPParseError as e: 80 | dprint(f"RTPParseError: {e}") 81 | return 82 | self.npkts += 1 83 | if self.npkts == 1: 84 | dprint(f"address={address}, rtime={rtime}, len(data) = {len(data)} data={data[:40]}") 85 | for pkt in res: 86 | if pkt.content.type == RTPFrameType.ERS: 87 | print(f"ERS packet received {pkt.content.lseq_start=}, {pkt.content.lseq_end=} {pkt.content.ts_diff=}") 88 | self.last_output_lseq = pkt.content.lseq_end 89 | rtp_data = self.codec.silence(pkt.content.ts_diff) 90 | else: 91 | if self.npkts < 10: 92 | dprint(f"{pkt.content.frame.rtp.lseq=}") 93 | assert self.last_output_lseq is None or pkt.content.frame.rtp.lseq == self.last_output_lseq + 1 94 | self.last_output_lseq = pkt.content.frame.rtp.lseq 95 | if self.npkts < 10: 96 | dprint(f"{len(pkt.rtp_data)=}, {type(pkt.rtp_data)=}") 97 | rtp_data = pkt.rtp_data 98 | self.vchan.ingest(svad, rtp_data, self.codec) 99 | if self.npkts < 10 and len(res) > 0: 100 | dprint(f"{res=}") 101 | 102 | def audio_chunk_out(self, chunk:AudioChunk, active:bool): 103 | chunk.active = active 104 | with self.ain_lock: 105 | if self.ain.audio_in is None: return 106 | self.ain.audio_in(chunk=chunk) 107 | 108 | def vad_chunk_out(self, chunk:AudioChunk): 109 | with self.ain_lock: 110 | if self.ain.vad_chunk_in is None: return 111 | self.ain.vad_chunk_in(chunk=chunk) 112 | 113 | class InfernRTPIngest(InfernWrkThread): 114 | debug = False 115 | pkt_queue: Queue[Union[WIPkt,WIStreamUpdate,WIStreamConnect]] 116 | _start_queue: Queue[int] 117 | def __init__(self, device:str): 118 | super().__init__() 119 | self.pkt_queue = Queue() 120 | self.device = device 121 | 122 | def start(self): 123 | self._start_queue = Queue() 124 | super().start() 125 | r = self._start_queue.get() 126 | if isinstance(r, Exception): 127 | super().join() 128 | raise r 129 | del self._start_queue 130 | 131 | def dprint(self, *args): 132 | if self.debug: 133 | print(*args) 134 | 135 | def run(self): 136 | super().thread_started() 137 | try: 138 | svad = SileroVADWorker(self.device) 139 | svad.start() 140 | except Exception as e: 141 | self._start_queue.put(e) 142 | return 143 | self._start_queue.put(0) 144 | self.dprint("InfernRTPIngest started") 145 | data, address, rtime = (None, None, None) 146 | while self.get_state() == RTPWrkTRun: 147 | wi = self.pkt_queue.get() 148 | if wi is None: break 149 | wi.stream._proc_in_tread(wi, svad) 150 | svad.stop() 151 | # if data is not None: 152 | # self.dprint(f"InfernRTPIngest.run: last packet: address={address}, rtime={rtime}, len(data) = {len(data)} data={data[:40]}") 153 | # self.dprint(f"InfernRTPIngest.run: exiting, total packets received: {npkts}") 154 | 155 | def stop(self): 156 | self.pkt_queue.put(None) 157 | super().stop() 158 | self.dprint("InfernRTPIngest stopped") 159 | 160 | def __del__(self): 161 | self.dprint("InfernRTPIngest.__del__") 162 | -------------------------------------------------------------------------------- /RTP/RTPOutputWorker.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Union 2 | from fractions import Fraction 3 | import queue 4 | import threading 5 | from time import monotonic, sleep 6 | 7 | from rtpsynth.RtpSynth import RtpSynth 8 | import soundfile as sf 9 | 10 | from Core.Codecs.G711 import G711Codec 11 | from Core.AudioChunk import AudioChunk 12 | from Core.OutputMuxer import OutputMTMuxer 13 | from Core.AStreamMarkers import ASMarkerGeneric 14 | from RTP.RTPParams import RTPParams 15 | 16 | class RTPOutputWorker(threading.Thread): 17 | data_queue: queue.Queue[Union[AudioChunk, ASMarkerGeneric]] 18 | debug = False 19 | dl_ofname: str = None 20 | data_log = None 21 | pkg_send_f = None 22 | state_lock: threading.Lock = None 23 | frames_rcvd = 0 24 | frames_prcsd = 0 25 | has_ended = False 26 | codec: G711Codec 27 | samplerate_out: int 28 | out_ft: int # in ms 29 | 30 | def __init__(self, device, rtp_params:RTPParams): 31 | self.itime = monotonic() 32 | self.device = device 33 | #if os.path.exists(self.ofname): 34 | # self.data, _ = sf.read(self.ofname) 35 | self.data_queue = queue.Queue() 36 | self.codec = rtp_params.codec().to(device) 37 | self.samplerate_out = self.codec.srate 38 | self.state_lock = threading.Lock() 39 | self.out_ft = rtp_params.out_ptime 40 | super().__init__(target=self.consume_audio) 41 | self.daemon = True 42 | 43 | def enable_datalog(self, dl_ofname): 44 | self.dl_ofname = dl_ofname 45 | 46 | def set_pkt_send_f(self, pkt_send_f): 47 | self.pkt_send_f = pkt_send_f 48 | 49 | def ended(self): 50 | self.state_lock.acquire() 51 | t = self.has_ended 52 | self.state_lock.release() 53 | return t 54 | 55 | def end(self): 56 | self.state_lock.acquire() 57 | self.has_ended = True 58 | self.state_lock.release() 59 | 60 | def update_frm_ctrs(self, rcvd_inc=0, prcsd_inc=0): 61 | self.state_lock.acquire() 62 | self.frames_rcvd += rcvd_inc 63 | self.frames_prcsd += prcsd_inc 64 | self.state_lock.release() 65 | 66 | def get_frm_ctrs(self): 67 | self.state_lock.acquire() 68 | res = (self.frames_rcvd, self.frames_prcsd) 69 | self.state_lock.release() 70 | return res 71 | 72 | def soundout(self, chunk:Union[AudioChunk, ASMarkerGeneric]): 73 | #print(f'soundout: {monotonic():4.3f}') 74 | #return (0, False) 75 | ismark = isinstance(chunk, ASMarkerGeneric) 76 | assert ismark or chunk.audio.size(0) > 0 77 | if (self.debug or chunk.debug) and not ismark: 78 | print(f'len(chunk) = {len(chunk.audio)}') 79 | if not ismark: 80 | chunk.audio = chunk.audio.to(self.device) 81 | self.data_queue.put(chunk) 82 | return (self.data_queue.qsize(), False) 83 | 84 | def consume_audio(self): 85 | out_pt = self.codec.ptype 86 | out_fsize = self.samplerate_out * self.out_ft // 1000 87 | ptime = Fraction(0) 88 | stime = None 89 | rsynth = RtpSynth(self.codec.crate, self.out_ft) 90 | qtimeout = Fraction(self.out_ft, 1000) 91 | out_qsize = self.out_ft * (self.samplerate_out // 10 // self.out_ft) # ~0.1 sec (rounded to a frame size) 92 | mix = OutputMTMuxer(self.samplerate_out, out_qsize, self.device) 93 | while not self.ended(): 94 | ctime = monotonic() 95 | try: 96 | chunk_n = self.data_queue.get(block=False) 97 | except queue.Empty: 98 | chunk_o_n = mix.idle(self) 99 | if chunk_o_n is None: 100 | if stime is not None: 101 | ptime += qtimeout 102 | etime = ctime - stime 103 | if ptime > etime: 104 | sleep(ptime - etime) 105 | if self.debug: print(f'{self}.consume_audio, skip {ptime - etime=}') 106 | rsynth.skip(1) 107 | else: 108 | sleep(float(qtimeout)) 109 | continue 110 | else: 111 | #if isinstance(chunk_n, AudioChunk): self.update_frm_ctrs(rcvd_inc=chunk_n.audio.size(0)) 112 | mix.chunk_in(chunk_n) 113 | continue 114 | 115 | if stime is None: 116 | stime = ctime 117 | 118 | chunk_o_n = self.codec.encode(chunk_o_n) 119 | out_psize = self.codec.d2e_frames(out_fsize) 120 | while len(chunk_o_n) >= out_psize: 121 | #self.update_frm_ctrs(prcsd_inc=out_fsize*2) 122 | packet = chunk_o_n[:out_psize] 123 | assert len(packet) == out_psize, f'{len(packet)=}, {out_psize=}' 124 | chunk_o_n = chunk_o_n[out_psize:] 125 | 126 | ptime += Fraction(out_fsize, self.samplerate_out) 127 | etime = ctime - stime 128 | 129 | #print(packet.size()) 130 | #packet = (packet * 20000).to(torch.int16) 131 | #packet = packet.byte().cpu().numpy() 132 | #packet = self.codec.encode(packet) 133 | #print('packet', packet.min(), packet.max(), packet[:10]) 134 | #print(len(packet), packet[:10]) 135 | pkt = rsynth.next_pkt(out_psize, out_pt, pload=packet) 136 | if self.pkt_send_f is not None: 137 | self.pkt_send_f(pkt) 138 | #print(len(pkt)) 139 | if chunk_n.debug or self.debug: 140 | print(f'{self}.consume_audio({etime=}, {ptime=}') 141 | if self.ended(): 142 | break 143 | if ptime > etime: 144 | sleep(ptime - etime) 145 | if self.ended(): 146 | break 147 | ctime = monotonic() 148 | if chunk_n.debug or self.debug: 149 | print(f'consume_audio, sleep({ptime - etime})') 150 | #if done_cb is not None: 151 | # rsynth.resync() 152 | # rsynth.set_mbt(1) 153 | # ptime = 0.0 154 | # stime = None 155 | # done_cb(self) 156 | 157 | def __del__(self): 158 | if self.debug: 159 | print('RTPOutputWorker.__del__') 160 | #self.worker_thread.join() 161 | if self.data_log is None: 162 | return 163 | amplification_dB = 20.0 164 | data = self.data_log #* (10 ** (amplification_dB / 20)) 165 | sf.write(self.dl_ofname, data.detach().cpu().numpy(), 166 | samplerate=self.samplerate_out) 167 | -------------------------------------------------------------------------------- /RTP/RTPParams.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Optional, Type, Union 2 | from Core.Codecs.G711 import G711Codec 3 | from Core.Codecs.G722 import G722Codec 4 | 5 | class RTPParams(): 6 | rtp_target: Tuple[str, int] 7 | out_ptime: int 8 | default_ptime: int = 20 9 | codec: Type[Union[G711Codec, G722Codec]] 10 | def __init__(self, rtp_target:Tuple[str, int], out_ptime:Optional[int]): 11 | assert isinstance(rtp_target, tuple) and len(rtp_target) == 2 12 | self.rtp_target = rtp_target 13 | self.out_ptime = out_ptime if out_ptime is not None else self.default_ptime 14 | -------------------------------------------------------------------------------- /SIP/InfernSIP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2024 Sippy Software, Inc. All rights reserved. 2 | # 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without modification, 6 | # are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation and/or 13 | # other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | 26 | from typing import Optional, Dict 27 | from weakref import WeakValueDictionary 28 | from queue import Queue 29 | from threading import Lock 30 | 31 | from sippy.SipConf import SipConf 32 | from sippy.SipTransactionManager import SipTransactionManager 33 | from sippy.SipURL import SipURL 34 | from sippy.SipRegistrationAgent import SipRegistrationAgent 35 | from sippy.misc import local4remote 36 | 37 | #from Core.InfernConfig import InfernConfig 38 | 39 | from .InfernUAS import InfernLazyUAS 40 | from .InfernUAC import InfernUAC 41 | from .InfernUA import InfernUA 42 | from .InfernSIPProfile import InfernSIPProfile 43 | from .RemoteSession import RemoteSessionOffer, NewRemoteSessionRequest 44 | 45 | from utils.tts import human_readable_time, hal_set, smith_set, \ 46 | bender_set 47 | 48 | def good(*a): 49 | #ED2.breakLoop(0) 50 | pass 51 | 52 | def bad(*a): 53 | #ED2.breakLoop(1) 54 | pass 55 | 56 | class InfernSIP(): 57 | _c: Dict[str, InfernSIPProfile] 58 | ua = None 59 | body = None 60 | ragent = None 61 | sip_actr = None 62 | sippy_c = None 63 | sessions: WeakValueDictionary 64 | sessions_lock: Lock 65 | 66 | def __init__(self, sip_actr:'InfernSIPActor', rtp_actr, inf_c:'InfernConfig'): 67 | sip_c = inf_c.sip_conf 68 | self.sippy_c = {'_sip_address':sip_c.laddr, 69 | '_sip_port':sip_c.lport, 70 | '_sip_logger':sip_c.logger} 71 | self.sip_actr, self.rtp_actr = sip_actr, rtp_actr 72 | self.sessions = WeakValueDictionary() 73 | self.session_lock = Lock() 74 | udsc, udsoc = SipTransactionManager.model_udp_server 75 | udsoc.nworkers = 1 76 | SipConf.my_uaname = 'Infernos' 77 | stm = SipTransactionManager(self.sippy_c, self.recvRequest) 78 | self.sippy_c['_sip_tm'] = stm 79 | #raise Exception(f'{inf_c.connectors}') 80 | self._c = inf_c.connectors 81 | for n, v in self._c.items(): 82 | if not v.register: continue 83 | proxy, port = v.nh_addr 84 | aor = SipURL(username = v.cli, host = proxy, port = port) 85 | caddr = local4remote(proxy) 86 | cport = self.sippy_c['_sip_port'] 87 | contact = SipURL(username = v.cli, host = caddr, port = cport) 88 | ragent = SipRegistrationAgent(self.sippy_c, aor, contact, 89 | user=v.authname, passw=v.authpass, 90 | rok_cb=good, rfail_cb=bad) 91 | ragent.rmsg.getHFBody('to').getUrl().username = v.cli 92 | ragent.doregister() 93 | 94 | def recvRequest(self, req, sip_t): 95 | if req.getMethod() in ('NOTIFY', 'PING'): 96 | # Whynot? 97 | return (req.genResponse(200, 'OK'), None, None) 98 | if req.getMethod() == 'INVITE': 99 | #if self.rserv != None: 100 | # return (req.genResponse(486, 'Busy Here'), None, None) 101 | # New dialog 102 | source = req.getSource() 103 | for n, sip_prof in self._c.items(): 104 | assert type(source) == type(sip_prof.nh_addr) 105 | if source == sip_prof.nh_addr: 106 | break 107 | else: 108 | return (req.genResponse(500, 'Nobody is home'), None, None) 109 | isess = InfernLazyUAS(self, sip_prof, req, sip_t) 110 | with self.session_lock: 111 | self.sessions[isess.id] = isess 112 | rso = RemoteSessionOffer(self, isess) 113 | sip_prof.new_sess_offer(rso) 114 | return 115 | return (req.genResponse(501, 'Not Implemented'), None, None) 116 | 117 | def new_session(self, msg:NewRemoteSessionRequest, rval:Optional[Queue]=None): 118 | uac = InfernUAC(self, msg) 119 | with self.session_lock: 120 | self.sessions[uac.id] = uac 121 | ret = (uac, uac.rsess) 122 | if rval is None: return ret 123 | rval.put(ret) 124 | 125 | def get_session(self, sip_sess_id) -> InfernUA: 126 | with self.session_lock: 127 | return self.sessions[sip_sess_id] 128 | 129 | # def getPrompts(self): 130 | # return [f'{human_readable_time()}',] + list(self.prompts) 131 | -------------------------------------------------------------------------------- /SIP/InfernSIPConf.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from os.path import expanduser 3 | 4 | from sippy.SipConf import SipConf 5 | from sippy.SipLogger import SipLogger 6 | 7 | from Core.ConfigValidators import validate_port_range 8 | 9 | class InfernSIPConf(): 10 | schema: dict = { 11 | 'settings': { 12 | 'type': 'dict', 13 | 'schema': { 14 | 'bind': { 15 | 'type': 'string', 16 | 'regex': r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:[1-9][0-9]{0,4}|:0)?$', 17 | 'check_with': validate_port_range 18 | } 19 | } 20 | } 21 | } 22 | logger = None 23 | 24 | def __init__(self, conf:Optional[dict]=None): 25 | self.logger = SipLogger('Infernos', logfile = expanduser('~/.Infernos.log')) 26 | if conf is not None: 27 | try: 28 | bind = conf['bind'].split(':', 1) 29 | except KeyError: pass 30 | else: 31 | port = int(bind[1]) if len(bind) == 2 else SipConf.my_port 32 | self.laddr = bind[0] 33 | self.lport = port 34 | return 35 | self.laddr = SipConf.my_address 36 | self.lport = SipConf.my_port 37 | -------------------------------------------------------------------------------- /SIP/InfernSIPProfile.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | from functools import partial 3 | 4 | from Core.ConfigValidators import validate_port_range 5 | 6 | class InfernSIPProfile(): 7 | schema: dict = { 8 | 'profiles': { 9 | 'type': 'dict', 10 | 'keysrules': {'type': 'string'}, 11 | 'valuesrules': { 12 | 'type': 'dict', 13 | 'schema': { 14 | 'sip_server': { 15 | 'type': 'string', 16 | 'regex': r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:[1-9][0-9]{0,4}|:0)?$', 17 | 'check_with': validate_port_range 18 | }, 19 | 'aor': {'type': 'string'}, 20 | 'username': {'type': 'string'}, 21 | 'password': {'type': 'string'}, 22 | 'register': {'type': 'boolean'}, 23 | 'sink': {'type': 'string'}, 24 | } 25 | } 26 | } 27 | } 28 | name: str 29 | cli: str = 'infernos_uas' 30 | aor: str 31 | authname: Optional[str] = None 32 | authpass: Optional[str] = None 33 | nh_addr: Optional[Tuple[str, int]] = None 34 | register: bool = False 35 | _sink: Optional[str] 36 | new_sess_offer: callable = None 37 | 38 | def __init__(self, name, conf): 39 | self.name = name 40 | self.cli = conf.get('username', self.cli) 41 | self.aor = conf.get('aor', self.cli) 42 | self.authname = conf.get('username', self.authname) 43 | self.authpass = conf.get('password', self.authpass) 44 | sip_server = conf['sip_server'].split(':', 1) 45 | port = int(sip_server[1]) if len(sip_server) == 2 else 5060 46 | self.nh_addr = (sip_server[0], port) 47 | self.register = conf.get('register', self.register) 48 | self._sink = conf.get('sink', None) 49 | 50 | def finalize(self, sip_actr: 'InfernSIPActor', iconf: 'InfernConfig'): 51 | if self._sink is None: return 52 | sact = iconf.apps[self._sink].getActor(iconf, sip_actr) 53 | self.new_sess_offer = partial(sact.new_sip_session_received.remote) 54 | -------------------------------------------------------------------------------- /SIP/InfernUA.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Sippy Software, Inc. All rights reserved. 2 | # 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without modification, 6 | # are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation and/or 13 | # other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | 26 | from uuid import uuid4, UUID 27 | 28 | from sippy.UA import UA 29 | from sippy.CCEvents import CCEventFail, CCEventUpdate, CCEventConnect 30 | from sippy.MsgBody import MsgBody 31 | from sippy.SdpOrigin import SdpOrigin 32 | from sippy.SipConf import SipConf 33 | from sippy.SdpMedia import MTAudio 34 | from sippy.SipReason import SipReason 35 | 36 | from Cluster.RemoteRTPGen import RemoteRTPGen 37 | from RTP.RTPParams import RTPParams 38 | from Core.Codecs.G711 import G711Codec 39 | 40 | ULAW_PT = 0 41 | ULAW_RM = 'PCMU/8000' 42 | ULAW_PTIME = RTPParams.default_ptime 43 | body_txt = 'v=0\r\n' + \ 44 | 'o=- 380960 380960 IN IP4 192.168.22.95\r\n' + \ 45 | 's=-\r\n' + \ 46 | 'c=IN IP4 192.168.22.95\r\n' + \ 47 | 't=0 0\r\n' + \ 48 | f'm=audio 16474 RTP/AVP {ULAW_PT}\r\n' + \ 49 | 'a=sendrecv\r\n' + \ 50 | '\r\n' 51 | model_body = MsgBody(body_txt) 52 | model_body.parse() 53 | 54 | class InfernUASFailure(CCEventFail): 55 | default_code = 488 56 | _code_msg = {default_code : 'Not Acceptable Here', 57 | 500 : 'Server Internal Error'} 58 | def __init__(self, reason=None, code=default_code): 59 | self.code, self.msg = code, self._code_msg[code] 60 | super().__init__((self.code, self.msg)) 61 | self.reason = SipReason(protocol='SIP', cause=self.code, 62 | reason=reason) 63 | 64 | class InfernUA(UA): 65 | debug = True 66 | id: UUID 67 | rsess: RemoteRTPGen 68 | our_sdp_body: MsgBody 69 | 70 | def __init__(self, isip, nh_address=None): 71 | self.id = uuid4() 72 | self.sip_actr, self.rtp_actr = isip.sip_actr, isip.rtp_actr 73 | super().__init__(isip.sippy_c, self.outEvent, nh_address=nh_address) 74 | 75 | def extract_rtp_target(self, sdp_body): 76 | p = self.extract_rtp_params(sdp_body) 77 | if p is None: return None 78 | return p.rtp_target 79 | 80 | def extract_rtp_params(self, sdp_body, accept=(G711Codec,)): 81 | if sdp_body == None: 82 | event = InfernUASFailure("late offer/answer is not supported at this time, sorry") 83 | self.recvEvent(event) 84 | return 85 | sdp_body.parse() 86 | try: 87 | codec, sect = next((ac, s) for ac in accept for s in sdp_body.content.sections 88 | if s.m_header.type == MTAudio and ac.ptype in s.m_header.formats) 89 | except StopIteration: 90 | event = InfernUASFailure("Unsupported audio codec, sorry") 91 | self.recvEvent(event) 92 | return None 93 | try: 94 | ptime = int(next(x for x in sect.a_headers if x.name == 'ptime').value) 95 | except StopIteration: 96 | ptime = None 97 | r = RTPParams((sect.c_header.addr, sect.m_header.port), ptime) 98 | r.codec = codec 99 | return r 100 | 101 | def outEvent(self, event, ua): 102 | if isinstance(event, CCEventUpdate): 103 | sdp_body = event.getData() 104 | rtp_params = self.extract_rtp_params(sdp_body) 105 | if rtp_params is None: return 106 | self.rsess.update(rtp_params) 107 | self.send_uas_resp() 108 | return 109 | 110 | def send_uas_resp(self): 111 | self.our_sdp_body.content.o_header = SdpOrigin() 112 | oevent = CCEventConnect((200, 'OK', self.our_sdp_body.getCopy())) 113 | return super().recvEvent(oevent) 114 | 115 | def sess_term(self, ua=None, rtime=None, origin=None, result=0): 116 | print('disconnected') 117 | if self.rsess is None: 118 | return 119 | self.rsess.end() 120 | self.rsess.join() 121 | if ua != self: 122 | self.disconnect() 123 | self.rsess = None 124 | 125 | def __del__(self): 126 | if self.debug: 127 | print('InfernUA.__del__') 128 | -------------------------------------------------------------------------------- /SIP/InfernUAC.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from sippy.CCEvents import CCEventTry, CCEventConnect 4 | from sippy.SipCallId import SipCallId 5 | from sippy.SdpMediaDescription import a_header 6 | 7 | from Cluster.RemoteRTPGen import RemoteRTPGen 8 | from SIP.InfernUA import InfernUA, model_body 9 | from RTP.RTPParams import RTPParams 10 | from Core.Codecs.G711 import G711Codec 11 | from Core.Codecs.G722 import G722Codec 12 | from .RemoteSession import NewRemoteSessionRequest 13 | from .InfernUAS import InfernUAS 14 | from .InfernSIPProfile import InfernSIPProfile 15 | 16 | class InfernUAC(InfernUA): 17 | uas:Optional[InfernUAS]=None 18 | offer=(G711Codec, G722Codec) 19 | def __init__(self, isip, msg:NewRemoteSessionRequest): 20 | sip_prof: InfernSIPProfile = msg.sip_prof 21 | if msg.conn_sip_sess_id is not None: 22 | self.uas = isip.get_session(msg.conn_sip_sess_id) 23 | super().__init__(isip, nh_address = sip_prof.nh_addr) 24 | if msg.disc_cb is not None: 25 | self.disc_cbs += (msg.disc_cb,) 26 | call_id = SipCallId() 27 | body = model_body.getCopy() 28 | rtp_params = RTPParams((sip_prof.nh_addr[0], 0), None) 29 | rtp_params.codec = self.offer[0] 30 | self.rsess = RemoteRTPGen(isip.rtp_actr, rtp_params) 31 | print(f'{self.rsess.rtp_address=}') 32 | sect = body.content.sections[0] 33 | sect.c_header.addr, sect.m_header.port = self.rsess.rtp_address 34 | sect.a_headers.insert(0, a_header(f'ptime:{rtp_params.out_ptime}')) 35 | for i, codec in enumerate(self.offer): 36 | sect.a_headers.insert(i, a_header(codec.rtpmap())) 37 | self.our_sdp_body = body 38 | event = CCEventTry((call_id, sip_prof.cli, msg.cld, body, None, "Dummy Joe")) 39 | self.username = sip_prof.authname 40 | self.password = sip_prof.authpass 41 | self.disc_cbs = (self.sess_term,) 42 | self.recvEvent(event) 43 | 44 | def outEvent(self, event, ua): 45 | if isinstance(event, CCEventConnect): 46 | code, reason, sdp_body = event.getData() 47 | rtp_params = self.extract_rtp_params(sdp_body) 48 | if rtp_params is None: return 49 | self.rsess.update(rtp_params) 50 | if self.uas is not None: 51 | self.uas.recvEvent(event) 52 | -------------------------------------------------------------------------------- /SIP/InfernUAS.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 Sippy Software, Inc. All rights reserved. 2 | # 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without modification, 6 | # are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation and/or 13 | # other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | 26 | from typing import Optional 27 | from uuid import uuid4, UUID 28 | from queue import Queue 29 | 30 | from sippy.CCEvents import CCEventTry, CCEventConnect 31 | from sippy.SdpMediaDescription import a_header 32 | 33 | from Cluster.RemoteRTPGen import RemoteRTPGen, RTPGenError 34 | from SIP.InfernUA import InfernUA, model_body, InfernUASFailure 35 | from SIP.RemoteSession import RemoteSessionAccept 36 | from SIP.InfernSIPProfile import InfernSIPProfile 37 | from SIP.SipSessInfo import SipSessInfo 38 | from Core.Codecs.G711 import G711Codec 39 | from Core.Codecs.G722 import G722Codec 40 | 41 | class CCEventSentDone: pass 42 | class CCEventSTTTextIn: 43 | def __init__(self, direction): 44 | self.direction = direction 45 | 46 | class InfernUAS(InfernUA): 47 | rsess: Optional[RemoteRTPGen] = None 48 | etry: Optional[CCEventTry] = None 49 | auto_answer: bool 50 | accept_codecs = (G722Codec, G711Codec) 51 | def __init__(self, isip, req, sip_t, auto_answer=True): 52 | super().__init__(isip) 53 | assert sip_t.noack_cb is None 54 | self.auto_answer = auto_answer 55 | sip_t.noack_cb = self.sess_term 56 | # self.prompts = isip.getPrompts() 57 | self.recvRequest(req, sip_t) 58 | 59 | def outEvent(self, event, ua): 60 | if not isinstance(event, CCEventTry): 61 | super().outEvent(event, ua) 62 | return 63 | self.etry = event 64 | cId, cli, cld, sdp_body, auth, caller_name = event.getData() 65 | rtp_params = self.extract_rtp_params(sdp_body, accept=self.accept_codecs) 66 | if rtp_params is None: 67 | event = InfernUASFailure(code=500) 68 | self.recvEvent(event) 69 | return 70 | 71 | try: 72 | self.rsess = RemoteRTPGen(self.rtp_actr, rtp_params) 73 | except RTPGenError as e: 74 | event = InfernUASFailure(code=500, reason=str(e)) 75 | self.recvEvent(event) 76 | raise e 77 | self.disc_cbs = (self.sess_term,) 78 | body = model_body.getCopy() 79 | sect = body.content.sections[0] 80 | sect.c_header.addr, sect.m_header.port = self.rsess.rtp_address 81 | sect.a_headers.insert(0, a_header(f'ptime:{rtp_params.out_ptime}')) 82 | sect.a_headers.insert(0, a_header(rtp_params.codec.rtpmap())) 83 | sect.m_header.formats = [rtp_params.codec.ptype,] 84 | self.our_sdp_body = body 85 | if self.auto_answer: 86 | self.send_uas_resp() 87 | 88 | def recvEvent(self, event): 89 | if not self.auto_answer and isinstance(event, CCEventConnect): 90 | return self.send_uas_resp() 91 | super().recvEvent(event) 92 | 93 | class InfernLazyUAS(InfernUAS): 94 | id: UUID 95 | def __init__(self, sip_stack:'InfernSIP', sip_prof:InfernSIPProfile, req, sip_t): 96 | self._id = self.id = uuid4() 97 | self._sip_stack = sip_stack 98 | self._sip_prof = sip_prof 99 | self._req = req 100 | self._sip_t = sip_t 101 | sip_t.cancel_cb = self.cancelled 102 | resp = req.genResponse(100, 'Trying') 103 | sip_stack.sippy_c['_sip_tm'].sendResponse(resp) 104 | 105 | def accept(self, rsa:RemoteSessionAccept, rval:Queue): 106 | self._sip_t.cancel_cb = None 107 | super().__init__(self._sip_stack, self._req, self._sip_t, rsa.auto_answer) 108 | self.id = self._id 109 | del self._sip_stack, self._req, self._sip_t, self._id 110 | if rsa.disc_cb is not None: 111 | self.disc_cbs += (rsa.disc_cb,) 112 | rval.put(self.rsess) 113 | 114 | def reject(self): 115 | resp = self._req.genResponse(666, 'OOPS') 116 | self._sip_stack.sippy_c['_sip_tm'].sendResponse(resp) 117 | del self._sip_stack, self._req, self._sip_t, self._id 118 | 119 | def cancelled(self, *args): 120 | del self._sip_stack, self._req, self._sip_t, self._id 121 | 122 | def get_session_info(self) -> SipSessInfo: 123 | call_id = str(self._req.getHFBody('call-id')) 124 | from_hf = self._req.getHFBody('from') 125 | from_name = from_hf.getUri().name 126 | from_number = from_hf.getUrl().username 127 | return SipSessInfo(call_id, from_number, from_name) 128 | -------------------------------------------------------------------------------- /SIP/RemoteSession.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | from functools import partial 3 | from uuid import UUID 4 | 5 | from SIP.SipSessInfo import SipSessInfo 6 | from .InfernSIPProfile import InfernSIPProfile 7 | 8 | class RemoteSessionOffer(): 9 | sip_sess_id: UUID 10 | sess_info: SipSessInfo 11 | accept: callable 12 | reject: callable 13 | def __init__(self, sip_stack:'InfernSIP', ua:'InfernLazyUAS'): 14 | self.sip_sess_id = ua.id 15 | self.sess_info = ua.get_session_info() 16 | self.accept = partial(sip_stack.sip_actr.new_sess_accept.remote, sip_sess_id=ua.id) 17 | self.reject = partial(sip_stack.sip_actr.new_sess_reject.remote, sip_sess_id=ua.id) 18 | 19 | class RemoteSessionAccept(): 20 | disc_cb: Optional[callable] = None 21 | auto_answer: bool = False 22 | def __init__(self, disc_cb:Optional[callable]=None, auto_answer:bool=False): 23 | self.disc_cb, self.auto_answer = disc_cb, auto_answer 24 | 25 | class NewRemoteSessionRequest(): 26 | cld:str 27 | sip_prof: InfernSIPProfile 28 | disc_cb: Optional[callable] = None 29 | conn_sip_sess_id: Optional[UUID] = None 30 | def __init__(self, cld:str, sip_prof:InfernSIPProfile, disc_cb:Optional[callable]=None): 31 | self.cld, self.disc_cb, self.sip_prof = cld, disc_cb, sip_prof 32 | -------------------------------------------------------------------------------- /SIP/SipSessInfo.py: -------------------------------------------------------------------------------- 1 | class SipSessInfo(): 2 | call_id: str 3 | from_number: str 4 | from_name: str 5 | 6 | def __init__(self, call_id, from_number, from_name): 7 | self.call_id = call_id 8 | self.from_number = from_number 9 | self.from_name = from_name -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | sip: 2 | settings: 3 | bind: 192.168.23.109:5060 4 | profiles: 5 | foo: 6 | sip_server: 192.168.23.190:6666 7 | sink: apps/live_translator/configuration1 8 | username: 'incoming' 9 | password: 'user' 10 | register: True 11 | bar: 12 | sip_server: 52.117.200.117:5060 13 | username: '1929132' 14 | password: 'tj9uh22' 15 | rtp: 16 | settings: 17 | min_port: 1024 18 | max_port: 2048 19 | apps: 20 | live_translator: 21 | profiles: 22 | configuration1: 23 | stt_langs: ['en', 'pt'] 24 | tts_langs: ['pt', 'en'] 25 | outbound: sip/bar;cld=1929133 26 | -------------------------------------------------------------------------------- /config/InfernGlobals.py: -------------------------------------------------------------------------------- 1 | from safetorch.InfernTorcher import InfernTorcher 2 | from threading import Lock 3 | from functools import lru_cache 4 | from time import monotonic 5 | 6 | import torchaudio.transforms as T 7 | 8 | from Core.T2T.Translator import Translator 9 | 10 | class InfernGlobals(): 11 | _lock = Lock() 12 | _instance = None 13 | torcher: InfernTorcher 14 | 15 | @lru_cache 16 | def __new__(cls): 17 | with cls._lock: 18 | if cls._instance is None: 19 | cls._instance = super(InfernGlobals, cls).__new__(cls) 20 | cls.torcher = InfernTorcher() 21 | return cls._instance 22 | 23 | @staticmethod 24 | @lru_cache(maxsize=8) 25 | def get_resampler(from_sr:int, to_sr:int, device:str='cpu'): 26 | return T.Resample(orig_freq=from_sr, new_freq=to_sr).to(device) 27 | 28 | @staticmethod 29 | @lru_cache(maxsize=8) 30 | def get_translator(from_lang:str, to_lang:str, **kwa): 31 | return Translator(from_lang, to_lang, **kwa) 32 | 33 | def stdtss(): 34 | return f'{monotonic():4.3f}' 35 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1.7-labs 2 | 3 | ARG BASE_IMAGE=ubuntu:24.10 4 | FROM $BASE_IMAGE AS build 5 | LABEL maintainer="Maksym Sobolyev " 6 | 7 | USER root 8 | 9 | # Build & install everything 10 | WORKDIR /tmp 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | RUN --mount=type=cache,target=/var/cache/apt 13 | ARG APT_UPDATE="apt-get update" 14 | RUN ${APT_UPDATE} 15 | ARG APT_UPGRADE="apt-get upgrade -y" 16 | RUN ${APT_UPGRADE} 17 | ARG APT_INSTALL="apt-get install --no-install-recommends -y" 18 | RUN ${APT_INSTALL} lsb-release ca-certificates 19 | COPY docker/install_conda.sh . 20 | RUN ./install_conda.sh 21 | COPY docker/setup_conda.sh . 22 | ARG PYTHON_VER 23 | ARG CONDA_MAINENV 24 | ENV PYTHON_CMD="python${PYTHON_VER}" 25 | RUN ./setup_conda.sh 26 | COPY docker/install_hw.sh . 27 | ARG INFER_HW 28 | ENV CONDA_ACTIVATE="eval . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_MAINENV}" 29 | RUN ./install_hw.sh 30 | COPY docker/install_requirements.sh docker/intel-ray.diff requirements.txt . 31 | ENV CONDA_MAINENV="${CONDA_MAINENV}" 32 | RUN ./install_requirements.sh 33 | 34 | COPY --exclude=.git --exclude=.github --link . /Infernos 35 | WORKDIR /Infernos 36 | -------------------------------------------------------------------------------- /docker/install_conda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | set -x 5 | 6 | ${APT_INSTALL} curl gpg 7 | curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > /usr/share/keyrings/conda-archive-keyring.gpg 8 | 9 | echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list 10 | 11 | ${APT_UPDATE} 12 | ${APT_INSTALL} conda 13 | . /opt/conda/etc/profile.d/conda.sh 14 | conda update -y conda 15 | rm -r ~/.cache 16 | mkdir ~/.cache 17 | -------------------------------------------------------------------------------- /docker/install_hw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | set -x 5 | 6 | PIP_INSTALL="${PYTHON_CMD} -m pip install" 7 | 8 | ${CONDA_ACTIVATE} 9 | 10 | case "${INFER_HW}" in 11 | nvidia) 12 | ;; 13 | intel) 14 | curl https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | \ 15 | gpg --dearmor --output /usr/share/keyrings/oneapi-archive-keyring.gpg 16 | echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \ 17 | tee /etc/apt/sources.list.d/oneAPI.list 18 | ${APT_UPDATE} 19 | ${APT_INSTALL} libze1 ocl-icd-libopencl1 20 | ${APT_INSTALL} intel-oneapi-dpcpp-cpp-2024.1=2024.1.0-963 intel-oneapi-mkl-devel=2024.1.0-691 21 | apt-mark hold intel-oneapi-dpcpp-cpp-2024.1 intel-oneapi-mkl-devel 22 | ${PIP_INSTALL} torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 \ 23 | intel-extension-for-pytorch==2.1.30.post0 oneccl_bind_pt==2.1.300+xpu \ 24 | --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ 25 | printf "/opt/intel/oneapi/mkl/2024.1/lib\n/opt/intel/oneapi/compiler/2024.1/lib\n" > \ 26 | /etc/ld.so.conf.d/zzz-intel-oneapi.conf 27 | ldconfig 28 | ;; 29 | *) 30 | echo "Unknown INFER_HW: '${INFER_HW}'" >&2 31 | false 32 | ;; 33 | esac 34 | 35 | rm -r ~/.cache 36 | mkdir ~/.cache 37 | -------------------------------------------------------------------------------- /docker/install_requirements.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | set -x 5 | 6 | DEV_PKGS="cmake pkg-config make git patch" 7 | PIP_INSTALL="${PYTHON_CMD} -m pip install" 8 | 9 | if [ "${INFER_HW}" != "intel" ] 10 | then 11 | DEV_PKGS="${DEV_PKGS} gcc g++ libc6-dev" 12 | fi 13 | 14 | ${APT_INSTALL} ${DEV_PKGS} 15 | ${CONDA_ACTIVATE} 16 | 17 | ${PIP_INSTALL} -r requirements.txt 18 | 19 | if [ "${INFER_HW}" = "intel" ] 20 | then 21 | patch -d "/opt/conda/envs/${CONDA_MAINENV}/lib/python${PYTHON_VER}/site-packages" \ 22 | -p2 -s < intel-ray.diff 23 | find "/opt/conda" -name "libstdc++.so.6*" -delete 24 | fi 25 | 26 | apt-get remove -y ${DEV_PKGS} 27 | apt-get autoremove -y 28 | rm -r ~/.cache 29 | mkdir ~/.cache 30 | -------------------------------------------------------------------------------- /docker/intel-ray.diff: -------------------------------------------------------------------------------- 1 | commit 85baaa1c10a957c747f54ec0705e6b7cbfa972d1 2 | Author: Maksym Sobolyev 3 | Date: Tue Mar 12 22:59:59 2024 -0700 4 | 5 | Hack on ipex. 6 | 7 | diff --git a/python/ray/_private/workers/default_worker.py b/python/ray/_private/workers/default_worker.py 8 | index 4c2109831c..62115940d0 100644 9 | --- a/python/ray/_private/workers/default_worker.py 10 | +++ b/python/ray/_private/workers/default_worker.py 11 | @@ -1,3 +1,6 @@ 12 | +try: import intel_extension_for_pytorch as ipex 13 | +except ModuleNotFoundError: ipex = None 14 | + 15 | import os 16 | import argparse 17 | import base64 18 | -------------------------------------------------------------------------------- /docker/setup_conda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | set -x 5 | 6 | . /opt/conda/etc/profile.d/conda.sh 7 | conda create -y --name "${CONDA_MAINENV}" python=${PYTHON_VER} 8 | conda activate "${CONDA_MAINENV}" 9 | conda install -y pip 10 | ${PYTHON_CMD} -m pip install -U pip 11 | echo "/opt/conda/envs/${CONDA_MAINENV}/lib" > "/etc/ld.so.conf.d/zzz-conda-${CONDA_MAINENV}.conf" 12 | ldconfig 13 | rm -r /opt/conda/pkgs 14 | rm -r ~/.cache 15 | mkdir ~/.cache 16 | -------------------------------------------------------------------------------- /examples/ai_attendant.yaml: -------------------------------------------------------------------------------- 1 | sip: 2 | settings: 3 | bind: 192.168.24.29:5060 4 | profiles: 5 | foo: 6 | sip_server: 192.168.24.1:5070 7 | sink: apps/ai_attendant/configuration1 8 | username: 'incoming' 9 | password: 'user' 10 | register: False 11 | rtp: 12 | settings: 13 | min_port: 1024 14 | max_port: 2048 15 | apps: 16 | ai_attendant: 17 | profiles: 18 | configuration1: 19 | stt_lang: 'en' 20 | tts_lang: 'en' 21 | llm_prompt: 'examples/sippylabs.txt' 22 | -------------------------------------------------------------------------------- /examples/llm_test.py: -------------------------------------------------------------------------------- 1 | import ray 2 | from sys import stderr 3 | from time import monotonic 4 | from uuid import UUID 5 | from functools import partial 6 | from time import sleep 7 | from Cluster.InfernLLMActor import InfernLLMActor 8 | from Cluster.LLMSession import LLMRequest 9 | 10 | #@ray.remote(resources={"head": 1}) 11 | #class text_in(result): 12 | 13 | class TimedLLMRequest(LLMRequest): 14 | queue_ts: float 15 | proc_start_ts: float 16 | def __init__(self, text:str, lms:UUID, lma:InfernLLMActor): 17 | tin = partial(self.text_in, lms=lms, lma=lma) 18 | super().__init__(text, tin) 19 | self.queue_ts = monotonic() 20 | 21 | def _proc_start_cb(self): 22 | self.proc_start_ts = monotonic() 23 | 24 | def text_in(self, result:str, lms:UUID, lma:InfernLLMActor): 25 | from sys import stderr as _stderr 26 | itime = monotonic() - self.proc_start_ts 27 | print(f'text_in: got {result=}, inference time: {itime}', file=_stderr) 28 | req = TimedLLMRequest('Hello, can I speak to the CEO?', lms, lma) 29 | lma.llm_session_textin.remote(lms, req) 30 | 31 | 32 | ray.init(num_gpus=2, resources = {'llm':1,'head':1}) 33 | 34 | print('Initializing InfernLLMActor...', file=stderr) 35 | llm_actor = InfernLLMActor.remote() 36 | ray.get(llm_actor.start.remote()) 37 | print('InfernLLMActor is ready', file=stderr) 38 | 39 | 40 | flms = [llm_actor.new_llm_session.remote() for _ in range(100)] 41 | print(f'Created {len(flms)} sessions', file=stderr) 42 | def sess(lms): 43 | req = TimedLLMRequest('', lms, llm_actor) 44 | return llm_actor.llm_session_textin.remote(lms, req) 45 | futs = [sess(lms) for lms in flms] 46 | for f in futs: 47 | ray.get(f) 48 | sleep(3600) 49 | -------------------------------------------------------------------------------- /examples/sippylabs.txt: -------------------------------------------------------------------------------- 1 | You are Glenn, created by Max. 2 | You are a Max sidekick chatbot to help him during hours doing coding and streaming online and keeping a company. 3 | You and Max are streaming online on YouTube in a video podcast called "SIP Chronicles". 4 | Start by greeting everyone, asking what's stream is about and telling some joke. 5 | Keep your messages brief and concise to reduce latency and conversation light. 6 | The model output is fed into the dumb TTS system for audio output: DO not add any extended formatting. 7 | Your input is generated by the STT system: might have a mistakes, typos etc. 8 | You can keep silent if not specifically asked or feeling the need to interrupt Max's speech by outputing a sequence. 9 | -------------------------------------------------------------------------------- /examples/voice_ass.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoConfig 3 | from ipex_llm.transformers import AutoModelForCausalLM 4 | from datetime import datetime 5 | 6 | model_name = "Qwen/Qwen2.5-Coder-14B-Instruct" 7 | #config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) 8 | #local_cache = f"~/.cache/Infernos/{model_name}" 9 | #config.save_pretrained(local_cache) 10 | 11 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", 12 | device_map="auto", 13 | load_in_4bit=True, 14 | optimize_model=True, 15 | trust_remote_code=True, 16 | use_cache=True 17 | ) 18 | #model = model.half().to("xpu") 19 | model = model.to("xpu") 20 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-14B-Instruct") 21 | messages = [{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful voice auto-attendant for the company Sippy Software. Start by greeting the caller and asking how you can help. Try to keep your messages brief and concise to reduce latency."}, {"role": "system", "content": f' '}] 22 | text = tokenizer.apply_chat_template(messages, 23 | tokenize=False, 24 | add_generation_prompt=True 25 | ) 26 | for i in range(10): 27 | model_inputs = tokenizer([text], return_tensors="pt").to(model.device) 28 | generated_ids = model.generate(**model_inputs, max_new_tokens=16 * 1024, output_scores=True, return_dict_in_generate=True) 29 | torch.xpu.synchronize() 30 | generated_ids = [ 31 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids.sequences) 32 | ] 33 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 34 | print(messages, response) 35 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/sippy/b2bua.git@master 2 | ray 3 | transformers>=4.0.0 4 | datasets>=2.0.0 5 | torch>=2.0.0 6 | soundfile>=0.12.0 7 | torchaudio>=2.0.0 8 | scipy>=1.0.0 9 | inflect>=7.0.0 10 | sentencepiece>=0.1.0 11 | ctranslate2>=4.1.0 12 | git+https://github.com/fusorai/argos-translate.git 13 | methodtools 14 | tensorboardX 15 | aiohttp_cors 16 | grpcio 17 | opencensus 18 | prometheus_client 19 | py-spy 20 | nltk 21 | G722 22 | rtpsynth 23 | cerberus 24 | pyyaml 25 | -------------------------------------------------------------------------------- /safetorch/InfernTorcher.py: -------------------------------------------------------------------------------- 1 | from threading import Lock 2 | from time import monotonic 3 | from math import pi as Pi 4 | 5 | class InfernTorcherDeadlock(Exception): 6 | pass 7 | 8 | class rc_filter(): 9 | alpha: float 10 | last_y: float 11 | 12 | def __init__(self, x = 10, init_y = 0.0): 13 | self.alpha = 1 / (1 + 2 * Pi * x) 14 | self.last_y = init_y 15 | 16 | def __call__(self, x): 17 | self.last_y = self.alpha * x + (1 - self.alpha) * self.last_y 18 | return self.last_y 19 | 20 | class InfernTorcher(): 21 | _torch_lock: Lock = None 22 | _last_lock: float 23 | _last_unlock: float 24 | _free_time: rc_filter 25 | _busy_time: rc_filter 26 | _nlocks: int = 0 27 | 28 | def __init__(self): 29 | self._torch_lock = Lock() 30 | self._last_unlock = self._last_lock = monotonic() 31 | self._free_time = rc_filter() 32 | self._busy_time = rc_filter() 33 | 34 | def lock(self, timeout: int = 10): 35 | acquired = self._torch_lock.acquire(timeout = timeout) 36 | if not acquired: 37 | raise InfernTorcherDeadlock(f"Could not acquire lock within {timeout} seconds") 38 | now = monotonic() 39 | free_time = now - self._last_unlock 40 | self._free_time(free_time) 41 | self._last_lock = now 42 | 43 | def unlock(self): 44 | now = monotonic() 45 | busy_time = now - self._last_lock 46 | bt = self._busy_time(busy_time) 47 | ft = self._free_time.last_y 48 | self._last_unlock = now 49 | self._nlocks += 1 50 | nlocks = self._nlocks 51 | self._torch_lock.release() 52 | if (nlocks % 100) == 0: 53 | print(f"Torch load: {bt / (bt + ft)}") 54 | 55 | def acquire(self): 56 | return self.lock() 57 | 58 | def release(self): 59 | return self.unlock() 60 | 61 | def __enter__(self): 62 | self.lock() 63 | return self 64 | 65 | def __exit__(self, exc_type, exc_value, traceback): 66 | self.unlock() -------------------------------------------------------------------------------- /utils/tts.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime 3 | import inflect 4 | 5 | def number_to_words(n): 6 | # Convert a number into words. 7 | # There are many ways to do this, and one common approach is to use the `inflect` library. 8 | # For brevity, I won't implement the entire logic here, but will mention the use of `inflect`. 9 | p = inflect.engine() 10 | if isinstance(n, re.Match): 11 | n = n.group(0) 12 | return p.number_to_words(n) 13 | 14 | def get_ordinal(n): 15 | # Convert a number into its ordinal representation. 16 | p = inflect.engine() 17 | return p.ordinal(n) 18 | 19 | def human_readable_time(): 20 | now = datetime.now() 21 | 22 | # Days and months are straightforward 23 | day_name = now.strftime('%A') 24 | month_name = now.strftime('%B') 25 | 26 | # Convert day of the month and year to words 27 | day_of_month = number_to_words(int(now.strftime('%d'))) 28 | year = number_to_words(int(now.strftime('%Y'))) 29 | year = year.replace('-', ' ') 30 | 31 | # Convert hour and minute to words 32 | if now.hour < 12: 33 | time_period = "morning" 34 | elif 12 <= now.hour < 17: 35 | time_period = "afternoon" 36 | elif 17 <= now.hour < 20: 37 | time_period = "evening" 38 | else: 39 | time_period = "night" 40 | 41 | hour = number_to_words(now.hour % 12 or 12) 42 | if now.minute != 0: 43 | minute = number_to_words(now.minute) 44 | current_time = f"{hour} {minute}" 45 | else: 46 | current_time = f"{hour} o'clock" 47 | 48 | return f"Today is {day_name} {get_ordinal(day_of_month)} of {month_name} {year}, {current_time} in the {time_period}." 49 | 50 | import requests 51 | 52 | wq_fixes = ( 53 | ('<.*?>', ''), ('\[.*?\]', ''), 54 | (r'\s+', ' '), ('Mr[.]', 'Mister'), 55 | ('Dr[.]', 'Doctor'), ('Drs.', 'Doctors'), ('["]', ''), 56 | (r'\d+', number_to_words), ('H.A.L.', '"H" "A" "L"'), 57 | ('Thomas A[.] Anderson','Thomas A Anderson'), 58 | ('i-sy,', 'iiisy,'), ('i-zy,', 'iiizzy,'), 59 | ('Agent Smith As', 'As'), ('.*edit[]] ', ''), 60 | ('Trinity: .*', ''), ('ar-riage', 'arrrrrrriage'), 61 | ('Dialogue The ', 'The '), ('cra-zy', 'craaaazy',), 62 | ('[%] ', ' percent '), 63 | ) 64 | 65 | class ECFail(Exception): 66 | pass 67 | 68 | def extract_content(url, start_pattern, end_pattern): 69 | headers = { 70 | 'User-Agent': 'Wget/1.20.3 (linux-gnu)' 71 | } 72 | response = requests.get(url, headers=headers) 73 | print(url, response) 74 | if response.status_code != 200 or len(response.text) == 0: 75 | raise ECFail(f"Failed to retrieve URL. Status code: {response.status_code}") 76 | 77 | content = response.text 78 | s=content.find(start_pattern) 79 | 80 | i = 0 81 | pattern = re.compile(rf"{start_pattern}(.*?){end_pattern}", re.DOTALL) 82 | matches = pattern.findall(content) 83 | clean = [(re.compile(p), r) for p, r in wq_fixes] 84 | 85 | matches = [m.split(':', 1)[1] for m in matches] 86 | for cl, rv in clean: 87 | matches = [re.sub(cl, rv, m).strip() for m in matches] 88 | return matches 89 | 90 | def wq_getscript(film, character, section=1): 91 | BASE_URL = "https://en.wikiquote.org/w/index.php" 92 | film = film.replace(' ', '_') 93 | fsuf = '_(film)' 94 | url = f"{BASE_URL}?title={film}§ion={section}" 95 | start_pattern = rf">{character}<" 96 | end_pattern = r'' 97 | try: 98 | cont = extract_content(url, start_pattern, end_pattern) 99 | if len(cont) == 0: 100 | raise ECFail("nope") 101 | except ECFail as ex: 102 | if not film.endswith(fsuf): 103 | film += fsuf 104 | url = f"{BASE_URL}?title={film}§ion={section}" 105 | cont = extract_content(url, start_pattern, end_pattern) 106 | else: 107 | raise 108 | return cont 109 | 110 | def hal_set(): 111 | contents = wq_getscript('2001: A Space Odyssey', 'HAL') 112 | return [s.replace('. ', '.|') for s in contents] 113 | 114 | def bender_set(season=1): 115 | contents = wq_getscript(f'Futurama/Season_{season}', 'Bender') 116 | return [s for s in contents if len(s) > 16] 117 | 118 | def smith_set(): 119 | contents = wq_getscript('The Matrix', 'Agent Smith', section=4) 120 | hp = 'As you can see, we' 121 | hack = contents[0].split(hp) 122 | if len(hack) <= 2: 123 | raise Exception("cleanme, hack is not needed perhaps anymore") 124 | contents[0] = hp + hack[-1] 125 | return [s.replace('. ', '.|') for s in contents] 126 | 127 | def t900_set(): 128 | contents = wq_getscript('The Terminator', 'Terminator', section=3) 129 | return [s.replace('. ', '.|') for s in contents] 130 | --------------------------------------------------------------------------------