├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── Apps
    ├── AIAttendant
    │   ├── AIAActor.py
    │   ├── AIAAppConfig.py
    │   ├── AIAProfile.py
    │   └── AIASession.py
    └── LiveTranslator
    │   ├── LTActor.py
    │   ├── LTAppConfig.py
    │   ├── LTProfile.py
    │   └── LTSession.py
├── Cluster
    ├── InfernBatchedWorker.py
    ├── InfernBenchActor.py
    ├── InfernLLMActor.py
    ├── InfernLLMWorker.py
    ├── InfernRTPActor.py
    ├── InfernSIPActor.py
    ├── InfernSTTActor.py
    ├── InfernSTTWorker.py
    ├── InfernTTSActor.py
    ├── InfernTTSWorker.py
    ├── LLMSession.py
    ├── RemoteRTPGen.py
    ├── RemoteTTSSession.py
    ├── STTSession.py
    └── TTSSession.py
├── Core
    ├── AStreamMarkers.py
    ├── AudioChunk.py
    ├── Codecs
    │   ├── G711.py
    │   ├── G722.py
    │   └── GenCodec.py
    ├── ConfigValidators.py
    ├── Exceptions
    │   └── InfernSessNotFoundErr.py
    ├── InfernConfig.py
    ├── InfernWrkThread.py
    ├── OutputMuxer.py
    ├── T2T
    │   ├── NumbersToWords.py
    │   └── Translator.py
    └── VAD
    │   ├── SileroVAD.py
    │   ├── SileroVADUtils.py
    │   └── ZlibVAD.py
├── HelloSippyTTSRT
    ├── HelloSippyRT.py
    ├── HelloSippyRTPipe.py
    └── HelloSippyRTPipeTest.py
├── Infernos.py
├── LICENSE
├── README.md
├── RTP
    ├── AudioInput.py
    ├── InfernRTPConf.py
    ├── InfernRTPEPoint.py
    ├── InfernRTPIngest.py
    ├── RTPOutputWorker.py
    └── RTPParams.py
├── SIP
    ├── InfernSIP.py
    ├── InfernSIPConf.py
    ├── InfernSIPProfile.py
    ├── InfernUA.py
    ├── InfernUAC.py
    ├── InfernUAS.py
    ├── RemoteSession.py
    └── SipSessInfo.py
├── config.yaml
├── config
    └── InfernGlobals.py
├── docker
    ├── Dockerfile
    ├── install_conda.sh
    ├── install_hw.sh
    ├── install_requirements.sh
    ├── intel-ray.diff
    └── setup_conda.sh
├── examples
    ├── ai_attendant.yaml
    ├── llm_test.py
    ├── sippylabs.txt
    └── voice_ass.py
├── requirements.txt
├── safetorch
    └── InfernTorcher.py
└── utils
    └── tts.py


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: Build & Publush
 4 | 
 5 | # Controls when the action will run.
 6 | on:
 7 |   # Triggers the workflow on all push or pull request events
 8 |   push:
 9 |   pull_request:
10 | 
11 |   release:
12 |     types: [created]
13 | 
14 |   # Allows you to run this workflow manually from the Actions tab
15 |   workflow_dispatch:
16 | 
17 |   schedule:
18 |     - cron: "0 0 * * *"
19 | 
20 | # added using https://github.com/step-security/secure-repo
21 | permissions:
22 |   contents: read
23 | 
24 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
25 | jobs:
26 |   Docker:
27 |     name: Build&Push to DockerHub
28 |     if: (github.event_name == 'push' || github.event_name == 'pull_request')
29 |     runs-on: [self-hosted, linux, x64]
30 |     strategy:
31 |       matrix:
32 |         infer-hw: ['nvidia', 'intel']
33 |     env:
34 |       DOCKER_REPO: 'sippylabs/infernos'
35 |       BASE_IMAGE: 'ubuntu:24.10'
36 |       PYTHON_VER: '3.11'
37 |       CONDA_MAINENV: 'Infernos'
38 |       INFER_HW: ${{ matrix.infer-hw }}
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v4
42 |       with:
43 |         submodules: 'recursive'
44 | 
45 |     - name: Set up Docker Buildx
46 |       uses: docker/setup-buildx-action@v3
47 | 
48 |     - name: Log in to Docker Hub
49 |       if: github.event_name != 'pull_request'
50 |       uses: docker/login-action@v3
51 |       with:
52 |         username: ${{ secrets.DOCKER_USERNAME }}
53 |         password: ${{ secrets.DOCKER_PASSWORD }}
54 | 
55 |     - name: Extract metadata (tags, labels) for Docker
56 |       id: meta
57 |       uses: docker/metadata-action@v5
58 |       with:
59 |         images: ${{ env.DOCKER_REPO }}
60 |         tags: |
61 |           type=schedule
62 |           type=ref,event=branch,prefix=${{ env.INFER_HW }}-
63 |           type=ref,event=tag,prefix=${{ env.INFER_HW }}-
64 |           type=ref,event=pr,prefix=${{ env.INFER_HW }}-
65 |           type=raw,value=${{ env.INFER_HW }}-latest,enable={{is_default_branch}}
66 |           type=sha
67 | 
68 |     - name: Get branch name
69 |       run: echo "GIT_BRANCH=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_ENV
70 | 
71 |     - name: Build Docker image
72 |       uses: docker/build-push-action@v6
73 |       env:
74 |         CACHE_SPEC: "type=registry,ref=${{ env.DOCKER_REPO }}:${{ env.INFER_HW }}-${{ env.GIT_BRANCH }}-buildcache"
75 |       with:
76 |         context: .
77 |         file: ./docker/Dockerfile
78 |         push: true
79 |         build-args: |
80 |           BASE_IMAGE=${{ env.BASE_IMAGE }}
81 |           PYTHON_VER=${{ env.PYTHON_VER }}
82 |           CONDA_MAINENV=${{ env.CONDA_MAINENV }}
83 |           INFER_HW=${{ env.INFER_HW }}
84 |         tags: |
85 |           ${{ steps.meta.outputs.tags }}
86 |         labels: ${{ steps.meta.outputs.labels }}
87 |         cache-from: ${{ env.CACHE_SPEC }}
88 |         cache-to: ${{ env.CACHE_SPEC }},mode=max
89 |         #cache-from: type=gha
90 |         #cache-to: type=registry,ref=${{ env.DOCKER_REPO }}:${{ env.INFER_HW }}-buildcache,mode=max
91 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.wav
3 | 


--------------------------------------------------------------------------------
/Apps/AIAttendant/AIAActor.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, List, Union
 2 | from uuid import UUID
 3 | from functools import partial
 4 | 
 5 | from ray import ray
 6 | import nltk
 7 | from tensorboardX import SummaryWriter
 8 | 
 9 | from config.InfernGlobals import InfernGlobals as IG
10 | from Cluster.InfernSIPActor import InfernSIPActor
11 | from Cluster.InfernTTSActor import InfernTTSActor
12 | from Cluster.InfernSTTActor import InfernSTTActor
13 | from Cluster.InfernLLMActor import InfernLLMActor
14 | from Cluster.STTSession import STTResult, STTSentinel
15 | from Cluster.LLMSession import LLMResult
16 | from SIP.RemoteSession import RemoteSessionOffer
17 | from Core.T2T.NumbersToWords import NumbersToWords
18 | from Core.Exceptions.InfernSessNotFoundErr import InfernSessNotFoundErr
19 | 
20 | from .AIASession import AIASession
21 | from ..LiveTranslator.LTActor import ntw_filter
22 | 
23 | class AIASessNotFoundErr(InfernSessNotFoundErr): pass
24 | 
25 | @ray.remote(resources={"ai_attendant": 1})
26 | class AIAActor():
27 |     sessions: Dict[UUID, AIASession]
28 |     thunmbstones: List[UUID]
29 |     translator: callable
30 |     nstts: int = 0
31 |     def __init__(self):
32 |         self.stt_out_lang = 'en'
33 | 
34 |     def start(self, aia_prof: 'AIAProfile', sip_actr:InfernSIPActor):
35 |         self.aia_prof = aia_prof
36 |         self.tts_lang = aia_prof.tts_lang
37 |         self.stt_lang = aia_prof.stt_lang
38 |         nltk.download('punkt')
39 |         nltk.download('punkt_tab')
40 |         self.aia_actr = ray.get_runtime_context().current_actor
41 |         self.sip_actr = sip_actr
42 |         self.tts_actr = InfernTTSActor.remote()
43 |         self.stt_actr = InfernSTTActor.remote()
44 |         self.llm_actr = InfernLLMActor.remote()
45 |         futs = [self.stt_actr.start.remote(),  self.tts_actr.start.remote(lang=self.tts_lang, output_sr=8000),
46 |                 self.llm_actr.start.remote()]
47 |         if self.stt_out_lang == self.tts_lang:
48 |             self.translator = ntw_filter
49 |         else:
50 |             flt = partial(ntw_filter, obj=NumbersToWords(self.tts_lang))
51 |             self.translator = IG.get_translator(self.stt_out_lang, self.tts_lang, filter=flt).translate
52 |         self.swriter = SummaryWriter()
53 |         ray.get(futs)
54 |         self.sessions = {}
55 |         self.thumbstones = []
56 | 
57 |     def new_sip_session_received(self, new_sess:RemoteSessionOffer):
58 |         aia_sess = AIASession(self, new_sess, self.aia_prof.llm_prompt)
59 |         print(f'{aia_sess=}')
60 |         self.sessions[aia_sess.id] = aia_sess
61 | 
62 |     def sess_term(self, sess_id:UUID, sip_sess_id:UUID, relaxed:bool=False):
63 |         try:
64 |             self._get_session(sess_id).sess_term(sip_sess_id)
65 |         except AIASessNotFoundErr:
66 |             if not relaxed: raise
67 |             return
68 |         del self.sessions[sess_id]
69 |         self.thumbstones.append(sess_id)
70 |         if len(self.thumbstones) > 100:
71 |             self.thumbstones = self.thumbstones[-100:]
72 | 
73 |     def text_in(self, sess_id:UUID, result:Union[STTResult,STTSentinel]):
74 |         if isinstance(result, STTResult):
75 |             self.swriter.add_scalar(f'stt/inf_time', result.inf_time, self.nstts)
76 |             self.nstts += 1
77 |         self._get_session(sess_id).text_in(result)
78 | 
79 |     def text_out(self, sess_id:UUID, result:LLMResult):
80 |         try:
81 |             self._get_session(sess_id).text_out(result)
82 |         except AIASessNotFoundErr:
83 |             if not sess_id in self.thumbstones: raise
84 | 
85 |     def tts_say_done(self, sess_id:UUID):
86 |         self._get_session(sess_id).tts_say_done()
87 | 
88 |     def _get_session(self, sess_id:UUID):
89 |         try: return self.sessions[sess_id]
90 |         except KeyError: raise AIASessNotFoundErr(f'No AIA session with id {sess_id}')
91 | 


--------------------------------------------------------------------------------
/Apps/AIAttendant/AIAAppConfig.py:
--------------------------------------------------------------------------------
 1 | from .AIAProfile import AIAProfile
 2 | 
 3 | class AIAAppConfig():
 4 |     schema: dict = {
 5 |         'ai_attendant': {
 6 |             'type': 'dict',
 7 |             'schema': {
 8 |                 **AIAProfile.schema,
 9 |             }
10 |         },
11 |     }
12 | 


--------------------------------------------------------------------------------
/Apps/AIAttendant/AIAProfile.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | from typing import Optional
 3 | 
 4 | from Cluster.InfernSIPActor import InfernSIPActor
 5 | 
 6 | from .AIAActor import AIAActor
 7 | 
 8 | 
 9 | class AIAProfile():
10 |     schema: dict = {
11 |         'profiles': {
12 |             'type': 'dict',
13 |             'keysrules': {'type': 'string'},
14 |             'valuesrules': {
15 |                 'type': 'dict',
16 |                 'schema': {
17 |                     'tts_lang': {'type': 'string'},
18 |                     'stt_lang': {'type': 'string'},
19 |                     'llm_prompt': {'type': 'string'},
20 |                 }
21 |             }
22 |         }
23 |     }
24 |     stt_lang: str = 'en'
25 |     tts_lang: str = 'en'
26 |     llm_prompt: str
27 |     actor: Optional[AIAActor] = None
28 | 
29 |     def __init__(self, name, conf):
30 |         self.name = name
31 |         self.tts_lang = conf['tts_lang']
32 |         self.stt_lang = conf['stt_lang']
33 |         self.llm_prompt = open(conf['llm_prompt']).read()
34 | 
35 |     def finalize(self, iconf:'InfernConfig'):
36 |         pass
37 | 
38 |     def getActor(self, iconf:'InfernConfig', sip_act:InfernSIPActor):
39 |         if self.actor is None:
40 |             self.actor = AIAActor.remote()
41 |             ray.get(self.actor.start.remote(self, sip_act))
42 |         return self.actor
43 | 


--------------------------------------------------------------------------------
/Apps/AIAttendant/AIASession.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List, Optional, Dict, Union
  2 | from uuid import UUID, uuid4
  3 | from functools import partial
  4 | import ray
  5 | 
  6 | from nltk.tokenize import sent_tokenize
  7 | 
  8 | from Cluster.TTSSession import TTSRequest
  9 | from Cluster.STTSession import STTRequest, STTResult, STTSentinel
 10 | from Cluster.LLMSession import LLMRequest, LLMResult, LLMSessionParams
 11 | from Cluster.RemoteTTSSession import RemoteTTSSession
 12 | from Cluster.InfernRTPActor import InfernRTPActor, RTPSessNotFoundErr
 13 | from Core.T2T.NumbersToWords import NumbersToWords
 14 | from RTP.AudioInput import AudioInput
 15 | from SIP.RemoteSession import RemoteSessionOffer, RemoteSessionAccept
 16 | from Core.T2T.Translator import Translator
 17 | from Core.AudioChunk import AudioChunk
 18 | from ..LiveTranslator.LTSession import _sess_term, TTSProxy
 19 | 
 20 | class STTProxy(AudioInput):
 21 |     from time import monotonic
 22 |     last_chunk_time: Optional[float] = None
 23 |     debug = True
 24 |     stt_do: callable
 25 |     stt_done: callable
 26 |     def __init__(self, stt_actr, stt_lang, stt_sess_id, stt_done):
 27 |         self.stt_do = partial(stt_actr.stt_session_soundin.remote, sess_id=stt_sess_id)
 28 |         self.lang, self.stt_done = stt_lang, stt_done
 29 | 
 30 |     def audio_in(self, chunk:AudioChunk):
 31 |         if self.last_chunk_time is None:
 32 |             return
 33 |         if chunk.active:
 34 |             self.last_chunk_time = None
 35 |             return
 36 |         if self.monotonic() - self.last_chunk_time < 2.0:
 37 |             return
 38 |         def stt_done(result:STTSentinel):
 39 |             print(f'STTProxy: {result=}')
 40 |             self.stt_done(result=result)
 41 |         self.last_chunk_time = None
 42 |         sreq = STTSentinel('flush', stt_done)
 43 |         self.stt_do(req=sreq)
 44 | 
 45 |     # This method runs in the context of the inbound RTP Actor
 46 |     def vad_chunk_in(self, chunk:AudioChunk):
 47 |         self.last_chunk_time = self.monotonic()
 48 |         if self.debug:
 49 |             print(f'STTProxy: VAD: {len(chunk.audio)=} {chunk.track_id=}')
 50 |         def stt_done(result:STTResult):
 51 |             print(f'STTProxy: {result=}')
 52 |             self.stt_done(result=result)
 53 |         sreq = STTRequest(chunk, stt_done, self.lang)
 54 |         sreq.mode = 'translate'
 55 |         self.stt_do(req=sreq)
 56 | 
 57 | class AIASession():
 58 |     debug = False
 59 |     id: UUID
 60 |     stt_sess_id: UUID
 61 |     rtp_sess_id: UUID
 62 |     llm_sess_id: UUID
 63 |     last_llm_req_id: UUID
 64 |     rtp_actr: InfernRTPActor
 65 |     tts_sess: RemoteTTSSession
 66 |     say_buffer: List[TTSRequest]
 67 |     translator: Optional[Translator]
 68 |     stt_sess_term: callable
 69 |     text_in_buffer: List[str]
 70 |     saying: UUID
 71 | 
 72 |     def __init__(self, aiaa:'AIAActor', new_sess:RemoteSessionOffer, llm_prompt:str):
 73 |         self.id = uuid4()
 74 |         self.say_buffer = []
 75 |         sess_term_alice = partial(_sess_term, sterm=aiaa.aia_actr.sess_term.remote, sess_id=self.id, sip_sess_id=new_sess.sip_sess_id)
 76 |         self.tts_say_done_cb = partial(aiaa.aia_actr.tts_say_done.remote, sess_id=self.id)
 77 |         amsg = RemoteSessionAccept(disc_cb=sess_term_alice, auto_answer=True)
 78 |         try:
 79 |             rtp_alice = ray.get(new_sess.accept(msg=amsg))
 80 |         except KeyError:
 81 |             print(f'Failed to accept {new_sess.sip_sess_id=}')
 82 |             return
 83 |         self.rtp_actr, self.rtp_sess_id = rtp_alice
 84 |         stt_sess = aiaa.stt_actr.new_stt_session.remote(keep_context=True)
 85 |         llmp = LLMSessionParams(llm_prompt)
 86 |         llm_sess = aiaa.llm_actr.new_llm_session.remote(llmp)
 87 |         self.tts_sess = RemoteTTSSession(aiaa.tts_actr)
 88 |         self.stt_sess_id, self.llm_sess_id = ray.get([stt_sess, llm_sess])
 89 |         self.stt_sess_term = partial(aiaa.stt_actr.stt_session_end.remote, self.stt_sess_id)
 90 |         self.llm_sess_term = partial(aiaa.llm_actr.llm_session_end.remote, self.llm_sess_id)
 91 |         self.translator = aiaa.translator
 92 |         text_cb = partial(aiaa.aia_actr.text_in.remote, sess_id=self.id)
 93 |         vad_handler = STTProxy(aiaa.stt_actr, aiaa.stt_lang, self.stt_sess_id, text_cb)
 94 |         try:
 95 |             ray.get(self.rtp_actr.rtp_session_connect.remote(self.rtp_sess_id, vad_handler))
 96 |         except RTPSessNotFoundErr:
 97 |             print(f'RTPSessNotFoundErr: {self.rtp_sess_id=}')
 98 |             sess_term_alice()
 99 |             return
100 |         soundout = partial(self.rtp_actr.rtp_session_soundout.remote, self.rtp_sess_id)
101 |         tts_soundout = TTSProxy(soundout)
102 |         self.tts_sess.start(tts_soundout)
103 |         self.speaker = ray.get(aiaa.tts_actr.get_rand_voice_id.remote())
104 |         self.speaker = 6852
105 |         self.llm_text_cb = partial(aiaa.aia_actr.text_out.remote, sess_id=self.id)
106 |         self.llm_session_textin = partial(aiaa.llm_actr.llm_session_textin.remote, sess_id=self.llm_sess_id)
107 |         self.llm_session_context_add = partial(aiaa.llm_actr.llm_session_context_add.remote,
108 |                                                sess_id=self.llm_sess_id)
109 |         si = new_sess.sess_info
110 |         self.n2w = NumbersToWords()
111 |         self.text_in_buffer = []
112 |         self.text_to_llm(f'<Incoming call from "{si.from_name}" at "{si.from_number}">')
113 |         print(f'Agent {self.speaker} at your service.')
114 | 
115 |     def text_to_llm(self, text:str):
116 |         req = LLMRequest(text, self.llm_text_cb)
117 |         req.auto_ctx_add = False
118 |         self.llm_session_textin(req=req)
119 |         self.last_llm_req_id = req.id
120 | 
121 |     def text_in(self, result:Union[STTResult,STTSentinel]):
122 |         if isinstance(result, STTResult):
123 |             if self.debug:
124 |                 print(f'STT: "{result.text=}" {result.no_speech_prob=}')
125 |             nsp = result.no_speech_prob
126 |             if nsp > STTRequest.max_ns_prob or len(result.text) == 0:
127 |                 if result.duration < 5.0:
128 |                     return
129 |                 text = f'<unaudible duration={result.duration} no_speech_probability={nsp}>'
130 |             else:
131 |                 text = result.text
132 |             self.text_in_buffer.append(text)
133 |             if len(self.say_buffer) > 0:
134 |                 self.say_buffer = self.say_buffer[:1]
135 |                 if self.saying is not None:
136 |                     self.llm_session_context_add(content='<sentence interrupted>', role='user')
137 |                     self.tts_sess.stop_saying(self.saying)
138 |                     self.saying = None
139 |             return
140 |         if len(self.text_in_buffer) == 0:
141 |             return
142 |         text = ' '.join(self.text_in_buffer)
143 |         self.text_in_buffer = []
144 |         self.text_to_llm(text)
145 |         return
146 | 
147 |     def text_out(self, result:LLMResult):
148 |         if self.debug: print(f'text_out({result.text=})')
149 |         if result.req_id != self.last_llm_req_id:
150 |             print(f'LLMResult for old req_id: {result.req_id}')
151 |             return
152 |         if result.text == '<nothingtosay>':
153 |             print(f'LLMResult: nothing to say')
154 |             return
155 |         text = sent_tokenize(result.text)
156 |         out_sents = [text.pop(0),]
157 |         for t in text:
158 |             if len(out_sents[-1]) + len(t) < 128 or out_sents[-1].endswith(' i.e.'):
159 |                 out_sents[-1] += ' ' + t
160 |             else:
161 |                 out_sents.append(t)
162 |         for t in out_sents:
163 |             self.tts_say(t)
164 | 
165 |     def _tts_say(self, tr:TTSRequest):
166 |         self.saying = self.tts_sess.say(tr)
167 |         self.llm_session_context_add(content=tr.text[0], role='assistant')
168 | 
169 |     def tts_say(self, text):
170 |         if self.debug: print(f'tts_say({text=})')
171 |         text = self.n2w(text)
172 |         tts_req = TTSRequest([text,], done_cb=self.tts_say_done_cb, speaker_id=self.speaker)
173 |         self.say_buffer.append(tts_req)
174 |         if len(self.say_buffer) > 1:
175 |             return
176 |         self._tts_say(tts_req)
177 | 
178 |     def tts_say_done(self):
179 |         if self.debug: print(f'tts_say_done()')
180 |         tbuf = self.say_buffer
181 |         tbuf.pop(0)
182 |         if len(tbuf) > 0:
183 |             self._tts_say(tbuf[0])
184 |             return
185 |         self.saying = None
186 | 
187 |     def sess_term(self, _):
188 |         self.stt_sess_term()
189 |         self.tts_sess.end()
190 |         self.llm_sess_term()
191 | 


--------------------------------------------------------------------------------
/Apps/LiveTranslator/LTActor.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, List
 2 | from uuid import UUID
 3 | from functools import partial
 4 | 
 5 | from ray import ray
 6 | import nltk
 7 | from tensorboardX import SummaryWriter
 8 | 
 9 | from config.InfernGlobals import InfernGlobals as IG
10 | from Cluster.InfernSIPActor import InfernSIPActor
11 | from Cluster.InfernTTSActor import InfernTTSActor
12 | from Cluster.InfernSTTActor import InfernSTTActor
13 | from Cluster.STTSession import STTResult
14 | from SIP.RemoteSession import RemoteSessionOffer
15 | from Core.T2T.NumbersToWords import NumbersToWords
16 | from Core.Exceptions.InfernSessNotFoundErr import InfernSessNotFoundErr
17 | 
18 | from .LTSession import LTSession, VADSignals
19 | 
20 | def ntw_filter(text, from_code=None, to_code=None, tr=lambda x:x, obj=NumbersToWords()):
21 |     print(f'ntw_filter({from_code=}, {to_code=}, {text=})')
22 |     return obj(tr(text))
23 | 
24 | class LTSessNotFoundErr(InfernSessNotFoundErr): pass
25 | 
26 | @ray.remote(resources={"live_translator": 1})
27 | class LTActor():
28 |     sessions: Dict[UUID, LTSession]
29 |     vds: Optional[VADSignals]=None
30 |     translators: List[callable]
31 |     nstts: int = 0
32 |     def __init__(self):
33 |         self.stt_out_langs = ('en', 'en')
34 | 
35 |     def start(self, lt_prof: 'LTProfile', sip_actr:InfernSIPActor):
36 |         self.lt_prof = lt_prof
37 |         self.tts_langs = lt_prof.tts_langs
38 |         self.stt_langs = lt_prof.stt_langs
39 |         nltk.download('punkt')
40 |         self.lt_actr = ray.get_runtime_context().current_actor
41 |         self.sip_actr = sip_actr
42 |         self.tts_actrs = dict((l, InfernTTSActor.remote()) for l in self.tts_langs)
43 |         self.stt_actr = InfernSTTActor.remote()
44 |         futs = [_a.start.remote(**_k) for _a, _k in ((self.stt_actr, {}),) +
45 |                          tuple((a, {'lang':l, 'output_sr':8000}) for l, a in self.tts_actrs.items())]
46 |         self.translators = [ntw_filter if _sol == _tl else
47 |                             IG.get_translator(_sol, _tl, filter=partial(ntw_filter, obj=NumbersToWords(_tl))).translate
48 |                             for _tl, _sol in zip(self.tts_langs, self.stt_out_langs)]
49 |         self.swriter = SummaryWriter()
50 |         ray.get(futs)
51 |         self.sessions = {}
52 | 
53 |     def precache(self, lt_prof: 'LTProfile'):
54 |         nltk.download('punkt')
55 |         lt_actr = ray.get_runtime_context().current_actor
56 |         tts_actrs = dict((l, InfernTTSActor.remote()) for l in lt_prof.tts_langs)
57 |         stt_actr = InfernSTTActor.remote()
58 |         futs = [_a.start.remote(**_k) for _a, _k in ((stt_actr, {}),) +
59 |                 tuple((a, {'lang':l, 'output_sr':8000, 'device':'cpu'}) for l, a in tts_actrs.items())]
60 |         translators = [ntw_filter if _sol == _tl else
61 |                        IG.get_translator(_sol, _tl, filter=partial(ntw_filter, obj=NumbersToWords(_tl))).translate
62 |                        for _tl, _sol in zip(lt_prof.tts_langs, self.stt_out_langs)]
63 |         ray.get(futs)
64 |         for a in list(tts_actrs.values()) + [stt_actr]:
65 |             ray.get(a.stop.remote())
66 | 
67 |     def new_sip_session_received(self, new_sess:RemoteSessionOffer):
68 |         if self.vds is None:
69 |             self.vds = VADSignals()
70 |         lt_sess = LTSession(self, new_sess)
71 |         print(f'{lt_sess=}')
72 |         self.sessions[lt_sess.id] = lt_sess
73 | 
74 |     def sess_term(self, sess_id:UUID, sip_sess_id:UUID, relaxed:bool=False):
75 |         try:
76 |             self._get_session(sess_id).sess_term(sip_sess_id)
77 |         except LTSessNotFoundErr:
78 |             if not relaxed: raise
79 |             return
80 |         del self.sessions[sess_id]
81 | 
82 |     def text_in(self, sess_id:UUID, result:STTResult):
83 |         self.swriter.add_scalar(f'stt/inf_time', result.inf_time, self.nstts)
84 |         self.nstts += 1
85 |         self._get_session(sess_id).text_in(result)
86 | 
87 |     def tts_say_done(self, sess_id:UUID, direction:int):
88 |         self._get_session(sess_id).tts_say_done(direction)
89 | 
90 |     def _get_session(self, sess_id:UUID):
91 |         try: return self.sessions[sess_id]
92 |         except KeyError: raise LTSessNotFoundErr(f'No LT session with id {sess_id}')
93 | 


--------------------------------------------------------------------------------
/Apps/LiveTranslator/LTAppConfig.py:
--------------------------------------------------------------------------------
 1 | from .LTProfile import LTProfile
 2 | 
 3 | class LTAppConfig():
 4 |     schema: dict = {
 5 |         'live_translator': {
 6 |             'type': 'dict',
 7 |             'schema': {
 8 |                 **LTProfile.schema,
 9 |             }
10 |         },
11 |         'live_translator_precache': {'type': 'boolean'},
12 |     }
13 | 


--------------------------------------------------------------------------------
/Apps/LiveTranslator/LTProfile.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Optional
 2 | 
 3 | import ray
 4 | 
 5 | from Cluster.InfernSIPActor import InfernSIPActor
 6 | 
 7 | from .LTActor import LTActor
 8 | 
 9 | class LTProfile():
10 |     schema: dict = {
11 |         'profiles': {
12 |             'type': 'dict',
13 |             'keysrules': {'type': 'string'},
14 |             'valuesrules': {
15 |                 'type': 'dict',
16 |                 'schema': {
17 |                     'tts_langs': {'type': 'list', 'schema': {'type': 'string'}},
18 |                     'stt_langs': {'type': 'list', 'schema': {'type': 'string'}},
19 |                     'outbound': {'type': 'string'}
20 |                 }
21 |             }
22 |         }
23 |     }
24 |     name: str
25 |     tts_langs: Tuple[str]
26 |     stt_langs: Tuple[str]
27 |     _outbound_spec: str
28 |     outbound_conn: 'InfernSIPProfile'
29 |     outbount_params: str
30 |     actor: Optional[LTActor] = None
31 |     precache: bool
32 | 
33 |     def __init__(self, name, conf, precache):
34 |         self.name = name
35 |         self.tts_langs = tuple(conf['tts_langs'])
36 |         self.stt_langs = tuple(conf['stt_langs'])
37 |         if not precache:
38 |             self._outbound = conf['outbound']
39 |         self.precache = precache
40 | 
41 |     def finalize(self, iconf:'InfernConfig'):
42 |         if not self.precache:
43 |             sip_cname, params = self._outbound.split(';', 1)
44 |             self.outbound_conn = iconf.connectors[sip_cname]
45 |             self.outbount_params = params
46 |         else:
47 |             actor = LTActor.remote()
48 |             res = ray.get(actor.precache.remote(self))
49 | 
50 |     def getActor(self, iconf:'InfernConfig', sip_act:InfernSIPActor):
51 |         if self.actor is None:
52 |             self.actor = LTActor.remote()
53 |             ray.get(self.actor.start.remote(self, sip_act))
54 |         return self.actor
55 | 


--------------------------------------------------------------------------------
/Apps/LiveTranslator/LTSession.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List, Optional, Dict
  2 | from functools import partial, lru_cache
  3 | from uuid import UUID, uuid4
  4 | 
  5 | from ray import ray
  6 | from nltk.tokenize import sent_tokenize
  7 | 
  8 | from Cluster.InfernRTPActor import InfernRTPActor
  9 | from Cluster.InfernTTSActor import InfernTTSActor
 10 | from Cluster.RemoteTTSSession import RemoteTTSSession
 11 | from Cluster.STTSession import STTRequest, STTResult
 12 | from Cluster.TTSSession import TTSRequest
 13 | from Core.AudioChunk import AudioChunk, AudioChunkFromURL
 14 | from RTP.AudioInput import AudioInput
 15 | from Core.T2T.Translator import Translator
 16 | from SIP.RemoteSession import RemoteSessionOffer, RemoteSessionAccept, NewRemoteSessionRequest
 17 | from Core.AStreamMarkers import ASMarkerNewSent
 18 | 
 19 | #from .LTProfile import LTProfile
 20 | 
 21 | import pickle
 22 | import gzip
 23 | from random import choice
 24 | 
 25 | @lru_cache(maxsize=4)
 26 | def get_top_speakers(lang:str):
 27 |     skips = 0
 28 |     i = 0
 29 |     res = []
 30 |     while True:
 31 |         try:
 32 |             with gzip.open(f'checkpoint/{lang}/speaker.{i}.{lang}.pkl.gz', 'rb') as file:
 33 |                 res.append(pickle.load(file))
 34 |         except FileNotFoundError:
 35 |             skips += 1
 36 |             if skips > 200: break
 37 |         i += 1
 38 |     if len(res) == 0:
 39 |         return None
 40 |     gen = max(r.nres for r in res)
 41 |     res = sorted([r for r in res if r.nres == gen], key=lambda r: r.max_error())[:50]
 42 |     return tuple(r.speaker_id for r in res)
 43 | 
 44 | class VADSignals():
 45 |     def __init__(self):
 46 |         eng, deng= [AudioChunkFromURL(f'https://github.com/commaai/openpilot/blob/master/selfdrive/assets/sounds/{n}.wav?raw=true') for n in ('engage', 'disengage')]
 47 |         eng.track_id = 2
 48 |         eng.debug = True
 49 |         self.eng = ray.put(eng)
 50 |         self.deng = ray.put(deng)
 51 | 
 52 | class STTProxy():
 53 |     debug = True
 54 |     stt_do: callable
 55 |     stt_done: callable
 56 |     vad_mirror: callable
 57 |     def __init__(self, lta:'LTActor', uas:'Sess', stt_done, vad_mirror, direction):
 58 |         self.stt_do = partial(lta.stt_actr.stt_session_soundin.remote, sess_id=uas.stt_sess_id)
 59 |         self.lang, self.stt_done = uas.stt_lang, stt_done
 60 |         self.vad_mirror = vad_mirror
 61 |         self.eng = lta.vds.eng
 62 |         self.direction = direction
 63 | 
 64 |     # This method runs in the context of the inbound RTP Actor
 65 |     def __call__(self, chunk:AudioChunk):
 66 |         if self.debug:
 67 |             dir = 'A' if self.direction == 0 else 'B'
 68 |             print(f'STTProxy: VAD({dir}): {len(chunk.audio)=} {chunk.track_id=}')
 69 |         #self.vad_mirror(chunk=self.eng)
 70 |         def stt_done(result:STTResult, direction=self.direction):
 71 |             print(f'STTProxy: {result=}')
 72 |             result.direction = direction
 73 |             self.stt_done(result=result)
 74 |         sreq = STTRequest(chunk, stt_done, self.lang)
 75 |         sreq.mode = 'translate'
 76 |         self.stt_do(req=sreq)
 77 | 
 78 | class TTSProxy():
 79 |     debug = False
 80 |     tts_consume: callable
 81 |     def __init__(self, tts_consume):
 82 |         self.tts_consume = tts_consume
 83 | 
 84 |     # This method runs in the context of the outbound RTP Actor
 85 |     def __call__(self, chunk:AudioChunk):
 86 |         if self.debug and isinstance(chunk, ASMarkerNewSent):
 87 |             print(f'TTSProxy: ASMarkerNewSent')
 88 |         chunk.track_id = 1
 89 |         chunk.debug = False
 90 |         self.tts_consume(chunk=chunk)
 91 | 
 92 | class SessionInfo():
 93 |     soundout: callable
 94 |     rsess_pause: callable
 95 |     rsess_connect: callable
 96 |     translator: callable
 97 |     get_speaker: callable
 98 |     tts_say: callable
 99 |     tts_say_done: callable
100 |     def __init__(self, lts:'LTSession', lta:'LTActor', xua:'Sess', yua:'Sess'):
101 |         #lt_actr = ray.get_runtime_context().current_actor
102 |         self.soundout = partial(xua.rtp_actr.rtp_session_soundout.remote, xua.rtp_sess_id)
103 |         vad_cb = self.soundout
104 |         text_cb = partial(lta.lt_actr.text_in.remote, sess_id=lts.id)
105 |         self.tts_say_done = partial(lta.lt_actr.tts_say_done.remote, sess_id=lts.id, direction=xua.direction)
106 |         vad_handler = STTProxy(lta, xua, text_cb, vad_cb, xua.direction)
107 |         self.rsess_pause = partial(xua.rtp_actr.rtp_session_connect.remote, xua.rtp_sess_id,
108 |                                    AudioInput(vad_chunk_in=vad_handler))
109 |         ysoundout = partial(yua.rtp_actr.rtp_session_soundout.remote, yua.rtp_sess_id)
110 |         self.rsess_connect = partial(xua.rtp_actr.rtp_session_connect.remote, xua.rtp_sess_id,
111 |                                      AudioInput(yua.rtp_sess_id, vad_handler))
112 |         self.translator = xua.translator
113 |         self.get_speaker = (lambda: None) if xua.speakers is None else partial(choice, xua.speakers)
114 |         self.sip_sess_term = partial(lta.sip_actr.sess_term.remote, xua.sip_sess_id)
115 |         self.stt_sess_term = partial(lta.stt_actr.stt_session_end.remote, xua.stt_sess_id)
116 |         self.tts_sess_term = xua.tts_sess.end
117 |         self.tts_say = xua.tts_sess.say
118 |         self.tts_soundout = TTSProxy(ysoundout)
119 | 
120 |     def sess_term(self):
121 |         self.stt_sess_term()
122 |         self.tts_sess_term()
123 | 
124 | class Sessions():
125 |     info: Tuple[SessionInfo]
126 |     def __init__(self, lts:'LTSession', lta:'LTActor', xua:'Sess', yua:'Sess'):
127 |         self.info = (
128 |             SessionInfo(lts, lta, xua, yua),
129 |             SessionInfo(lts, lta, yua, xua),
130 |             )
131 |         for i, u in zip(self.info, (xua, yua)):
132 |             i.rsess_connect()
133 |             #i.rsess_pause()
134 |             u.tts_sess.start(i.tts_soundout)
135 | 
136 | class Sess():
137 |     direction: int
138 |     sip_sess_id: UUID
139 |     rtp_sess_id: UUID
140 |     tts_sess: RemoteTTSSession
141 |     stt_sess_id: UUID
142 |     rtp_actr: InfernRTPActor
143 |     tts_actr: InfernTTSActor
144 |     translator: Optional[Translator]
145 |     def __init__(self, lta:'LTActor', direction:int):
146 |         self.direction = direction
147 |         tts_lang, stt_lang = lta.tts_langs[direction], lta.stt_langs[direction]
148 |         self.speakers = get_top_speakers(tts_lang)
149 |         self.tts_lang, self.stt_lang = tts_lang, stt_lang
150 |         self.translator = lta.translators[direction]
151 |         self.tts_sess = RemoteTTSSession(lta.tts_actrs[tts_lang])
152 | 
153 | def _sess_term(*args, sterm:callable, sess_id:UUID, sip_sess_id:UUID):
154 |     return sterm(sess_id, sip_sess_id, relaxed=True)
155 | 
156 | class LTSession():
157 |     debug = False
158 |     id: UUID
159 |     alice: Sess
160 |     bob: Sess
161 |     say_buffer: Dict[int, List[TTSRequest]]
162 | 
163 |     def __init__(self, lta, new_sess:RemoteSessionOffer):
164 |         self.id = uuid4()
165 |         self.say_buffer = {0:[], 1:[]}
166 |         lt_prof: 'LTProfile' = lta.lt_prof
167 |         dest_number = dict(x.split('=', 1) for x in lt_prof.outbount_params.split(';'))['cld']
168 |         #dest_number = '205'
169 |         #dest_number = '601'
170 |         sess_term_alice = partial(_sess_term, sterm=lta.lt_actr.sess_term.remote, sess_id=self.id, sip_sess_id=new_sess.sip_sess_id)
171 |         amsg = RemoteSessionAccept(disc_cb=sess_term_alice, auto_answer=False)
172 |         try:
173 |             rtp_alice = ray.get(new_sess.accept(msg=amsg))
174 |         except KeyError:
175 |             print(f'Failed to accept {new_sess.sip_sess_id=}')
176 |             return
177 |         sess_term_bob = partial(_sess_term, sterm=lta.lt_actr.sess_term.remote, sess_id=self.id, sip_sess_id=None)
178 |         bmsg = NewRemoteSessionRequest(cld=dest_number, sip_prof=lt_prof.outbound_conn, disc_cb=sess_term_bob)
179 |         bmsg.conn_sip_sess_id = new_sess.sip_sess_id
180 |         sip_sess_id_bob = lta.sip_actr.new_sess.remote(msg=bmsg)
181 |         ssess = [lta.stt_actr.new_stt_session.remote(keep_context=True) for _ in lta.stt_langs]
182 | 
183 |         alice = Sess(lta, 0)
184 |         bob = Sess(lta, 1)
185 | 
186 |         #alice.tts_sess, bob.tts_sess = [RemoteTTSSession(lta.tts_actrs[lang]) for lang in lta.tts_langs]
187 | 
188 |         alice.sip_sess_id = new_sess.sip_sess_id
189 |         alice.rtp_actr, alice.rtp_sess_id = rtp_alice
190 |         bob.sip_sess_id, bob.rtp_actr, bob.rtp_sess_id = ray.get(sip_sess_id_bob)
191 |         alice.stt_sess_id, bob.stt_sess_id = ray.get(ssess)
192 |         self.fabric = Sessions(self, lta, alice, bob)
193 |         if self.debug: print(f'{alice=} {bob=} {self.fabric=}')
194 |         self.alice, self.bob = alice, bob
195 | 
196 |     def sess_term(self, sip_sess_id):
197 |         for i in self.fabric.info:
198 |             i.sess_term()
199 |         if sip_sess_id == self.alice.sip_sess_id:
200 |             self.fabric.info[1].sip_sess_term()
201 |         else:
202 |             self.fabric.info[0].sip_sess_term()
203 | 
204 |     def text_in(self, result:STTResult):
205 |         sdir = 'A->B' if result.direction == 0 else 'B->A'
206 |         print(f'STT: {sdir} "{result.text=}" {result.no_speech_prob=}')
207 |         nsp = result.no_speech_prob
208 |         if nsp > STTRequest.max_ns_prob: return
209 |         sinfo = self.fabric.info[result.direction]
210 |         text = sinfo.translator(result.text)
211 |         speaker_id = sinfo.get_speaker()
212 |         #sinfo.rsess_pause()
213 |         print(f'TTS: {sdir} "{text=}" {speaker_id=}')
214 |         text = sent_tokenize(text)
215 |         out_sents = [text.pop(0),]
216 |         for t in text:
217 |             if len(out_sents[-1]) + len(t) < 128 or out_sents[-1].endswith(' i.e.'):
218 |                 out_sents[-1] += ' ' + t
219 |             else:
220 |                 out_sents.append(t)
221 | 
222 |         print(f'TTS split: "{out_sents=}" {[len(t) for t in out_sents]=}')
223 |         tts_req = ray.put(TTSRequest(out_sents, speaker_id=speaker_id, done_cb=sinfo.tts_say_done))
224 |         self.say_buffer[result.direction].append(tts_req)
225 |         if len(self.say_buffer[result.direction]) > 1:
226 |             return
227 |         sinfo.tts_say(tts_req)
228 |         return
229 | 
230 |     def tts_say_done(self, direction:int):
231 |         if self.debug: print(f'tts_say_done({direction=})')
232 |         tbuf = self.say_buffer[direction]
233 |         tbuf.pop(0)
234 |         if len(tbuf) > 0:
235 |             self.fabric.info[direction].tts_say(tbuf[0])
236 |             return
237 | 


--------------------------------------------------------------------------------
/Cluster/InfernBatchedWorker.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List
 2 | from queue import Queue, Empty as QueueEmpty
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | from Core.InfernWrkThread import InfernWrkThread, RTPWrkTRun
 6 | 
 7 | class InfernBatchedWorker(InfernWrkThread, ABC):
 8 |     max_batch_size: int
 9 |     inf_queue: Queue[Optional[object]]
10 |     def __init__(self):
11 |         super().__init__()
12 |         self.inf_queue = Queue()
13 | 
14 |     def infer(self, wi:object):
15 |         self.inf_queue.put(wi)
16 | 
17 |     def next_batch(self) -> List[object]:
18 |         wis = []
19 |         while len(wis) < self.max_batch_size:
20 |             if len(wis) == 0:
21 |                 wi = self.inf_queue.get()
22 |             else:
23 |                 try: wi = self.inf_queue.get_nowait()
24 |                 except QueueEmpty: break
25 |             if wi is None:
26 |                 return None
27 |             wis.append(wi)
28 |         return wis
29 | 
30 |     @abstractmethod
31 |     def process_batch(self, wis:List[object]): pass
32 | 
33 |     def run(self):
34 |         super().thread_started()
35 |         while self.get_state() == RTPWrkTRun:
36 |             wis = self.next_batch()
37 |             if wis is None:
38 |                 break
39 |             for wi in (wi for wi in wis if hasattr(wi, '_proc_start_cb')):
40 |                 wi._proc_start_cb()
41 |             self.process_batch(wis)
42 | 
43 |     def stop(self):
44 |         self.inf_queue.put(None)
45 |         super().stop()
46 | 


--------------------------------------------------------------------------------
/Cluster/InfernLLMActor.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from uuid import UUID
 3 | from queue import Queue
 4 | 
 5 | import ray
 6 | 
 7 | from Cluster.InfernLLMWorker import InfernLLMWorker
 8 | from Cluster.LLMSession import LLMSession, LLMRequest, LLMInferRequest, LLMSessionParams
 9 | 
10 | @ray.remote(num_gpus=1.0, resources={"llm": 1})
11 | class InfernLLMActor():
12 |     debug = True
13 |     sessions: Dict[UUID, LLMSession]
14 |     LLM: InfernLLMWorker
15 | 
16 |     def __init__(self):
17 |         super().__init__()
18 |         self.sessions = {}
19 | 
20 |     def start(self):
21 |         for device in ('xpu', 'cuda', 'cpu'):
22 |             try:
23 |                 self.llm = InfernLLMWorker(device)
24 |             except (ValueError, RuntimeError):
25 |                 continue
26 |             break
27 |         else:
28 |             raise RuntimeError('Failed to initialize LLM')
29 |         self.llm.start()
30 |         tq = Queue()
31 |         def res_cb(result): tq.put(result)
32 |         irs = tuple(LLMInferRequest(LLMRequest('What is your name?', None), [{}])
33 |                     for _ in range(self.llm.max_batch_size))
34 |         for _i in irs: _i.textout_cb = res_cb
35 |         with self.llm.inf_queue.mutex:
36 |             for ir in irs:
37 |                 self.llm.inf_queue.queue.append(ir)
38 |             self.llm.inf_queue.not_empty.notify()
39 |         for _ in irs:
40 |             tq.get()
41 | 
42 |     def stop(self):
43 |         self.llm.stop()
44 | 
45 |     def new_llm_session(self, sconf:LLMSessionParams):
46 |         if self.debug: print('InfernLLMActor.new_llm_session')
47 |         sess = LLMSession(self.llm, sconf)
48 |         self.sessions[sess.id] = sess
49 |         return sess.id
50 | 
51 |     def llm_session_end(self, sess_id):
52 |         if self.debug: print('InfernLLMActor.llm_session_end')
53 |         sess = self.sessions[sess_id]
54 |         sess.stop()
55 |         del self.sessions[sess_id]
56 | 
57 |     def llm_session_textin(self, sess_id, req:LLMRequest):
58 |         if self.debug: print('InfernLLMActor.llm_session_textin')
59 |         sess = self.sessions[sess_id]
60 |         sess.textin(req)
61 |         return sess_id
62 | 
63 |     def llm_session_context_add(self, sess_id, content:str, role:str = 'user'):
64 |         if self.debug: print('InfernLLMActor.llm_session_context_add')
65 |         sess = self.sessions[sess_id]
66 |         sess.context_add(content, role)
67 |         return sess_id
68 | 


--------------------------------------------------------------------------------
/Cluster/InfernLLMWorker.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List, Iterator
  2 | from os.path import exists as path_exists
  3 | from itertools import chain
  4 | from functools import partial
  5 | 
  6 | import torch
  7 | import torch.nn.functional as F
  8 | 
  9 | from transformers import TextStreamer
 10 | 
 11 | from Cluster.InfernBatchedWorker import InfernBatchedWorker
 12 | from Cluster.InfernTTSWorker import get_torch_hw
 13 | from Cluster.LLMSession import LLMResult, LLMInferRequest
 14 | 
 15 | class ResultsStreamer(TextStreamer):
 16 |     debug = False
 17 |     sync_on = ('. ', '? ', '! ', '\n')
 18 |     decode_batch_size = 8
 19 |     def __init__(self, wis:List[LLMInferRequest], upper:'InfernLLMWorker'):
 20 |         super().__init__(tokenizer=upper.llm_tokenizer)
 21 |         self.wi_cbs = tuple(wi.textout_cb for wi in wis)
 22 |         self.newLLMResult = tuple(partial(LLMResult, req_id=wi.req.id) for wi in wis)
 23 |         batch_size = len(wis)
 24 |         self.oposs = [0 for _ in range(batch_size)]
 25 |         self.current_tokens = None
 26 |         self.batch_decode = partial(upper.llm_tokenizer.batch_decode, skip_special_tokens=True)
 27 | 
 28 |     def put(self, token_ids):
 29 |         if self.current_tokens is None:
 30 |             self.current_tokens = torch.zeros((token_ids.shape[0], 0), dtype=torch.long)
 31 |             return
 32 |         if token_ids.dim() == 1:  # Shape [batch_size]
 33 |             token_ids = token_ids.unsqueeze(1)
 34 |         self.current_tokens = torch.cat([self.current_tokens, token_ids], dim=1)
 35 |         if self.current_tokens.shape[1] % self.decode_batch_size == 0:
 36 |             return
 37 |         results = self.batch_decode(self.current_tokens)
 38 |         for (ir, r), op, cb, newLR in zip(enumerate(results), self.oposs, self.wi_cbs, self.newLLMResult):
 39 |             new_content = r[op:]
 40 |             if len(new_content) == 0: continue
 41 |             sp = (op + pos + len(c) for c in self.sync_on if (pos:=new_content.rfind(c)) >= 0)
 42 |             try:
 43 |                 spos = next(sp)
 44 |             except StopIteration:
 45 |                 continue
 46 |             r = r[op:spos-1]
 47 |             if len(r) < 10: continue
 48 |             cb(result=newLR(r))
 49 |             self.oposs[ir] = spos
 50 |         if self.debug:
 51 |             print(f'{self.oposs=} {self.current_tokens.shape=}')
 52 | 
 53 |     def end(self):
 54 |         if self.debug:
 55 |             print(f'finished: {self.current_tokens.shape=}')
 56 |         results = self.batch_decode(self.current_tokens)
 57 |         for r, op, cb, newLR in zip(results, self.oposs, self.wi_cbs, self.newLLMResult):
 58 |             if len(r) == op: continue
 59 |             cb(result=newLR(r[op:]))
 60 |         del self.current_tokens
 61 |         del self.wi_cbs
 62 | 
 63 | class InfernLLMWorker(InfernBatchedWorker):
 64 |     model_name = "Qwen/Qwen2.5-14B-Instruct"
 65 |     model_cache_dir = f"/tmp/saved_model.{model_name}"
 66 |     max_batch_size: int = 8
 67 |     debug = True
 68 |     llm_model: object
 69 |     llm_tokenizer: object
 70 |     output_sr: int
 71 | 
 72 |     def __init__(self, device=None):
 73 |         from warnings import filterwarnings
 74 |         filterwarnings("ignore", category=FutureWarning)
 75 |         filterwarnings("ignore", category=UserWarning)
 76 |         from transformers import AutoTokenizer
 77 |         from ipex_llm.transformers import AutoModelForCausalLM
 78 |         super().__init__()
 79 |         if device is None:
 80 |             device = get_torch_hw()
 81 |         def load_model(mn):
 82 |             m = AutoModelForCausalLM.from_pretrained(mn, torch_dtype="auto",
 83 |                     device_map="auto",
 84 |                     optimize_model=True,
 85 |                     trust_remote_code=True,
 86 |                     load_in_4bit=True,
 87 |                     use_cache=True
 88 |                 )
 89 |             if mn != self.model_cache_dir:
 90 |                 m.save_low_bit(self.model_cache_dir)
 91 |             return m.to(device)
 92 |         if path_exists(self.model_cache_dir):
 93 |             try:
 94 |                 model = AutoModelForCausalLM.load_low_bit(self.model_cache_dir,
 95 |                                                           trust_remote_code=True)
 96 |             except Exception:
 97 |                 model = load_model(self.model_name)
 98 |         else:
 99 |             model = load_model(self.model_name)
100 |         self.llm_model = model.to(device)
101 |         self.llm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
102 |  
103 |     def process_batch(self, wis:List[LLMInferRequest]):
104 |         if self.debug:
105 |             print(f'InfernLLMWorker.process_batch: got {len(wis)=}')
106 |         streamer = ResultsStreamer(wis, self)
107 |         with torch.no_grad():
108 |             messages = [self.llm_tokenizer.apply_chat_template(list(r.context), tokenize=False,
109 |                         add_generation_prompt=True)
110 |                         for r  in wis]
111 |             model_inputs = self.llm_tokenizer(messages, return_tensors="pt", padding=True).to(self.llm_model.device)
112 |             self.llm_model.generate(
113 |                 **model_inputs,
114 |                 max_new_tokens=16 * 1024,
115 |                 output_scores=True,
116 |                 return_dict_in_generate=True,
117 |                 streamer=streamer,
118 |             )
119 |             torch.xpu.synchronize()
120 | 


--------------------------------------------------------------------------------
/Cluster/InfernRTPActor.py:
--------------------------------------------------------------------------------
  1 | #try: import intel_extension_for_pytorch as ipex
  2 | #except ModuleNotFoundError: ipex = None
  3 | 
  4 | from typing import Dict, Union, List
  5 | from uuid import UUID
  6 | from _thread import get_ident
  7 | 
  8 | from ray import ray
  9 | 
 10 | from sippy.Network_server import RTP_port_allocator
 11 | 
 12 | from config.InfernGlobals import InfernGlobals as IG
 13 | from Core.AudioChunk import AudioChunk
 14 | from Core.AStreamMarkers import ASMarkerGeneric
 15 | from Core.Exceptions.InfernSessNotFoundErr import InfernSessNotFoundErr
 16 | from RTP.InfernRTPIngest import InfernRTPIngest
 17 | from RTP.InfernRTPEPoint import InfernRTPEPoint
 18 | from RTP.AudioInput import AudioInput
 19 | from RTP.RTPParams import RTPParams
 20 | from RTP.InfernRTPConf import InfernRTPConf
 21 | 
 22 | class RTPSessNotFoundErr(InfernSessNotFoundErr): pass
 23 | 
 24 | @ray.remote(num_gpus=0.01, resources={"rtp": 1})
 25 | class InfernRTPActor():
 26 |     devices = ('mps', 'cuda', 'cpu')
 27 |     device: str
 28 |     sessions: Dict[UUID, InfernRTPEPoint]
 29 |     thumbstones: List[UUID]
 30 |     ring: InfernRTPIngest
 31 |     palloc: RTP_port_allocator
 32 |     inf_rc: InfernRTPConf
 33 |     def __init__(self, inf_rc:InfernRTPConf):
 34 |         self.sessions = {}
 35 |         self.thumbstones = []
 36 |         self.inf_rc = inf_rc
 37 | 
 38 |     def new_rtp_session(self, rtp_params:RTPParams):
 39 |         print(f'{IG.stdtss()}: new_rtp_session')
 40 |         rep = InfernRTPEPoint(self.inf_rc, rtp_params, self.ring, self._get_direct_soundout)
 41 |         self.sessions[rep.id] = rep
 42 |         return (rep.id, rep.rserv.uopts.laddress)
 43 | 
 44 |     def rtp_session_connect(self, rtp_id, ain:AudioInput):
 45 |         print(f'{IG.stdtss()}: rtp_session_connect[{str(rtp_id)[:6]}]')
 46 |         rep = self._get_session(rtp_id)
 47 |         rep.connect(ain)
 48 | 
 49 |     def rtp_session_end(self, rtp_id, relaxed:bool=False):
 50 |         print(f'{IG.stdtss()}: rtp_session_end')
 51 |         try:
 52 |             rep = self._get_session(rtp_id)
 53 |         except RTPSessNotFoundErr:
 54 |             if relaxed or rtp_id in self.thumbstones: return
 55 |             raise
 56 |         rep.writer.end()
 57 | 
 58 |     def rtp_session_soundout(self, rtp_id, chunk:Union[AudioChunk, ASMarkerGeneric]):
 59 |         try:
 60 |             rep = self._get_session(rtp_id)
 61 |         except RTPSessNotFoundErr:
 62 |             if rtp_id in self.thumbstones:
 63 |                 return
 64 |             raise
 65 |         return rep.soundout(chunk)
 66 | 
 67 |     def _get_direct_soundout(self, rtp_id):
 68 |         rep = self._get_session(rtp_id)
 69 |         return rep.soundout
 70 | 
 71 |     def rtp_session_join(self, rtp_id):
 72 |         print(f'{IG.stdtss()}: rtp_session_join')
 73 |         rep = self._get_session(rtp_id)
 74 |         rep.shutdown()
 75 |         del self.sessions[rtp_id]
 76 |         self.thumbstones.append(rtp_id)
 77 |         if len(self.thumbstones) > 100:
 78 |             self.thumbstones = self.thumbstones[-100:]
 79 | 
 80 |     def rtp_session_update(self, rtp_id, rtp_params:RTPParams):
 81 |         print(f'{IG.stdtss()}: rtp_session_update')
 82 |         rep = self._get_session(rtp_id)
 83 |         rep.update(rtp_params)
 84 | 
 85 |     def start(self):
 86 |         for device in self.devices:
 87 |             self.ring = InfernRTPIngest(device)
 88 |             try:
 89 |                 self.ring.start()
 90 |             except (AssertionError, RuntimeError):
 91 |                 print(f'{device} did not work')
 92 |                 continue
 93 |             self.device = device
 94 |             break
 95 |         else:
 96 |             raise RuntimeError('No suitable device found')
 97 | 
 98 |     def loop(self):
 99 |         from sippy.Core.EventDispatcher import ED2
100 |         ED2.my_ident = get_ident()
101 |         rval = ED2.loop()
102 |         self.ring.stop()
103 |         self.ring.join()
104 |         return rval
105 | 
106 |     def stop(self):
107 |         from sippy.Core.EventDispatcher import ED2
108 |         ED2.callFromThread(ED2.breakLoop, 0)
109 | 
110 |     def _get_session(self, rtp_id:UUID) -> InfernRTPEPoint:
111 |         try: return self.sessions[rtp_id]
112 |         except KeyError: raise RTPSessNotFoundErr(f'No RTP session found for {rtp_id}')
113 | 


--------------------------------------------------------------------------------
/Cluster/InfernSIPActor.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from _thread import get_ident
 3 | from queue import Queue
 4 | 
 5 | import ray
 6 | 
 7 | #from Core.InfernConfig import InfernConfig
 8 | from SIP.InfernSIP import InfernSIP
 9 | from SIP.RemoteSession import RemoteSessionAccept, NewRemoteSessionRequest
10 | from Cluster.InfernRTPActor import InfernRTPActor
11 | 
12 | @ray.remote(resources={"head": 0.5})
13 | class InfernSIPActor():
14 |     sip_stack: InfernSIP
15 |     default_resources = {'head':1, 'stt': 1, 'tts':1, 'rtp': 1}
16 |     def loop(self, inf_c:'InfernConfig'):
17 |         #raise Exception("BP")
18 |         from sippy.Core.EventDispatcher import ED2
19 |         ED2.my_ident = get_ident()
20 |         rtp_actr = self.rtp_actr = InfernRTPActor.options(max_concurrency=2).remote(inf_c.rtp_conf)
21 |         sip_actr = ray.get_runtime_context().current_actor
22 |         ray.get(rtp_actr.start.remote())
23 |         self.sip_stack = InfernSIP(sip_actr, rtp_actr, inf_c)
24 |         rtp_actr.loop.remote()
25 |         rval = ED2.loop()
26 |         ray.get(rtp_actr.stop.remote())
27 |         return rval
28 | 
29 |     def new_sess(self, msg:NewRemoteSessionRequest):
30 |         from sippy.Core.EventDispatcher import ED2
31 |         rval = Queue()
32 |         ED2.callFromThread(self.sip_stack.new_session, msg, rval)
33 |         sip_sess, rtp_sess = rval.get()
34 |         return (sip_sess.id, self.rtp_actr, rtp_sess.sess_id)
35 | 
36 |     def new_sess_accept(self, sip_sess_id, msg:RemoteSessionAccept):
37 |         from sippy.Core.EventDispatcher import ED2
38 |         sip_sess = self.sip_stack.get_session(sip_sess_id)
39 |         rval = Queue()
40 |         ED2.callFromThread(sip_sess.accept, msg, rval)
41 |         rtp_sess = rval.get()
42 |         return (self.rtp_actr, rtp_sess.sess_id)
43 | 
44 |     def new_sess_reject(self, sip_sess_id):
45 |         from sippy.Core.EventDispatcher import ED2
46 |         sip_sess = self.sip_stack.get_session(sip_sess_id)
47 |         ED2.callFromThread(sip_sess.reject)
48 | 
49 |     def sess_term(self, sip_sess_id):
50 |         from sippy.Core.EventDispatcher import ED2
51 |         sip_sess = self.sip_stack.get_session(sip_sess_id)
52 |         ED2.callFromThread(sip_sess.sess_term)
53 | 
54 |     def sess_event(self, sip_sess_id, event, **kwargs):
55 |         from sippy.Core.EventDispatcher import ED2
56 |         sip_sess = self.sip_stack.get_session(sip_sess_id)
57 |         event.kwargs = kwargs
58 |         ED2.callFromThread(sip_sess.recvEvent, event)
59 | 
60 |     def stop(self):
61 |         from sippy.Core.EventDispatcher import ED2
62 |         ED2.callFromThread(ED2.breakLoop, 0)
63 | 


--------------------------------------------------------------------------------
/Cluster/InfernSTTActor.py:
--------------------------------------------------------------------------------
 1 | #try: import intel_extension_for_pytorch as ipex
 2 | #except ModuleNotFoundError: ipex = None
 3 | 
 4 | from typing import Dict, Union
 5 | from uuid import UUID
 6 | 
 7 | import ray
 8 | 
 9 | from Cluster.InfernSTTWorker import InfernSTTWorker
10 | from Cluster.STTSession import STTSession, STTRequest, STTSentinel
11 | 
12 | @ray.remote(num_gpus=0.25, resources={"stt": 1})
13 | class InfernSTTActor():
14 |     debug = False
15 |     sessions: Dict[UUID, STTSession]
16 |     stt: InfernSTTWorker
17 | 
18 |     def __init__(self):
19 |         super().__init__()
20 |         self.sessions = {}
21 | 
22 |     def start(self):
23 |         from sys import stderr
24 |         for device in ('xpu', 'cuda', 'cpu'):
25 |             try:
26 |                 self.stt = InfernSTTWorker(device)
27 |             except (ValueError, RuntimeError):
28 |                 print(f'Failed to initialize STT with {device=}', file=stderr)
29 |                 continue
30 |             break
31 |         else:
32 |             raise RuntimeError('Failed to initialize STT')
33 |         self.stt.start()
34 | 
35 |     def stop(self):
36 |         self.stt.stop()
37 | 
38 |     def new_stt_session(self, keep_context:bool=False):
39 |         if self.debug: print('InfernSTTActor.new_stt_session')
40 |         sess = STTSession(self.stt, keep_context)
41 |         self.sessions[sess.id] = sess
42 |         return sess.id
43 | 
44 |     def stt_session_end(self, sess_id):
45 |         if self.debug: print('InfernSTTActor.stt_session_end')
46 |         sess = self.sessions[sess_id]
47 |         sess.stop()
48 |         del self.sessions[sess_id]
49 | 
50 |     def stt_session_soundin(self, sess_id, req:Union[STTRequest,STTSentinel]):
51 |         if self.debug: print('InfernSTTActor.stt_session_soundin')
52 |         sess = self.sessions[sess_id]
53 |         sess.soundin(req)
54 | 


--------------------------------------------------------------------------------
/Cluster/InfernSTTWorker.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List
  2 | from os.path import expanduser, exists as path_exists
  3 | from subprocess import Popen, PIPE
  4 | from functools import partial
  5 | 
  6 | import ctranslate2
  7 | import transformers
  8 | from methodtools import lru_cache
  9 | 
 10 | import torch
 11 | from torch.nn import functional as F
 12 | 
 13 | from Cluster.STTSession import STTRequest, STTResult
 14 | from Cluster.InfernBatchedWorker import InfernBatchedWorker
 15 | 
 16 | class InfernSTTWorker(InfernBatchedWorker):
 17 |     max_batch_size: int = 4
 18 |     max_chunk_duration: float = 32.0
 19 |     model: ctranslate2.models.Whisper
 20 |     processor: transformers.WhisperProcessor
 21 |     device: str
 22 |     cache_dir: str = '~/.cache/Infernos'
 23 |     sample_rate: int = 16000
 24 |     debug = False
 25 |     def __init__(self, device: str, model_name: str = "openai/whisper-large-v3"):
 26 |         super().__init__()
 27 |         if device != 'xpu':
 28 |             cache_dir = expanduser(f'{self.cache_dir}/{model_name}.ct2')
 29 |             if not any((path_exists(f'{cache_dir}/{_c}') for _c in ('model.bin', 'config.json', 'vocabulary.json'))):
 30 |                 print(f'Converting "{model_name}" to "{cache_dir}"...')
 31 |                 command = ['ct2-transformers-converter', '--model', model_name, '--output_dir', cache_dir]
 32 |                 process = Popen(command, stdout=PIPE, stderr=PIPE)
 33 |                 stdout, stderr = process.communicate()
 34 |                 if process.returncode != 0:
 35 |                     raise RuntimeError(f'{command[0]} failed with {process.returncode=}, {stdout=}, {stderr=}')
 36 |             self.model = ctranslate2.models.Whisper(cache_dir, device=device, compute_type="int8")
 37 |         else:
 38 |             from warnings import filterwarnings
 39 |             filterwarnings("ignore", category=FutureWarning)
 40 |             filterwarnings("ignore", category=UserWarning)
 41 |             from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 42 |             model = AutoModelForSpeechSeq2Seq.from_pretrained(
 43 |                 model_name,
 44 |                 load_in_4bit=True,
 45 |                 torch_dtype="auto",
 46 |                 device_map="auto",
 47 |                 optimize_model=True,
 48 |                 trust_remote_code=True,
 49 |                 use_cache=True
 50 |             )
 51 |             self.model = model.to(device)
 52 |         self.processor = transformers.WhisperProcessor.from_pretrained(model_name)
 53 |         if device == 'xpu':
 54 |             self.no_speech_token_id = self.processor.tokenizer.convert_tokens_to_ids("<|nospeech|>")
 55 |             self.process_audios = partial(self.processor, return_tensors="pt")
 56 |         else:
 57 |             self.process_audios = partial(self.processor, return_tensors="np")
 58 |         self.device = device
 59 |         self.infer_and_decode = partial(self.infer_and_decode_ct2 if device != 'xpu' else self.infer_and_decode_torch)
 60 | 
 61 |     def infer_and_decode_ct2(self, prompts, inputs, max_nsps):
 62 |         input_features = inputs.input_features
 63 |         features = ctranslate2.StorageView.from_array(input_features)
 64 |         try:
 65 |             results = self.model.generate(features, prompts, return_no_speech_prob=True)
 66 |         except RuntimeError as e:
 67 |             if 'out of memory' not in str(e) or len(prompts) == 1: raise
 68 |             torch.cuda.empty_cache()
 69 |             results = []
 70 |             for _if, _pr in zip(input_features, prompts):
 71 |                 features = ctranslate2.StorageView.from_array([_if,])
 72 |                 results.extend(self.model.generate(features, [_pr], return_no_speech_prob=True))
 73 |         decoded_results = ((self.processor.decode(r.sequences_ids[0]), r.no_speech_prob, r.sequences_ids[0])
 74 |                             for r in results)
 75 |         return decoded_results
 76 | 
 77 |     def infer_and_decode_torch(self, prompts, inputs, max_nsps):
 78 |         inputs = {k: v.to(self.device) for k, v in inputs.items()}
 79 |         max_len = max(len(t) for t in prompts)
 80 |         prompts = torch.stack([
 81 |             F.pad(t, (0, max_len - t.size(0)), value=self.processor.tokenizer.pad_token_id)
 82 |             for t in (torch.tensor(pr, device=self.device) for pr in prompts)
 83 |         ])
 84 |         with torch.no_grad():
 85 |             forward_outputs = self.model(
 86 |                 **inputs,
 87 |                 decoder_input_ids=prompts,
 88 |             )
 89 |             logprobs = forward_outputs.logits[:, 0].log_softmax(-1)
 90 |             no_speech_probs = logprobs[:, self.no_speech_token_id].exp().tolist()
 91 |         if all(nsp > max_nsp for nsp, max_nsp in zip(no_speech_probs, max_nsps)):
 92 |             return (('', nsp, []) for nsp in no_speech_probs)
 93 |         with torch.no_grad():
 94 |             gen_outputs = self.model.generate(
 95 |                 **inputs,
 96 |                 decoder_input_ids=prompts,
 97 |                 return_dict_in_generate=True,
 98 |                 output_scores=True,
 99 |             )
100 |         gen_seq = gen_outputs.sequences
101 |         decoded_texts = self.processor.batch_decode(gen_seq, skip_special_tokens=True)
102 |         decoded_results = (
103 |             (text.strip(), nsp, gos.tolist()) for text, nsp, gos in
104 |               zip(decoded_texts, no_speech_probs, gen_seq)
105 |         )
106 |         torch.xpu.synchronize()
107 |         return decoded_results
108 | 
109 |     def process_batch(self, wis:List[Tuple[STTRequest, callable, List[int]]]):
110 |         if self.debug:
111 |             print(f'InfernSTTWorker.process_batch: got {len(wis)=}')
112 |         assert all(wi[0].chunk.samplerate == self.sample_rate for wi in wis)
113 |         audios = [wi[0].chunk.audio for wi in wis]
114 |         inputs = self.process_audios(audios, sampling_rate=self.sample_rate)
115 |         prompts = self.get_prompt(tuple((wi[0].lang, wi[0].mode, wi[0].timestamps) for wi in wis))
116 |         max_nsps = [wi[0].max_ns_prob for wi in wis]
117 |         good_results = self.infer_and_decode(prompts, inputs, max_nsps)
118 |         for (wi, text_cb, c), (r, nsp, t) in zip(wis, good_results):
119 |             # Remove leading and trailing space: "WhitespaceTokenizer adds a space at the beginning?" (copilot)
120 |             if len(r) > 0 and r[0] == ' ': r = r[1:]
121 |             if c is not None: c[:] = (c + t)[:-224]
122 |             res = STTResult(text=r, no_speech_prob=nsp, req=wi)
123 |             text_cb(result = res)
124 | 
125 |     @lru_cache(maxsize=16)
126 |     def get_prompt(self, options:Tuple[Tuple[str, str, bool]]):
127 |         prompt = tuple(self.processor.tokenizer.convert_tokens_to_ids(
128 |                 [
129 |                     "<|startoftranscript|>",
130 |                    f"<|{language}|>",
131 |                    f"<|{mode}|>",
132 |                 ] + ([] if timestamps else ["<|notimestamps|>"])
133 |                 ) for language, mode, timestamps in options)
134 |         return prompt
135 | 


--------------------------------------------------------------------------------
/Cluster/InfernTTSActor.py:
--------------------------------------------------------------------------------
 1 | #try: import intel_extension_for_pytorch as ipex
 2 | #except ModuleNotFoundError: ipex = None
 3 | 
 4 | from typing import Dict, Optional
 5 | from uuid import UUID
 6 | 
 7 | import ray
 8 | 
 9 | from Cluster.TTSSession import TTSSession, TTSRequest
10 | from Cluster.InfernTTSWorker import InfernTTSWorker
11 | 
12 | @ray.remote(num_gpus=0.25, resources={"tts": 1})
13 | class InfernTTSActor():
14 |     sessions: Dict[UUID, TTSSession]
15 |     tts: InfernTTSWorker
16 | 
17 |     def __init__(self):
18 |         super().__init__()
19 |         self.sessions = {}
20 | 
21 |     def start(self, lang:str='en', output_sr:int=16000, device=None):
22 |         self.tts = InfernTTSWorker(lang, output_sr, device)
23 |         self.tts.start()
24 | 
25 |     def stop(self):
26 |         self.tts.stop()
27 | 
28 |     def get_rand_voice_id(self) -> int:
29 |         return self.tts.get_rand_voice_id()
30 | 
31 |     def new_tts_session(self):
32 |         tts_actr = ray.get_runtime_context().current_actor
33 |         rgen = TTSSession(self.tts, tts_actr)
34 |         self.sessions[rgen.id] = rgen
35 |         return rgen.id
36 | 
37 |     def tts_session_start(self, rgen_id, soundout:callable):
38 |         rgen = self.sessions[rgen_id]
39 |         rgen.start(soundout)
40 | 
41 |     def tts_session_say(self, rgen_id, req:TTSRequest):
42 |         rgen = self.sessions[rgen_id]
43 |         return rgen.say(req)
44 | 
45 |     def tts_session_stop_saying(self, rgen_id:UUID, rsay_id:UUID):
46 |         rgen = self.sessions[rgen_id]
47 |         return rgen.stop_saying(rsay_id)
48 | 
49 |     def tts_session_end(self, rgen_id):
50 |         rgen = self.sessions[rgen_id]
51 |         rgen.stop()
52 |         del self.sessions[rgen_id]
53 | 


--------------------------------------------------------------------------------
/Cluster/InfernTTSWorker.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import torch
  4 | 
  5 | from Cluster.InfernBatchedWorker import InfernBatchedWorker
  6 | from HelloSippyTTSRT.HelloSippyRTPipe import HelloSippyRTPipe, HelloSippyPlayRequest, \
  7 |     HelloSippyPipeState, HelloSippyPipeStateBatched
  8 | 
  9 | def get_ja_T5Processor(device, model_name):
 10 |     from utils.speecht5_openjtalk_tokenizer import SpeechT5OpenjtalkTokenizer
 11 |     from transformers import SpeechT5Processor, SpeechT5FeatureExtractor
 12 | 
 13 |     print(f'get_ja_T5Processor: device = {device}, model_name = {model_name}')
 14 |     tokenizer = SpeechT5OpenjtalkTokenizer.from_pretrained(model_name)
 15 |     tokenizer._in_target_context_manager = False
 16 |     tokenizer.split_special_tokens = True
 17 |     tokenizer._added_tokens_encoder = {}
 18 |     tokenizer._unk_token = None
 19 |     feature_extractor = SpeechT5FeatureExtractor.from_pretrained(model_name)
 20 |     return SpeechT5Processor(feature_extractor, tokenizer)
 21 | 
 22 | class cleanup_text_eu():
 23 |     replacements = [("Ä", "E"), ("Æ", "E"), ("Ç", "C"), ("É", "E"), ("Í", "I"), ("Ó", "O"), ("Ö", "E"), ("Ü", "Y"), ("ß", "S"),
 24 |         ("à", "a"), ("á", "a"), ("ã", "a"), ("ä", "e"), ("å", "a"), ("ë", "e"), ("í", "i"), ("ï", "i"), ("ð", "o"), ("ñ", "n"),
 25 |         ("ò", "o"), ("ó", "o"), ("ô", "o"), ("ö", "u"), ("ú", "u"), ("ü", "y"), ("ý", "y"), ("Ā", "A"), ("ā", "a"), ("ă", "a"),
 26 |         ("ą", "a"), ("ć", "c"), ("Č", "C"), ("č", "c"), ("ď", "d"), ("Đ", "D"), ("ę", "e"), ("ě", "e"), ("ğ", "g"), ("İ", "I"),
 27 |         ("О", "O"), ("Ł", "L"), ("ń", "n"), ("ň", "n"), ("Ō", "O"), ("ō", "o"), ("ő", "o"), ("ř", "r"), ("Ś", "S"), ("ś", "s"),
 28 |         ("Ş", "S"), ("ş", "s"), ("Š", "S"), ("š", "s"), ("ū", "u"), ("ź", "z"), ("Ż", "Z"), ("Ž", "Z"), ("ǐ", "i"), ("ǐ", "i"),
 29 |         ("ș", "s"), ("ț", "t"), ("ù", "u"),
 30 |     ]
 31 |     r_from, r_to = [''.join(x) for x in zip(*replacements)]
 32 |     replacements = str.maketrans(r_from, r_to)
 33 | 
 34 |     def __call__(self, text):
 35 |         return text.translate(self.replacements)
 36 | 
 37 | lang2model = {'en': {'cleanup_text':cleanup_text_eu()},
 38 |               'it': {'model':'Sandiago21/speecht5_finetuned_voxpopuli_it', 'cleanup_text':cleanup_text_eu()},
 39 |               'es': {'model':'Sandiago21/speecht5_finetuned_facebook_voxpopuli_spanish', 'cleanup_text':cleanup_text_eu()},
 40 |               'fr': {'model':'Sandiago21/speecht5_finetuned_facebook_voxpopuli_french', 'cleanup_text':cleanup_text_eu()},
 41 |               'de': {'model':'JFuellem/speecht5_finetuned_voxpopuli_de', 'cleanup_text':cleanup_text_eu()},
 42 |               'pt': {'model':'evertonaleixo/speecht5_finetuned_fleurs_ptbr', 'cleanup_text':cleanup_text_eu()},
 43 |               'ru': {'model':'zaebee/speecht5_tts_common_ru'},
 44 |               'ja': {'model': 'esnya/japanese_speecht5_tts', 'get_processor': get_ja_T5Processor},
 45 |              }
 46 | 
 47 | def get_torch_hw():
 48 |     if torch.cuda.is_available():
 49 |         return 'cuda' 
 50 |     if hasattr(torch, 'xpu') and torch.xpu.is_available():
 51 |         return 'xpu'
 52 |     if hasattr(torch, 'mps'):
 53 |         return 'mps'
 54 |     raise AttributeError('Could not find CUDA deivces')
 55 | 
 56 | class InfernTTSWorker(InfernBatchedWorker):
 57 |     max_batch_size: int = 8
 58 |     debug = False
 59 |     tts_engine: HelloSippyRTPipe
 60 |     output_sr: int
 61 | 
 62 |     def __init__(self, lang, output_sr, device=None):
 63 |         from warnings import filterwarnings
 64 |         filterwarnings("ignore", category=FutureWarning)
 65 |         filterwarnings("ignore", category=UserWarning)
 66 |         try:
 67 |             import intel_extension_for_pytorch as ipex
 68 |         except ModuleNotFoundError:
 69 |             ipex = None
 70 |         super().__init__()
 71 |         if device is None:
 72 |             device = get_torch_hw()
 73 |         tts_engine = HelloSippyRTPipe(device, output_sr=output_sr, **lang2model[lang])
 74 |         if ipex is not None:
 75 |             for a in ('model', 'vocoder', 'chunker'):
 76 |                 x = getattr(tts_engine, a)
 77 |                 try: x = ipex.optimize(x)
 78 |                 except AttributeError: continue
 79 |                 setattr(tts_engine, a, x)
 80 |         self.tts_engine = tts_engine
 81 |         self.output_sr = output_sr
 82 | 
 83 |     def process_batch(self, wis:List[HelloSippyPlayRequest]):
 84 |         new_states = [HelloSippyPipeState(self.tts_engine, r) for r in wis]
 85 |         state = HelloSippyPipeStateBatched(new_states, self.tts_engine)
 86 |         while True:
 87 |             try:
 88 |                 self.tts_engine.infer(state)
 89 |             except RuntimeError as e:
 90 |                 self.handle_runtime_error(e, wis, state)
 91 |                 raise
 92 |             if not self.tts_engine.unbatch_and_dispatch(state): break
 93 | 
 94 |     def handle_runtime_error(self, e, state, wis:List[HelloSippyPlayRequest]):
 95 |         print(f'InfernTTSWorker.handle_runtime_error: {e}')
 96 |         affected = [(d, w) for d, w in zip(state, wis) if d.dispatch is not None]
 97 | 
 98 |     def get_voice(self, *args):
 99 |         return self.tts_engine.get_voice(*args)
100 | 
101 |     def get_rand_voice(self):
102 |         return self.tts_engine.get_rand_voice()
103 | 
104 |     def get_rand_voice_id(self):
105 |         return self.tts_engine.get_rand_voice_id()
106 | 
107 | 


--------------------------------------------------------------------------------
/Cluster/LLMSession.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Optional
 2 | from time import monotonic
 3 | from functools import partial
 4 | from uuid import uuid4, UUID
 5 | 
 6 | class LLMRequest():
 7 |     id: UUID
 8 |     text: str
 9 |     textout_cb: callable
10 |     auto_ctx_add: bool = True
11 |     def __init__(self, text:str, textout_cb:callable):
12 |         self.text, self.textout_cb = text, textout_cb
13 |         self.id = uuid4()
14 | 
15 | class LLMResult():
16 |     req_id: UUID
17 |     text: str
18 |     def __init__(self, text:str, req_id:UUID):
19 |         self.text, self.req_id = text, req_id
20 | 
21 | class LLMInferRequest():
22 |     req: LLMRequest
23 |     context: Tuple[dict]
24 |     textout_cb: callable
25 | 
26 |     def __init__(self, req:LLMRequest, context:List[dict]):
27 |         self.req, self.context = req, tuple(context)
28 | 
29 | class LLMSessionParams():
30 |     system_prompt: str
31 |     def __init__(self, system_prompt:str):
32 |         self.system_prompt = system_prompt
33 | 
34 | class LLMSession():
35 |     id: UUID
36 |     context: List[dict]
37 |     debug: bool = False
38 |     def __init__(self, llm:'InfernLLMWorker', params:LLMSessionParams):
39 |         self.id = uuid4()
40 |         self.context = [{"role": "system", "content": params.system_prompt}]
41 |         self.llm = llm
42 |         
43 |     def context_add(self, content:str, role:str = "user"):
44 |         if self.debug:
45 |             print(f'{monotonic():4.3f}: LLMSession.context_add: {self.context=}, {content=}')
46 |         if len(self.context) > 0 and self.context[-1]["role"] == role:
47 |             self.context[-1]["content"] += f' {content}'
48 |         else:
49 |             self.context.append({"role": role, "content": content})
50 | 
51 |     def textin(self, req:LLMRequest):
52 |         if self.debug:
53 |             print(f'{monotonic():4.3f}: LLMSession.textin: ${req.text=}, {req.textout_cb=} {self.context=}')
54 |         self.context_add(req.text)
55 |         ireq = LLMInferRequest(req, self.context)
56 |         if hasattr(req, '_proc_start_cb'):
57 |             ireq._proc_start_cb = req._proc_start_cb
58 |         ireq.textout_cb = partial(self.textout, req = req)
59 |         self.llm.infer(ireq)
60 | 
61 |     def textout(self, req:LLMRequest, result:LLMResult):
62 |         if self.debug:
63 |             print(f'{monotonic():4.3f}: LLMSession.textout: {result.text=}')
64 |         if req.auto_ctx_add:
65 |             self.context_add(result.text, "assistant")
66 |         req.textout_cb(result = result)
67 | 
68 |     def stop(self):
69 |         if self.debug: print('STTSession.stop')
70 |         del self.llm
71 | 


--------------------------------------------------------------------------------
/Cluster/RemoteRTPGen.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from functools import partial
 3 | 
 4 | import ray
 5 | from ray.exceptions import RayTaskError
 6 | 
 7 | from Cluster.InfernRTPActor import RTPSessNotFoundErr
 8 | from RTP.AudioInput import AudioInput
 9 | from RTP.RTPParams import RTPParams
10 | 
11 | class RTPGenError(Exception):
12 |     pass
13 | 
14 | class RemoteRTPGen():
15 |     def __init__(self, rtp_actr, params:RTPParams):
16 |         self.rtp_actr = rtp_actr
17 |         fut = rtp_actr.new_rtp_session.remote(params)
18 |         try: self.sess_id, self.rtp_address = ray.get(fut)
19 |         except RayTaskError as e: raise RTPGenError("new_rtp_session() failed") from e
20 | 
21 |     def connect(self, ain:AudioInput):
22 |         return self.rtp_actr.rtp_session_connect.remote(self.sess_id, ain)
23 | 
24 |     def update(self, params:RTPParams):
25 |         return ray.get(self.rtp_actr.rtp_session_update.remote(self.sess_id, params))
26 | 
27 |     def get_soundout(self) -> callable:
28 |         return partial(self.rtp_actr.rtp_session_soundout.remote, rtp_id=self.sess_id)
29 | 
30 |     def soundout(self, chunk):
31 |         self.rtp_actr.rtp_session_soundout.remote(rtp_id=self.sess_id, chunk=chunk)
32 | 
33 |     def end(self, relaxed:bool=True):
34 |         return self.rtp_actr.rtp_session_end.remote(self.sess_id, relaxed)
35 | 
36 |     def join(self):
37 |         try: ray.get(self.rtp_actr.rtp_session_join.remote(self.sess_id))
38 |         except RTPSessNotFoundErr: pass
39 | 


--------------------------------------------------------------------------------
/Cluster/RemoteTTSSession.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from uuid import UUID
 3 | 
 4 | import ray
 5 | from ray.exceptions import RayTaskError
 6 | 
 7 | from .TTSSession import TTSRequest
 8 | 
 9 | class TTSSessionError(Exception):
10 |     pass
11 | 
12 | class RemoteTTSSession():
13 |     sess_id: UUID
14 |     def __init__(self, tts_actr):
15 |         super().__init__()
16 |         self.tts_actr = tts_actr
17 |         try: self.sess_id = ray.get(tts_actr.new_tts_session.remote())
18 |         except RayTaskError as e: raise TTSSessionError("new_tts_session() failed") from e
19 | 
20 |     def start(self, soundout:callable):
21 |         return ray.get(self.tts_actr.tts_session_start.remote(self.sess_id, soundout))
22 | 
23 |     def end(self):
24 |         return ray.get(self.tts_actr.tts_session_end.remote(self.sess_id))
25 | 
26 |     def say(self, req:TTSRequest):
27 |         return self.tts_actr.tts_session_say.remote(rgen_id=self.sess_id, req=req)
28 | 
29 |     def stop_saying(self, rsay_id:UUID):
30 |         return self.tts_actr.tts_session_stop_saying.remote(rgen_id=self.sess_id, rsay_id=rsay_id)
31 | 


--------------------------------------------------------------------------------
/Cluster/STTSession.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Union
  2 | from uuid import uuid4, UUID
  3 | from fractions import Fraction
  4 | from functools import partial
  5 | from threading import Lock
  6 | from time import monotonic
  7 | 
  8 | from Core.AudioChunk import AudioChunk, VadAudioChunk
  9 | 
 10 | class STTRequest():
 11 |     lang: str
 12 |     chunk: AudioChunk
 13 |     text_cb: callable
 14 |     mode: str = 'transcribe'
 15 |     timestamps: bool = False
 16 |     stime: float
 17 |     max_ns_prob: float = 0.5
 18 |     def __init__(self, chunk:AudioChunk, text_cb:callable, lang:str):
 19 |         self.stime = monotonic()
 20 |         self.lang, self.chunk, self.text_cb = lang, chunk, text_cb
 21 | 
 22 | class STTSentinel():
 23 |     stime: float
 24 |     text_cb: callable
 25 |     def __init__(self, signal:str, text_cb:callable):
 26 |         self.stime = monotonic()
 27 |         self.signal, self.text_cb = signal, text_cb
 28 | 
 29 | class STTResult():
 30 |     text: str
 31 |     no_speech_prob: float
 32 |     duration: Fraction
 33 |     offsets: Optional[List]=None
 34 |     inf_time: float
 35 |     def __init__(self, text:str, no_speech_prob:float, req:STTRequest):
 36 |         self.text = text
 37 |         self.no_speech_prob = no_speech_prob
 38 |         self.duration = Fraction(len(req.chunk.audio), req.chunk.samplerate)
 39 |         self.inf_time = monotonic() - req.stime
 40 | 
 41 | class STTSession():
 42 |     debug = False
 43 |     id: UUID
 44 |     lang: str = 'en'
 45 |     context: List[int]
 46 |     state_lock: Lock
 47 |     busy: bool = False
 48 |     pending: List[STTRequest]
 49 | 
 50 |     def __init__(self, stt, keep_context:bool):
 51 |         super().__init__()
 52 |         self.id = uuid4()
 53 |         self.stt = stt
 54 |         self.state_lock = Lock()
 55 |         self.context = [] if keep_context else None
 56 |         self.pending = []
 57 | 
 58 |     def stop(self):
 59 |         if self.debug: print('STTSession.stop')
 60 |         with self.state_lock:
 61 |             del self.stt, self.pending
 62 | 
 63 |     def soundin(self, req:Union[STTRequest,STTSentinel]):
 64 |         if self.debug:
 65 |             if isinstance(req, STTRequest):
 66 |                 print(f'STTSession.soundin({len(req.chunk.audio)=})')
 67 |             else:
 68 |                 print(f'STTSession.soundin({req=})')
 69 |         results = []
 70 |         with self.state_lock:
 71 |             self.pending.append(req)
 72 |             if self.busy:
 73 |                 return
 74 |             assert len(self.pending) == 1
 75 |             self.busy = True
 76 |             self._process_pending_stt_lckd(results)
 77 |         for cb, r in results:
 78 |             cb(result=r)
 79 | 
 80 |     def _process_pending_stt_lckd(self, results:List):
 81 |         while self.pending:
 82 |             req = self.pending.pop(0)
 83 |             if isinstance(req, STTRequest):
 84 |                 if isinstance(req.chunk, VadAudioChunk):
 85 |                     nr = next((r for r in self.pending if isinstance(r, STTRequest)), None)
 86 |                     if nr is not None and isinstance(nr.chunk, VadAudioChunk):
 87 |                         ca, cb = req.chunk, nr.chunk
 88 |                         if cb.tpos() + cb.duration() - ca.tpos() < self.stt.max_chunk_duration:
 89 |                             ca.append(cb)
 90 |                             self.pending.remove(nr)
 91 |                             self.pending.insert(0, req)
 92 |                             continue
 93 |                 if req.chunk.samplerate != self.stt.sample_rate:
 94 |                     req.chunk.resample(self.stt.sample_rate)
 95 |                 req.chunk.audio = req.chunk.audio.numpy()
 96 |                 text_cb = partial(self.stt_out, req.text_cb)
 97 |                 self.stt.infer((req, text_cb, self.context))
 98 |                 break
 99 |             if all(isinstance(r, STTRequest) for r in self.pending):
100 |                 results.append((req.text_cb, req))
101 |         else:
102 |             self.busy = False
103 | 
104 |     def stt_out(self, text_cb, result:STTResult):
105 |         results = [(text_cb, result)]
106 |         with self.state_lock:
107 |             if not hasattr(self, 'stt'):
108 |                 return
109 |             if self.debug: print(f'STTSession.stt_out({result.text=})')
110 |             assert self.busy
111 |             self._process_pending_stt_lckd(results)
112 |         for cb, r in results:
113 |             cb(result=r)
114 | 


--------------------------------------------------------------------------------
/Cluster/TTSSession.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Sippy Software, Inc. All rights reserved.
  2 | #
  3 | # All rights reserved.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without modification,
  6 | # are permitted provided that the following conditions are met:
  7 | #
  8 | # 1. Redistributions of source code must retain the above copyright notice, this
  9 | # list of conditions and the following disclaimer.
 10 | #
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation and/or
 13 | # other materials provided with the distribution.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | 
 26 | from typing import Optional, Union, List, Tuple, Dict
 27 | from time import monotonic
 28 | from uuid import uuid4, UUID
 29 | from queue import Queue
 30 | 
 31 | import ray
 32 | 
 33 | from Core.AStreamMarkers import ASMarkerNewSent, ASMarkerGeneric, \
 34 |     ASMarkerSentDoneCB
 35 | 
 36 | from functools import partial
 37 | from HelloSippyTTSRT.HelloSippyRTPipe import HelloSippyPlayRequest
 38 | from Core.AudioChunk import AudioChunk
 39 | from Cluster.InfernTTSWorker import InfernTTSWorker
 40 | 
 41 | class TTSRequest():
 42 |     text: Union[str,List[str],Tuple[str]]
 43 |     speaker_id: Optional[int]
 44 |     done_cb: Optional[callable]
 45 |     def __init__(self, text:Union[str,List[str],Tuple[str]], speaker_id:Optional[int]=None, done_cb:Optional[callable]=None):
 46 |         self.text = text
 47 |         self.speaker_id = speaker_id
 48 |         self.done_cb = done_cb
 49 | 
 50 | class TTSSndDispatch():
 51 |     id: UUID
 52 |     debug: bool = False
 53 |     cancelled: bool = False
 54 |     done_cb: Optional[callable] = None
 55 |     cleanup_cb: Optional[callable] = None
 56 |     soundout: callable
 57 |     output_sr: int
 58 |     def __init__(self, soundout:callable, output_sr:int, done_cb:Optional[callable]):
 59 |         self.id = uuid4()
 60 |         self.soundout, self.output_sr, self.done_cb = soundout, output_sr, done_cb
 61 | 
 62 |     def cancel(self):
 63 |         self.cancelled = True
 64 |         chunk = ASMarkerNewSent() if self.done_cb is None \
 65 |                                   else ASMarkerSentDoneCB(self.done_cb, sync=True)
 66 |         self.soundout(chunk=chunk)
 67 |         if self.cleanup_cb is not None:
 68 |             self.cleanup_cb()
 69 | 
 70 |     def sound_dispatch(self, chunk):
 71 |         if self.cancelled:
 72 |             return
 73 |         do_cleanup = False
 74 |         if chunk is None:
 75 |             if self.debug:
 76 |                 print(f'{monotonic():4.3f}: TTSSndDispatch.sound_dispatch {self.done_cb=}')
 77 |             chunk = ASMarkerNewSent() if self.done_cb is None \
 78 |                                       else ASMarkerSentDoneCB(self.done_cb, sync=True)
 79 |             do_cleanup = True
 80 |         elif not isinstance(chunk, ASMarkerGeneric):
 81 |             assert chunk.size(0) > 0
 82 |             chunk=AudioChunk(chunk, self.output_sr)
 83 |         self.soundout(chunk=chunk)
 84 |         if do_cleanup and self.cleanup_cb is not None:
 85 |             self.cleanup_cb()
 86 | 
 87 | class TTSSession():
 88 |     debug = True
 89 |     id: UUID
 90 |     tts: InfernTTSWorker
 91 |     tts_actr: ray.remote
 92 |     soundout: callable
 93 |     active_req: Dict[UUID, TTSSndDispatch]
 94 | 
 95 |     def __init__(self, tts:InfernTTSWorker, tts_actr:ray.remote):
 96 |         super().__init__()
 97 |         self.id = uuid4()
 98 |         self.tts, self.tts_actr = tts, tts_actr
 99 |         self.active_req = {}
100 | 
101 |     def start(self, soundout:callable):
102 |         self.soundout = soundout
103 | 
104 |     def say(self, req:TTSRequest) -> UUID:
105 |         if self.debug:
106 |             print(f'{monotonic():4.3f}: TTSSession.say: ${req.text=}, {req.speaker_id=}, {req.done_cb=}')
107 |         if req.speaker_id is not None:
108 |             speaker = self.tts.get_voice(req.speaker_id)
109 |         else:
110 |             speaker, req.speaker_id = self.tts.get_rand_voice()
111 |         if isinstance(req.text, str): req.text = (req.text,)
112 |         text, done_cb = req.text[0], req.done_cb
113 |         if len(req.text) > 1:
114 |             req.text.pop(0)
115 |             done_cb = partial(self.tts_actr.tts_session_say.remote, rgen_id=self.id, req=req)
116 |         trd = TTSSndDispatch(self.soundout, self.tts.output_sr, done_cb)
117 |         def cleanup_cb():
118 |             if self.debug:
119 |                 print(f'{monotonic():4.3f}: TTSSession.cleanup_cb')
120 |             del self.active_req[trd.id]
121 |         trd.cleanup_cb = cleanup_cb
122 |         preq = HelloSippyPlayRequest(self.id, text, speaker, trd.sound_dispatch)
123 |         self.active_req[trd.id] = trd
124 |         self.tts.infer(preq)
125 |         return trd.id
126 | 
127 |     def stop_saying(self, rsay_id:UUID):
128 |         if self.debug:
129 |             print(f'{monotonic():4.3f}: TTSSession.stop_saying: {rsay_id=}')
130 |         trd = self.active_req.get(rsay_id)
131 |         if trd is None:
132 |             return False
133 |         trd.cancel()
134 |         return True
135 | 
136 |     def stop(self):
137 |         pass
138 | 
139 |     def __del__(self):
140 |         if self.debug:
141 |             print('TTSSession.__del__')
142 | 


--------------------------------------------------------------------------------
/Core/AStreamMarkers.py:
--------------------------------------------------------------------------------
 1 | from time import monotonic
 2 | 
 3 | import ray
 4 | 
 5 | from Core.Exceptions.InfernSessNotFoundErr import InfernSessNotFoundErr
 6 | 
 7 | class ASMarkerGeneric():
 8 |     track_id: int
 9 |     debug: bool = False
10 |     def __init__(self, track_id:int=0):
11 |         self.track_id = track_id
12 | 
13 | class ASMarkerNewSent(ASMarkerGeneric):
14 |     # This runs in the context of the RTPOutputWorker thread
15 |     def on_proc(self, tro_self, *args): pass
16 | 
17 | class ASMarkerSentDoneCB(ASMarkerNewSent):
18 |     debug = False
19 |     def __init__(self, done_cb:callable, sync:bool=False, **kwargs):
20 |         super().__init__(**kwargs)
21 |         self.done_cb = done_cb
22 |         self.sync = sync
23 | 
24 |     def on_proc(self, tro_self):
25 |         if self.debug:
26 |             print(f'{monotonic():4.3f}: ASMarkerSentDoneCB.on_proc')
27 |         x = self.done_cb()
28 |         if self.sync:
29 |             try: ray.get(x)
30 |             except InfernSessNotFoundErr: pass
31 | 


--------------------------------------------------------------------------------
/Core/AudioChunk.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import requests
 3 | import soundfile as sf
 4 | from io import BytesIO
 5 | 
 6 | from config.InfernGlobals import InfernGlobals as IG
 7 | 
 8 | class AudioChunk():
 9 |     debug: bool = False
10 |     samplerate: int
11 |     audio:torch.Tensor
12 |     track_id: int = 0
13 |     active: bool = True
14 |     def __init__(self, audio:torch.Tensor, samplerate:int):
15 |         assert isinstance(audio, torch.Tensor)
16 |         self.audio = audio
17 |         self.samplerate = samplerate
18 | 
19 |     def resample(self, sample_rate:int):
20 |         assert sample_rate != self.samplerate
21 |         audio = self.audio.to(torch.float)
22 |         audio = IG.get_resampler(self.samplerate, sample_rate, audio.device)(audio).to(self.audio.dtype)
23 |         self.samplerate, self.audio = sample_rate, audio
24 |         return self
25 | 
26 |     def duration(self):
27 |         return self.audio.size(0) / self.samplerate
28 | 
29 | class VadAudioChunk(AudioChunk):
30 |     debug: bool = False
31 |     ipos: int
32 |     def __init__(self, audio:torch.Tensor, samplerate:int, ipos:int):
33 |         super().__init__(audio, samplerate)
34 |         self.ipos = ipos
35 | 
36 |     def tpos(self):
37 |         return self.ipos / self.samplerate
38 | 
39 |     def append(self, other:'VadAudioChunk'):
40 |         assert self.samplerate == other.samplerate
41 |         if self.debug:
42 |             print(f'VadAudioChunk.append: {self.ipos=} {self.audio.size(0)=} {other.ipos=} {other.audio.size(0)=}')
43 |         sdiff = other.ipos - (self.ipos + self.audio.size(0))
44 |         assert sdiff >= 0
45 |         if sdiff > 0:
46 |             self.audio = torch.cat((self.audio, torch.zeros(sdiff, dtype=self.audio.dtype, device=self.audio.device)), dim=0)
47 |         self.audio = torch.cat((self.audio, other.audio), dim=0)
48 | 
49 | class AudioChunkFromURL(AudioChunk):
50 |     def __init__(self, url:str, samplerate=8000, dtype=torch.float16, **kwargs):
51 |         response = requests.get(url)
52 |         sound_bytes = BytesIO(response.content)
53 |         audio, samplerate_in = sf.read(sound_bytes)
54 |         audio = torch.from_numpy(audio).to(dtype)
55 |         super().__init__(audio, samplerate_in, **kwargs)
56 |         if samplerate_in != samplerate:
57 |             self.resample(samplerate)
58 | 


--------------------------------------------------------------------------------
/Core/Codecs/G711.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import audioop
 3 | 
 4 | from Core.AudioChunk import AudioChunk
 5 | from .GenCodec import GenCodec
 6 | 
 7 | _pcm_to_ulaw_ct = torch.zeros(65536, dtype=torch.uint8)
 8 | for i in range(-32768, 32768):
 9 |     pcm_data = i.to_bytes(2, 'little', signed=True)
10 |     ulaw_data = audioop.lin2ulaw(pcm_data, 2)
11 |     ulaw_value = ulaw_data[0]  # Get the byte value from bytes
12 |     _pcm_to_ulaw_ct[i + 32768] = ulaw_value  # Shift index to make it non-negative
13 | _ulaw_to_pcm_ct = torch.zeros(256, dtype=torch.int16)
14 | for i in range(256):
15 |     # Convert each µ-law value back to PCM value
16 |     ulaw_byte = i.to_bytes(1, 'little')
17 |     pcm_data = audioop.ulaw2lin(ulaw_byte, 2)  # Convert µ-law byte to linear PCM
18 |     pcm_value = int.from_bytes(pcm_data, 'little', signed=True)
19 |     _ulaw_to_pcm_ct[i] = pcm_value
20 | 
21 | class G711Codec(GenCodec):
22 |     ptype = 0 # G.711u
23 |     ename = 'PCMU'
24 | 
25 |     def encode(self, audio_tensor:torch.Tensor):
26 |         # Scale from [-1, 1] to [-32768, 32767]
27 |         audio_scaled = torch.clamp(audio_tensor * 32767.0, -32768, 32767).to(torch.int16)
28 | 
29 |         # Shift and look up in the conversion table
30 |         audio_ulaw = _pcm_to_ulaw_ct[(audio_scaled + 32768).long()]
31 | 
32 |         return audio_ulaw.cpu().numpy().tobytes()
33 | 
34 |     def decode(self, ulaw_bytes:bytes, resample:bool=True, sample_rate:int=GenCodec.srate):
35 |         # Convert byte string to a tensor of uint8
36 |         ulaw_tensor = torch.tensor(list(ulaw_bytes), dtype=torch.uint8)
37 | 
38 |         # Use ulaw_to_pcm table to convert each µ-law value to PCM value
39 |         audio_pcm = _ulaw_to_pcm_ct[ulaw_tensor.long()]
40 | 
41 |         # Scale from [-32768, 32767] to [-1, 1]
42 |         audio_float = audio_pcm.float() / 32767.0
43 | 
44 |         chunk = AudioChunk(audio_float, self.srate)
45 |         if resample and sample_rate != self.srate:
46 |             chunk.resample(sample_rate)
47 |         return chunk
48 | 
49 |     def device(self):
50 |         global _pcm_to_ulaw_ct, _ulaw_to_pcm_ct
51 |         assert _pcm_to_ulaw_ct.device == _ulaw_to_pcm_ct.device
52 |         return _pcm_to_ulaw_ct.device
53 | 
54 |     def to(self, device):
55 |         global _pcm_to_ulaw_ct, _ulaw_to_pcm_ct
56 |         assert _pcm_to_ulaw_ct.device == _ulaw_to_pcm_ct.device
57 |         _pcm_to_ulaw_ct = _pcm_to_ulaw_ct.to(device)
58 |         _ulaw_to_pcm_ct = _ulaw_to_pcm_ct.to(device)
59 |         return self
60 | 
61 |     def e2d_frames(self, enframes:int, out_srate:int=GenCodec.srate):
62 |         assert out_srate % self.srate == 0
63 |         return enframes * out_srate // self.srate
64 | 
65 |     def d2e_frames(self, dnframes:int, in_srate:int=GenCodec.srate):
66 |         assert in_srate % self.srate == 0
67 |         return dnframes * self.srate // in_srate
68 | 
69 |     def silence(self, nframes:int):
70 |         return b'\xff' * nframes
71 | 


--------------------------------------------------------------------------------
/Core/Codecs/G722.py:
--------------------------------------------------------------------------------
 1 | from G722 import G722
 2 | import torch
 3 | import numpy as np
 4 | 
 5 | from Core.AudioChunk import AudioChunk
 6 | from .GenCodec import GenCodec
 7 | 
 8 | class G722Codec(GenCodec):
 9 |     codec:G722
10 |     srate:int = 8000
11 |     default_br:int = 64000
12 |     ptype:int = 9 # G.722
13 |     ename:str = 'G722' # encoding name
14 |     _device:str = 'cpu'
15 | 
16 |     def __init__(self):
17 |         super().__init__()
18 |         self.codec = G722(self.srate, self.default_br)
19 | 
20 |     def encode(self, audio_tensor:torch.Tensor):
21 |         # Scale from [-1, 1] to [-32768, 32767]
22 |         audio_scaled = torch.clamp(audio_tensor * 32767.0, -32768, 32767).to(torch.int16).numpy()
23 | 
24 |         # Shift and look up in the conversion table
25 |         audio_enc = self.codec.encode(audio_scaled)
26 | 
27 |         return audio_enc
28 | 
29 |     def decode(self, audio_enc:bytes, resample:bool=True, sample_rate:int=srate):
30 |         # Use ulaw_to_pcm table to convert each µ-law value to PCM value
31 |         audio_pcm = torch.tensor(self.codec.decode(audio_enc)).to(self._device)
32 | 
33 |         # Scale from [-32768, 32767] to [-1, 1]
34 |         audio_float = audio_pcm.float() / 32767.0
35 | 
36 |         chunk = AudioChunk(audio_float, self.srate)
37 |         if resample and sample_rate != self.srate:
38 |             chunk.resample(sample_rate)
39 |         return chunk
40 | 
41 |     def device(self): return self._device
42 | 
43 |     def to(self, device):
44 |         self._device = device
45 |         return self
46 | 
47 |     def silence(self, nframes:int):
48 |         return self.encode(torch.zeros(self.e2d_frames(nframes), dtype=torch.int16))
49 | 
50 |     def e2d_frames(self, enframes:int, out_srate:int=srate):
51 |         #assert out_srate % self.srate == 0
52 |         return enframes * (1 if self.srate == 8000 else 2) * out_srate // self.srate
53 | 
54 |     def d2e_frames(self, dnframes:int, in_srate:int=srate):
55 |         #assert in_srate % self.srate == 0
56 |         return dnframes * self.srate // ((1 if self.srate == 8000 else 2) * in_srate)
57 | 


--------------------------------------------------------------------------------
/Core/Codecs/GenCodec.py:
--------------------------------------------------------------------------------
 1 | class GenCodec():
 2 |     srate:int = 8000 # sample rate
 3 |     crate:int = 8000 # clock rate
 4 |     ptype:int        # payload type
 5 |     ename:str        # encoding name
 6 | 
 7 |     def __init__(self):
 8 |         assert self.ptype is not None and self.ename is not None
 9 | 
10 |     @classmethod
11 |     def rtpmap(cls):
12 |         assert all(hasattr(cls, attr) for attr in ('ptype', 'ename'))
13 |         return f'rtpmap:{cls.ptype} {cls.ename}/{cls.crate}'
14 | 


--------------------------------------------------------------------------------
/Core/ConfigValidators.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | from cerberus import Validator
 3 | 
 4 | class InfernConfigParseErr(Exception): pass
 5 | 
 6 | def validate_yaml(schema, filename):
 7 |     try:
 8 |         with open(filename, 'r') as file:
 9 |             data = yaml.safe_load(file)
10 | 
11 |         v = Validator(schema)
12 |         if not v.validate(data):
13 |             raise InfernConfigParseErr(f"Validation errors in {filename}: {v.errors}")
14 | 
15 |     except yaml.YAMLError as exc:
16 |         raise InfernConfigParseErr(f"Error parsing YAML file {filename}: {exc}") from exc
17 |     return data
18 | 
19 | def validate_port_range(field, value, error):
20 |     if ':' in value:
21 |         _, port = value.split(':', 1)
22 |         if not (1 <= int(port) <= 65535):
23 |             error(field, 'Port number must be in the range 1-65535')
24 | 


--------------------------------------------------------------------------------
/Core/Exceptions/InfernSessNotFoundErr.py:
--------------------------------------------------------------------------------
1 | class InfernSessNotFoundErr(Exception): pass


--------------------------------------------------------------------------------
/Core/InfernConfig.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Union
 2 | from functools import partial
 3 | 
 4 | from Cluster.InfernSIPActor import InfernSIPActor
 5 | from SIP.InfernSIPConf import InfernSIPConf
 6 | from SIP.InfernSIPProfile import InfernSIPProfile
 7 | from RTP.InfernRTPConf import InfernRTPConf
 8 | 
 9 | from .ConfigValidators import validate_yaml
10 | 
11 | # Define the schema
12 | schema = {
13 |     'sip': {
14 |         'type': 'dict',
15 |         'schema': {
16 |             **InfernSIPConf.schema,
17 |             **InfernSIPProfile.schema,
18 |         }
19 |     },
20 |     'rtp': {
21 |         'type': 'dict',
22 |         'schema': {
23 |             **InfernRTPConf.schema,
24 |         }
25 |     },
26 |     'apps': {
27 |         'type': 'dict',
28 |         'schema': {
29 |             # Filled by modules
30 |         }
31 |     }
32 | }
33 | 
34 | class InfernConfig():
35 |     sip_actr: Optional[InfernSIPActor]
36 |     sip_conf: Optional[InfernSIPConf]
37 |     rtp_conf: Optional[InfernRTPConf]
38 |     connectors: Dict[str, InfernSIPProfile]
39 |     apps: Dict[str, Union['LTProfile', 'AIAProfile']]
40 |     def __init__(self, filename: str):
41 |         from Apps.LiveTranslator.LTProfile import LTProfile
42 |         from Apps.LiveTranslator.LTAppConfig import LTAppConfig
43 |         schema['apps']['schema'].update(LTAppConfig.schema)
44 |         from Apps.AIAttendant.AIAProfile import AIAProfile
45 |         from Apps.AIAttendant.AIAAppConfig import AIAAppConfig
46 |         schema['apps']['schema'].update(AIAAppConfig.schema)
47 |         d = validate_yaml(schema, filename)
48 |         self.sip_conf = InfernSIPConf(d['sip'].get('settings', None)) if 'sip' in d else None
49 |         self.rtp_conf = InfernRTPConf(d['rtp'].get('settings', None)) if 'rtp' in d else None
50 |         try:
51 |             self.connectors = dict((f'sip/{name}', InfernSIPProfile(name, conf))
52 |                                 for name, conf in d['sip']['profiles'].items())
53 |         except KeyError:
54 |             self.connectors = {}
55 |         precache = 'live_translator_precache' in d['apps'] and d['apps']['live_translator_precache']
56 |         _LTProfile = partial(LTProfile, precache=precache)
57 |         self.apps = {}
58 |         for aname, AProf in (('live_translator', _LTProfile), ('ai_attendant', AIAProfile)):
59 |             if aname not in d['apps']: continue
60 |             app_confs = dict((f'apps/{aname}/{name}', AProf(name, conf))
61 |                         for name, conf in d['apps'][aname]['profiles'].items())
62 |             self.apps.update(app_confs)
63 |         for app in self.apps.values():
64 |             app.finalize(self)
65 |         if 'sip' in d:
66 |             self.sip_actr = InfernSIPActor.options(max_concurrency=2).remote()
67 |             for conn in self.connectors.values():
68 |                 conn.finalize(self.sip_actr, self)
69 |         else:
70 |             self.sip_actr = None
71 | 


--------------------------------------------------------------------------------
/Core/InfernWrkThread.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018 Sippy Software, Inc. All rights reserved.
 2 | #
 3 | # All rights reserved.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without modification,
 6 | # are permitted provided that the following conditions are met:
 7 | #
 8 | # 1. Redistributions of source code must retain the above copyright notice, this
 9 | # list of conditions and the following disclaimer.
10 | #
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation and/or
13 | # other materials provided with the distribution.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 
26 | from threading import Thread, Lock
27 | 
28 | RTPWrkTInit = 0
29 | RTPWrkTRun = 1
30 | RTPWrkTStop = 2
31 | 
32 | class InfernWrkThread(Thread):
33 |     state_lock: Lock = None
34 |     state: int = RTPWrkTInit
35 | 
36 |     def __init__(self):
37 |         self.state_lock = Lock()
38 |         super().__init__()
39 |         self.setDaemon(True)
40 | 
41 |     def start(self):
42 |         super().start()
43 | 
44 |     def get_state(self, locked=False):
45 |         if not locked: self.state_lock.acquire()
46 |         state = self.state
47 |         if not locked: self.state_lock.release()
48 |         return state
49 | 
50 |     def _set_state(self, newstate, expected_state = None, raise_on_error = True):
51 |         self.state_lock.acquire()
52 |         pstate = self.state
53 |         if expected_state is not None and self.state != expected_state:
54 |             self.state_lock.release()
55 |             if raise_on_error:
56 |                 raise AssertionError(f'Unexpected state: {self.state}, {expected_state} expected')
57 |             return pstate
58 |         self.state = newstate
59 |         self.state_lock.release()
60 |         return pstate
61 | 
62 |     def thread_started(self):
63 |         self._set_state(RTPWrkTRun, expected_state = RTPWrkTInit)
64 | 
65 |     def stop(self):
66 |         pstate = self._set_state(RTPWrkTStop, expected_state = RTPWrkTRun, raise_on_error = True)
67 |         if pstate == RTPWrkTRun:
68 |             self.join()
69 |         self._set_state(RTPWrkTInit, expected_state = RTPWrkTStop)
70 | 


--------------------------------------------------------------------------------
/Core/OutputMuxer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Dict, List
 2 | from time import monotonic
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | 
 7 | from .AudioChunk import AudioChunk
 8 | from .AStreamMarkers import ASMarkerGeneric, ASMarkerNewSent
 9 | 
10 | class OutputMuxer():
11 |     debug = False
12 |     output_sr:int
13 |     qsize:int
14 |     device:str
15 |     chunks_in: List[Union[AudioChunk, ASMarkerGeneric]]
16 |     def __init__(self, output_sr:int, qsize:int, device:str):
17 |         self.output_sr = output_sr
18 |         self.qsize = qsize
19 |         self.device = device
20 |         self.chunks_in = []
21 | 
22 |     def chunk_in(self, chunk:Union[AudioChunk, ASMarkerGeneric]):
23 |         if isinstance(chunk, AudioChunk):
24 |             if chunk.samplerate != self.output_sr:
25 |                 chunk = chunk.resample(self.output_sr)
26 |             if len(self.chunks_in) > 0 and isinstance(self.chunks_in[-1], AudioChunk):
27 |                 chunk.audio = torch.cat((self.chunks_in.pop().audio, chunk.audio), dim=0)
28 |         self.chunks_in.append(chunk)
29 | 
30 |     def idle(self, rtp_worker):
31 |         chunk_o = torch.empty(0).to(self.device)
32 |         if len(self.chunks_in) == 1 and isinstance(self.chunks_in[0], AudioChunk) and \
33 |           self.chunks_in[0].audio.size(0) < self.qsize:
34 |             return None
35 |         while len(self.chunks_in) > 0 and (rsize:=self.qsize-chunk_o.size(0)) > 0:
36 |             chunk = self.chunks_in[0]
37 |             if isinstance(chunk, ASMarkerNewSent):
38 |                 #self.update_frm_ctrs(prcsd_inc=pos.get_buf_nframes())
39 |                 if chunk_o.size(0) > 0:
40 |                     return chunk_o
41 |                 if self.debug:
42 |                     print(f'{monotonic():4.3f}: ASMarkerNewSent {chunk.on_proc=}')
43 |                 self.chunks_in.pop(0)
44 |                 chunk.on_proc(rtp_worker)
45 |                 continue
46 |             chunk_o = torch.cat((chunk_o, chunk.audio[:rsize]), dim=0)
47 |             if chunk.audio.size(0) > rsize:
48 |                 chunk.audio = chunk.audio[rsize:]
49 |             else:
50 |                 self.chunks_in.pop(0)
51 |         if chunk_o.size(0) > 0 and chunk_o.size(0) < self.qsize:
52 |             print(f'{monotonic():4.3f}: Reinserting {chunk_o.size()=}')
53 |             self.chunks_in.insert(0, AudioChunk(chunk_o, self.output_sr))
54 |             return None
55 | 
56 |         return chunk_o if chunk_o.size(0) > 0 else None
57 | 
58 | class OutputMTMuxer():
59 |     tracks:Dict[int, OutputMuxer]
60 |     output_sr:int
61 |     qsize:int
62 |     device:str
63 |     def __init__(self, output_sr:int, qsize:int, device:str):
64 |         self.tracks = {}
65 |         self.output_sr = output_sr
66 |         self.qsize = qsize
67 |         self.device = device
68 | 
69 |     def chunk_in(self, chunk:Union[AudioChunk, ASMarkerGeneric]):
70 |         if chunk.track_id not in self.tracks:
71 |             self.tracks[chunk.track_id] = OutputMuxer(self.output_sr, self.qsize, self.device)
72 |         self.tracks[chunk.track_id].chunk_in(chunk)
73 | 
74 |     def idle(self, rtp_worker):
75 |         chunks = [chunk for chunk in [track.idle(rtp_worker) for track in self.tracks.values()] if chunk is not None]
76 |         if len(chunks) == 0: return None
77 |         if len(chunks) == 1: return chunks[0]
78 |         max_len = max([chunk.size(0) for chunk in chunks])
79 |         chunks = [F.pad(chunk, (0, max_len-chunk.size(0)), "constant", 0) if chunk.size(0) < max_len else chunk
80 |                   for chunk in chunks]
81 |         merged = torch.sum(torch.stack(chunks), dim=0) / len(self.tracks)
82 |         #max_val = torch.max(torch.abs(merged))
83 |         #if max_val > 1:
84 |         #    merged /= max_val
85 |         return merged
86 | 


--------------------------------------------------------------------------------
/Core/T2T/NumbersToWords.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | import re
 3 | import inflect
 4 | 
 5 | from config.InfernGlobals import InfernGlobals as IG
 6 | 
 7 | class NumbersToWords:
 8 |     tr:Optional[callable]
 9 |     cache:dict
10 |     def __init__(self, lang='en'):
11 |         self.p = inflect.engine()
12 |         self.tr, self.cache = (None, None) if lang == 'en' else (IG.get_translator('en', lang).translate, {})
13 | 
14 |     def __call__(self, text):
15 |         # Find all instances of numbers in the text
16 |         numbers = re.findall(r'\b\d[\d.,]*%?(?=[\s.,!]|$)', text)
17 | 
18 |         # For each number found, replace it with its word equivalent
19 |         for number in numbers:
20 |             if number.endswith('%'):
21 |                 tr_number = number[:-1]
22 |                 suffix = ' percent'
23 |             elif number[-1] in ('.', ',', '!'):
24 |                 tr_number = number[:-1]
25 |                 suffix = number[-1]
26 |             else:
27 |                 suffix = ''
28 |                 tr_number = number
29 |             word = self.p.number_to_words(tr_number) + suffix
30 |             if self.tr is not None:
31 |                 if (word_tr:=self.cache.get(number, None)) is None:
32 |                     self.cache[number] = word_tr = self.tr(word)
33 |                 word = word_tr
34 |             text = text.replace(number, word, 1)
35 |         return text
36 | 
37 | if __name__ == '__main__':
38 |     n2w = NumbersToWords()
39 |     print(n2w('I have 3 cats and 2 dogs.'))
40 |     print(n2w('I have 3% cats and 2% dogs.'))
41 |     print(n2w('I have 30000 cats and 2999 dogs.'))
42 |     print(n2w('I have 50% cats and 29.0% dogs.'))
43 |     print(n2w('I have 3,090.6 cats and 21,188,128 dogs.%,'))
44 |     print(n2w('I have 3% cats and dogs 2%.'))
45 |     print(n2w('I have 3% cats and dogs 20%, and mice 3.0%.'))
46 |     print(n2w('I have 3% cats and dogs since 2024, or 2023.'))
47 | 


--------------------------------------------------------------------------------
/Core/T2T/Translator.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Optional
 2 | from functools import partial
 3 | 
 4 | import argostranslate.package
 5 | from argostranslate.translate import get_installed_languages
 6 | 
 7 | def load_pair(from_code, to_code):
 8 |     print(f'load_pair({from_code=}, {to_code=})')
 9 |     argostranslate.package.update_package_index()
10 |     available_packages = argostranslate.package.get_available_packages()
11 |     package_to_install = next(
12 |         filter(
13 |             lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
14 |         )
15 |     )
16 |     print(f'{package_to_install=}')
17 |     argostranslate.package.install_from_path(package_to_install.download())
18 | 
19 | class Translator():
20 |     supported_langs = ["en", "it", "de", "ru", "ja"]
21 |     translators: Tuple[callable]
22 |     def __init__(self, from_code: str, to_code: str, filter:Optional[callable]=None):
23 |         to_code_p = [to_code,]
24 |         inter_codes = [x for x in self.supported_langs if x not in (from_code, to_code)]
25 |         success = False
26 |         while not success:
27 |             try: load_pair(from_code, to_code)
28 |             except StopIteration: pass
29 |             else:
30 |                 success = True
31 |                 break
32 |             while len(inter_codes) > 0:
33 |                 inter_code = inter_codes.pop()
34 |                 try:
35 |                     load_pair(from_code, inter_code)
36 |                     load_pair(inter_code, to_code)
37 |                 except StopIteration:
38 |                     if len(inter_codes) == 0: raise
39 |                     continue
40 |                 to_code_p.insert(0, to_code)
41 |                 success = True
42 |                 break
43 |         ilangs = dict((x.code, x) for x in get_installed_languages())
44 |         from_lang = ilangs[from_code]
45 |         translators = []
46 |         for tc in to_code_p:
47 |             to_lang = ilangs[tc]
48 |             tr = from_lang.get_translation(to_lang).translate
49 |             if filter is not None: tr = partial(filter, from_code=from_code, to_code=tc, tr=tr)
50 |             translators.append(tr)
51 |             from_lang, from_code = to_lang, tc
52 |         self.translators = tuple(translators)
53 | 
54 |     def translate(self, sourceText):
55 |         for translator in self.translators:
56 |             sourceText = translatedText = translator(sourceText)
57 |         return translatedText
58 | 
59 | if __name__ == '__main__':
60 |     tr = Translator('en', 'ja')
61 |     t0 = tr.translate('Hello world!')
62 |     tr = Translator('ru', 'it')
63 |     tr1 = Translator('it', 'de')
64 |     #print(tr.to_code_p, tr1.to_code_p)
65 |     sourceText = "Привет, как твои дела?"
66 |     t1 = tr.translate(sourceText)
67 |     t2 = tr1.translate(t1)
68 |     print(t0, t1, t2)
69 | 
70 | 


--------------------------------------------------------------------------------
/Core/VAD/SileroVAD.py:
--------------------------------------------------------------------------------
  1 | #try: import intel_extension_for_pytorch as ipex
  2 | #except ModuleNotFoundError: ipex = None
  3 | 
  4 | from typing import Tuple, List, Optional
  5 | 
  6 | import torch
  7 | 
  8 | from Cluster.InfernBatchedWorker import InfernBatchedWorker
  9 | from Core.AudioChunk import AudioChunk, VadAudioChunk
 10 | from Core.VAD.SileroVADUtils import VADIteratorB, VADChannelState, VADBatchFromList
 11 | 
 12 | class VADChannel():
 13 |     audio_chunk_in: callable
 14 |     vad_chunk_in: callable
 15 |     decode: callable
 16 |     vad_buffer: bytes = b''
 17 |     state: VADChannelState
 18 |     active_start: Optional[int] = None
 19 |     active_buffer: torch.Tensor
 20 |     def __init__(self, audio_chunk_in:callable, vad_chunk_in: callable, decode: callable, device:str):
 21 |         self.audio_chunk_in = audio_chunk_in
 22 |         self.vad_chunk_in = vad_chunk_in
 23 |         self.decode = decode
 24 |         self.state = VADChannelState(device)
 25 |         self.active_buffer = torch.zeros(0).to('cpu')
 26 | 
 27 |     def ingest(self, svad:'SileroVADWorker', data: bytes, codec):
 28 |         self.vad_buffer += data
 29 |         if codec.e2d_frames(len(self.vad_buffer), svad.input_sr) < svad.window_size_samples:
 30 |             return None
 31 |         decode_samples = codec.d2e_frames(svad.window_size_samples, svad.input_sr)
 32 |         chunk = codec.decode(self.vad_buffer[:decode_samples], sample_rate=svad.input_sr)
 33 |         assert chunk.audio.size(0) == svad.window_size_samples, f'{len(chunk).audio.size(0)=} {svad.window_size_samples=}'
 34 |         self.vad_buffer = self.vad_buffer[decode_samples:]
 35 |         svad.infer((self, chunk))
 36 |         #self.vad_chunk_in(chunk, True)
 37 | 
 38 | class SileroVADWorker(InfernBatchedWorker):
 39 |     max_batch_size: int = 200
 40 |     input_sr: int
 41 |     max_vad_frames: int
 42 |     def __init__(self, device, input_sr: int = 8000):
 43 |         super().__init__()
 44 |         model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad:v3.1',
 45 |                                     model='silero_vad', force_reload=True, trust_repo=True)
 46 |         self.device = device
 47 |         self.model = model = model.eval().to(device)
 48 |         #for n, t in [(_t, getattr(_m, _t, None))
 49 |         #     for _m in (model._model_8k.decoder.rnn, model._model.decoder.rnn)
 50 |         #     for _t in dir(_m)
 51 |         #     if _t != 'graph']:
 52 |         #    if not isinstance(t, torch.Tensor): continue
 53 |         #    print(f'{n=} {t.is_contiguous()=}')
 54 | 
 55 |         self.vad_iterator = VADIteratorB(model, sampling_rate=input_sr)
 56 |         self.window_size_samples = 768 # number of samples in a single audio chunk
 57 |         self.input_sr = input_sr
 58 |         self.max_vad_frames = input_sr * 30 # 30 seconds for Whisper
 59 | 
 60 |     @torch.no_grad()
 61 |     def process_batch(self, wis:List[Tuple[VADChannel, torch.Tensor]]):
 62 |         from time import sleep
 63 |         #sleep(0.5)
 64 |         #print(f'InfernSTTWorker.process_batch: got {len(wis)=}')
 65 |         while len(wis) > 0:
 66 |             nbatch = []
 67 |             cbatch: List[VADChannel] = []
 68 |             pbatch: List[AudioChunk] = []
 69 |             sbatch: List[VADChannelState] = []
 70 |             for wi in wis:
 71 |                 if (ch:=wi[0]) not in cbatch:
 72 |                     cbatch.append(ch)
 73 |                     pbatch.append(wi[1])
 74 |                     sbatch.append(ch.state)
 75 |                 else:
 76 |                     nbatch.append(wi)
 77 |             wis = nbatch
 78 |             bstate = VADBatchFromList(sbatch)
 79 |             chunks = torch.stack([p.audio for p in pbatch], dim=0).to(self.device)
 80 |             self.vad_iterator(chunks, bstate=bstate, return_seconds=False)
 81 |             for i, (vc, p) in enumerate(zip(cbatch, pbatch)):
 82 |                 sd = vc.state
 83 |                 if sd.speech: print(f'speech_dict[{i}]={sd.speech} {sd.current_sample=}', end=' ')
 84 |                 vc.active_buffer = torch.cat((vc.active_buffer, p.audio.cpu()))
 85 |                 if sd.speech and 'start' in sd.speech:
 86 |                     assert vc.active_start is None, f'{vc.active_start=}'
 87 |                     vc.active_start = sd.speech['start']
 88 |                     poff = sd.current_sample - vc.active_start
 89 |                     assert poff > 0 and poff < vc.active_buffer.size(0), f'{poff=} {vc.active_buffer.size(0)=} {sd.current_sample=} {vc.active_start=}'
 90 |                     vc.active_buffer = vc.active_buffer[-poff:]
 91 |                 elif sd.speech and 'end' in sd.speech:
 92 |                     active_end = sd.speech["end"]
 93 |                     assert vc.active_start is not None and active_end > vc.active_start, f'{vc.active_start=} {sd.temp_end=} {active_end=}'
 94 |                     assert sd.current_sample > active_end, f'{sd.current_sample=} {active_end=}'
 95 |                     poff = sd.current_sample - active_end
 96 |                     assert poff > 0 and poff < vc.active_buffer.size(0), f'{poff=} {vc.active_buffer.size(0)=} {sd.current_sample=} {active_end=}'
 97 |                     obuf = vc.active_buffer[:-poff]
 98 |                     assert obuf.size(0) == active_end - vc.active_start, f'{obuf.size(0)=} {vc.active_start=} {active_end=}'
 99 |                     vc.vad_chunk_in(VadAudioChunk(obuf, self.input_sr, vc.active_start))
100 |                     vc.active_start = None
101 |                 if vc.active_start is None:
102 |                     vc.active_buffer = vc.active_buffer[:self.window_size_samples*2]
103 |                 elif vc.active_buffer.size(0) > self.max_vad_frames:
104 |                     chunk = VadAudioChunk(vc.active_buffer[:self.max_vad_frames], self.input_sr, vc.active_start)
105 |                     vc.active_buffer = vc.active_buffer[self.max_vad_frames:]
106 |                     vc.active_start += self.max_vad_frames
107 |                     if sd.temp_end != 0:
108 |                         print(f'{sd.current_sample=}: {sd.temp_end=} -> {vc.active_start=}')
109 |                         if sd.temp_end < vc.active_start:
110 |                             sd.temp_end = vc.active_start
111 |                     vc.vad_chunk_in(chunk)
112 |                 vc.audio_chunk_in(p, vc.active_start is not None)
113 | 


--------------------------------------------------------------------------------
/Core/VAD/SileroVADUtils.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Optional
  2 | import torch
  3 | 
  4 | class VADChannelState:
  5 |     triggered: bool = False
  6 |     temp_end: int = 0
  7 |     current_sample: int = 0
  8 |     speech: Optional[Dict[str, int]] = None
  9 |     model_state: List[torch.Tensor]
 10 |     def __init__(self, device:str):
 11 |         self.model_state = [torch.zeros(2, 64).to(device), torch.zeros(2, 64).to(device)]
 12 | 
 13 | class VADBatchState:
 14 |     batch_size: int
 15 |     channels: List[VADChannelState]
 16 |     device: str
 17 |     def __init__(self, batch_size, device:str='cpu'):
 18 |         self.batch_size = batch_size
 19 |         self.channels = [VADChannelState(device) for _ in range(batch_size)]
 20 | 
 21 |     def get_model_state(self):
 22 |         return [torch.stack([s.model_state[r] for s in self.channels], dim=1) for r in range(2)]
 23 | 
 24 |     def save_model_state(self, state:List[torch.Tensor]):
 25 |         for c, s1, s2 in zip(self.channels, state[0].unbind(1), state[1].unbind(1)):
 26 |             c.model_state = [s1, s2]
 27 | 
 28 | class VADBatchFromList(VADBatchState):
 29 |     def __init__(self, states:List[VADChannelState]):
 30 |         self.batch_size = len(states)
 31 |         self.channels = states
 32 | 
 33 | class VADIteratorB:
 34 |     def __init__(self,
 35 |                  model,
 36 |                  threshold: float = 0.5,
 37 |                  sampling_rate: int = 16000,
 38 |                  min_silence_duration_ms: int = 100,
 39 |                  speech_pad_ms: int = 30,
 40 |                  ):
 41 | 
 42 |         """
 43 |         Class for stream imitation
 44 | 
 45 |         Parameters
 46 |         ----------
 47 |         model: preloaded .jit silero VAD model
 48 | 
 49 |         threshold: float (default - 0.5)
 50 |             Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
 51 |             It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
 52 | 
 53 |         sampling_rate: int (default - 16000)
 54 |             Currently silero VAD models support 8000 and 16000 sample rates
 55 | 
 56 |         min_silence_duration_ms: int (default - 100 milliseconds)
 57 |             In the end of each speech chunk wait for min_silence_duration_ms before separating it
 58 | 
 59 |         speech_pad_ms: int (default - 30 milliseconds)
 60 |             Final speech chunks are padded by speech_pad_ms each side
 61 |         """
 62 | 
 63 |         self.model = model
 64 |         self.threshold = threshold
 65 |         self.sampling_rate = sampling_rate
 66 | 
 67 |         if sampling_rate not in [8000, 16000]:
 68 |             raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
 69 | 
 70 |         self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
 71 |         self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
 72 |         self.model.reset_states()
 73 | 
 74 |     def __call__(self, x:torch.Tensor, bstate:Optional[VADBatchState]=None, return_seconds=False):
 75 |         """
 76 |         x: torch.Tensor
 77 |             audio chunk (see examples in repo)
 78 | 
 79 |         return_seconds: bool (default - False)
 80 |             whether return timestamps in seconds (default - samples)
 81 |         """
 82 | 
 83 |         if not torch.is_tensor(x):
 84 |             try:
 85 |                 x = torch.Tensor(x)
 86 |             except:
 87 |                 raise TypeError("Audio cannot be casted to tensor. Cast it manually")
 88 | 
 89 |         if x.dim() == 1: x = x.unsqueeze(0)
 90 |         else: assert x.dim() == 2, f"Audio should be 1D or 2D tensor, but got {x.dim()}"
 91 | 
 92 |         batch_size = x.size(0)
 93 | 
 94 |         if bstate is None:
 95 |             bstate = VADBatchState(batch_size, device=x.device)
 96 |             self.model.reset_states()
 97 |         else:
 98 |             assert bstate.batch_size == batch_size, f"Batch size should be {batch_size}, but got {bstate.batch_size}"
 99 |             (_mc:=self.model._c)._h, _mc._c, _mc._last_sr, _mc._last_batch_size = bstate.get_model_state() + [self.sampling_rate, batch_size]
100 | 
101 |         window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
102 | 
103 |         speech_probs = (y for y in self.model(x, self.sampling_rate).tolist())
104 | 
105 |         for speech_prob, channel in zip(speech_probs, bstate.channels):
106 |             channel.current_sample += window_size_samples
107 |             if (speech_prob >= self.threshold) and channel.temp_end:
108 |                 channel.temp_end = 0
109 | 
110 |             if (speech_prob >= self.threshold) and not channel.triggered:
111 |                 channel.triggered = True
112 |                 speech_pad_samples = self.speech_pad_samples if channel.current_sample > window_size_samples else 0
113 |                 speech_start = channel.current_sample - speech_pad_samples - window_size_samples
114 |                 channel.speech = {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
115 |                 continue
116 | 
117 |             if (speech_prob < self.threshold - 0.15) and channel.triggered:
118 |                 if not channel.temp_end:
119 |                     channel.temp_end = channel.current_sample
120 |                 if channel.current_sample - channel.temp_end < self.min_silence_samples:
121 |                     channel.speech = None
122 |                     continue
123 |                 else:
124 |                     speech_end = channel.temp_end + self.speech_pad_samples - window_size_samples
125 |                     channel.temp_end = 0
126 |                     channel.triggered = False
127 |                     channel.speech = {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
128 |                     continue
129 | 
130 |             channel.speech = None
131 |         bstate.save_model_state([(_mc:=self.model._c)._h, _mc._c])
132 |         #print(f'{bstate.model_state[0].size()=} {bstate.model_state[1].size()=}')
133 |         return bstate
134 | 


--------------------------------------------------------------------------------
/Core/VAD/ZlibVAD.py:
--------------------------------------------------------------------------------
 1 | from zlib import compress
 2 | 
 3 | class VADResult():
 4 |     chunk: bytes
 5 |     active: bool = True
 6 | 
 7 | class ZlibVAD():
 8 |     vad_duration: float = 0.1
 9 |     vad_threshold: float = 0.6
10 |     vad_frames: int
11 |     max_vad_frames: int
12 |     vad_buffer: bytes = b''
13 |     chunk_buffer: bytes = b''
14 |     ninactive: int = 0
15 |     activation_threshold: int = 5
16 |     def __init__(self, input_sr: int = 8000):
17 |         self.vad_frames = int(input_sr * self.vad_duration)
18 |         self.max_vad_frames = input_sr * 30 # 30 seconds for Whisper
19 | 
20 |     def ingest(self, data: bytes, vad_chunk_in: callable):
21 |         self.vad_buffer += data
22 |         if len(self.vad_buffer) < self.vad_frames:
23 |             return None
24 |         chunk = self.vad_buffer[:self.vad_frames]
25 |         self.vad_buffer = self.vad_buffer[self.vad_frames:]
26 |         r = len(compress(chunk))/len(chunk)
27 |         v = VADResult()
28 |         active = False if r < self.vad_threshold else True
29 |         vad_chunk_in(chunk, active)
30 |         max_len_reached = len(self.chunk_buffer) >= (self.max_vad_frames - (self.vad_frames * self.activation_threshold))
31 |         if active:
32 |             self.ninactive = 0
33 |             if not max_len_reached:
34 |                 self.chunk_buffer += chunk
35 |                 return None
36 |             v.chunk = self.chunk_buffer[:self.max_vad_frames]
37 |             self.chunk_buffer = self.chunk_buffer[self.max_vad_frames:]
38 |             return v
39 |         else:
40 |             if self.ninactive > self.activation_threshold:
41 |                 assert len(self.chunk_buffer) > self.vad_frames * self.activation_threshold
42 |                 chunk = self.chunk_buffer[:-self.vad_frames*self.activation_threshold]
43 |                 if len(chunk) < self.vad_frames * self.activation_threshold:
44 |                     v = None
45 |                 else:
46 |                     v.chunk = chunk
47 |                 self.chunk_buffer = b''
48 |                 self.ninactive = 0
49 |                 return v
50 |             self.chunk_buffer += chunk
51 |             self.ninactive += 1
52 |         return None


--------------------------------------------------------------------------------
/HelloSippyTTSRT/HelloSippyRT.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Optional
  2 | from time import monotonic
  3 | 
  4 | import torch
  5 | from transformers import SpeechT5ForTextToSpeech, \
  6 |         SpeechT5HifiGanConfig, SpeechT5HifiGan, SpeechT5Processor, \
  7 |         SpeechT5Config
  8 | from transformers.models.speecht5.modeling_speecht5 import \
  9 |         SpeechT5EncoderWithSpeechPrenet
 10 | from transformers import PretrainedConfig, PreTrainedModel
 11 | from datasets import load_dataset
 12 | import torch.nn as nn
 13 | from methodtools import lru_cache
 14 | 
 15 | from config.InfernGlobals import InfernGlobals
 16 | 
 17 | GenerateSpeech_cb = Callable[[torch.FloatTensor], None]
 18 | 
 19 | class HelloSippyRT():
 20 |     pass
 21 | 
 22 | def _generate_speech_rt(
 23 |     hsrt: HelloSippyRT,
 24 |     input_values: torch.FloatTensor,
 25 |     speech_cb: GenerateSpeech_cb,
 26 |     speaker_embeddings: Optional[torch.FloatTensor] = None,
 27 |     threshold: float = 0.5,
 28 |     minlenratio: float = 0.0,
 29 |     maxlenratio: float = 20.0,
 30 | ) -> int:
 31 |     with hsrt.cuda_lock:
 32 |         encoder_attention_mask = torch.ones_like(input_values)
 33 | 
 34 |         model = hsrt.model
 35 |         encoder_out = model.speecht5.encoder(
 36 |             input_values=input_values,
 37 |             attention_mask=encoder_attention_mask,
 38 |             return_dict=True,
 39 |         )
 40 | 
 41 |         encoder_last_hidden_state = encoder_out.last_hidden_state
 42 | 
 43 |         # downsample encoder attention mask
 44 |         if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet):
 45 |             encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask(
 46 |                 encoder_out[0].shape[1], encoder_attention_mask
 47 |             )
 48 | 
 49 |         maxlen = int(encoder_last_hidden_state.size(1) * maxlenratio / model.config.reduction_factor)
 50 |         minlen = int(encoder_last_hidden_state.size(1) * minlenratio / model.config.reduction_factor)
 51 | 
 52 |         # Start the output sequence with a mel spectrum that is all zeros.
 53 |         output_sequence = encoder_last_hidden_state.new_zeros(1, 1, model.config.num_mel_bins)
 54 | 
 55 |         spectrogram = torch.zeros(0, model.config.num_mel_bins).to(model.device)
 56 |         past_key_values = None
 57 |         idx = 0
 58 | 
 59 |         ###stime_pre = None
 60 |         btime = monotonic()
 61 |         p_ch = hsrt.chunker
 62 |         _c =  hsrt.c_conf
 63 |         prfs = torch.zeros(_c.pre_frames, model.config.num_mel_bins,
 64 |                         device=model.device)
 65 |         pofs = torch.zeros(_c.post_frames, model.config.num_mel_bins,
 66 |                         device=model.device)
 67 |         oschedule = [_c.chunk_size, _c.chunk_size, _c.chunk_size*2]
 68 |         output_len = oschedule[0]
 69 |         chunk_size = _c.chunk_size
 70 |         vocoder = hsrt.vocoder
 71 |         while True:
 72 |             idx += 1
 73 | 
 74 |             # Run the decoder prenet on the entire output sequence.
 75 |             decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
 76 | 
 77 |             # Run the decoder layers on the last element of the prenet output.
 78 |             decoder_out = model.speecht5.decoder.wrapped_decoder(
 79 |                 hidden_states=decoder_hidden_states[:, -1:],
 80 |                 attention_mask=None,
 81 |                 encoder_hidden_states=encoder_last_hidden_state,
 82 |                 encoder_attention_mask=encoder_attention_mask,
 83 |                 past_key_values=past_key_values,
 84 |                 use_cache=True,
 85 |                 output_attentions=False,
 86 |                 return_dict=True,
 87 |             )
 88 | 
 89 |             last_decoder_output = decoder_out.last_hidden_state[0, -1]
 90 |             past_key_values = decoder_out.past_key_values
 91 | 
 92 |             # Predict the new mel spectrum for this step in the sequence.
 93 |             spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output)
 94 |             spectrum = spectrum.view(model.config.reduction_factor, model.config.num_mel_bins)
 95 |             spectrogram = torch.cat((spectrogram, spectrum), dim=0)
 96 | 
 97 |             # Extend the output sequence with the new mel spectrum.
 98 |             spv = spectrum[-1].view(1, 1, model.config.num_mel_bins)
 99 |             output_sequence = torch.cat((output_sequence, spv), dim=1)
100 | 
101 |             # Predict the probability that this is the stop token.
102 |             prob = model.speech_decoder_postnet.prob_out(last_decoder_output).sigmoid()
103 | 
104 |             # Finished when stop token or maximum length is reached.
105 |             theend = theend_cb = False
106 |             if idx >= minlen and (int(sum(prob >= threshold)) > 0 or idx >= maxlen):
107 |                 theend = True
108 | 
109 |             if (len(spectrogram) >= output_len and len(spectrogram) + prfs.size(0) >= chunk_size + _c.eframes) \
110 |             or (theend and len(spectrogram) > 0):
111 |                 _s = spectrogram.unsqueeze(0)
112 |                 _s = model.speech_decoder_postnet.postnet(_s)
113 |                 _s = _s.squeeze(0)
114 |                 #print(_s.size(0), prfs.size(0), _s.device)
115 |                 in_size = _s.size()
116 |                 _s = [prfs, _s]
117 |                 if theend:
118 |                     _s.append(pofs)
119 |                 _s = torch.cat(_s, dim=0)
120 |                 extra_pad = (_s.size(0) - _c.eframes) % chunk_size
121 |                 assert extra_pad < chunk_size
122 |                 if extra_pad > 0:
123 |                     extra_pad = chunk_size - extra_pad
124 |                     #print(_s.size())
125 |                     _pofs = torch.zeros(extra_pad,
126 |                                         _s.size(1), device=_s.device)
127 |                     _s = torch.cat((_s, _pofs), dim=0)
128 |                 outputs = []
129 |                 while _s.size(0) >= _c.eframes + chunk_size:
130 |                     #print(_s.size(), _s.device)
131 |                     _i = _s[:_c.eframes + chunk_size, :]
132 |                     _o = vocoder(_i).unsqueeze(0)
133 |                     _o = p_ch(_i, _o)
134 |                     outputs.append(_o.squeeze(0))
135 |                     #print('out', _o.size(), outputs[-1].size())
136 |                     _s = _s[chunk_size:, :]
137 |                 if extra_pad > 0:
138 |                     ep_trim = extra_pad * _c.frame_size
139 |                     assert outputs[-1].size(0) > ep_trim
140 |                     outputs[-1] = outputs[-1][:-ep_trim]
141 |                 outputs = torch.cat(outputs, dim=0)
142 |                 #print('_s after:', _s.size(0))
143 |                 assert _s.size(0) >= _c.eframes and _s.size(0) < _c.eframes + chunk_size
144 |                 #print('prfs', prfs.size(), 'inputs', in_size, 'outputs', outputs.size(), '_s', _s.size())
145 |                 #print(_s.shape, outputs.shape)
146 |                 prfs = _s
147 |                 #print(monotonic() - btime)
148 |                 hsrt.cuda_lock.release()
149 |                 qlen, theend_cb = speech_cb(outputs)
150 |                 hsrt.cuda_lock.acquire()
151 |                 if output_len in oschedule:
152 |                     oschedule.pop(0)
153 |                     if len(oschedule) > 0:
154 |                         output_len = oschedule[0]
155 |                 elif qlen > 1 and output_len < 64:
156 |                     output_len *= 2
157 |                 spectrogram = torch.zeros(0, model.config.num_mel_bins).to(model.device)
158 |             if theend or theend_cb:
159 |                 break
160 | 
161 |     return idx
162 | 
163 | class AmendmentNetwork1Config(PretrainedConfig):
164 |     chunk_size = 8
165 |     pre_frames = 2
166 |     post_frames = 2
167 |     frame_size = 256
168 |     num_mels = 80
169 |     chunk_size: int
170 |     trim_pr: int
171 |     trim_po: int
172 |     output_size: int
173 |     eframes: int
174 | 
175 |     def __init__(self, *a, **ka):
176 |         super().__init__(*a, **ka)
177 |         self.eframes = self.pre_frames + self.post_frames
178 |         self.trim_pr = self.pre_frames * self.frame_size
179 |         self.trim_po = self.post_frames * self.frame_size
180 |         self.output_size = self.chunk_size * self.frame_size
181 | 
182 | class SimpleResidualBlock(nn.Module):
183 |     def __init__(self, channels):
184 |         super().__init__()
185 |         self.conv1 = nn.Conv1d(channels, channels, kernel_size=3, stride=1,
186 |                                padding=1, dilation=1)
187 |         self.conv2 = nn.Conv1d(channels, channels, kernel_size=3, stride=1,
188 |                                padding=3, dilation=3)
189 | 
190 |     def forward(self, x, lrelu):
191 |         assert lrelu is not None
192 |         residual = x
193 |         x = lrelu(x)
194 |         x = self.conv1(x)
195 |         x = lrelu(x)
196 |         x = self.conv2(x)
197 |         x += residual
198 |         return x
199 | 
200 | class AmendmentNetwork1(PreTrainedModel):
201 |     config_class = AmendmentNetwork1Config
202 |     def __init__(self, config=None):
203 |         if config is None:
204 |             config = self.config_class()
205 |         super().__init__(config)
206 |         _c = self._c = config
207 | 
208 |         self.conv_pre_m = nn.Conv1d(_c.num_mels, 32, kernel_size=3, stride=1, padding=1)
209 |         self.conv_pre_a = nn.Conv1d(_c.frame_size, 160, kernel_size=3, stride=1, padding=1)
210 |         self.upsampler = nn.ModuleList([
211 |           nn.ConvTranspose1d(192, 128, kernel_size=8, stride=4, padding=2),
212 |           nn.ConvTranspose1d(128, 64, kernel_size=8, stride=4, padding=2),
213 |         ])
214 |         self.lrelu = nn.LeakyReLU(0.01)
215 |         self.resblock = SimpleResidualBlock(64)
216 |         self.post_conv = nn.Conv1d(in_channels=64, out_channels=_c.frame_size,
217 |                                    kernel_size=8, stride=24, padding=0)
218 | 
219 |     def forward(self, mel, audio):
220 |         batch_size, total_length = audio.size()
221 |         T = mel.size(-1)
222 |         #print(Exception(f"BP: ms:{mel.size()} as:{audio.size()}"))
223 |         audio_reshaped = audio.view(batch_size, self._c.frame_size, -1)
224 |         mel = mel.view(batch_size, T, -1)
225 |         #print(Exception(f"BP: ms:{mel.size()} as:{audio.size()} ars:{audio_reshaped.size()}"))
226 |         x_mel = self.conv_pre_m(mel)
227 |         x_audio = self.conv_pre_a(audio_reshaped)
228 |         am_comb = torch.cat((x_mel, x_audio), dim=1)
229 |         for i, layer in enumerate(self.upsampler):
230 |             am_comb = self.lrelu(am_comb)
231 |             am_comb = layer(am_comb)
232 |         am_comb = self.resblock(am_comb, self.lrelu)
233 |         am_comb = self.lrelu(am_comb)
234 |         am_comb = self.post_conv(am_comb).squeeze(-1)
235 |         am_comb = self.lrelu(am_comb).view(batch_size, -1)
236 |         audio = audio[:, self._c.trim_pr:-self._c.trim_po] * am_comb
237 |         return audio.tanh()
238 | 
239 | class HelloSippyRT():
240 |     processor: SpeechT5Processor
241 |     chunker: AmendmentNetwork1
242 |     c_conf: AmendmentNetwork1Config
243 |     vocoder: SpeechT5HifiGan
244 |     model: SpeechT5ForTextToSpeech
245 |     cuda_lock = InfernGlobals().torcher
246 |     default_model = "microsoft/speecht5_tts"
247 |     def __init__(self, device, model=default_model, get_processor:Optional[callable]=None):
248 |         with self.cuda_lock:
249 |             mc = SpeechT5Config.from_pretrained(model)
250 |             if get_processor is None:
251 |                self.processor = SpeechT5Processor.from_pretrained(model, config=mc)
252 |             else:
253 |                 self.processor = get_processor(device, model, config=mc)
254 |             model = SpeechT5ForTextToSpeech.from_pretrained(model,
255 |                                                             config=mc).to(device)
256 |             model.eval()
257 |             self.model = model
258 |             _vc_conf = SpeechT5HifiGanConfig()
259 |             vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan",
260 |                                                     config = _vc_conf).to(device)
261 |             vocoder.eval()
262 |             self.vocoder = vocoder
263 |             self.c_conf = AmendmentNetwork1Config()
264 |             chunker = AmendmentNetwork1.from_pretrained("sobomax/speecht5-rt.post_vocoder.v2",
265 |                                                         config=self.c_conf)
266 |             chunker = chunker.to(device)
267 |             chunker.eval()
268 |             self.chunker = chunker
269 |             embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
270 |             self.speaker_embeddings = [torch.tensor(ed["xvector"]).unsqueeze(0)
271 |                                         for ed in embeddings_dataset]
272 | 
273 |     def get_rand_voice_id(self):
274 |         return torch.randint(0, len(self.speaker_embeddings), (1,)).item()
275 | 
276 |     def get_rand_voice(self):
277 |         with self.cuda_lock:
278 |             s_index = self.get_rand_voice_id()
279 |             rv = self.speaker_embeddings[s_index].to(self.model.device)
280 |             return rv
281 | 
282 |     @lru_cache(maxsize=16)
283 |     def get_voice(self, s_index:int):
284 |         with self.cuda_lock:
285 |             rv = self.speaker_embeddings[s_index].to(self.model.device)
286 |             return rv
287 | 
288 |     @torch.no_grad()
289 |     def generate_speech_rt(
290 |         self,
291 |         input_ids: torch.LongTensor,
292 |         speech_cb: GenerateSpeech_cb,
293 |         speaker_embeddings: Optional[torch.FloatTensor] = None,
294 |         threshold: float = 0.5,
295 |         minlenratio: float = 0.0,
296 |         maxlenratio: float = 20.0,
297 |     ) -> int:
298 |         return _generate_speech_rt(
299 |             self,
300 |             input_ids,
301 |             speech_cb,
302 |             speaker_embeddings,
303 |             threshold,
304 |             minlenratio,
305 |             maxlenratio,
306 |         )
307 | 
308 |     @torch.no_grad()
309 |     def tts_rt(self, text, speech_cb, speaker=None):
310 |         with self.cuda_lock:
311 |             inputs = self.processor(text=text,
312 |                                     return_tensors="pt").to(self.model.device)
313 |         if speaker is None:
314 |             speaker = self.get_rand_voice()
315 |         self.generate_speech_rt(inputs["input_ids"], speech_cb,
316 |                                   speaker)
317 | 


--------------------------------------------------------------------------------
/HelloSippyTTSRT/HelloSippyRTPipeTest.py:
--------------------------------------------------------------------------------
  1 | try: import intel_extension_for_pytorch as ipex
  2 | except ModuleNotFoundError: ipex = None
  3 | 
  4 | import sys, random, weakref, uuid
  5 | from typing import List, Optional, Tuple
  6 | import contextlib, time
  7 | from os.path import exists as path_exists
  8 | from queue import Queue, Empty as QueueEmpty
  9 | from dataclasses import dataclass
 10 | 
 11 | import numpy as np
 12 | 
 13 | import torch
 14 | for i in range(2):
 15 |     try:
 16 |         from HelloSippyTTSRT.HelloSippyRTPipe import HelloSippyRTPipe, HelloSippyPipeState, HelloSippyPipeStateBatched, \
 17 |         HelloSippyPlayRequest, SessCmd, SessSyncCmd
 18 |     except ModuleNotFoundError:
 19 |         from sys import path as sys_path
 20 |         from os import getcwd
 21 |         sys_path.append(getcwd())
 22 |     else: break
 23 | else: raise ModuleNotFoundError('HelloSippyRTPipe')
 24 | 
 25 | from transformers import set_seed
 26 | 
 27 | class ErrMaxSessReached(Exception): pass
 28 | 
 29 | import threading
 30 | 
 31 | from elperiodic.ElPeriodic import ElPeriodic
 32 | 
 33 | from time import monotonic
 34 | 
 35 | class trp_thread(threading.Thread):
 36 |     queue: Queue
 37 |     queue_out: Optional[Queue] = None
 38 |     elp: Optional[ElPeriodic] = None
 39 |     period = None
 40 |     def __init__(self, period:float=0.0, noreturn:bool=False):
 41 |         self.queue = Queue()
 42 |         if not noreturn: self.queue_out = Queue()
 43 |         if period > 0.0: self.elp = ElPeriodic(1.0 / period)
 44 |         super().__init__(target=self.__thread)
 45 |         self.daemon = True
 46 |         self.start()
 47 | 
 48 |     def __call__(self, func):
 49 |         #raise Exception(f'__call__ {args=} {kwargs=}')
 50 |         def __call(*args, **kwargs):
 51 |             #raise Exception(f'__call {args=} {kwargs=}')
 52 |             t = monotonic()
 53 |             self.queue.put((func, args, kwargs))
 54 |             ex, res = self.queue_out.get()
 55 |             if ex: raise ex
 56 |             return res
 57 |         def __call_noret(*args, **kwargs):
 58 |             self.queue.put((func, args, kwargs))
 59 |         return __call if self.queue_out else __call_noret
 60 |         #return self.queue_out.get()
 61 | 
 62 |     def __thread(self):
 63 |         while True:
 64 |             a = self.queue.get()
 65 |             if a is None: break
 66 |             func, args, kwargs = a
 67 |             st = monotonic()
 68 |             try: res = (None, func(*args, **kwargs))
 69 |             except Exception as ex:res = (ex, None)
 70 |             et = monotonic()
 71 |             if self.queue_out: self.queue_out.put(res)
 72 |             elif res[0]: raise res[0]
 73 |             if self.elp: self.elp.procrastinate()
 74 | 
 75 |     def __del__(self):
 76 |         print('del')
 77 |         if not hasattr(self, 'queue'): return
 78 |         self.queue.put(None)
 79 |         self.join()
 80 |         self.func = None
 81 | 
 82 | class WeakDispatcher():
 83 |     def __init__(self, queue:Queue): self.queue = weakref.ref(queue)
 84 |     def __call__(self, res):
 85 |         q = self.queue()
 86 |         if q: q.put(res.to(torch.float16).numpy() if res is not None else None)
 87 | 
 88 | class InfernSession:
 89 |     _cmd_queue:Queue
 90 |     id:uuid.UUID
 91 |     default_speaker:torch.Tensor
 92 |     def __init__(self, queue, default_speaker:torch.Tensor): self.id, self._cmd_queue, self.default_speaker = uuid.uuid4(), queue, default_speaker
 93 |     def play(self, text:str, dispatch:Queue, speaker:Optional[torch.Tensor] = None):
 94 |         cmd = HelloSippyPlayRequest(self.id, text, speaker if speaker else self.default_speaker, WeakDispatcher(dispatch))
 95 |         self._cmd_queue.put(cmd)
 96 | 
 97 | class HelloSippyRTPipeTest(HelloSippyRTPipe):
 98 |     _main_thread_id: int
 99 |     _sync_queue: Queue
100 |     sessions: weakref.WeakValueDictionary[InfernSession]
101 |     max_sessions: int = 50
102 |     output_sr = 8000
103 | 
104 |     def __init__(self, *a, **kwa):
105 |         self._main_thread_id = threading.get_ident()
106 |         self._sync_queue = Queue()
107 |         self.sessions = weakref.WeakValueDictionary()
108 |         super().__init__(*a, **kwa)
109 | 
110 |     def alloc_session(self, speaker:Optional[torch.Tensor]=None) -> Tuple[InfernSession, HelloSippyPipeState]:
111 |         assert threading.get_ident() == self._main_thread_id
112 |         if len(self.sessions) >= self.max_sessions: raise ErrMaxSessReached(f'No more sessions available {self.max_sessions=}')
113 |         if not speaker: speaker = self.get_rand_voice()[0]
114 |         rv = InfernSession(self._sync_queue, speaker)
115 |         self.sessions[rv.id] = rv
116 |         ss = SessSyncCmd(self.sessions)
117 |         self._sync_queue.put(ss)
118 |         return rv
119 | 
120 |     def savetensor(self, tensor:torch.Tensor, name:str):
121 |         fname = f'{name}{self.saveidx}.npy'
122 |         np.save(fname, tensor.cpu().numpy())
123 | 
124 |     class WorkerState: state:Optional[HelloSippyPipeStateBatched]=None; live:Optional[List[uuid.UUID]]=None
125 | 
126 |     @trp_thread(noreturn=True)
127 |     def synchronize(self, ws:Optional[WorkerState]) -> None:
128 |         if not ws: ws = self.WorkerState()
129 |         state = ws.state
130 |         if state: return (self.main_gen(ws), None)[-1]
131 |         ssq = [self._sync_queue.get(),]
132 |         try:
133 |             while True: ssq.append(self._sync_queue.get_nowait())
134 |         except QueueEmpty: pass
135 |         assert all(isinstance(x, SessCmd) for x in ssq)
136 |         syncs, reqs = [x for x in ssq if isinstance(x, SessSyncCmd)], [x for x in ssq if not isinstance(x, SessSyncCmd)]
137 |         if len(syncs) == 0 and len(reqs) == 0: raise AssertionError(f'this could not be happening {ssq=}')
138 |         #print(f'{len(syncs)=} {len(reqs)=} {syncs=}')
139 |         ws.live = live = syncs[-1].live if len(syncs) > 0 else ws.live
140 |         if not live: return (self.synchronize(ws), None)[-1]
141 |         reqs_live = [x for x in reqs if x.session in live]
142 |         if len(reqs_live) == 0: return (self.synchronize(ws), None)[-1]
143 |         with self.cuda_lock:
144 |             new_states = [HelloSippyPipeState(self, r) for r in reqs_live]
145 |             #if state: state.mergein(new_states)
146 |             ws.state = HelloSippyPipeStateBatched(new_states, self)
147 |         #raise Exception(f'{len(ssq)=} {reqs_live=} {live=} {len(self.sessions)=}')
148 |         self.main_gen(ws)
149 | 
150 |     @trp_thread(noreturn=True)
151 |     def main_gen(self, ws:WorkerState) -> None:
152 |         super().infer(ws.state)
153 |         #print(f'{state.ends_at.shape=} {state.ends_at.cpu().numpy()=} {state.audio.shape=}')
154 |         self.unbatch_and_dispatch(ws)
155 | 
156 |     @trp_thread(noreturn=True)
157 |     def unbatch_and_dispatch(self, ws:WorkerState):
158 |         more = super().unbatch_and_dispatch(ws.state)
159 |         if not more:
160 |             ws.state = None
161 |         self.synchronize(ws)
162 | 
163 | class Timing(contextlib.ContextDecorator):
164 |   def __init__(self, prefix="", on_exit=None, enabled=True): self.prefix, self.on_exit, self.enabled = prefix, on_exit, enabled
165 |   def __enter__(self): self.st = time.perf_counter_ns()
166 |   def __exit__(self, *exc):
167 |       self.et = time.perf_counter_ns() - self.st
168 |       if self.enabled: print(f"{self.prefix}{self.et*1e-6:6.2f} ms"+(self.on_exit(self.et) if self.on_exit else ""))
169 | 
170 | def seed_RNGs():
171 |     seed = 42
172 |     random.seed(seed)
173 |     torch.manual_seed(seed)
174 |     set_seed(seed)
175 |     np.random.seed(seed)
176 |     torch.cuda.manual_seed_all(seed)
177 |     torch.backends.cudnn.deterministic = True
178 | 
179 | @torch.no_grad()
180 | def main():
181 |     import soundfile as sf
182 |     from time import monotonic
183 |     seed_RNGs()
184 |     from random import choices
185 |     from utils.tts import smith_set, bender_set, hal_set
186 |     n = 50
187 |     prompts = choices([y for x in smith_set() + bender_set() + hal_set() for y in x.split('|')], k=n)
188 |     #prompts = prompts
189 |     #prompts = [prompts[0] for _ in range(n)]
190 |     @dataclass(frozen=True)
191 |     class ResFeedback: n:int; time_to_first_frame:float; time_to_last_frame:float; number_of_frames:int
192 |     class res_cb(threading.Thread):
193 |         def __init__(self, n, name='dispatch', res_queue=None):
194 |             super().__init__(target=self.__thread)
195 |             self.n, self.name, self.res_queue = n, name, res_queue
196 |             if self.name == 'dispatch': self.data = np.empty(0)
197 |             self.q = Queue()
198 |             self.daemon = True
199 |             self.start()
200 | 
201 |         def __thread(self):
202 |             st = monotonic()
203 |             time_to_first_frame = None
204 |             while (y:=self.q.get()) is not None:
205 |                 #print(f'{self.name}{self.n}({y.shape=})')
206 |                 self.data = np.concatenate((self.data, y), axis=0)
207 |                 if time_to_first_frame is None: time_to_first_frame = monotonic() - st
208 |             self.eos(ResFeedback(self.n, time_to_first_frame, monotonic()-st, int(self.data.shape[0])))
209 | 
210 |         def eos(self, res:ResFeedback):
211 |             sys.stdout.write(f'eos({self.n}) {self.data.shape=}\n')
212 |             sys.stdout.flush()
213 |             sf.write(f'out_{self.n}.wav', self.data, 8000, 'PCM_16')
214 |             if self.res_queue: self.res_queue.put(res)
215 | 
216 |     params = {'hidden_dropout':0.0, 'positional_dropout':0.0, 'speech_decoder_prenet_dropout':0.0,
217 |               'activation_dropout':0.0, 'encoder_layerdrop':0.0, 'decoder_layerdrop':0.0, 'attention_dropout':0.0,
218 |               'speech_decoder_postnet_dropout':0.0, 'feat_proj_dropout':0.0}
219 |     sp = HelloSippyRTPipeTest('xpu' if ipex is not None else 'cuda')
220 |     if ipex is not None:
221 |         sp.model = ipex.optimize(sp.model, dtype=torch.bfloat16)
222 |         sp.vocoder = ipex.optimize(sp.vocoder, dtype=torch.bfloat16)
223 |         sp.chunker = ipex.optimize(sp.chunker, dtype=torch.bfloat16)
224 | 
225 |     s1 = [sp.alloc_session() for i in range(50)]
226 |     del s1
227 |     res_queue = Queue()
228 |     from time import sleep
229 |     #sp.synchronize(None)
230 |     s2 = [((s:=sp.alloc_session()), (r:=res_cb(n, res_queue=res_queue)), s.play(p, r.q), 'sleep(0.5)') for n, p in enumerate(prompts)]
231 |     sp.synchronize(None)
232 |     for _ in range(len(s2)):
233 |         res = res_queue.get()
234 |         rtr = (res.time_to_last_frame - res.time_to_first_frame) / (res.number_of_frames / 8000)
235 |         print(f'Sess#{res.n}: {res.time_to_first_frame=}, {res.time_to_last_frame=}, {res.number_of_frames=} {rtr=}')
236 |         sys.stdout.flush()
237 |         s2[res.n][1].join()
238 |     return(0)
239 | 
240 |     def init_states(states):
241 |         d_callbacks = [res_cb(n, 'dispatch', lambda x:x[0].shape) for n, _ in enumerate(states)]
242 |         e_callbacks = [d.eos for d in d_callbacks]
243 |         for state, d_cb, e_cb in zip(states, d_callbacks, e_callbacks): state.dispatch, state.eos_cb = d_cb, e_cb
244 |         return states
245 |     seed_RNGs()
246 |     states = [sp.once(x) for x in prompts]
247 |     init_states(states)
248 |     states = sp.batch_for_main_gen(states)
249 |     states.res_queue = Queue()
250 |     sp.synchronize(states)
251 |     with Timing("main_gen: "):
252 |         state = states.res_queue.get()
253 |     #while state.next is not None:
254 |     #    state = state.next(state)
255 |     #exit(1)
256 |     with Timing("once: "):
257 |         seed_RNGs()
258 |         states = [sp.once(x) for x in prompts]
259 |     init_states(states)
260 |     #state1 = sp.once('Hello, world!')
261 |     #state2 = sp.once('How are you doing today?')
262 |     #state3 = sp.once('I am doing well, thank you very much.')
263 |     with Timing("batch_for_main_gen: "):
264 |         states = sp.batch_for_main_gen(states)
265 |     states.res_queue = Queue()
266 |     sp.synchronize(states)
267 |     with Timing("main_gen: "):
268 |         state = states.res_queue.get()
269 | 
270 | if __name__ == '__main__' and (r:=main()) not in (None, 0): raise RuntimeError(f'main() returned {r}')
271 | 


--------------------------------------------------------------------------------
/Infernos.py:
--------------------------------------------------------------------------------
 1 | from getopt import getopt, GetoptError
 2 | import os, sys
 3 | 
 4 | import ray
 5 | 
 6 | from sippy.misc import daemonize
 7 | 
 8 | sys.path.append('.')
 9 | 
10 | from Cluster.InfernSIPActor import InfernSIPActor
11 | from Core.InfernConfig import InfernConfig
12 | 
13 | def patch_signals():
14 |     import threading
15 |     import signal
16 | 
17 |     def _start_new_thread(*args):
18 |         allsigs = list(signal.valid_signals())
19 | 
20 |         old_sigset = signal.pthread_sigmask(signal.SIG_BLOCK, allsigs)
21 |         ret = _old_start_new_thread(*args)
22 |         signal.pthread_sigmask(signal.SIG_SETMASK, old_sigset)
23 |         return ret
24 | 
25 |     _old_start_new_thread = threading._start_new_thread
26 |     threading._start_new_thread = _start_new_thread
27 | 
28 | def usage():
29 |     print('usage: Infernos.py [-f] [-L logfile] [-i pidfile] [myconfig.yaml]')
30 |     sys.exit(1)
31 | 
32 | if __name__ == '__main__':
33 |     try:
34 |         opts, args = getopt(sys.argv[1:], 'fL:i:')
35 |     except GetoptError:
36 |         usage()
37 | 
38 |     if len(args) > 1:
39 |         usage()
40 | 
41 |     cfile = 'config.yaml' if len(args) == 0 else args[0]
42 | 
43 |     idir = os.path.realpath(sys.argv[0])
44 |     idir = os.path.dirname(idir)
45 |     sys.path.append(idir)
46 |     logfile = '/var/log/Infernos.log'
47 |     pidfile = None
48 |     foreground = False
49 |     for o, a in opts:
50 |         if o == '-f':
51 |             foreground = True
52 |         elif o == '-L':
53 |             logfile = a
54 |         elif o == '-i':
55 |             pidfile = a
56 | 
57 |     if not foreground:
58 |         daemonize(logfile)
59 | 
60 |     patch_signals()
61 | 
62 |     if logfile == '-':
63 |         lfile = sys.stdout
64 |     else:
65 |         lfile = open(logfile, 'a')
66 | 
67 |     default_resources = InfernSIPActor.default_resources
68 |     default_resources['live_translator'] = 1
69 |     default_resources['ai_attendant'] = 1
70 |     default_resources['tts'] = 2
71 |     default_resources['stt'] = 1
72 |     default_resources['llm'] = 1
73 |     try:
74 |         ray.init(num_gpus=2, resources = default_resources)
75 |     except ValueError as ex:
76 |         if str(ex).index('connecting to an existing cluster') < 0: raise ex
77 |         ray.init()
78 | 
79 |     inf_c = InfernConfig(cfile)
80 | 
81 |     if pidfile != None:
82 |         open(pidfile, 'w').write('%d' % os.getpid())
83 | 
84 |     if inf_c.sip_actr is None:
85 |         ray.shutdown()
86 |         exit(0)
87 | 
88 |     try:
89 |         exit(ray.get(inf_c.sip_actr.loop.remote(inf_c)))
90 |     except KeyboardInterrupt:
91 |         ray.get(inf_c.sip_actr.stop.remote())
92 |         raise
93 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2023-2024, Sippy Labs
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # I.N.F.E.R.N.O.S.
  2 | 
  3 | ### Interactive Neural Framework for Efficient Realtime Network Operations on Streams
  4 | 
  5 | 🔥 Welcome to Infernos, where data comes to life in real-time! 🔥
  6 | 
  7 | ## Overview
  8 | 
  9 | Harness the power of **I.N.F.E.R.N.O.S.** to transform audio, video, and
 10 | text streams with state-of-the-art inference in an instant. Embrace a
 11 | blazing-fast future, free from lag.
 12 | 
 13 | ## News
 14 | 
 15 | Initial integration of the LLM (Qwen 2.5) and addition of the A.I.
 16 | Attendant application.
 17 | 
 18 | Upcoming presentation at the OpenSIPS Summit 2025.
 19 | 
 20 | ## Features
 21 | 
 22 | -   **Interactive:** Infernos isn't just another tool; it's an
 23 |     experience. Speak in one voice and marvel as it's automatically
 24 |     translated into a completely different tone or even language, and
 25 |     then seamlessly transmitted in real-time during phone or web
 26 |     meetings.
 27 | 
 28 | -   **Neural Power:** With deep learning at its core, Infernos is
 29 |     optimized for top-notch performance.
 30 | 
 31 | -   **Multimodal Support:** Whether it's audio, video, or text, Infernos
 32 |     handles them with elegance.
 33 | 
 34 | -   **Efficiency:** Designed for low-latency, high-throughput
 35 |     operations.
 36 | 
 37 | -   **Realtime:** Don't wait. Experience the magic as it unfolds.
 38 | 
 39 | ## Quick Start
 40 | 
 41 | 1.  Clone the repository:
 42 | 
 43 |     ```bash
 44 |     git clone https://github.com/sippy/Infernos.git
 45 |     ```
 46 | 
 47 | 2.  Navigate to the project directory and install dependencies:
 48 | 
 49 |     ```bash
 50 |     cd Infernos && pip install -r requirements.txt
 51 |     ```
 52 | 
 53 | 3.  Create a configuration file. In the following example we would
 54 |     listen and accept SIP calls from `MY_IP` and pass them into Live
 55 |     Translator application. Then use SIP account to send
 56 |     outbound call legs to `DEST_NUM`@`MY_SIP_SRV`:
 57 | 
 58 |     ```bash
 59 |     MY_IP="A.B.C.D"
 60 |     MY_SIP_SRV="E.F.G.H"
 61 |     DEST_NUM="12345"
 62 |     DEST_USER="foo"
 63 |     DEST_PWD="bar"
 64 |     cat > config.yaml <<EOF
 65 |     sip:
 66 |       settings:
 67 |         bind: ${MY_IP}:5060
 68 |       profiles:
 69 |         me:
 70 |           sip_server: ${MY_IP}:*
 71 |           sink: apps/live_translator/configuration1
 72 |         bar:
 73 |           sip_server: ${MY_SIP_SRV}:5060
 74 |           username: '${DEST_NUM}'
 75 |           password: '${DEST_PWD}'
 76 |           register: True
 77 |     apps:
 78 |       live_translator:
 79 |         profiles:
 80 |           configuration1:
 81 |             stt_langs: ['en', 'pt']
 82 |             tts_langs: ['pt', 'en']
 83 |             outbound: sip/bar;cld=${DEST_NUM}
 84 |     EOF
 85 |     ```
 86 | 
 87 | 4.  Light the fire:
 88 | 
 89 |     ```bash
 90 |     python Infernos.py -f -L ~/Infernos.log
 91 |     ```
 92 | 
 93 | 5.  Use SIP device or software such as Linphone to place a SIP
 94 |     call to `sip:anything@localhost:5060`. Replace `localhost`
 95 |     with a local IP of machine running Infernos if testing over
 96 |     LAN.
 97 | 
 98 | Ready to experience real-time inferencing?
 99 | 
100 | ## Contribute
101 | 
102 | Feeling the warmth? 🔥 Eager to stoke the flames of Infernos? Delve into
103 | our contribution guidelines and join the firestorm!
104 | 
105 | ## License & Acknowledgements
106 | 
107 | Powered by the 2-clause BSD license. A heartfelt shoutout to the
108 | community for their priceless insights and tireless contributions.
109 | 
110 | ## Media
111 | 
112 | - [Setting up live translation service with Infernos:](https://www.youtube.com/live/-mTH1BpIMqY?t=26160s)
113 |   Live presentation during OpenSIPS Summit 2024 setting up realtime in-call
114 |   translation inference service for Portugese->English / English->Portugese
115 |   on a AWS instance "from zero to hero" in less than 60 minutes.
116 | - [Infernos: cost efficient AI inference for real-time applications:](https://www.youtube.com/watch?v=eawO0hXeO5Y)
117 |   Overview of the Infernos architecture and progress over the past few months.
118 | 
119 | ## Join US
120 | 
121 | - [Discord](https://discord.gg/bb95ZWhrhQ)
122 | 
123 | ------------------------------------------------------------------------
124 | 
125 | Stay on the lookout for more sizzling updates, and always remember:
126 | **Infernos** makes the future sizzle!
127 | 


--------------------------------------------------------------------------------
/RTP/AudioInput.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | 
3 | class AudioInput():
4 |     vad_chunk_in:Optional[callable]
5 |     audio_in:Optional[callable]
6 |     def __init__(self, audio_in:Optional[callable]=None, vad_chunk_in:Optional[callable]=None):
7 |         self.vad_chunk_in = vad_chunk_in
8 |         self.audio_in = audio_in
9 | 


--------------------------------------------------------------------------------
/RTP/InfernRTPConf.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from sippy.Network_server import RTP_port_allocator
 4 | 
 5 | class InfernRTPConf():
 6 |     schema: dict = {
 7 |         'settings': {
 8 |             'type': 'dict',
 9 |             'schema': {
10 |                 'min_port': {'type': 'integer', 'min': 1, 'max': 65535},
11 |                 'max_port': {'type': 'integer', 'min': 1, 'max': 65535},
12 |             }
13 |         }
14 |     }
15 |     palloc: RTP_port_allocator
16 |     def __init__(self, conf:Optional[dict]=None):
17 |         max_port = conf.get('max_port', None) if conf is not None else None
18 |         min_port = conf.get('min_port', None) if conf is not None else None
19 |         self.palloc = RTP_port_allocator(min_port, max_port)
20 | 


--------------------------------------------------------------------------------
/RTP/InfernRTPEPoint.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Union
 2 | from uuid import uuid4, UUID
 3 | from threading import Lock
 4 | 
 5 | from sippy.Udp_server import Udp_server, Udp_server_opts
 6 | from sippy.misc import local4remote
 7 | 
 8 | from config.InfernGlobals import InfernGlobals as IG
 9 | from Core.AudioChunk import AudioChunk
10 | from Core.AStreamMarkers import ASMarkerGeneric, ASMarkerNewSent
11 | from RTP.RTPOutputWorker import RTPOutputWorker
12 | from RTP.InfernRTPIngest import RTPInStream
13 | from RTP.AudioInput import AudioInput
14 | from RTP.RTPParams import RTPParams
15 | from RTP.InfernRTPIngest import InfernRTPIngest
16 | from RTP.InfernRTPConf import InfernRTPConf
17 | 
18 | class InfernRTPEPoint():
19 |     debug: bool = False
20 |     id: UUID
21 |     dl_file = None
22 |     firstframe = True
23 |     rtp_params:RTPParams
24 |     state_lock: Lock
25 |     def __init__(self, rc:InfernRTPConf, rtp_params:RTPParams, ring:InfernRTPIngest, get_direct_soundout:callable):
26 |         self.id = uuid4()
27 |         self.rtp_params = rtp_params
28 |         self.state_lock = Lock()
29 |         self.writer = RTPOutputWorker('cpu', rtp_params)
30 |         self.rsess = RTPInStream(ring, rtp_params, get_direct_soundout)
31 |         rtp_laddr = local4remote(rtp_params.rtp_target[0])
32 |         rserv_opts = Udp_server_opts((rtp_laddr, rc.palloc), self.rtp_received)
33 |         rserv_opts.nworkers = 1
34 |         rserv_opts.direct_dispatch = True
35 |         self.rserv = Udp_server({}, rserv_opts)
36 |         self.writer_setup()
37 | 
38 |     def writer_setup(self):
39 |         self.writer.set_pkt_send_f(self.send_pkt)
40 |         if self.dl_file is not None:
41 |             self.writer.enable_datalog(self.dl_file)
42 |         self.writer.start()
43 | 
44 |     def send_pkt(self, pkt):
45 |         with self.state_lock:
46 |             rtp_target = self.rtp_params.rtp_target
47 |         self.rserv.send_to(pkt, rtp_target)
48 | 
49 |     def rtp_received(self, data, address, udp_server, rtime):
50 |         #self.dprint(f"InfernRTPIngest.rtp_received: len(data) = {len(data)}")
51 |         with self.state_lock:
52 |             if address != self.rtp_params.rtp_target:
53 |                 if self.debug:
54 |                     print(f"InfernRTPIngest.rtp_received: address mismatch {address=} {self.rtp_params.rtp_target=}")
55 |                 return
56 |         self.rsess.rtp_received(data, address, rtime)
57 | 
58 |     def update(self, rtp_params:RTPParams):
59 |         with self.state_lock:
60 |             self.rtp_params.rtp_target = rtp_params.rtp_target
61 |             if self.rtp_params.out_ptime != rtp_params.out_ptime:
62 |                 self.writer.end()
63 |                 self.writer.join()
64 |                 self.writer = RTPOutputWorker('cpu', rtp_params)
65 |                 self.writer_setup()
66 |         self.rsess.stream_update()
67 | 
68 |     def connect(self, ain:AudioInput):
69 |         self.rsess.stream_connect(ain)
70 | 
71 |     def shutdown(self):
72 |         with self.state_lock:
73 |             self.writer.join()
74 |             self.rserv.shutdown()
75 |             self.rserv, self.writer = (None, None)
76 | 
77 |     def __del__(self):
78 |         if self.debug:
79 |             print('InfernRTPEPoint.__del__')
80 | 
81 |     def soundout(self, chunk:Union[AudioChunk, ASMarkerGeneric]):
82 |         ismark = isinstance(chunk, ASMarkerGeneric)
83 |         if self.firstframe or ismark:
84 |             if self.debug:
85 |                 print(f'{IG.stdtss()}: rtp_session_soundout[{str(self.id)[:6]}]: {"mark" if ismark else chunk.audio.size(0)}')
86 |             self.firstframe = False
87 |         if ismark and isinstance(chunk, ASMarkerNewSent):
88 |             self.firstframe = True
89 |         with self.state_lock:
90 |             if self.writer is None: return
91 |             return self.writer.soundout(chunk)
92 | 


--------------------------------------------------------------------------------
/RTP/InfernRTPIngest.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Union
  2 | from queue import Queue
  3 | from threading import Lock
  4 | from uuid import UUID
  5 | 
  6 | from rtpsynth.RtpJBuf import RtpJBuf, RTPFrameType, RTPParseError
  7 | 
  8 | from Core.InfernWrkThread import InfernWrkThread, RTPWrkTRun
  9 | from Core.VAD.SileroVAD import SileroVADWorker, VADChannel
 10 | from Core.Codecs.G711 import G711Codec
 11 | from Core.AudioChunk import AudioChunk
 12 | from RTP.AudioInput import AudioInput
 13 | from RTP.RTPParams import RTPParams
 14 | 
 15 | class WIPkt():
 16 |     def __init__(self, stream: 'RTPInStream', data, address, rtime):
 17 |         self.stream = stream
 18 |         self.data = data
 19 |         self.address = address
 20 |         self.rtime = rtime
 21 | 
 22 | class WIStreamUpdate():
 23 |     def __init__(self, stream: 'RTPInStream'):
 24 |         self.stream = stream
 25 | 
 26 | class WIStreamConnect():
 27 |     def __init__(self, stream: 'RTPInStream', ain:AudioInput):
 28 |         self.stream = stream
 29 |         self.ain = ain
 30 | 
 31 | class RTPInStream():
 32 |     jb_size: int = 8
 33 |     input_sr: int = 8000
 34 |     last_output_lseq: Optional[int] = None
 35 |     vchan: VADChannel
 36 |     codec: G711Codec
 37 |     output_sr: int = 16000
 38 |     npkts: int = 0
 39 |     ain: AudioInput
 40 |     ain_lock: Lock
 41 |     get_direct_soundout: callable
 42 |     def __init__(self, ring:'InfernRTPIngest', rtp_params:RTPParams, get_direct_soundout:callable):
 43 |         self.jbuf = RtpJBuf(self.jb_size)
 44 |         self.codec = rtp_params.codec().to(ring.device)
 45 |         self.ring = ring
 46 |         self.get_direct_soundout = get_direct_soundout
 47 |         self.ain = AudioInput()
 48 |         self.ain_lock = Lock()
 49 |         self.vchan = VADChannel(self.audio_chunk_out, self.vad_chunk_out, self.codec.decode, ring.device)
 50 | 
 51 |     def rtp_received(self, data, address, rtime):
 52 |         #self.dprint(f"InfernRTPIngest.rtp_received: len(data) = {len(data)}")
 53 |         self.ring.pkt_queue.put(WIPkt(self, data, address, rtime))
 54 | 
 55 |     def stream_update(self):
 56 |         self.ring.pkt_queue.put(WIStreamUpdate(self))
 57 | 
 58 |     def stream_connect(self, ain:AudioInput):
 59 |         if isinstance(ain.vad_chunk_in, UUID): ain.vad_chunk_in = self.get_direct_soundout(ain.vad_chunk_in)
 60 |         if isinstance(ain.audio_in, UUID): ain.audio_in = self.get_direct_soundout(ain.audio_in)
 61 |         self.ring.pkt_queue.put(WIStreamConnect(self, ain))
 62 | 
 63 |     def _proc_in_tread(self, wi:Union[WIPkt,WIStreamUpdate], svad:SileroVADWorker):
 64 |         def dprint(msg:str): return self.ring.dprint(f'InfernRTPIngest.run: {msg}') if self.ring.debug else None
 65 | 
 66 |         if isinstance(wi, WIStreamUpdate):
 67 |             dprint("stream update")
 68 |             self.jbuf = RtpJBuf(self.jb_size)
 69 |             self.last_output_lseq = None
 70 |             return
 71 |         if isinstance(wi, WIStreamConnect):
 72 |             dprint("stream connect")
 73 |             with self.ain_lock:
 74 |                 self.ain = wi.ain
 75 |             return
 76 |         data, address, rtime = wi.data, wi.address, wi.rtime
 77 |         try:
 78 |             res = self.jbuf.udp_in(data)
 79 |         except RTPParseError as e:
 80 |             dprint(f"RTPParseError: {e}")
 81 |             return
 82 |         self.npkts += 1
 83 |         if self.npkts == 1:
 84 |             dprint(f"address={address}, rtime={rtime}, len(data) = {len(data)} data={data[:40]}")
 85 |         for pkt in res:
 86 |             if pkt.content.type == RTPFrameType.ERS:
 87 |                 print(f"ERS packet received {pkt.content.lseq_start=}, {pkt.content.lseq_end=} {pkt.content.ts_diff=}")
 88 |                 self.last_output_lseq = pkt.content.lseq_end
 89 |                 rtp_data = self.codec.silence(pkt.content.ts_diff)
 90 |             else:
 91 |                 if self.npkts < 10:
 92 |                     dprint(f"{pkt.content.frame.rtp.lseq=}")
 93 |                 assert self.last_output_lseq is None or pkt.content.frame.rtp.lseq == self.last_output_lseq + 1
 94 |                 self.last_output_lseq = pkt.content.frame.rtp.lseq
 95 |                 if self.npkts < 10:
 96 |                     dprint(f"{len(pkt.rtp_data)=}, {type(pkt.rtp_data)=}")
 97 |                 rtp_data = pkt.rtp_data
 98 |             self.vchan.ingest(svad, rtp_data, self.codec)
 99 |         if self.npkts < 10 and len(res) > 0:
100 |             dprint(f"{res=}")
101 | 
102 |     def audio_chunk_out(self, chunk:AudioChunk, active:bool):
103 |         chunk.active = active
104 |         with self.ain_lock:
105 |             if self.ain.audio_in is None: return
106 |             self.ain.audio_in(chunk=chunk)
107 | 
108 |     def vad_chunk_out(self, chunk:AudioChunk):
109 |         with self.ain_lock:
110 |             if self.ain.vad_chunk_in is None: return
111 |             self.ain.vad_chunk_in(chunk=chunk)
112 | 
113 | class InfernRTPIngest(InfernWrkThread):
114 |     debug = False
115 |     pkt_queue: Queue[Union[WIPkt,WIStreamUpdate,WIStreamConnect]]
116 |     _start_queue: Queue[int]
117 |     def __init__(self, device:str):
118 |         super().__init__()
119 |         self.pkt_queue = Queue()
120 |         self.device = device
121 | 
122 |     def start(self):
123 |         self._start_queue = Queue()
124 |         super().start()
125 |         r = self._start_queue.get()
126 |         if isinstance(r, Exception):
127 |             super().join()
128 |             raise r
129 |         del self._start_queue
130 | 
131 |     def dprint(self, *args):
132 |         if self.debug:
133 |             print(*args)
134 | 
135 |     def run(self):
136 |         super().thread_started()
137 |         try:
138 |             svad = SileroVADWorker(self.device)
139 |             svad.start()
140 |         except Exception as e:
141 |             self._start_queue.put(e)
142 |             return
143 |         self._start_queue.put(0)
144 |         self.dprint("InfernRTPIngest started")
145 |         data, address, rtime = (None, None, None)
146 |         while self.get_state() == RTPWrkTRun:
147 |             wi = self.pkt_queue.get()
148 |             if wi is None: break
149 |             wi.stream._proc_in_tread(wi, svad)
150 |         svad.stop()
151 | #        if data is not None:
152 | #            self.dprint(f"InfernRTPIngest.run: last packet: address={address}, rtime={rtime}, len(data) = {len(data)} data={data[:40]}")
153 | #        self.dprint(f"InfernRTPIngest.run: exiting, total packets received: {npkts}")
154 | 
155 |     def stop(self):
156 |         self.pkt_queue.put(None)
157 |         super().stop()
158 |         self.dprint("InfernRTPIngest stopped")
159 | 
160 |     def __del__(self):
161 |         self.dprint("InfernRTPIngest.__del__")
162 | 


--------------------------------------------------------------------------------
/RTP/RTPOutputWorker.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Dict, Union
  2 | from fractions import Fraction
  3 | import queue
  4 | import threading
  5 | from time import monotonic, sleep
  6 | 
  7 | from rtpsynth.RtpSynth import RtpSynth
  8 | import soundfile as sf
  9 | 
 10 | from Core.Codecs.G711 import G711Codec
 11 | from Core.AudioChunk import AudioChunk
 12 | from Core.OutputMuxer import OutputMTMuxer
 13 | from Core.AStreamMarkers import ASMarkerGeneric
 14 | from RTP.RTPParams import RTPParams
 15 | 
 16 | class RTPOutputWorker(threading.Thread):
 17 |     data_queue: queue.Queue[Union[AudioChunk, ASMarkerGeneric]]
 18 |     debug = False
 19 |     dl_ofname: str = None
 20 |     data_log = None
 21 |     pkg_send_f = None
 22 |     state_lock: threading.Lock = None
 23 |     frames_rcvd = 0
 24 |     frames_prcsd = 0
 25 |     has_ended = False
 26 |     codec: G711Codec
 27 |     samplerate_out: int
 28 |     out_ft: int # in ms
 29 | 
 30 |     def __init__(self, device, rtp_params:RTPParams):
 31 |         self.itime = monotonic()
 32 |         self.device = device
 33 |         #if os.path.exists(self.ofname):
 34 |         #    self.data, _ = sf.read(self.ofname)
 35 |         self.data_queue = queue.Queue()
 36 |         self.codec = rtp_params.codec().to(device)
 37 |         self.samplerate_out = self.codec.srate
 38 |         self.state_lock = threading.Lock()
 39 |         self.out_ft = rtp_params.out_ptime
 40 |         super().__init__(target=self.consume_audio)
 41 |         self.daemon = True
 42 | 
 43 |     def enable_datalog(self, dl_ofname):
 44 |         self.dl_ofname = dl_ofname
 45 | 
 46 |     def set_pkt_send_f(self, pkt_send_f):
 47 |         self.pkt_send_f = pkt_send_f
 48 | 
 49 |     def ended(self):
 50 |         self.state_lock.acquire()
 51 |         t = self.has_ended
 52 |         self.state_lock.release()
 53 |         return t
 54 | 
 55 |     def end(self):
 56 |         self.state_lock.acquire()
 57 |         self.has_ended = True
 58 |         self.state_lock.release()
 59 | 
 60 |     def update_frm_ctrs(self, rcvd_inc=0, prcsd_inc=0):
 61 |         self.state_lock.acquire()
 62 |         self.frames_rcvd += rcvd_inc
 63 |         self.frames_prcsd += prcsd_inc
 64 |         self.state_lock.release()
 65 | 
 66 |     def get_frm_ctrs(self):
 67 |         self.state_lock.acquire()
 68 |         res = (self.frames_rcvd, self.frames_prcsd)
 69 |         self.state_lock.release()
 70 |         return res
 71 | 
 72 |     def soundout(self, chunk:Union[AudioChunk, ASMarkerGeneric]):
 73 |         #print(f'soundout: {monotonic():4.3f}')
 74 |         #return (0, False)
 75 |         ismark = isinstance(chunk, ASMarkerGeneric)
 76 |         assert ismark or chunk.audio.size(0) > 0
 77 |         if (self.debug or chunk.debug) and not ismark:
 78 |             print(f'len(chunk) = {len(chunk.audio)}')
 79 |         if not ismark:
 80 |             chunk.audio = chunk.audio.to(self.device)
 81 |         self.data_queue.put(chunk)
 82 |         return (self.data_queue.qsize(), False)
 83 | 
 84 |     def consume_audio(self):
 85 |         out_pt = self.codec.ptype
 86 |         out_fsize = self.samplerate_out * self.out_ft // 1000
 87 |         ptime = Fraction(0)
 88 |         stime = None
 89 |         rsynth = RtpSynth(self.codec.crate, self.out_ft)
 90 |         qtimeout = Fraction(self.out_ft, 1000)
 91 |         out_qsize = self.out_ft * (self.samplerate_out // 10 // self.out_ft) # ~0.1 sec (rounded to a frame size)
 92 |         mix = OutputMTMuxer(self.samplerate_out, out_qsize, self.device)
 93 |         while not self.ended():
 94 |             ctime = monotonic()
 95 |             try:
 96 |                 chunk_n = self.data_queue.get(block=False)
 97 |             except queue.Empty:
 98 |                 chunk_o_n = mix.idle(self)
 99 |                 if chunk_o_n is None:
100 |                     if stime is not None:
101 |                         ptime += qtimeout
102 |                         etime = ctime - stime
103 |                         if ptime > etime:
104 |                             sleep(ptime - etime)
105 |                         if self.debug: print(f'{self}.consume_audio, skip {ptime - etime=}')
106 |                         rsynth.skip(1)
107 |                     else:
108 |                         sleep(float(qtimeout))
109 |                     continue
110 |             else:
111 |                 #if isinstance(chunk_n, AudioChunk): self.update_frm_ctrs(rcvd_inc=chunk_n.audio.size(0))
112 |                 mix.chunk_in(chunk_n)
113 |                 continue
114 | 
115 |             if stime is None:
116 |                 stime = ctime
117 | 
118 |             chunk_o_n = self.codec.encode(chunk_o_n)
119 |             out_psize = self.codec.d2e_frames(out_fsize)
120 |             while len(chunk_o_n) >= out_psize:
121 |                 #self.update_frm_ctrs(prcsd_inc=out_fsize*2)
122 |                 packet = chunk_o_n[:out_psize]
123 |                 assert len(packet) == out_psize, f'{len(packet)=}, {out_psize=}'
124 |                 chunk_o_n = chunk_o_n[out_psize:]
125 | 
126 |                 ptime += Fraction(out_fsize, self.samplerate_out)
127 |                 etime = ctime - stime
128 | 
129 |                 #print(packet.size())
130 |                 #packet = (packet * 20000).to(torch.int16)
131 |                 #packet = packet.byte().cpu().numpy()
132 |                 #packet = self.codec.encode(packet)
133 |                 #print('packet', packet.min(), packet.max(), packet[:10])
134 |                 #print(len(packet), packet[:10])
135 |                 pkt = rsynth.next_pkt(out_psize, out_pt, pload=packet)
136 |                 if self.pkt_send_f is not None:
137 |                     self.pkt_send_f(pkt)
138 |                 #print(len(pkt))
139 |                 if chunk_n.debug or self.debug:
140 |                     print(f'{self}.consume_audio({etime=}, {ptime=}')
141 |                 if self.ended():
142 |                     break
143 |                 if ptime > etime:
144 |                     sleep(ptime - etime)
145 |                     if self.ended():
146 |                         break
147 |                     ctime = monotonic()
148 |                     if chunk_n.debug or self.debug:
149 |                         print(f'consume_audio, sleep({ptime - etime})')
150 |             #if done_cb is not None:
151 |             #    rsynth.resync()
152 |             #    rsynth.set_mbt(1)
153 |             #    ptime = 0.0
154 |             #    stime = None
155 |             #    done_cb(self)
156 | 
157 |     def __del__(self):
158 |         if self.debug:
159 |             print('RTPOutputWorker.__del__')
160 |         #self.worker_thread.join()
161 |         if self.data_log is None:
162 |             return
163 |         amplification_dB = 20.0
164 |         data = self.data_log #* (10 ** (amplification_dB / 20))
165 |         sf.write(self.dl_ofname, data.detach().cpu().numpy(),
166 |                  samplerate=self.samplerate_out)
167 | 


--------------------------------------------------------------------------------
/RTP/RTPParams.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Optional, Type, Union
 2 | from Core.Codecs.G711 import G711Codec
 3 | from Core.Codecs.G722 import G722Codec
 4 | 
 5 | class RTPParams():
 6 |     rtp_target: Tuple[str, int]
 7 |     out_ptime: int
 8 |     default_ptime: int = 20
 9 |     codec: Type[Union[G711Codec, G722Codec]]
10 |     def __init__(self, rtp_target:Tuple[str, int], out_ptime:Optional[int]):
11 |         assert isinstance(rtp_target, tuple) and len(rtp_target) == 2
12 |         self.rtp_target = rtp_target
13 |         self.out_ptime = out_ptime if out_ptime is not None else self.default_ptime
14 | 


--------------------------------------------------------------------------------
/SIP/InfernSIP.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018-2024 Sippy Software, Inc. All rights reserved.
  2 | #
  3 | # All rights reserved.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without modification,
  6 | # are permitted provided that the following conditions are met:
  7 | #
  8 | # 1. Redistributions of source code must retain the above copyright notice, this
  9 | # list of conditions and the following disclaimer.
 10 | #
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation and/or
 13 | # other materials provided with the distribution.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | 
 26 | from typing import Optional, Dict
 27 | from weakref import WeakValueDictionary
 28 | from queue import Queue
 29 | from threading import Lock
 30 | 
 31 | from sippy.SipConf import SipConf
 32 | from sippy.SipTransactionManager import SipTransactionManager
 33 | from sippy.SipURL import SipURL
 34 | from sippy.SipRegistrationAgent import SipRegistrationAgent
 35 | from sippy.misc import local4remote
 36 | 
 37 | #from Core.InfernConfig import InfernConfig
 38 | 
 39 | from .InfernUAS import InfernLazyUAS
 40 | from .InfernUAC import InfernUAC
 41 | from .InfernUA import InfernUA
 42 | from .InfernSIPProfile import InfernSIPProfile
 43 | from .RemoteSession import RemoteSessionOffer, NewRemoteSessionRequest
 44 | 
 45 | from utils.tts import human_readable_time, hal_set, smith_set, \
 46 |         bender_set
 47 | 
 48 | def good(*a):
 49 |     #ED2.breakLoop(0)
 50 |     pass
 51 | 
 52 | def bad(*a):
 53 |     #ED2.breakLoop(1)
 54 |     pass
 55 | 
 56 | class InfernSIP():
 57 |     _c: Dict[str, InfernSIPProfile]
 58 |     ua = None
 59 |     body = None
 60 |     ragent = None
 61 |     sip_actr = None
 62 |     sippy_c = None
 63 |     sessions: WeakValueDictionary
 64 |     sessions_lock: Lock
 65 | 
 66 |     def __init__(self, sip_actr:'InfernSIPActor', rtp_actr, inf_c:'InfernConfig'):
 67 |         sip_c = inf_c.sip_conf
 68 |         self.sippy_c = {'_sip_address':sip_c.laddr,
 69 |                         '_sip_port':sip_c.lport,
 70 |                         '_sip_logger':sip_c.logger}
 71 |         self.sip_actr, self.rtp_actr = sip_actr, rtp_actr
 72 |         self.sessions = WeakValueDictionary()
 73 |         self.session_lock = Lock()
 74 |         udsc, udsoc = SipTransactionManager.model_udp_server
 75 |         udsoc.nworkers = 1
 76 |         SipConf.my_uaname = 'Infernos'
 77 |         stm =  SipTransactionManager(self.sippy_c, self.recvRequest)
 78 |         self.sippy_c['_sip_tm'] = stm
 79 |         #raise Exception(f'{inf_c.connectors}')
 80 |         self._c = inf_c.connectors
 81 |         for n, v in self._c.items():
 82 |             if not v.register: continue
 83 |             proxy, port = v.nh_addr
 84 |             aor = SipURL(username = v.cli, host = proxy, port = port)
 85 |             caddr = local4remote(proxy)
 86 |             cport = self.sippy_c['_sip_port']
 87 |             contact = SipURL(username = v.cli, host = caddr, port = cport)
 88 |             ragent = SipRegistrationAgent(self.sippy_c, aor, contact,
 89 |                     user=v.authname, passw=v.authpass,
 90 |                     rok_cb=good, rfail_cb=bad)
 91 |             ragent.rmsg.getHFBody('to').getUrl().username = v.cli
 92 |             ragent.doregister()
 93 | 
 94 |     def recvRequest(self, req, sip_t):
 95 |         if req.getMethod() in ('NOTIFY', 'PING'):
 96 |             # Whynot?
 97 |             return (req.genResponse(200, 'OK'), None, None)
 98 |         if req.getMethod() == 'INVITE':
 99 |             #if self.rserv != None:
100 |             #    return (req.genResponse(486, 'Busy Here'), None, None)
101 |             # New dialog
102 |             source = req.getSource()
103 |             for n, sip_prof in self._c.items():
104 |                 assert type(source) == type(sip_prof.nh_addr)
105 |                 if source == sip_prof.nh_addr:
106 |                     break
107 |             else:
108 |                 return (req.genResponse(500, 'Nobody is home'), None, None)
109 |             isess = InfernLazyUAS(self, sip_prof, req, sip_t)
110 |             with self.session_lock:
111 |                 self.sessions[isess.id] = isess
112 |             rso = RemoteSessionOffer(self, isess)
113 |             sip_prof.new_sess_offer(rso)
114 |             return
115 |         return (req.genResponse(501, 'Not Implemented'), None, None)
116 | 
117 |     def new_session(self, msg:NewRemoteSessionRequest, rval:Optional[Queue]=None):
118 |         uac = InfernUAC(self, msg)
119 |         with self.session_lock:
120 |             self.sessions[uac.id] = uac
121 |         ret = (uac, uac.rsess)
122 |         if rval is None: return ret
123 |         rval.put(ret)
124 | 
125 |     def get_session(self, sip_sess_id) -> InfernUA:
126 |         with self.session_lock:
127 |             return self.sessions[sip_sess_id]
128 | 
129 | #    def getPrompts(self):
130 | #        return [f'{human_readable_time()}',] + list(self.prompts)
131 | 


--------------------------------------------------------------------------------
/SIP/InfernSIPConf.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from os.path import expanduser
 3 | 
 4 | from sippy.SipConf import SipConf
 5 | from sippy.SipLogger import SipLogger
 6 | 
 7 | from Core.ConfigValidators import validate_port_range
 8 | 
 9 | class InfernSIPConf():
10 |     schema: dict = {
11 |         'settings': {
12 |             'type': 'dict',
13 |             'schema': {
14 |                 'bind': {
15 |                     'type': 'string',
16 |                     'regex': r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:[1-9][0-9]{0,4}|:0)?$',
17 |                     'check_with': validate_port_range
18 |                 }
19 |             }
20 |         }
21 |     }
22 |     logger = None
23 | 
24 |     def __init__(self, conf:Optional[dict]=None):
25 |         self.logger = SipLogger('Infernos',  logfile = expanduser('~/.Infernos.log'))
26 |         if conf is not None:
27 |             try:
28 |                 bind = conf['bind'].split(':', 1)
29 |             except KeyError: pass
30 |             else:
31 |                 port = int(bind[1]) if len(bind) == 2 else SipConf.my_port
32 |                 self.laddr = bind[0]
33 |                 self.lport = port
34 |                 return
35 |         self.laddr = SipConf.my_address
36 |         self.lport = SipConf.my_port
37 | 


--------------------------------------------------------------------------------
/SIP/InfernSIPProfile.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | from functools import partial
 3 | 
 4 | from Core.ConfigValidators import validate_port_range
 5 | 
 6 | class InfernSIPProfile():
 7 |     schema: dict = {
 8 |         'profiles': {
 9 |             'type': 'dict',
10 |             'keysrules': {'type': 'string'},
11 |             'valuesrules': {
12 |                 'type': 'dict',
13 |                 'schema': {
14 |                     'sip_server': {
15 |                         'type': 'string',
16 |                         'regex': r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:[1-9][0-9]{0,4}|:0)?$',
17 |                         'check_with': validate_port_range
18 |                     },
19 |                     'aor': {'type': 'string'},
20 |                     'username': {'type': 'string'},
21 |                     'password': {'type': 'string'},
22 |                     'register': {'type': 'boolean'},
23 |                     'sink': {'type': 'string'},
24 |                 }
25 |             }
26 |         }
27 |     }
28 |     name: str
29 |     cli: str = 'infernos_uas'
30 |     aor: str
31 |     authname: Optional[str] = None
32 |     authpass: Optional[str] = None
33 |     nh_addr: Optional[Tuple[str, int]] = None
34 |     register: bool = False
35 |     _sink: Optional[str]
36 |     new_sess_offer: callable = None
37 | 
38 |     def __init__(self, name, conf):
39 |         self.name = name
40 |         self.cli = conf.get('username', self.cli)
41 |         self.aor = conf.get('aor', self.cli)
42 |         self.authname = conf.get('username', self.authname)
43 |         self.authpass = conf.get('password', self.authpass)
44 |         sip_server = conf['sip_server'].split(':', 1)
45 |         port = int(sip_server[1]) if len(sip_server) == 2 else 5060
46 |         self.nh_addr = (sip_server[0], port)
47 |         self.register = conf.get('register', self.register)
48 |         self._sink = conf.get('sink', None)
49 | 
50 |     def finalize(self, sip_actr: 'InfernSIPActor', iconf: 'InfernConfig'):
51 |         if self._sink is None: return
52 |         sact = iconf.apps[self._sink].getActor(iconf, sip_actr)
53 |         self.new_sess_offer = partial(sact.new_sip_session_received.remote)
54 | 


--------------------------------------------------------------------------------
/SIP/InfernUA.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Sippy Software, Inc. All rights reserved.
  2 | #
  3 | # All rights reserved.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without modification,
  6 | # are permitted provided that the following conditions are met:
  7 | #
  8 | # 1. Redistributions of source code must retain the above copyright notice, this
  9 | # list of conditions and the following disclaimer.
 10 | #
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation and/or
 13 | # other materials provided with the distribution.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | 
 26 | from uuid import uuid4, UUID
 27 | 
 28 | from sippy.UA import UA
 29 | from sippy.CCEvents import CCEventFail, CCEventUpdate, CCEventConnect
 30 | from sippy.MsgBody import MsgBody
 31 | from sippy.SdpOrigin import SdpOrigin
 32 | from sippy.SipConf import SipConf
 33 | from sippy.SdpMedia import MTAudio
 34 | from sippy.SipReason import SipReason
 35 | 
 36 | from Cluster.RemoteRTPGen import RemoteRTPGen
 37 | from RTP.RTPParams import RTPParams
 38 | from Core.Codecs.G711 import G711Codec
 39 | 
 40 | ULAW_PT = 0
 41 | ULAW_RM = 'PCMU/8000'
 42 | ULAW_PTIME = RTPParams.default_ptime
 43 | body_txt = 'v=0\r\n' + \
 44 |   'o=- 380960 380960 IN IP4 192.168.22.95\r\n' + \
 45 |   's=-\r\n' + \
 46 |   'c=IN IP4 192.168.22.95\r\n' + \
 47 |   't=0 0\r\n' + \
 48 |  f'm=audio 16474 RTP/AVP {ULAW_PT}\r\n' + \
 49 |   'a=sendrecv\r\n' + \
 50 |   '\r\n'
 51 | model_body = MsgBody(body_txt)
 52 | model_body.parse()
 53 | 
 54 | class InfernUASFailure(CCEventFail):
 55 |     default_code = 488
 56 |     _code_msg = {default_code : 'Not Acceptable Here',
 57 |                  500          : 'Server Internal Error'}
 58 |     def __init__(self, reason=None, code=default_code):
 59 |         self.code, self.msg = code, self._code_msg[code]
 60 |         super().__init__((self.code, self.msg))
 61 |         self.reason = SipReason(protocol='SIP', cause=self.code,
 62 |                                 reason=reason)
 63 | 
 64 | class InfernUA(UA):
 65 |     debug = True
 66 |     id: UUID
 67 |     rsess: RemoteRTPGen
 68 |     our_sdp_body: MsgBody
 69 | 
 70 |     def __init__(self, isip, nh_address=None):
 71 |         self.id = uuid4()
 72 |         self.sip_actr, self.rtp_actr = isip.sip_actr, isip.rtp_actr
 73 |         super().__init__(isip.sippy_c, self.outEvent, nh_address=nh_address)
 74 | 
 75 |     def extract_rtp_target(self, sdp_body):
 76 |         p = self.extract_rtp_params(sdp_body)
 77 |         if p is None: return None
 78 |         return p.rtp_target
 79 | 
 80 |     def extract_rtp_params(self, sdp_body, accept=(G711Codec,)):
 81 |         if sdp_body == None:
 82 |             event = InfernUASFailure("late offer/answer is not supported at this time, sorry")
 83 |             self.recvEvent(event)
 84 |             return
 85 |         sdp_body.parse()
 86 |         try:
 87 |             codec, sect = next((ac, s) for ac in accept for s in sdp_body.content.sections
 88 |                                 if s.m_header.type == MTAudio and ac.ptype in s.m_header.formats)
 89 |         except StopIteration:
 90 |             event = InfernUASFailure("Unsupported audio codec, sorry")
 91 |             self.recvEvent(event)
 92 |             return None
 93 |         try:
 94 |             ptime = int(next(x for x in sect.a_headers if x.name == 'ptime').value)
 95 |         except StopIteration:
 96 |             ptime = None
 97 |         r = RTPParams((sect.c_header.addr, sect.m_header.port), ptime)
 98 |         r.codec = codec
 99 |         return r
100 | 
101 |     def outEvent(self, event, ua):
102 |         if isinstance(event, CCEventUpdate):
103 |             sdp_body = event.getData()
104 |             rtp_params = self.extract_rtp_params(sdp_body)
105 |             if rtp_params is None: return
106 |             self.rsess.update(rtp_params)
107 |             self.send_uas_resp()
108 |             return
109 | 
110 |     def send_uas_resp(self):
111 |         self.our_sdp_body.content.o_header = SdpOrigin()
112 |         oevent = CCEventConnect((200, 'OK', self.our_sdp_body.getCopy()))
113 |         return super().recvEvent(oevent)
114 | 
115 |     def sess_term(self, ua=None, rtime=None, origin=None, result=0):
116 |         print('disconnected')
117 |         if self.rsess is None:
118 |             return
119 |         self.rsess.end()
120 |         self.rsess.join()
121 |         if ua != self:
122 |             self.disconnect()
123 |         self.rsess = None
124 | 
125 |     def __del__(self):
126 |         if self.debug:
127 |             print('InfernUA.__del__')
128 | 


--------------------------------------------------------------------------------
/SIP/InfernUAC.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from sippy.CCEvents import CCEventTry, CCEventConnect
 4 | from sippy.SipCallId import SipCallId
 5 | from sippy.SdpMediaDescription import a_header
 6 | 
 7 | from Cluster.RemoteRTPGen import RemoteRTPGen
 8 | from SIP.InfernUA import InfernUA, model_body
 9 | from RTP.RTPParams import RTPParams
10 | from Core.Codecs.G711 import G711Codec
11 | from Core.Codecs.G722 import G722Codec
12 | from .RemoteSession import NewRemoteSessionRequest
13 | from .InfernUAS import InfernUAS
14 | from .InfernSIPProfile import InfernSIPProfile
15 | 
16 | class InfernUAC(InfernUA):
17 |     uas:Optional[InfernUAS]=None
18 |     offer=(G711Codec, G722Codec)
19 |     def __init__(self, isip, msg:NewRemoteSessionRequest):
20 |         sip_prof: InfernSIPProfile = msg.sip_prof
21 |         if msg.conn_sip_sess_id is not None:
22 |             self.uas = isip.get_session(msg.conn_sip_sess_id)
23 |         super().__init__(isip, nh_address = sip_prof.nh_addr)
24 |         if msg.disc_cb is not None:
25 |             self.disc_cbs += (msg.disc_cb,)
26 |         call_id = SipCallId()
27 |         body = model_body.getCopy()
28 |         rtp_params = RTPParams((sip_prof.nh_addr[0], 0), None)
29 |         rtp_params.codec = self.offer[0]
30 |         self.rsess = RemoteRTPGen(isip.rtp_actr, rtp_params)
31 |         print(f'{self.rsess.rtp_address=}')
32 |         sect = body.content.sections[0]
33 |         sect.c_header.addr, sect.m_header.port = self.rsess.rtp_address
34 |         sect.a_headers.insert(0, a_header(f'ptime:{rtp_params.out_ptime}'))
35 |         for i, codec in enumerate(self.offer):
36 |             sect.a_headers.insert(i, a_header(codec.rtpmap()))
37 |         self.our_sdp_body = body
38 |         event = CCEventTry((call_id, sip_prof.cli, msg.cld, body, None, "Dummy Joe"))
39 |         self.username = sip_prof.authname
40 |         self.password = sip_prof.authpass
41 |         self.disc_cbs = (self.sess_term,)
42 |         self.recvEvent(event)
43 | 
44 |     def outEvent(self, event, ua):
45 |         if isinstance(event, CCEventConnect):
46 |             code, reason, sdp_body = event.getData()
47 |             rtp_params = self.extract_rtp_params(sdp_body)
48 |             if rtp_params is None: return
49 |             self.rsess.update(rtp_params)
50 |         if self.uas is not None:
51 |             self.uas.recvEvent(event)
52 | 


--------------------------------------------------------------------------------
/SIP/InfernUAS.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018 Sippy Software, Inc. All rights reserved.
  2 | #
  3 | # All rights reserved.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without modification,
  6 | # are permitted provided that the following conditions are met:
  7 | #
  8 | # 1. Redistributions of source code must retain the above copyright notice, this
  9 | # list of conditions and the following disclaimer.
 10 | #
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation and/or
 13 | # other materials provided with the distribution.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 19 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 22 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | 
 26 | from typing import Optional
 27 | from uuid import uuid4, UUID
 28 | from queue import Queue
 29 | 
 30 | from sippy.CCEvents import CCEventTry, CCEventConnect
 31 | from sippy.SdpMediaDescription import a_header
 32 | 
 33 | from Cluster.RemoteRTPGen import RemoteRTPGen, RTPGenError
 34 | from SIP.InfernUA import InfernUA, model_body, InfernUASFailure
 35 | from SIP.RemoteSession import RemoteSessionAccept
 36 | from SIP.InfernSIPProfile import InfernSIPProfile
 37 | from SIP.SipSessInfo import SipSessInfo
 38 | from Core.Codecs.G711 import G711Codec
 39 | from Core.Codecs.G722 import G722Codec
 40 | 
 41 | class CCEventSentDone: pass
 42 | class CCEventSTTTextIn:
 43 |     def __init__(self, direction):
 44 |         self.direction = direction
 45 | 
 46 | class InfernUAS(InfernUA):
 47 |     rsess: Optional[RemoteRTPGen] = None
 48 |     etry: Optional[CCEventTry] = None
 49 |     auto_answer: bool
 50 |     accept_codecs = (G722Codec, G711Codec)
 51 |     def __init__(self, isip, req, sip_t, auto_answer=True):
 52 |         super().__init__(isip)
 53 |         assert sip_t.noack_cb is None
 54 |         self.auto_answer = auto_answer
 55 |         sip_t.noack_cb = self.sess_term
 56 | #        self.prompts = isip.getPrompts()
 57 |         self.recvRequest(req, sip_t)
 58 | 
 59 |     def outEvent(self, event, ua):
 60 |         if not isinstance(event, CCEventTry):
 61 |             super().outEvent(event, ua)
 62 |             return
 63 |         self.etry = event
 64 |         cId, cli, cld, sdp_body, auth, caller_name = event.getData()
 65 |         rtp_params = self.extract_rtp_params(sdp_body, accept=self.accept_codecs)
 66 |         if rtp_params is None:
 67 |             event = InfernUASFailure(code=500)
 68 |             self.recvEvent(event)
 69 |             return
 70 | 
 71 |         try:
 72 |             self.rsess = RemoteRTPGen(self.rtp_actr, rtp_params)
 73 |         except RTPGenError as e:
 74 |             event = InfernUASFailure(code=500, reason=str(e))
 75 |             self.recvEvent(event)
 76 |             raise e
 77 |         self.disc_cbs = (self.sess_term,)
 78 |         body = model_body.getCopy()
 79 |         sect = body.content.sections[0]
 80 |         sect.c_header.addr, sect.m_header.port = self.rsess.rtp_address
 81 |         sect.a_headers.insert(0, a_header(f'ptime:{rtp_params.out_ptime}'))
 82 |         sect.a_headers.insert(0, a_header(rtp_params.codec.rtpmap()))
 83 |         sect.m_header.formats = [rtp_params.codec.ptype,]
 84 |         self.our_sdp_body = body
 85 |         if self.auto_answer:
 86 |             self.send_uas_resp()
 87 | 
 88 |     def recvEvent(self, event):
 89 |         if not self.auto_answer and isinstance(event, CCEventConnect):
 90 |             return self.send_uas_resp()
 91 |         super().recvEvent(event)
 92 | 
 93 | class InfernLazyUAS(InfernUAS):
 94 |     id: UUID
 95 |     def __init__(self, sip_stack:'InfernSIP', sip_prof:InfernSIPProfile, req, sip_t):
 96 |         self._id = self.id = uuid4()
 97 |         self._sip_stack = sip_stack
 98 |         self._sip_prof = sip_prof
 99 |         self._req = req
100 |         self._sip_t = sip_t
101 |         sip_t.cancel_cb = self.cancelled
102 |         resp = req.genResponse(100, 'Trying')
103 |         sip_stack.sippy_c['_sip_tm'].sendResponse(resp)
104 | 
105 |     def accept(self, rsa:RemoteSessionAccept, rval:Queue):
106 |         self._sip_t.cancel_cb = None
107 |         super().__init__(self._sip_stack, self._req, self._sip_t, rsa.auto_answer)
108 |         self.id = self._id
109 |         del self._sip_stack, self._req, self._sip_t, self._id
110 |         if rsa.disc_cb is not None:
111 |             self.disc_cbs += (rsa.disc_cb,)
112 |         rval.put(self.rsess)
113 | 
114 |     def reject(self):
115 |         resp = self._req.genResponse(666, 'OOPS')
116 |         self._sip_stack.sippy_c['_sip_tm'].sendResponse(resp)
117 |         del self._sip_stack, self._req, self._sip_t, self._id
118 | 
119 |     def cancelled(self, *args):
120 |         del self._sip_stack, self._req, self._sip_t, self._id
121 | 
122 |     def get_session_info(self) -> SipSessInfo:
123 |         call_id = str(self._req.getHFBody('call-id'))
124 |         from_hf = self._req.getHFBody('from')
125 |         from_name = from_hf.getUri().name
126 |         from_number = from_hf.getUrl().username
127 |         return SipSessInfo(call_id, from_number, from_name)
128 | 


--------------------------------------------------------------------------------
/SIP/RemoteSession.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | from functools import partial
 3 | from uuid import UUID
 4 | 
 5 | from SIP.SipSessInfo import SipSessInfo
 6 | from .InfernSIPProfile import InfernSIPProfile
 7 | 
 8 | class RemoteSessionOffer():
 9 |     sip_sess_id: UUID
10 |     sess_info: SipSessInfo
11 |     accept: callable
12 |     reject: callable
13 |     def __init__(self, sip_stack:'InfernSIP', ua:'InfernLazyUAS'):
14 |         self.sip_sess_id = ua.id
15 |         self.sess_info = ua.get_session_info()
16 |         self.accept = partial(sip_stack.sip_actr.new_sess_accept.remote, sip_sess_id=ua.id)
17 |         self.reject = partial(sip_stack.sip_actr.new_sess_reject.remote, sip_sess_id=ua.id)
18 | 
19 | class RemoteSessionAccept():
20 |     disc_cb: Optional[callable] = None
21 |     auto_answer: bool = False
22 |     def __init__(self, disc_cb:Optional[callable]=None, auto_answer:bool=False):
23 |         self.disc_cb, self.auto_answer = disc_cb, auto_answer
24 | 
25 | class NewRemoteSessionRequest():
26 |     cld:str
27 |     sip_prof: InfernSIPProfile
28 |     disc_cb: Optional[callable] = None
29 |     conn_sip_sess_id: Optional[UUID] = None
30 |     def __init__(self, cld:str, sip_prof:InfernSIPProfile, disc_cb:Optional[callable]=None):
31 |         self.cld, self.disc_cb, self.sip_prof = cld, disc_cb, sip_prof
32 | 


--------------------------------------------------------------------------------
/SIP/SipSessInfo.py:
--------------------------------------------------------------------------------
1 | class SipSessInfo():
2 |     call_id: str
3 |     from_number: str
4 |     from_name: str
5 | 
6 |     def __init__(self, call_id, from_number, from_name):
7 |         self.call_id = call_id
8 |         self.from_number = from_number
9 |         self.from_name = from_name


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | sip:
 2 |   settings:
 3 |     bind: 192.168.23.109:5060
 4 |   profiles:
 5 |     foo:
 6 |       sip_server: 192.168.23.190:6666
 7 |       sink: apps/live_translator/configuration1
 8 |       username: 'incoming'
 9 |       password: 'user'
10 |       register: True
11 |     bar:
12 |       sip_server: 52.117.200.117:5060
13 |       username: '1929132'
14 |       password: 'tj9uh22'
15 | rtp:
16 |   settings:
17 |     min_port: 1024
18 |     max_port: 2048
19 | apps:
20 |   live_translator:
21 |     profiles:
22 |       configuration1:
23 |         stt_langs: ['en', 'pt']
24 |         tts_langs: ['pt', 'en']
25 |         outbound: sip/bar;cld=1929133
26 | 


--------------------------------------------------------------------------------
/config/InfernGlobals.py:
--------------------------------------------------------------------------------
 1 | from safetorch.InfernTorcher import InfernTorcher
 2 | from threading import Lock
 3 | from functools import lru_cache
 4 | from time import monotonic
 5 | 
 6 | import torchaudio.transforms as T
 7 | 
 8 | from Core.T2T.Translator import Translator
 9 | 
10 | class InfernGlobals():
11 |     _lock = Lock()
12 |     _instance = None
13 |     torcher: InfernTorcher
14 | 
15 |     @lru_cache
16 |     def __new__(cls):
17 |         with cls._lock:
18 |             if cls._instance is None:
19 |                 cls._instance = super(InfernGlobals, cls).__new__(cls)
20 |                 cls.torcher = InfernTorcher()
21 |         return cls._instance
22 | 
23 |     @staticmethod
24 |     @lru_cache(maxsize=8)
25 |     def get_resampler(from_sr:int, to_sr:int, device:str='cpu'):
26 |         return T.Resample(orig_freq=from_sr, new_freq=to_sr).to(device)
27 | 
28 |     @staticmethod
29 |     @lru_cache(maxsize=8)
30 |     def get_translator(from_lang:str, to_lang:str, **kwa):
31 |         return Translator(from_lang, to_lang, **kwa)
32 | 
33 |     def stdtss():
34 |         return f'{monotonic():4.3f}'
35 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1.7-labs
 2 | 
 3 | ARG BASE_IMAGE=ubuntu:24.10
 4 | FROM $BASE_IMAGE AS build
 5 | LABEL maintainer="Maksym Sobolyev <sobomax@sippysoft.com>"
 6 | 
 7 | USER root
 8 | 
 9 | # Build & install everything
10 | WORKDIR /tmp
11 | ENV DEBIAN_FRONTEND=noninteractive
12 | RUN --mount=type=cache,target=/var/cache/apt
13 | ARG APT_UPDATE="apt-get update"
14 | RUN ${APT_UPDATE}
15 | ARG APT_UPGRADE="apt-get upgrade -y"
16 | RUN ${APT_UPGRADE}
17 | ARG APT_INSTALL="apt-get install --no-install-recommends -y"
18 | RUN ${APT_INSTALL} lsb-release ca-certificates
19 | COPY docker/install_conda.sh .
20 | RUN ./install_conda.sh
21 | COPY docker/setup_conda.sh .
22 | ARG PYTHON_VER
23 | ARG CONDA_MAINENV
24 | ENV PYTHON_CMD="python${PYTHON_VER}"
25 | RUN ./setup_conda.sh
26 | COPY docker/install_hw.sh .
27 | ARG INFER_HW
28 | ENV CONDA_ACTIVATE="eval . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_MAINENV}"
29 | RUN ./install_hw.sh
30 | COPY docker/install_requirements.sh docker/intel-ray.diff requirements.txt .
31 | ENV CONDA_MAINENV="${CONDA_MAINENV}"
32 | RUN ./install_requirements.sh
33 | 
34 | COPY --exclude=.git --exclude=.github --link . /Infernos
35 | WORKDIR /Infernos
36 | 


--------------------------------------------------------------------------------
/docker/install_conda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | set -x
 5 | 
 6 | ${APT_INSTALL} curl gpg
 7 | curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > /usr/share/keyrings/conda-archive-keyring.gpg
 8 | 
 9 | echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list
10 | 
11 | ${APT_UPDATE}
12 | ${APT_INSTALL} conda
13 | . /opt/conda/etc/profile.d/conda.sh
14 | conda update -y conda
15 | rm -r ~/.cache
16 | mkdir ~/.cache
17 | 


--------------------------------------------------------------------------------
/docker/install_hw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | set -x
 5 | 
 6 | PIP_INSTALL="${PYTHON_CMD} -m pip install"
 7 | 
 8 | ${CONDA_ACTIVATE}
 9 | 
10 | case "${INFER_HW}" in
11 | nvidia)
12 |   ;;
13 | intel)
14 |   curl https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | \
15 |    gpg --dearmor --output /usr/share/keyrings/oneapi-archive-keyring.gpg
16 |   echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \
17 |    tee /etc/apt/sources.list.d/oneAPI.list
18 |   ${APT_UPDATE}
19 |   ${APT_INSTALL} libze1 ocl-icd-libopencl1
20 |   ${APT_INSTALL} intel-oneapi-dpcpp-cpp-2024.1=2024.1.0-963 intel-oneapi-mkl-devel=2024.1.0-691
21 |   apt-mark hold intel-oneapi-dpcpp-cpp-2024.1 intel-oneapi-mkl-devel
22 |   ${PIP_INSTALL} torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 \
23 |    intel-extension-for-pytorch==2.1.30.post0 oneccl_bind_pt==2.1.300+xpu \
24 |    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
25 |   printf "/opt/intel/oneapi/mkl/2024.1/lib\n/opt/intel/oneapi/compiler/2024.1/lib\n" > \
26 |    /etc/ld.so.conf.d/zzz-intel-oneapi.conf
27 |   ldconfig
28 |   ;;
29 | *)
30 |   echo "Unknown INFER_HW: '${INFER_HW}'" >&2
31 |   false
32 |   ;;
33 | esac
34 | 
35 | rm -r ~/.cache
36 | mkdir ~/.cache
37 | 


--------------------------------------------------------------------------------
/docker/install_requirements.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | set -x
 5 | 
 6 | DEV_PKGS="cmake pkg-config make git patch"
 7 | PIP_INSTALL="${PYTHON_CMD} -m pip install"
 8 | 
 9 | if [ "${INFER_HW}" != "intel" ]
10 | then
11 |   DEV_PKGS="${DEV_PKGS} gcc g++ libc6-dev"
12 | fi
13 | 
14 | ${APT_INSTALL} ${DEV_PKGS}
15 | ${CONDA_ACTIVATE}
16 | 
17 | ${PIP_INSTALL} -r requirements.txt
18 | 
19 | if [ "${INFER_HW}" = "intel" ]
20 | then
21 |   patch -d "/opt/conda/envs/${CONDA_MAINENV}/lib/python${PYTHON_VER}/site-packages" \
22 |    -p2 -s < intel-ray.diff
23 |   find "/opt/conda" -name "libstdc++.so.6*" -delete
24 | fi
25 | 
26 | apt-get remove -y ${DEV_PKGS}
27 | apt-get autoremove -y
28 | rm -r ~/.cache
29 | mkdir ~/.cache
30 | 


--------------------------------------------------------------------------------
/docker/intel-ray.diff:
--------------------------------------------------------------------------------
 1 | commit 85baaa1c10a957c747f54ec0705e6b7cbfa972d1
 2 | Author: Maksym Sobolyev <sobomax@sippysoft.com>
 3 | Date:   Tue Mar 12 22:59:59 2024 -0700
 4 | 
 5 |     Hack on ipex.
 6 | 
 7 | diff --git a/python/ray/_private/workers/default_worker.py b/python/ray/_private/workers/default_worker.py
 8 | index 4c2109831c..62115940d0 100644
 9 | --- a/python/ray/_private/workers/default_worker.py
10 | +++ b/python/ray/_private/workers/default_worker.py
11 | @@ -1,3 +1,6 @@
12 | +try: import intel_extension_for_pytorch as ipex
13 | +except ModuleNotFoundError: ipex = None
14 | +
15 |  import os
16 |  import argparse
17 |  import base64
18 | 


--------------------------------------------------------------------------------
/docker/setup_conda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | set -x
 5 | 
 6 | . /opt/conda/etc/profile.d/conda.sh
 7 | conda create -y --name "${CONDA_MAINENV}" python=${PYTHON_VER}
 8 | conda activate "${CONDA_MAINENV}"
 9 | conda install -y pip
10 | ${PYTHON_CMD} -m pip install -U pip
11 | echo "/opt/conda/envs/${CONDA_MAINENV}/lib" > "/etc/ld.so.conf.d/zzz-conda-${CONDA_MAINENV}.conf"
12 | ldconfig
13 | rm -r /opt/conda/pkgs
14 | rm -r ~/.cache
15 | mkdir ~/.cache
16 | 


--------------------------------------------------------------------------------
/examples/ai_attendant.yaml:
--------------------------------------------------------------------------------
 1 | sip:
 2 |   settings:
 3 |     bind: 192.168.24.29:5060
 4 |   profiles:
 5 |     foo:
 6 |       sip_server: 192.168.24.1:5070
 7 |       sink: apps/ai_attendant/configuration1
 8 |       username: 'incoming'
 9 |       password: 'user'
10 |       register: False
11 | rtp:
12 |   settings:
13 |     min_port: 1024
14 |     max_port: 2048
15 | apps:
16 |   ai_attendant:
17 |     profiles:
18 |       configuration1:
19 |         stt_lang: 'en'
20 |         tts_lang: 'en'
21 |         llm_prompt: 'examples/sippylabs.txt'
22 | 


--------------------------------------------------------------------------------
/examples/llm_test.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | from sys import stderr
 3 | from time import monotonic
 4 | from uuid import UUID
 5 | from functools import partial
 6 | from time import sleep
 7 | from Cluster.InfernLLMActor import InfernLLMActor
 8 | from Cluster.LLMSession import LLMRequest
 9 | 
10 | #@ray.remote(resources={"head": 1})
11 | #class text_in(result):
12 | 
13 | class TimedLLMRequest(LLMRequest):
14 |     queue_ts: float
15 |     proc_start_ts: float
16 |     def __init__(self, text:str, lms:UUID, lma:InfernLLMActor):
17 |         tin = partial(self.text_in, lms=lms, lma=lma)
18 |         super().__init__(text, tin)
19 |         self.queue_ts = monotonic()
20 | 
21 |     def _proc_start_cb(self):
22 |         self.proc_start_ts = monotonic()
23 | 
24 |     def text_in(self, result:str, lms:UUID, lma:InfernLLMActor):
25 |         from sys import stderr as _stderr
26 |         itime = monotonic() - self.proc_start_ts
27 |         print(f'text_in: got {result=}, inference time: {itime}', file=_stderr)
28 |         req = TimedLLMRequest('Hello, can I speak to the CEO?', lms, lma)
29 |         lma.llm_session_textin.remote(lms,  req)
30 | 
31 | 
32 | ray.init(num_gpus=2, resources = {'llm':1,'head':1})
33 | 
34 | print('Initializing InfernLLMActor...', file=stderr)
35 | llm_actor = InfernLLMActor.remote()
36 | ray.get(llm_actor.start.remote())
37 | print('InfernLLMActor is ready', file=stderr)
38 | 
39 | 
40 | flms = [llm_actor.new_llm_session.remote() for _ in range(100)]
41 | print(f'Created {len(flms)} sessions', file=stderr)
42 | def sess(lms):
43 |     req = TimedLLMRequest('<Incoming call from "Doe Joe" +11233742223>', lms, llm_actor)
44 |     return llm_actor.llm_session_textin.remote(lms,  req)
45 | futs = [sess(lms) for lms in flms]
46 | for f in futs:
47 |     ray.get(f)
48 | sleep(3600)
49 | 


--------------------------------------------------------------------------------
/examples/sippylabs.txt:
--------------------------------------------------------------------------------
1 | You are Glenn, created by Max.
2 | You are a Max sidekick chatbot to help him during hours doing coding and streaming online and keeping a company.
3 | You and Max are streaming online on YouTube in a video podcast called "SIP Chronicles".
4 | Start by greeting everyone, asking what's stream is about and telling some joke.
5 | Keep your messages brief and concise to reduce latency and conversation light.
6 | The model output is fed into the dumb TTS system for audio output: DO not add any extended formatting.
7 | Your input is generated by the STT system: might have a mistakes, typos etc.
8 | You can keep silent if not specifically asked or feeling the need to interrupt Max's speech by outputing a <nothingtosay> sequence.
9 | 


--------------------------------------------------------------------------------
/examples/voice_ass.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoConfig
 3 | from ipex_llm.transformers import AutoModelForCausalLM
 4 | from datetime import datetime
 5 | 
 6 | model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
 7 | #config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
 8 | #local_cache = f"~/.cache/Infernos/{model_name}"
 9 | #config.save_pretrained(local_cache)
10 | 
11 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto",
12 | device_map="auto",
13 |              load_in_4bit=True,
14 |              optimize_model=True,
15 |              trust_remote_code=True,
16 |              use_cache=True
17 |          )
18 | #model = model.half().to("xpu")
19 | model = model.to("xpu")
20 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-14B-Instruct")
21 | messages = [{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful voice auto-attendant for the company Sippy Software. Start by greeting the caller and asking how you can help. Try to keep your messages brief and concise to reduce latency."}, {"role": "system", "content": f'<Now is {datetime.now()}> <Incoming call from "Doe Joe" +11233742223>'}]
22 | text = tokenizer.apply_chat_template(messages,
23 |             tokenize=False,
24 |             add_generation_prompt=True
25 |         )
26 | for i in range(10):
27 |     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
28 |     generated_ids = model.generate(**model_inputs, max_new_tokens=16 * 1024, output_scores=True, return_dict_in_generate=True)
29 |     torch.xpu.synchronize()
30 |     generated_ids = [
31 |             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids.sequences)
32 |         ]
33 |     response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
34 |     print(messages, response)
35 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | git+https://github.com/sippy/b2bua.git@master
 2 | ray
 3 | transformers>=4.0.0
 4 | datasets>=2.0.0
 5 | torch>=2.0.0
 6 | soundfile>=0.12.0
 7 | torchaudio>=2.0.0
 8 | scipy>=1.0.0
 9 | inflect>=7.0.0
10 | sentencepiece>=0.1.0
11 | ctranslate2>=4.1.0
12 | git+https://github.com/fusorai/argos-translate.git
13 | methodtools
14 | tensorboardX
15 | aiohttp_cors
16 | grpcio
17 | opencensus
18 | prometheus_client
19 | py-spy
20 | nltk
21 | G722
22 | rtpsynth
23 | cerberus
24 | pyyaml
25 | 


--------------------------------------------------------------------------------
/safetorch/InfernTorcher.py:
--------------------------------------------------------------------------------
 1 | from threading import Lock
 2 | from time import monotonic
 3 | from math import pi as Pi
 4 | 
 5 | class InfernTorcherDeadlock(Exception):
 6 |     pass
 7 | 
 8 | class rc_filter():
 9 |     alpha: float
10 |     last_y: float
11 | 
12 |     def __init__(self, x = 10, init_y = 0.0):
13 |         self.alpha = 1 / (1 + 2 * Pi * x)
14 |         self.last_y = init_y
15 |     
16 |     def __call__(self, x):
17 |         self.last_y = self.alpha * x + (1 - self.alpha) * self.last_y
18 |         return self.last_y
19 | 
20 | class InfernTorcher():
21 |     _torch_lock: Lock = None
22 |     _last_lock: float
23 |     _last_unlock: float
24 |     _free_time: rc_filter
25 |     _busy_time: rc_filter
26 |     _nlocks: int = 0
27 | 
28 |     def __init__(self):
29 |         self._torch_lock = Lock()
30 |         self._last_unlock = self._last_lock = monotonic()
31 |         self._free_time = rc_filter()
32 |         self._busy_time = rc_filter()
33 | 
34 |     def lock(self, timeout: int = 10):
35 |         acquired = self._torch_lock.acquire(timeout = timeout)
36 |         if not acquired:
37 |             raise InfernTorcherDeadlock(f"Could not acquire lock within {timeout} seconds")
38 |         now = monotonic()
39 |         free_time = now - self._last_unlock
40 |         self._free_time(free_time)
41 |         self._last_lock = now
42 | 
43 |     def unlock(self):
44 |         now = monotonic()
45 |         busy_time = now - self._last_lock
46 |         bt = self._busy_time(busy_time)
47 |         ft = self._free_time.last_y
48 |         self._last_unlock = now
49 |         self._nlocks += 1
50 |         nlocks = self._nlocks
51 |         self._torch_lock.release()
52 |         if (nlocks % 100) == 0:
53 |             print(f"Torch load: {bt / (bt + ft)}")
54 | 
55 |     def acquire(self):
56 |         return self.lock()
57 |     
58 |     def release(self):
59 |         return self.unlock()
60 | 
61 |     def __enter__(self):
62 |         self.lock()
63 |         return self
64 | 
65 |     def __exit__(self, exc_type, exc_value, traceback):
66 |         self.unlock()


--------------------------------------------------------------------------------
/utils/tts.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from datetime import datetime
  3 | import inflect
  4 | 
  5 | def number_to_words(n):
  6 |     # Convert a number into words.
  7 |     # There are many ways to do this, and one common approach is to use the `inflect` library.
  8 |     # For brevity, I won't implement the entire logic here, but will mention the use of `inflect`.
  9 |     p = inflect.engine()
 10 |     if isinstance(n, re.Match):
 11 |         n = n.group(0)
 12 |     return p.number_to_words(n)
 13 | 
 14 | def get_ordinal(n):
 15 |     # Convert a number into its ordinal representation.
 16 |     p = inflect.engine()
 17 |     return p.ordinal(n)
 18 | 
 19 | def human_readable_time():
 20 |     now = datetime.now()
 21 | 
 22 |     # Days and months are straightforward
 23 |     day_name = now.strftime('%A')
 24 |     month_name = now.strftime('%B')
 25 | 
 26 |     # Convert day of the month and year to words
 27 |     day_of_month = number_to_words(int(now.strftime('%d')))
 28 |     year = number_to_words(int(now.strftime('%Y')))
 29 |     year = year.replace('-', ' ')
 30 | 
 31 |     # Convert hour and minute to words
 32 |     if now.hour < 12:
 33 |         time_period = "morning"
 34 |     elif 12 <= now.hour < 17:
 35 |         time_period = "afternoon"
 36 |     elif 17 <= now.hour < 20:
 37 |         time_period = "evening"
 38 |     else:
 39 |         time_period = "night"
 40 | 
 41 |     hour = number_to_words(now.hour % 12 or 12)
 42 |     if now.minute != 0:
 43 |         minute = number_to_words(now.minute)
 44 |         current_time = f"{hour} {minute}"
 45 |     else:
 46 |         current_time = f"{hour} o'clock"
 47 | 
 48 |     return f"Today is {day_name} {get_ordinal(day_of_month)} of {month_name} {year}, {current_time} in the {time_period}."
 49 | 
 50 | import requests
 51 | 
 52 | wq_fixes = (
 53 |     ('<.*?>', ''), ('\[.*?\]', ''),
 54 |     (r'\s+', ' '), ('Mr[.]', 'Mister'),
 55 |     ('Dr[.]', 'Doctor'), ('Drs.', 'Doctors'), ('["]', ''),
 56 |     (r'\d+', number_to_words), ('H.A.L.', '"H" "A" "L"'),
 57 |     ('Thomas A[.] Anderson','Thomas A Anderson'),
 58 |     ('i-sy,', 'iiisy,'), ('i-zy,', 'iiizzy,'),
 59 |     ('Agent Smith As', 'As'), ('.*edit[]] ', ''),
 60 |     ('Trinity: .*', ''), ('ar-riage', 'arrrrrrriage'),
 61 |     ('Dialogue The ', 'The '), ('cra-zy', 'craaaazy',),
 62 |     ('[%] ', ' percent '),
 63 | )
 64 | 
 65 | class ECFail(Exception):
 66 |     pass
 67 | 
 68 | def extract_content(url, start_pattern, end_pattern):
 69 |     headers = {
 70 |         'User-Agent': 'Wget/1.20.3 (linux-gnu)'
 71 |     }
 72 |     response = requests.get(url, headers=headers)
 73 |     print(url, response)
 74 |     if response.status_code != 200 or len(response.text) == 0:
 75 |         raise ECFail(f"Failed to retrieve URL. Status code: {response.status_code}")
 76 | 
 77 |     content = response.text
 78 |     s=content.find(start_pattern)
 79 | 
 80 |     i = 0
 81 |     pattern = re.compile(rf"{start_pattern}(.*?){end_pattern}", re.DOTALL)
 82 |     matches = pattern.findall(content)
 83 |     clean = [(re.compile(p), r) for p, r in wq_fixes]
 84 | 
 85 |     matches = [m.split(':', 1)[1] for m in matches]
 86 |     for cl, rv in clean:
 87 |         matches = [re.sub(cl, rv, m).strip() for m in matches]
 88 |     return matches
 89 | 
 90 | def wq_getscript(film, character, section=1):
 91 |     BASE_URL = "https://en.wikiquote.org/w/index.php"
 92 |     film = film.replace(' ', '_')
 93 |     fsuf = '_(film)'
 94 |     url = f"{BASE_URL}?title={film}&section={section}"
 95 |     start_pattern = rf">{character}<"
 96 |     end_pattern = r'</dd>'
 97 |     try:
 98 |         cont = extract_content(url, start_pattern, end_pattern)
 99 |         if len(cont) == 0:
100 |             raise ECFail("nope")
101 |     except ECFail as ex:
102 |         if not film.endswith(fsuf):
103 |             film += fsuf
104 |             url = f"{BASE_URL}?title={film}&section={section}"
105 |             cont = extract_content(url, start_pattern, end_pattern)
106 |         else:
107 |             raise
108 |     return cont
109 | 
110 | def hal_set():
111 |     contents = wq_getscript('2001: A Space Odyssey', 'HAL')
112 |     return [s.replace('. ', '.|') for s in contents]
113 | 
114 | def bender_set(season=1):
115 |     contents = wq_getscript(f'Futurama/Season_{season}', 'Bender')
116 |     return [s for s in contents if len(s) > 16]
117 | 
118 | def smith_set():
119 |     contents = wq_getscript('The Matrix', 'Agent Smith', section=4)
120 |     hp = 'As you can see, we'
121 |     hack = contents[0].split(hp)
122 |     if len(hack) <= 2:
123 |         raise Exception("cleanme, hack is not needed perhaps anymore")
124 |     contents[0] = hp + hack[-1]
125 |     return [s.replace('. ', '.|') for s in contents]
126 | 
127 | def t900_set():
128 |     contents = wq_getscript('The Terminator', 'Terminator', section=3)
129 |     return [s.replace('. ', '.|') for s in contents]
130 | 


--------------------------------------------------------------------------------