├── audioldm
    ├── requirements.txt
    └── handler.py
├── musicgen
    ├── requirements.txt
    └── handler.py
├── README.md
└── bark
    ├── requirements.txt
    └── handler.py


/audioldm/requirements.txt:
--------------------------------------------------------------------------------
1 | diffusers
2 | transformers


--------------------------------------------------------------------------------
/musicgen/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.31.0
2 | accelerate>=0.20.3
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deploy audio ML models on HF Inference Endpoints
2 | 
3 | list goes here...
4 | 


--------------------------------------------------------------------------------
/bark/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/transformers.git
2 | git+https://github.com/huggingface/optimum.git
3 | git+https://github.com/huggingface/accelerate.git


--------------------------------------------------------------------------------
/audioldm/handler.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Any
 2 | from diffusers import AudioLDMPipeline
 3 | 
 4 | import torch
 5 | 
 6 | class EndpointHandler:
 7 |     def __init__(self, path=""):
 8 |         # load model and processor from path
 9 |         self.pipe = AudioLDMPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
10 | 
11 |     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
12 |         """
13 |         Args:
14 |             data (:dict:):
15 |                 The payload with the text prompt and generation parameters.
16 |         """
17 |         # process input
18 |         inputs = data.pop("inputs", data)
19 |         parameters = data.pop("parameters", None)      
20 | 
21 |         # pass inputs with all kwargs in data
22 |         if parameters is not None:
23 |             outputs = self.pipe(inputs, **parameters)
24 |         else:
25 |             outputs = self.pipe(inputs,)
26 | 
27 |         # postprocess the prediction
28 |         prediction = outputs[0].cpu().numpy()
29 | 
30 |         return [{"generated_audio": prediction}]


--------------------------------------------------------------------------------
/bark/handler.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Any
 2 | from transformers import AutoProcessor, BarkModel
 3 | 
 4 | import torch
 5 | 
 6 | class EndpointHandler:
 7 |     def __init__(self, path="suno/bark"):
 8 |         # load model and processor from path
 9 |         self.processor = AutoProcessor.from_pretrained(path)
10 |         self.model = BarkModel.from_pretrained(path, torch_dtype=torch.float16).to("cuda")
11 | 
12 |     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
13 |         """
14 |         Args:
15 |             data (:dict:):
16 |                 The payload with the text prompt and generation parameters.
17 |         """
18 |         # process input
19 |         inputs = data.pop("inputs", data)
20 |         parameters = data.pop("parameters", None)
21 | 
22 |         # preprocess
23 |         inputs = self.processor(
24 |             text=[inputs],
25 |             padding=True,
26 |             return_tensors="pt",).to("cuda")
27 | 
28 |         # pass inputs with all kwargs in data
29 |         if parameters is not None:
30 |             outputs = self.model.generate(**inputs, **parameters)
31 |         else:
32 |             outputs = self.model.generate(**inputs,)
33 | 
34 |         # postprocess the prediction
35 |         prediction = outputs[0].cpu().numpy()
36 | 
37 |         return [{"generated_audio": prediction}]


--------------------------------------------------------------------------------
/musicgen/handler.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Any
 2 | from transformers import AutoProcessor, MusicgenForConditionalGeneration
 3 | import torch
 4 | 
 5 | class EndpointHandler:
 6 |     def __init__(self, path=""):
 7 |         # load model and processor from path
 8 |         self.processor = AutoProcessor.from_pretrained(path)
 9 |         self.model = MusicgenForConditionalGeneration.from_pretrained(path, torch_dtype=torch.float16).to("cuda")
10 | 
11 |     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
12 |         """
13 |         Args:
14 |             data (:dict:):
15 |                 The payload with the text prompt and generation parameters.
16 |         """
17 |         # process input
18 |         inputs = data.pop("inputs", data)
19 |         parameters = data.pop("parameters", None)
20 | 
21 |         # preprocess
22 |         inputs = self.processor(
23 |             text=[inputs],
24 |             padding=True,
25 |             return_tensors="pt",).to("cuda")
26 | 
27 |         # pass inputs with all kwargs in data
28 |         if parameters is not None:
29 |             with torch.autocast("cuda"):
30 |                 outputs = self.model.generate(**inputs, **parameters)
31 |         else:
32 |             with torch.autocast("cuda"):
33 |                 outputs = self.model.generate(**inputs,)
34 | 
35 |         # postprocess the prediction
36 |         prediction = outputs[0].cpu().numpy()
37 | 
38 |         return [{"generated_audio": prediction}]


--------------------------------------------------------------------------------