├── .github └── ISSUE_TEMPLATE │ └── request-add-model.md ├── .gitignore ├── LICENSE ├── README.md ├── examples ├── neutone_fx │ ├── example_clipper.py │ ├── example_clipper_prefilter.py │ ├── example_delayed_passthrough.py │ ├── example_overdrive-random.py │ ├── example_rave.py │ ├── example_rave_prefilter.py │ ├── example_rave_v1_prefilter.py │ └── example_spectral_filter.py └── neutone_gen │ ├── example_clipper.py │ └── example_musicgen_load.py ├── neutone_sdk ├── __init__.py ├── assets │ └── default_samples │ │ ├── sample_ambience.mp3 │ │ ├── sample_drums.mp3 │ │ └── sample_rhodes.mp3 ├── audio.py ├── benchmark.py ├── cached_mel_spec.py ├── constants.py ├── conv.py ├── core.py ├── filters.py ├── gcn_1d.py ├── metadata.py ├── non_realtime_sqw.py ├── non_realtime_wrapper.py ├── parameter.py ├── py.typed ├── queues.py ├── realtime_stft.py ├── sandwich.py ├── sqw.py ├── tcn_1d.py ├── utils.py └── wavform_to_wavform.py ├── pyproject.toml └── testing ├── test_cached_mel_spec.py ├── test_conv.py ├── test_profiling.py ├── test_queues.py ├── test_sandwiches.py ├── test_sqw.py └── torchscript_test.py /.github/ISSUE_TEMPLATE/request-add-model.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Request add model 3 | about: Template for adding a model to the default list in Neutone 4 | title: "[MODEL] " 5 | labels: enhancement 6 | assignees: bogdanteleaga, christhetree 7 | 8 | --- 9 | 10 | ## A brief description of what your model does 11 | 12 | 13 | 14 | ## Checklist 15 | - [ ] I have checked the model works properly loaded locally in the Neutone plugin on my machine. 16 | - [ ] I have uploaded the .nm model file at a publicly available location: . 17 | 18 | ## Extra information 19 | 20 | 21 | ## Metadata 22 | 23 | The model export function should dump a json file. Please paste the contents here for review and discussions. 24 | 25 | ``` 26 | ``` 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### macOS template 2 | # General 3 | .DS_Store 4 | .AppleDouble 5 | .LSOverride 6 | 7 | # Icon must end with two \r 8 | Icon 9 | 10 | # Thumbnails 11 | ._* 12 | 13 | # Files that might appear in the root of a volume 14 | .DocumentRevisions-V100 15 | .fseventsd 16 | .Spotlight-V100 17 | .TemporaryItems 18 | .Trashes 19 | .VolumeIcon.icns 20 | .com.apple.timemachine.donotpresent 21 | 22 | # Directories potentially created on remote AFP share 23 | .AppleDB 24 | .AppleDesktop 25 | Network Trash Folder 26 | Temporary Items 27 | .apdisk 28 | 29 | ### Python template 30 | # Byte-compiled / optimized / DLL files 31 | __pycache__/ 32 | *.py[cod] 33 | *$py.class 34 | 35 | # C extensions 36 | *.so 37 | 38 | # Distribution / packaging 39 | .Python 40 | build/ 41 | develop-eggs/ 42 | dist/ 43 | downloads/ 44 | eggs/ 45 | .eggs/ 46 | lib/ 47 | lib64/ 48 | parts/ 49 | sdist/ 50 | var/ 51 | wheels/ 52 | pip-wheel-metadata/ 53 | share/python-wheels/ 54 | *.egg-info/ 55 | .installed.cfg 56 | *.egg 57 | MANIFEST 58 | 59 | # PyInstaller 60 | # Usually these files are written by a python script from a template 61 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 62 | *.manifest 63 | *.spec 64 | 65 | # Installer logs 66 | pip-log.txt 67 | pip-delete-this-directory.txt 68 | 69 | # Unit test / coverage reports 70 | htmlcov/ 71 | .tox/ 72 | .nox/ 73 | .coverage 74 | .coverage.* 75 | .cache 76 | nosetests.xml 77 | coverage.xml 78 | *.cover 79 | *.py,cover 80 | .hypothesis/ 81 | .pytest_cache/ 82 | cover/ 83 | 84 | # Translations 85 | *.mo 86 | *.pot 87 | 88 | # Django stuff: 89 | *.log 90 | local_settings.py 91 | db.sqlite3 92 | db.sqlite3-journal 93 | 94 | # Flask stuff: 95 | instance/ 96 | .webassets-cache 97 | 98 | # Scrapy stuff: 99 | .scrapy 100 | 101 | # Sphinx documentation 102 | docs/_build/ 103 | 104 | # PyBuilder 105 | .pybuilder/ 106 | target/ 107 | 108 | # Jupyter Notebook 109 | .ipynb_checkpoints 110 | 111 | # IPython 112 | profile_default/ 113 | ipython_config.py 114 | 115 | # pyenv 116 | # For a library or package, you might want to ignore these files since the code is 117 | # intended to run in multiple environments; otherwise, check them in: 118 | .python-version 119 | 120 | # pipenv 121 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 122 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 123 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 124 | # install all needed dependencies. 125 | #Pipfile.lock 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .venv 140 | env/ 141 | venv/ 142 | ENV/ 143 | env.bak/ 144 | venv.bak/ 145 | 146 | # Spyder project settings 147 | .spyderproject 148 | .spyproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | # mkdocs documentation 154 | /site 155 | 156 | # mypy 157 | .mypy_cache/ 158 | .dmypy.json 159 | dmypy.json 160 | 161 | # Pyre type checker 162 | .pyre/ 163 | 164 | # pytype static type analyzer 165 | .pytype/ 166 | 167 | # Cython debug symbols 168 | cython_debug/ 169 | 170 | ### CUDA template 171 | *.i 172 | *.ii 173 | *.gpu 174 | *.ptx 175 | *.cubin 176 | *.fatbin 177 | 178 | ### VirtualEnv template 179 | # Virtualenv 180 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 181 | [Bb]in 182 | [Ii]nclude 183 | [Ll]ib 184 | [Ll]ib64 185 | [Ll]ocal 186 | [Ss]cripts 187 | pyvenv.cfg 188 | pip-selfcheck.json 189 | 190 | ### JupyterNotebooks template 191 | # gitignore template for Jupyter Notebooks 192 | # website: http://jupyter.org/ 193 | 194 | */.ipynb_checkpoints/* 195 | 196 | # IPython 197 | 198 | # Remove previous ipynb_checkpoints 199 | # git rm -r .ipynb_checkpoints/ 200 | 201 | # User added 202 | .idea/ 203 | exports/ 204 | export_model/ 205 | models/ 206 | out/ 207 | scratch.py 208 | -------------------------------------------------------------------------------- /examples/neutone_fx/example_clipper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pathlib 4 | from argparse import ArgumentParser 5 | from typing import Dict, List 6 | 7 | import torch as tr 8 | import torch.nn as nn 9 | from torch import Tensor 10 | 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter 12 | from neutone_sdk.utils import save_neutone_model 13 | 14 | logging.basicConfig() 15 | log = logging.getLogger(__name__) 16 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 17 | 18 | 19 | class ClipperModel(nn.Module): 20 | def forward( 21 | self, x: Tensor, min_val: Tensor, max_val: Tensor, gain: Tensor 22 | ) -> Tensor: 23 | tr.neg(min_val, out=min_val) 24 | tr.mul(gain, min_val, out=min_val) 25 | tr.mul(gain, max_val, out=max_val) 26 | tr.clip(x, min=min_val, max=max_val, out=x) 27 | return x 28 | 29 | 30 | class ClipperModelWrapper(WaveformToWaveformBase): 31 | def get_model_name(self) -> str: 32 | return "clipper" 33 | 34 | def get_model_authors(self) -> List[str]: 35 | return ["Andrew Fyfe"] 36 | 37 | def get_model_short_description(self) -> str: 38 | return "Audio clipper." 39 | 40 | def get_model_long_description(self) -> str: 41 | return "Clips the input audio between -1 and 1." 42 | 43 | def get_technical_description(self) -> str: 44 | return "Clips the input audio between -1 and 1." 45 | 46 | def get_technical_links(self) -> Dict[str, str]: 47 | return { 48 | "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/example_clipper.py" 49 | } 50 | 51 | def get_tags(self) -> List[str]: 52 | return ["clipper"] 53 | 54 | def get_model_version(self) -> str: 55 | return "1.0.0" 56 | 57 | def is_experimental(self) -> bool: 58 | return False 59 | 60 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 61 | return [ 62 | ContinuousNeutoneParameter("min", "min clip threshold", default_value=0.15), 63 | ContinuousNeutoneParameter("max", "max clip threshold", default_value=0.15), 64 | ContinuousNeutoneParameter("gain", "scale clip threshold", default_value=1.0), 65 | ] 66 | 67 | @tr.jit.export 68 | def is_input_mono(self) -> bool: 69 | return False 70 | 71 | @tr.jit.export 72 | def is_output_mono(self) -> bool: 73 | return False 74 | 75 | @tr.jit.export 76 | def get_native_sample_rates(self) -> List[int]: 77 | return [] # Supports all sample rates 78 | 79 | @tr.jit.export 80 | def get_native_buffer_sizes(self) -> List[int]: 81 | return [] # Supports all buffer sizes 82 | 83 | def aggregate_params(self, params: Tensor) -> Tensor: 84 | return params # We want sample-level control, so no aggregation 85 | 86 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 87 | min_val, max_val, gain = params["min"], params["max"], params["gain"] 88 | x = self.model.forward(x, min_val, max_val, gain) 89 | return x 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = ArgumentParser() 94 | parser.add_argument("-o", "--output", default="export_model") 95 | args = parser.parse_args() 96 | root_dir = pathlib.Path(args.output) 97 | 98 | model = ClipperModel() 99 | wrapper = ClipperModelWrapper(model) 100 | save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True) 101 | -------------------------------------------------------------------------------- /examples/neutone_fx/example_clipper_prefilter.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from argparse import ArgumentParser 3 | from typing import Dict, List 4 | 5 | import torch as tr 6 | import torch.nn as nn 7 | from torch import Tensor 8 | 9 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter 10 | from neutone_sdk.filters import FIRFilter, FilterType 11 | from neutone_sdk.utils import save_neutone_model 12 | 13 | """ 14 | Example wrapper script for prefilter + models with variable sample rate. 15 | WaveformToWaveformBase.set_model_sample_rate_and_buffer_size() is used 16 | to change the sample rate of the filter to the actual used sample rate. 17 | """ 18 | 19 | 20 | class ClipperModel(nn.Module): 21 | def forward( 22 | self, x: Tensor, min_val: Tensor, max_val: Tensor, gain: Tensor 23 | ) -> Tensor: 24 | tr.neg(min_val, out=min_val) 25 | tr.mul(gain, min_val, out=min_val) 26 | tr.mul(gain, max_val, out=max_val) 27 | tr.clip(x, min=min_val, max=max_val, out=x) 28 | return x 29 | 30 | 31 | class ClipperModelWrapper(WaveformToWaveformBase): 32 | def __init__(self, model: nn.Module, use_debug_mode: bool = True) -> None: 33 | super().__init__(model, use_debug_mode) 34 | # filter to be applied before model 35 | self.pre_filter = FIRFilter(FilterType.LOWPASS, cutoffs=[1000.0], filt_size=257) 36 | 37 | def get_model_name(self) -> str: 38 | return "clipper" 39 | 40 | def get_model_authors(self) -> List[str]: 41 | return ["Andrew Fyfe"] 42 | 43 | def get_model_short_description(self) -> str: 44 | return "Audio clipper." 45 | 46 | def get_model_long_description(self) -> str: 47 | return "Clips the input audio between -1 and 1." 48 | 49 | def get_technical_description(self) -> str: 50 | return "Clips the input audio between -1 and 1." 51 | 52 | def get_technical_links(self) -> Dict[str, str]: 53 | return { 54 | "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/example_clipper.py" 55 | } 56 | 57 | def get_tags(self) -> List[str]: 58 | return ["clipper"] 59 | 60 | def get_model_version(self) -> str: 61 | return "1.0.0" 62 | 63 | def is_experimental(self) -> bool: 64 | return False 65 | 66 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 67 | return [ 68 | ContinuousNeutoneParameter("min", "min clip threshold", default_value=0.15), 69 | ContinuousNeutoneParameter("max", "max clip threshold", default_value=0.15), 70 | ContinuousNeutoneParameter("gain", "scale clip threshold", default_value=1.0), 71 | ] 72 | 73 | @tr.jit.export 74 | def is_input_mono(self) -> bool: 75 | return False 76 | 77 | @tr.jit.export 78 | def is_output_mono(self) -> bool: 79 | return False 80 | 81 | def calc_model_delay_samples(self) -> int: 82 | # model latency should also be added if non-causal 83 | return self.pre_filter.delay 84 | 85 | def set_model_sample_rate_and_buffer_size( 86 | self, sample_rate: int, n_samples: int 87 | ) -> bool: 88 | # While clipper works in any sample rate, prefilter's samplerate must be changed 89 | self.pre_filter.set_parameters(sample_rate=sample_rate) 90 | return True 91 | 92 | @tr.jit.export 93 | def get_native_sample_rates(self) -> List[int]: 94 | return [] # Supports all sample rates 95 | 96 | @tr.jit.export 97 | def get_native_buffer_sizes(self) -> List[int]: 98 | return [] # Supports all buffer sizes 99 | 100 | def aggregate_params(self, params: Tensor) -> Tensor: 101 | return params # We want sample-level control, so no aggregation 102 | 103 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 104 | min_val, max_val, gain = params["min"], params["max"], params["gain"] 105 | # Apply pre-filter 106 | x = self.pre_filter(x) 107 | x = self.model.forward(x, min_val, max_val, gain) 108 | return x 109 | 110 | 111 | if __name__ == "__main__": 112 | parser = ArgumentParser() 113 | parser.add_argument("-o", "--output", default="export_model") 114 | args = parser.parse_args() 115 | root_dir = pathlib.Path(args.output) 116 | model = ClipperModel() 117 | wrapper = ClipperModelWrapper(model) 118 | save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True) 119 | -------------------------------------------------------------------------------- /examples/neutone_fx/example_delayed_passthrough.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pathlib 4 | from argparse import ArgumentParser 5 | from typing import Dict, List 6 | 7 | import torch as tr 8 | import torch.nn as nn 9 | from torch import Tensor 10 | 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter 12 | from neutone_sdk.utils import save_neutone_model 13 | 14 | logging.basicConfig() 15 | log = logging.getLogger(__name__) 16 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 17 | 18 | 19 | class DelayedPassthroughModel(nn.Module): 20 | def __init__(self, delay_n_samples: int, in_ch: int = 2) -> None: 21 | super().__init__() 22 | self.delay_n_samples = delay_n_samples 23 | self.delay_buf = tr.zeros((in_ch, delay_n_samples)) 24 | 25 | def forward(self, x: Tensor) -> Tensor: 26 | x = tr.cat([self.delay_buf, x], dim=-1) 27 | self.delay_buf[:, :] = x[:, -self.delay_n_samples :] 28 | x = x[:, : -self.delay_n_samples] 29 | return x 30 | 31 | 32 | class DelayedPassthroughModelWrapper(WaveformToWaveformBase): 33 | def get_model_name(self) -> str: 34 | return "delayed.passthrough" 35 | 36 | def get_model_authors(self) -> List[str]: 37 | return ["Christopher Mitcheltree"] 38 | 39 | def get_model_short_description(self) -> str: 40 | return "Delayed passthrough model." 41 | 42 | def get_model_long_description(self) -> str: 43 | return "Delays the input audio by some number of samples. Should be tested with 50/50 dry/wet." 44 | 45 | def get_technical_description(self) -> str: 46 | return "Delays the input audio by some number of samples. Should be tested with 50/50 dry/wet." 47 | 48 | def get_technical_links(self) -> Dict[str, str]: 49 | return {} 50 | 51 | def get_tags(self) -> List[str]: 52 | return [] 53 | 54 | def get_model_version(self) -> str: 55 | return "1.0.0" 56 | 57 | def is_experimental(self) -> bool: 58 | return True 59 | 60 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 61 | return [] 62 | 63 | @tr.jit.export 64 | def is_input_mono(self) -> bool: 65 | return False 66 | 67 | @tr.jit.export 68 | def is_output_mono(self) -> bool: 69 | return False 70 | 71 | @tr.jit.export 72 | def get_native_sample_rates(self) -> List[int]: 73 | return [44100] # Change this to test different scenarios 74 | 75 | @tr.jit.export 76 | def get_native_buffer_sizes(self) -> List[int]: 77 | return [2048] # Change this to test different scenarios 78 | 79 | @tr.jit.export 80 | def reset_model(self) -> bool: 81 | self.model.delay_buf.fill_(0) 82 | return True 83 | 84 | @tr.jit.export 85 | def calc_model_delay_samples(self) -> int: 86 | return self.model.delay_n_samples 87 | 88 | @tr.jit.export 89 | def get_wet_default_value(self) -> float: 90 | return 0.5 91 | 92 | @tr.jit.export 93 | def get_dry_default_value(self) -> float: 94 | return 0.5 95 | 96 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 97 | x = self.model.forward(x) 98 | return x 99 | 100 | 101 | if __name__ == "__main__": 102 | parser = ArgumentParser() 103 | parser.add_argument("-o", "--output", default="export_model") 104 | args = parser.parse_args() 105 | root_dir = pathlib.Path(args.output) 106 | 107 | model = DelayedPassthroughModel( 108 | delay_n_samples=500 109 | ) # Change delay_n_samples to test different scenarios 110 | wrapper = DelayedPassthroughModelWrapper(model) 111 | save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True) 112 | -------------------------------------------------------------------------------- /examples/neutone_fx/example_overdrive-random.py: -------------------------------------------------------------------------------- 1 | # This code is based on the following repository written by Christian J. Steinmetz 2 | # https://github.com/csteinmetz1/micro-tcn 3 | import logging 4 | import os 5 | from argparse import ArgumentParser 6 | from pathlib import Path 7 | from typing import Dict, List 8 | 9 | import torch 10 | import torch.nn as nn 11 | from torch import Tensor 12 | 13 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter 14 | from neutone_sdk.tcn_1d import FiLM 15 | from neutone_sdk.utils import save_neutone_model 16 | 17 | logging.basicConfig() 18 | log = logging.getLogger(__name__) 19 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 20 | 21 | 22 | # TODO(christhetree): integrate this into tcn_1d.py 23 | class PaddingCached(nn.Module): # to maintain signal continuity over sample windows 24 | def __init__(self, padding: int, channels: int) -> None: 25 | super().__init__() 26 | self.padding = padding 27 | self.channels = channels 28 | pad = torch.zeros(1, self.channels, self.padding) 29 | self.register_buffer("pad", pad) 30 | 31 | def forward(self, x: Tensor) -> Tensor: 32 | padded_x = torch.cat([self.pad, x], -1) # concat input signal to the cache 33 | self.pad = padded_x[..., -self.padding :] # discard old cache 34 | return padded_x 35 | 36 | 37 | # TODO(christhetree): integrate this into tcn_1d.py 38 | class Conv1dCached(nn.Module): # Conv1d with cache 39 | def __init__( 40 | self, 41 | in_chan: int, 42 | out_chan: int, 43 | kernel: int, 44 | stride: int, 45 | padding: int, 46 | dilation: int = 1, 47 | weight_norm: bool = False, 48 | bias: bool = False, 49 | ) -> None: 50 | super().__init__() 51 | self.pad = PaddingCached(padding * 2, in_chan) 52 | self.conv = nn.Conv1d( 53 | in_chan, out_chan, kernel, stride, dilation=dilation, bias=bias 54 | ) 55 | nn.init.normal_(self.conv.weight) # random initialization 56 | if weight_norm: 57 | self.conv = nn.utils.weight_norm(self.conv) 58 | 59 | def forward(self, x: Tensor) -> Tensor: 60 | x = self.pad(x) # get (cached input + current input) 61 | x = self.conv(x) 62 | return x 63 | 64 | 65 | # TODO(christhetree): integrate this into tcn_1d.py 66 | class TCNBlock(nn.Module): 67 | def __init__( 68 | self, 69 | in_ch: int, 70 | out_ch: int, 71 | kernel_size: int = 3, 72 | dilation: int = 1, 73 | cond_dim: int = 32, 74 | ) -> None: 75 | super(TCNBlock, self).__init__() 76 | self.in_ch = in_ch 77 | self.out_ch = out_ch 78 | self.kernel_size = kernel_size 79 | padding = kernel_size // 2 * dilation 80 | self.conv1 = Conv1dCached( 81 | in_ch, 82 | out_ch, 83 | kernel=kernel_size, 84 | stride=1, 85 | padding=padding, 86 | dilation=dilation, 87 | bias=True, 88 | ) 89 | self.res = nn.Conv1d( 90 | in_ch, out_ch, kernel_size=1, groups=1, bias=False 91 | ) # residual connection 92 | self.bn = nn.BatchNorm1d(out_ch) 93 | self.film = FiLM(out_ch, cond_dim) 94 | self.relu = nn.PReLU(out_ch) 95 | 96 | def forward(self, x: Tensor, p: Tensor) -> Tensor: 97 | x_in = x 98 | x = self.conv1(x) 99 | x = self.film(x, p) 100 | x = self.bn(x) 101 | x = self.relu(x) 102 | 103 | # residual 104 | x_res = self.res(x_in) 105 | start = (x_res.shape[-1] - x.shape[-1]) // 2 106 | stop = start + x.shape[-1] 107 | x = x + x_res[..., start:stop] 108 | return x 109 | 110 | 111 | class OverdriveModel(nn.Module): 112 | def __init__( 113 | self, 114 | ninputs: int = 1, 115 | noutputs: int = 1, 116 | nblocks: int = 4, 117 | channel_growth: int = 0, 118 | channel_width: int = 32, 119 | kernel_size: int = 13, 120 | dilation_growth: int = 2, 121 | ncondition: int = 2, 122 | ) -> None: 123 | super().__init__() 124 | 125 | # MLP layers for conditioning 126 | self.ncondition = ncondition 127 | self.condition = torch.nn.Sequential( 128 | torch.nn.Linear(ncondition, 16), 129 | torch.nn.ReLU(), 130 | torch.nn.Linear(16, 32), 131 | torch.nn.ReLU(), 132 | torch.nn.Linear(32, 32), # cond_dim = 32 133 | torch.nn.ReLU(), 134 | ) 135 | 136 | # main model 137 | self.blocks = torch.nn.ModuleList() 138 | for n in range(nblocks): 139 | in_ch = out_ch if n > 0 else ninputs 140 | out_ch = in_ch * channel_growth if channel_growth > 1 else channel_width 141 | dilation = dilation_growth**n 142 | self.blocks.append( 143 | TCNBlock(in_ch, out_ch, kernel_size, dilation, cond_dim=32) 144 | ) 145 | self.output = nn.Conv1d(out_ch, noutputs, kernel_size=1) 146 | 147 | # random initialization 148 | self.initialize_random() 149 | 150 | def forward(self, x: Tensor, c: Tensor) -> Tensor: 151 | p = self.condition(c) # conditioning 152 | for _, block in enumerate(self.blocks): 153 | x = block(x, p) 154 | y = torch.tanh(self.output(x)) # clipping 155 | return y 156 | 157 | def weights_init(self, m: nn.Module) -> None: 158 | classname = m.__class__.__name__ 159 | if classname == "Linear": 160 | nn.init.normal_(m.weight, 0, 0.40) 161 | 162 | def initialize_random(self) -> None: 163 | for n in self.blocks: 164 | nn.init.normal_(n.conv1.conv.weight, 0, 0.7) 165 | # nn.init.normal_(self.output.weight, 0, 0.25) 166 | self.condition.apply(self.weights_init) 167 | 168 | 169 | class OverdriveModelWrapper(WaveformToWaveformBase): 170 | def get_model_name(self) -> str: 171 | return "conv1d-overdrive.random" 172 | 173 | def get_model_authors(self) -> List[str]: 174 | return ["Nao Tokui"] 175 | 176 | def get_model_short_description(self) -> str: 177 | return "Neural distortion/overdrive effect" 178 | 179 | def get_model_long_description(self) -> str: 180 | return "Neural distortion/overdrive effect through randomly initialized Convolutional Neural Network" 181 | 182 | def get_technical_description(self) -> str: 183 | return "Random distortion/overdrive effect through randomly initialized Temporal-1D-convolution layers. Based on the idea proposed by Steinmetz et al." 184 | 185 | def get_tags(self) -> List[str]: 186 | return ["distortion", "overdrive"] 187 | 188 | def get_model_version(self) -> str: 189 | return "1.0.0" 190 | 191 | def is_experimental(self) -> bool: 192 | return False 193 | 194 | def get_technical_links(self) -> Dict[str, str]: 195 | return { 196 | "Paper": "https://arxiv.org/abs/2010.04237", 197 | "Code": "https://github.com/csteinmetz1/micro-tcn", 198 | } 199 | 200 | def get_citation(self) -> str: 201 | return "Steinmetz, C. J., & Reiss, J. D. (2020). Randomized overdrive neural networks. arXiv preprint arXiv:2010.04237." 202 | 203 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 204 | return [ 205 | ContinuousNeutoneParameter("depth", "Effect Depth", 0.0), 206 | ContinuousNeutoneParameter("P1", "Feature modulation 1", 0.0), 207 | ContinuousNeutoneParameter("P2", "Feature modulation 2", 0.0), 208 | ] 209 | 210 | @torch.jit.export 211 | def is_input_mono(self) -> bool: 212 | return False 213 | 214 | @torch.jit.export 215 | def is_output_mono(self) -> bool: 216 | return False 217 | 218 | @torch.jit.export 219 | def get_native_sample_rates(self) -> List[int]: 220 | return [] # Supports all sample rates 221 | 222 | @torch.jit.export 223 | def get_native_buffer_sizes(self) -> List[int]: 224 | return [] # Supports all buffer sizes 225 | 226 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 227 | # conditioning for FiLM layer 228 | p1 = params["P1"] 229 | p2 = params["P2"] 230 | depth = params["depth"] 231 | condition = torch.hstack([p1, p2]).reshape((1, -1)) * depth 232 | 233 | # main process 234 | for ch in range(x.shape[0]): # process channel by channel 235 | x_ = x[ch].reshape(1, 1, -1) 236 | x_ = self.model(x_, condition) 237 | x[ch] = x_.squeeze() 238 | return x 239 | 240 | 241 | if __name__ == "__main__": 242 | parser = ArgumentParser() 243 | parser.add_argument("-o", "--output", default="export_model") 244 | args = parser.parse_args() 245 | root_dir = Path(args.output) 246 | 247 | model = OverdriveModel() 248 | wrapper = OverdriveModelWrapper(model) 249 | metadata = wrapper.to_metadata() 250 | save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True) 251 | -------------------------------------------------------------------------------- /examples/neutone_fx/example_rave.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from argparse import ArgumentParser 4 | from pathlib import Path 5 | from typing import Dict, List 6 | 7 | import torch 8 | import torchaudio 9 | from torch import Tensor 10 | 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter 12 | from neutone_sdk.audio import ( 13 | AudioSample, 14 | AudioSamplePair, 15 | render_audio_sample, 16 | ) 17 | from neutone_sdk.utils import save_neutone_model 18 | 19 | logging.basicConfig() 20 | log = logging.getLogger(__name__) 21 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 22 | 23 | 24 | class RAVEModelWrapper(WaveformToWaveformBase): 25 | def get_model_name(self) -> str: 26 | return "RAVE.example" # <-EDIT THIS 27 | 28 | def get_model_authors(self) -> List[str]: 29 | return ["Author Name"] # <-EDIT THIS 30 | 31 | def get_model_short_description(self) -> str: 32 | return "RAVE model trained on xxx sounds." # <-EDIT THIS 33 | 34 | def get_model_long_description(self) -> str: 35 | return ( # <-EDIT THIS 36 | "RAVE timbre transfer model trained on xxx sounds. Useful for xxx sounds." 37 | ) 38 | 39 | def get_technical_description(self) -> str: 40 | return "RAVE model proposed by Caillon, Antoine et al." 41 | 42 | def get_technical_links(self) -> Dict[str, str]: 43 | return { 44 | "Paper": "https://arxiv.org/abs/2111.05011", 45 | "Code": "https://github.com/acids-ircam/RAVE", 46 | } 47 | 48 | def get_tags(self) -> List[str]: 49 | return ["timbre transfer", "RAVE"] 50 | 51 | def get_model_version(self) -> str: 52 | return "1.0.0" 53 | 54 | def is_experimental(self) -> bool: 55 | """ 56 | set to True for models in experimental stage 57 | (status shown on the website) 58 | """ 59 | return True # <-EDIT THIS 60 | 61 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 62 | return [ 63 | ContinuousNeutoneParameter( 64 | name="Chaos", description="Magnitude of latent noise", default_value=0.0 65 | ), 66 | ContinuousNeutoneParameter( 67 | name="Z edit index", 68 | description="Index of latent dimension to edit", 69 | default_value=0.0, 70 | ), 71 | ContinuousNeutoneParameter( 72 | name="Z scale", 73 | description="Scale of latent variable", 74 | default_value=0.5, 75 | ), 76 | ContinuousNeutoneParameter( 77 | name="Z offset", 78 | description="Offset of latent variable", 79 | default_value=0.5, 80 | ), 81 | ] 82 | 83 | def is_input_mono(self) -> bool: 84 | return True # <-Set to False for stereo (each channel processed separately) 85 | 86 | def is_output_mono(self) -> bool: 87 | return True # <-Set to False for stereo (each channel processed separately) 88 | 89 | def get_native_sample_rates(self) -> List[int]: 90 | return [48000] # <-EDIT THIS 91 | 92 | def get_native_buffer_sizes(self) -> List[int]: 93 | return [2048] 94 | 95 | def get_citation(self) -> str: 96 | return """Caillon, A., & Esling, P. (2021). RAVE: A variational autoencoder for fast and high-quality neural audio synthesis. arXiv preprint arXiv:2111.05011.""" 97 | 98 | def calc_model_delay_samples(self) -> int: 99 | return 2048 100 | 101 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 102 | # parameters edit the latent variable 103 | z = self.model.encode(x.unsqueeze(1)) 104 | noise_amp = params["Chaos"] 105 | z = torch.randn_like(z) * noise_amp + z 106 | # add offset / scale 107 | idx_z = int( 108 | torch.clamp(params["Z edit index"], min=0.0, max=0.99) 109 | * self.model.latent_size 110 | ) 111 | z_scale = params["Z scale"] * 2 # 0~1 -> 0~2 112 | z_offset = params["Z offset"] * 2 - 1 # 0~1 -> -1~1 113 | z[:, idx_z] = z[:, idx_z] * z_scale + z_offset 114 | out = self.model.decode(z) 115 | out = out.squeeze(1) 116 | return out # (n_channels=1, sample_size) 117 | 118 | 119 | if __name__ == "__main__": 120 | parser = ArgumentParser() 121 | parser.add_argument( 122 | "-i", 123 | "--input", 124 | default="./models/rave/rave_cached.ts", 125 | help="exported RAVE torchscript file", 126 | ) 127 | parser.add_argument("-o", "--output", default="ravemodel", help="model output name") 128 | parser.add_argument("-f", "--folder", default="./exports", help="output folder") 129 | parser.add_argument( 130 | "-s", 131 | "--sounds", 132 | nargs="*", 133 | type=str, 134 | default=None, 135 | help="directory of sounds to use as example input.", 136 | ) 137 | args = parser.parse_args() 138 | root_dir = Path(args.folder) / args.output 139 | 140 | # wrap it 141 | model = torch.jit.load(args.input) 142 | wrapper = RAVEModelWrapper(model) 143 | 144 | soundpairs = None 145 | if args.sounds is not None: 146 | soundpairs = [] 147 | for sound in args.sounds: 148 | wave, sr = torchaudio.load(sound) 149 | input_sample = AudioSample(wave, sr) 150 | rendered_sample = render_audio_sample(wrapper, input_sample) 151 | soundpairs.append(AudioSamplePair(input_sample, rendered_sample)) 152 | 153 | save_neutone_model( 154 | wrapper, 155 | root_dir, 156 | freeze=False, 157 | dump_samples=True, 158 | submission=True, 159 | audio_sample_pairs=soundpairs, 160 | ) 161 | -------------------------------------------------------------------------------- /examples/neutone_fx/example_rave_prefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from argparse import ArgumentParser 4 | from pathlib import Path 5 | from typing import Dict, List 6 | 7 | import torch 8 | import torchaudio 9 | from torch import Tensor, nn 10 | 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter 12 | from neutone_sdk.audio import ( 13 | AudioSample, 14 | AudioSamplePair, 15 | render_audio_sample, 16 | ) 17 | from neutone_sdk.filters import FIRFilter, FilterType 18 | from neutone_sdk.utils import save_neutone_model 19 | 20 | logging.basicConfig() 21 | log = logging.getLogger(__name__) 22 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 23 | 24 | 25 | class FilteredRAVEModelWrapper(WaveformToWaveformBase): 26 | def __init__(self, model: nn.Module, use_debug_mode: bool = True) -> None: 27 | super().__init__(model, use_debug_mode) 28 | # filter to be applied before model 29 | # cut below 500 and above 4000 Hz 30 | self.pre_filter = FIRFilter( 31 | FilterType.BANDPASS, cutoffs=[500.0, 4000.0], filt_size=257 32 | ) 33 | 34 | def get_model_name(self) -> str: 35 | return "RAVE.example" # <-EDIT THIS 36 | 37 | def get_model_authors(self) -> List[str]: 38 | return ["Author Name"] # <-EDIT THIS 39 | 40 | def get_model_short_description(self) -> str: 41 | return "RAVE model trained on xxx sounds." # <-EDIT THIS 42 | 43 | def get_model_long_description(self) -> str: 44 | return ( # <-EDIT THIS 45 | "RAVE timbre transfer model trained on xxx sounds. Useful for xxx sounds." 46 | ) 47 | 48 | def get_technical_description(self) -> str: 49 | return "RAVE model proposed by Caillon, Antoine et al." 50 | 51 | def get_technical_links(self) -> Dict[str, str]: 52 | return { 53 | "Paper": "https://arxiv.org/abs/2111.05011", 54 | "Code": "https://github.com/acids-ircam/RAVE", 55 | } 56 | 57 | def get_tags(self) -> List[str]: 58 | return ["timbre transfer", "RAVE"] 59 | 60 | def get_model_version(self) -> str: 61 | return "1.0.0" 62 | 63 | def is_experimental(self) -> bool: 64 | """ 65 | set to True for models in experimental stage 66 | (status shown on the website) 67 | """ 68 | return True # <-EDIT THIS 69 | 70 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 71 | return [ 72 | ContinuousNeutoneParameter( 73 | name="Chaos", description="Magnitude of latent noise", default_value=0.0 74 | ), 75 | ContinuousNeutoneParameter( 76 | name="Z edit index", 77 | description="Index of latent dimension to edit", 78 | default_value=0.0, 79 | ), 80 | ContinuousNeutoneParameter( 81 | name="Z scale", 82 | description="Scale of latent variable", 83 | default_value=0.5, 84 | ), 85 | ContinuousNeutoneParameter( 86 | name="Z offset", 87 | description="Offset of latent variable", 88 | default_value=0.5, 89 | ), 90 | ] 91 | 92 | def is_input_mono(self) -> bool: 93 | return True # <-Set to False for stereo (each channel processed separately) 94 | 95 | def is_output_mono(self) -> bool: 96 | return True # <-Set to False for stereo (each channel processed separately) 97 | 98 | def get_native_sample_rates(self) -> List[int]: 99 | return [48000] # <-Set to model sr during training 100 | 101 | def get_native_buffer_sizes(self) -> List[int]: 102 | return [2048] 103 | 104 | def calc_model_delay_samples(self) -> int: 105 | # model latency should also be added if non-causal 106 | return self.pre_filter.delay + 2048 107 | 108 | def set_model_sample_rate_and_buffer_size( 109 | self, sample_rate: int, n_samples: int 110 | ) -> bool: 111 | # Set prefilter samplerate to current sample rate 112 | self.pre_filter.set_parameters(sample_rate=sample_rate) 113 | return True 114 | 115 | def get_citation(self) -> str: 116 | return """Caillon, A., & Esling, P. (2021). RAVE: A variational autoencoder for fast and high-quality neural audio synthesis. arXiv preprint arXiv:2111.05011.""" 117 | 118 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 119 | # Apply pre-filter 120 | x = self.pre_filter(x) 121 | # parameters edit the latent variable 122 | z = self.model.encode(x.unsqueeze(1)) 123 | noise_amp = params["Chaos"] * 2 124 | z = torch.randn_like(z) * noise_amp + z 125 | # add offset / scale 126 | idx_z = int( 127 | torch.clamp(params["Z edit index"], min=0.0, max=0.99) 128 | * self.model.latent_size 129 | ) 130 | z_scale = params["Z scale"] * 2 # 0~1 -> 0~2 131 | z_offset = params["Z offset"] * 2 - 1 # 0~1 -> -1~1 132 | z[:, idx_z] = z[:, idx_z] * z_scale + z_offset 133 | out = self.model.decode(z) 134 | out = out.squeeze(1) 135 | return out 136 | 137 | 138 | if __name__ == "__main__": 139 | parser = ArgumentParser() 140 | parser.add_argument( 141 | "-i", 142 | "--input", 143 | default="./models/rave/rave_cached.ts", 144 | help="exported RAVE torchscript file", 145 | ) 146 | parser.add_argument("-o", "--output", default="ravemodel", help="model output name") 147 | parser.add_argument("-f", "--folder", default="./exports", help="output folder") 148 | parser.add_argument( 149 | "-s", 150 | "--sounds", 151 | nargs="*", 152 | type=str, 153 | default=None, 154 | help="directory of sounds to use as example input.", 155 | ) 156 | args = parser.parse_args() 157 | root_dir = Path(args.folder) / args.output 158 | 159 | # wrap it 160 | model = torch.jit.load(args.input) 161 | wrapper = FilteredRAVEModelWrapper(model) 162 | 163 | soundpairs = None 164 | if args.sounds is not None: 165 | soundpairs = [] 166 | for sound in args.sounds: 167 | wave, sr = torchaudio.load(sound) 168 | input_sample = AudioSample(wave, sr) 169 | rendered_sample = render_audio_sample(wrapper, input_sample) 170 | soundpairs.append(AudioSamplePair(input_sample, rendered_sample)) 171 | 172 | save_neutone_model( 173 | wrapper, 174 | root_dir, 175 | freeze=False, 176 | dump_samples=True, 177 | submission=True, 178 | audio_sample_pairs=soundpairs, 179 | ) 180 | -------------------------------------------------------------------------------- /examples/neutone_fx/example_rave_v1_prefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from argparse import ArgumentParser 4 | from pathlib import Path 5 | from typing import Dict, List 6 | 7 | import torch 8 | import torchaudio 9 | from torch import Tensor, nn 10 | 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter 12 | from neutone_sdk.audio import ( 13 | AudioSample, 14 | AudioSamplePair, 15 | render_audio_sample, 16 | ) 17 | from neutone_sdk.filters import FIRFilter, FilterType 18 | from neutone_sdk.utils import save_neutone_model 19 | 20 | logging.basicConfig() 21 | log = logging.getLogger(__name__) 22 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 23 | 24 | 25 | class FilteredRAVEv1ModelWrapper(WaveformToWaveformBase): 26 | def __init__(self, model: nn.Module, use_debug_mode: bool = True) -> None: 27 | super().__init__(model, use_debug_mode) 28 | self.pre_filter = FIRFilter( 29 | FilterType.BANDPASS, cutoffs=[500.0, 4000.0], filt_size=257 30 | ) 31 | 32 | def get_model_name(self) -> str: 33 | return "RAVE.example" 34 | 35 | def get_model_authors(self) -> List[str]: 36 | return ["Author Name"] 37 | 38 | def get_model_short_description(self) -> str: 39 | return "stereo RAVE model trained on ..." 40 | 41 | def get_model_long_description(self) -> str: 42 | return ( # <-EDIT THIS 43 | "RAVE timbre transfer model trained on xxx sounds. Useful for xxx sounds." 44 | ) 45 | 46 | def get_technical_description(self) -> str: 47 | return "RAVE model proposed by Caillon, Antoine et al." 48 | 49 | def get_technical_links(self) -> Dict[str, str]: 50 | return { 51 | "Paper": "https://arxiv.org/abs/2111.05011", 52 | "Code": "https://github.com/acids-ircam/RAVE", 53 | } 54 | 55 | def get_tags(self) -> List[str]: 56 | return ["timbre transfer", "RAVE"] 57 | 58 | def get_model_version(self) -> str: 59 | return "1.0.0" 60 | 61 | def is_experimental(self) -> bool: 62 | """ 63 | set to True for models in experimental stage 64 | (status shown on the website) 65 | """ 66 | return False 67 | 68 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 69 | return [ 70 | ContinuousNeutoneParameter( 71 | name="Chaos", 72 | description="Magnitude of latent noise", 73 | default_value=0.0, 74 | ), 75 | ContinuousNeutoneParameter( 76 | name="Z edit index", 77 | description="Index of latent dimension to edit", 78 | default_value=0.0, 79 | ), 80 | ContinuousNeutoneParameter( 81 | name="Z scale", 82 | description="Scale of latent variable", 83 | default_value=0.5, 84 | ), 85 | ContinuousNeutoneParameter( 86 | name="Z offset", 87 | description="Offset of latent variable", 88 | default_value=0.5, 89 | ), 90 | ] 91 | 92 | def is_input_mono(self) -> bool: 93 | return True # <-Set to False for stereo (each channel processed separately) 94 | 95 | def is_output_mono(self) -> bool: 96 | return True # <-Set to False for stereo (each channel processed separately) 97 | 98 | def get_native_sample_rates(self) -> List[int]: 99 | return [48000] # <-Set to model sr during training 100 | 101 | def get_native_buffer_sizes(self) -> List[int]: 102 | return [2048] 103 | 104 | def calc_model_delay_samples(self) -> int: 105 | # model latency should also be added if non-causal 106 | return self.pre_filter.delay 107 | 108 | def set_model_sample_rate_and_buffer_size( 109 | self, sample_rate: int, n_samples: int 110 | ) -> bool: 111 | # Set prefilter samplerate to current sample rate 112 | self.pre_filter.set_parameters(sample_rate=sample_rate) 113 | return True 114 | 115 | def get_citation(self) -> str: 116 | return """Caillon, A., & Esling, P. (2021). RAVE: A variational autoencoder for fast and high-quality neural audio synthesis. arXiv preprint arXiv:2111.05011.""" 117 | 118 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 119 | # Apply pre-filter 120 | x = self.pre_filter(x) 121 | ## parameters edit the latent variable 122 | z_mean, z_std = self.model.encode_amortized(x.unsqueeze(1)) 123 | noise_amp = z_std * params["Chaos"] * 4 124 | batch, latent_dim, time = z_std.shape 125 | z = ( 126 | torch.randn(1, latent_dim, 1, device=z_std.device).expand(batch, -1, time) 127 | * noise_amp 128 | + z_mean 129 | ) 130 | # add offset / scale 131 | idx_z = int( 132 | torch.clamp(params["Z edit index"], min=0.0, max=0.99) 133 | * self.model.cropped_latent_size 134 | ) 135 | z_scale = params["Z scale"] * 2 # 0~1 -> 0~2 136 | z_offset = params["Z offset"] * 2 - 1 # 0~1 -> -1~1 137 | z[:, idx_z] = z[:, idx_z] * z_scale + z_offset 138 | out = self.model.decode(z) 139 | out = out.squeeze(1) 140 | return out 141 | 142 | 143 | if __name__ == "__main__": 144 | parser = ArgumentParser() 145 | parser.add_argument( 146 | "-i", 147 | "--input", 148 | default="./models/rave/rave_cached.ts", 149 | help="exported RAVE torchscript file", 150 | ) 151 | parser.add_argument("-o", "--output", default="ravemodel", help="model output name") 152 | parser.add_argument("-f", "--folder", default="./exports", help="output folder") 153 | parser.add_argument( 154 | "-s", 155 | "--sounds", 156 | nargs="*", 157 | type=str, 158 | default=None, 159 | help="directory of sounds to use as example input.", 160 | ) 161 | args = parser.parse_args() 162 | root_dir = Path(args.folder) / args.output 163 | 164 | # wrap it 165 | model = torch.jit.load(args.input) 166 | wrapper = FilteredRAVEv1ModelWrapper(model) 167 | 168 | soundpairs = None 169 | if args.sounds is not None: 170 | soundpairs = [] 171 | for sound in args.sounds: 172 | wave, sr = torchaudio.load(sound) 173 | input_sample = AudioSample(wave, sr) 174 | rendered_sample = render_audio_sample(wrapper, input_sample) 175 | soundpairs.append(AudioSamplePair(input_sample, rendered_sample)) 176 | 177 | save_neutone_model( 178 | wrapper, 179 | root_dir, 180 | freeze=False, 181 | dump_samples=True, 182 | submission=True, 183 | audio_sample_pairs=soundpairs, 184 | ) 185 | -------------------------------------------------------------------------------- /examples/neutone_fx/example_spectral_filter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pathlib 4 | from argparse import ArgumentParser 5 | from typing import Dict, List 6 | 7 | import torch as tr 8 | import torch.nn as nn 9 | from torch import Tensor 10 | 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter 12 | from neutone_sdk.realtime_stft import RealtimeSTFT 13 | from neutone_sdk.utils import save_neutone_model 14 | 15 | logging.basicConfig() 16 | log = logging.getLogger(__name__) 17 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 18 | 19 | 20 | class SpectralFilter(nn.Module): 21 | def __init__(self) -> None: 22 | """ 23 | Creates a spectral notch filter, where the bandwidth of the filter also changes as the center frequency changes. 24 | """ 25 | super().__init__() 26 | self.base_constant = tr.tensor( 27 | 1025 / tr.e 28 | ) # Used to scale the controls somewhat to the STFT 29 | self.half_constant = tr.tensor(0.5) # Prevent dynamic memory allocations 30 | 31 | def _map_0to1_val_to_log_bin_idx(self, val: Tensor, max_bin: int) -> int: 32 | """ 33 | Maps a float tensor between [0.0, 1.0] to an integer between [0, max_bins] with the assumption that the 34 | bin indices follow a logarithmic spacing. 35 | """ 36 | idx = ( 37 | (tr.pow(self.base_constant, val) - 1.0) 38 | / (self.base_constant - 1.0) 39 | * max_bin 40 | ) 41 | idx = int(tr.clip(tr.round(idx), 0, max_bin)) 42 | return idx 43 | 44 | def forward( 45 | self, x: Tensor, center: Tensor, width: Tensor, amount: Tensor 46 | ) -> Tensor: 47 | """ 48 | Filters a positive valued magnitude spectrogram using a notch filter with controllable center, width, 49 | and amount of attenuation. 50 | 51 | Args: 52 | x: a magnitude spectrogram with shape (n_ch, n_bins, n_frames) 53 | center: 1D control value between [0.0, 1.0] for the center frequency of the filter. 54 | width: 1D control value between [0.0, 1.0] for the bandwidth of the filter. 55 | amount: 1D control value between [0.0, 1.0] for the amount of attenuation. 56 | """ 57 | if amount == 0.0: 58 | return x 59 | n_bins = x.size(1) # Figure out how many bins we have to work with 60 | # Find the center freq bin 61 | center_bin_idx = self._map_0to1_val_to_log_bin_idx(center, n_bins - 1) 62 | # Find the lowest freq bin 63 | lo_bin_idx = self._map_0to1_val_to_log_bin_idx( 64 | center * (1.0 - width), n_bins - 1 65 | ) 66 | lo_bin_idx = max(0, lo_bin_idx) 67 | # Find the highest freq bin 68 | hi_bin_idx = self._map_0to1_val_to_log_bin_idx( 69 | center + ((1.0 - center) * width), n_bins - 1 70 | ) 71 | hi_bin_idx = min(n_bins - 1, hi_bin_idx) 72 | # If the filter has 0 width, we don't need to do anything 73 | if hi_bin_idx - lo_bin_idx == 0: 74 | return x 75 | # Filter the low bins of the notch 76 | if center_bin_idx - lo_bin_idx > 0: 77 | # Using a linear spacing here is not ideal since the frequency bins are not linearly spaced, 78 | # but this is just an example 79 | lo_filter = 1.0 - ( 80 | tr.linspace(0.0, 1.0, center_bin_idx - lo_bin_idx + 2)[1:-1] * amount 81 | ) 82 | lo_filter = lo_filter.view(1, -1, 1) 83 | x[:, lo_bin_idx:center_bin_idx, :] *= lo_filter 84 | # Filter the high bins of the notch 85 | if hi_bin_idx - center_bin_idx > 0: 86 | # Using a linear spacing here is not ideal since the frequency bins are not linearly spaced, 87 | # but this is just an example 88 | hi_filter = 1.0 - ( 89 | tr.linspace(1.0, 0.0, hi_bin_idx - center_bin_idx + 1)[:-1] * amount 90 | ) 91 | hi_filter = hi_filter.view(1, -1, 1) 92 | x[:, center_bin_idx:hi_bin_idx, :] *= hi_filter 93 | return x 94 | 95 | 96 | class SpectralFilterWrapper(WaveformToWaveformBase): 97 | def __init__( 98 | self, 99 | spectral_filter_model: nn.Module, 100 | model_io_n_frames: int = 16, 101 | n_fft: int = 2048, 102 | hop_len: int = 512, 103 | fade_n_samples: int = 384, # Cross-fade for 3/4 of the hop_len to ensure no buzzing in the wet audio 104 | use_debug_mode: bool = True, 105 | ) -> None: 106 | """ 107 | Creates a modified WaveformToWaveformBase wrapper that can be used to create spectral neural audio effects. 108 | Feel free to use this as a starting point to make your own spectral effects! 109 | 110 | Args: 111 | spectral_filter_model: a spectral model, in this example a filter (could be replaced with anything). 112 | model_io_n_frames: the number of STFT frames the spectral model expects as input and output. 113 | n_fft: n_fft to use for the STFT. 114 | hop_len: hop_len in samples to use for the STFT. 115 | fade_n_samples: no. of samples to crossfade between output buffers of audio after the inverse STFT. Adds a 116 | slight delay, but prevents clicks and pops in the output audio. 117 | use_debug_mode: makes debugging easier, is turned off automatically before the model is exported. 118 | """ 119 | super().__init__(spectral_filter_model, use_debug_mode) 120 | in_ch = 1 if self.is_input_mono() else 2 121 | self.stft = RealtimeSTFT( 122 | model_io_n_frames=model_io_n_frames, 123 | io_n_ch=in_ch, 124 | n_fft=n_fft, 125 | hop_len=hop_len, 126 | power=1.0, # Ensures an energy spectrogram 127 | logarithmize=False, # We don't need a log-magnitude spectrogram for this filter 128 | ensure_pos_spec=True, # Ensures a positive-valued spectrogram 129 | use_phase_info=True, # Keep the phase information for the inverse STFT 130 | fade_n_samples=fade_n_samples, 131 | use_debug_mode=use_debug_mode, 132 | ) 133 | self.stft.set_buffer_size(self.stft.calc_min_buffer_size()) 134 | if use_debug_mode: 135 | log.info(f"Supported buffer sizes = {self.get_native_buffer_sizes()}") 136 | log.info(f"Supported sample rate = {self.get_native_sample_rates()}") 137 | log.info(f"STFT delay = {self.calc_model_delay_samples()}") 138 | 139 | def get_model_name(self) -> str: 140 | return "spectral.filter" 141 | 142 | def get_model_authors(self) -> List[str]: 143 | return ["Christopher Mitcheltree"] 144 | 145 | def get_model_short_description(self) -> str: 146 | return "Spectral notch filter." 147 | 148 | def get_model_long_description(self) -> str: 149 | return ( 150 | "Filters the audio in the spectral domain using a central frequency, bandwidth, and amount. " 151 | "The bandwidth changes as the central frequency changes." 152 | ) 153 | 154 | def get_technical_description(self) -> str: 155 | return ( 156 | "Filters the audio in the spectral domain using a central frequency, bandwidth, and amount. " 157 | "The bandwidth changes as the central frequency changes." 158 | ) 159 | 160 | def get_technical_links(self) -> Dict[str, str]: 161 | return {} 162 | 163 | def get_tags(self) -> List[str]: 164 | return ["spectral", "filter", "notch filter", "stft", "template"] 165 | 166 | def get_model_version(self) -> str: 167 | return "1.0.0" 168 | 169 | def is_experimental(self) -> bool: 170 | return True 171 | 172 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 173 | return [ 174 | ContinuousNeutoneParameter( 175 | "center", "center frequency of the filter", default_value=0.3 176 | ), 177 | ContinuousNeutoneParameter("width", "width of the filter", default_value=0.5), 178 | ContinuousNeutoneParameter( 179 | "amount", "spectral attenuation amount", default_value=0.9 180 | ), 181 | ] 182 | 183 | @tr.jit.export 184 | def is_input_mono(self) -> bool: 185 | return False 186 | 187 | @tr.jit.export 188 | def is_output_mono(self) -> bool: 189 | return False 190 | 191 | @tr.jit.export 192 | def get_native_sample_rates(self) -> List[int]: 193 | # For consistent filtering across different sampling rates, a native sampling rate must be given. Feel free to 194 | # change this to your required sampling rate. 195 | return [44100] 196 | 197 | @tr.jit.export 198 | def get_native_buffer_sizes(self) -> List[int]: 199 | return ( 200 | self.stft.calc_supported_buffer_sizes() 201 | ) # Possible buffer sizes are determined by the STFT parameters 202 | 203 | @tr.jit.export 204 | def calc_model_delay_samples(self) -> int: 205 | # TODO(cm): make a model specific version of this method? 206 | return self.stft.calc_model_delay_samples() # This is equal to `fade_n_samples` 207 | 208 | def set_model_buffer_size(self, n_samples: int) -> bool: 209 | self.stft.set_buffer_size(n_samples) 210 | return True 211 | 212 | def reset_model(self) -> bool: 213 | self.stft.reset() 214 | return True 215 | 216 | def prepare_for_inference(self) -> None: 217 | super().prepare_for_inference() 218 | # This needs to be done explicitly until we have dedicated wrapper base class for spectral models 219 | self.stft.use_debug_mode = False 220 | self.stft.eval() 221 | 222 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 223 | center, width, amount = params["center"], params["width"], params["amount"] 224 | x = self.stft.audio_to_spec( 225 | x 226 | ) # Convert the audio to a spectrogram (n_ch, n_bins, n_frames) 227 | x = self.model.forward( 228 | x, center, width, amount 229 | ) # Apply the spectral filter and receive an altered spectrogram 230 | x = self.stft.spec_to_audio( 231 | x 232 | ) # Convert the filtered spectrogram back to audio (n_ch, n_samples) 233 | return x 234 | 235 | 236 | if __name__ == "__main__": 237 | parser = ArgumentParser() 238 | parser.add_argument("-o", "--output", default="export_model") 239 | args = parser.parse_args() 240 | root_dir = pathlib.Path(args.output) 241 | 242 | model = SpectralFilter() 243 | wrapper = SpectralFilterWrapper(model) 244 | save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True) 245 | -------------------------------------------------------------------------------- /examples/neutone_gen/example_clipper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pathlib 4 | from argparse import ArgumentParser 5 | from typing import Dict, List 6 | 7 | import torch as tr 8 | import torch.nn as nn 9 | from torch import Tensor 10 | 11 | from neutone_sdk import NeutoneParameter, ContinuousNeutoneParameter 12 | from neutone_sdk.non_realtime_sqw import NonRealtimeSampleQueueWrapper 13 | from neutone_sdk.non_realtime_wrapper import NonRealtimeBase 14 | 15 | logging.basicConfig() 16 | log = logging.getLogger(__name__) 17 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 18 | 19 | 20 | class ClipperModel(nn.Module): 21 | def forward( 22 | self, x: Tensor, min_val: Tensor, max_val: Tensor, gain: Tensor 23 | ) -> Tensor: 24 | tr.neg(min_val, out=min_val) 25 | tr.mul(gain, min_val, out=min_val) 26 | tr.mul(gain, max_val, out=max_val) 27 | tr.clip(x, min=min_val, max=max_val, out=x) 28 | return x 29 | # return x[:, :-4] 30 | # return tr.rand(2, 2048).fill_(0.5) 31 | 32 | 33 | class NonRealtimeClipperModelWrapper(NonRealtimeBase): 34 | def get_model_name(self) -> str: 35 | return "clipper" 36 | 37 | def get_model_authors(self) -> List[str]: 38 | return ["Christopher Mitcheltree"] 39 | 40 | def get_model_short_description(self) -> str: 41 | return "Audio clipper." 42 | 43 | def get_model_long_description(self) -> str: 44 | return "Clips the input audio between -1 and 1." 45 | 46 | def get_technical_description(self) -> str: 47 | return "Clips the input audio between -1 and 1." 48 | 49 | def get_technical_links(self) -> Dict[str, str]: 50 | return { 51 | "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/neutone_gen/example_clipper_gen.py" 52 | } 53 | 54 | def get_tags(self) -> List[str]: 55 | return ["clipper"] 56 | 57 | def get_model_version(self) -> str: 58 | return "1.0.0" 59 | 60 | def is_experimental(self) -> bool: 61 | return False 62 | 63 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 64 | return [ 65 | ContinuousNeutoneParameter("min", "min clip threshold", default_value=0.15), 66 | ContinuousNeutoneParameter("max", "max clip threshold", default_value=0.15), 67 | ContinuousNeutoneParameter( 68 | "gain", "scale clip threshold", default_value=1.0 69 | ), 70 | ] 71 | 72 | @tr.jit.export 73 | def get_audio_in_channels(self) -> List[int]: 74 | return [2] 75 | 76 | @tr.jit.export 77 | def get_audio_out_channels(self) -> List[int]: 78 | return [2] 79 | 80 | @tr.jit.export 81 | def get_native_sample_rates(self) -> List[int]: 82 | return [] # Supports all sample rates 83 | 84 | @tr.jit.export 85 | def get_native_buffer_sizes(self) -> List[int]: 86 | return [] # Supports all buffer sizes 87 | 88 | @tr.jit.export 89 | def is_one_shot_model(self) -> bool: 90 | return False 91 | 92 | def aggregate_continuous_params(self, cont_params: Tensor) -> Tensor: 93 | return cont_params # We want sample-level control, so no aggregation 94 | 95 | def do_forward_pass( 96 | self, 97 | curr_block_idx: int, 98 | audio_in: List[Tensor], 99 | numerical_params: Dict[str, Tensor], 100 | text_params: List[str], 101 | tokens_params: List[List[int]], 102 | ) -> List[Tensor]: 103 | min_val, max_val, gain = ( 104 | numerical_params["min"], 105 | numerical_params["max"], 106 | numerical_params["gain"], 107 | ) 108 | audio_out = [] 109 | for x in audio_in: 110 | x = self.model.forward(x, min_val, max_val, gain) 111 | audio_out.append(x) 112 | return audio_out 113 | # return [self.model.forward(min_val, min_val, max_val, gain)] 114 | 115 | 116 | if __name__ == "__main__": 117 | parser = ArgumentParser() 118 | parser.add_argument("-o", "--output", default="export_model") 119 | args = parser.parse_args() 120 | root_dir = pathlib.Path(args.output) 121 | 122 | model = ClipperModel() 123 | wrapper = NonRealtimeClipperModelWrapper(model) 124 | sqw = NonRealtimeSampleQueueWrapper(wrapper) 125 | 126 | in_n_samples = 2048 127 | audio_in = [tr.rand(1, in_n_samples)] 128 | # audio_in = [] 129 | numerical_params = tr.rand(3, in_n_samples) 130 | # numerical_params = None 131 | 132 | out = sqw.forward_non_realtime(audio_in, numerical_params) 133 | log.info(f" out[0].shape: {out[0].shape}") 134 | log.info(f" out: {out}") 135 | 136 | sqw.reset() 137 | sqw.prepare_for_inference() 138 | # TODO(cm): write export method for nonrealtime models 139 | ts = tr.jit.script(sqw) 140 | 141 | out_ts = ts.forward_non_realtime(audio_in, numerical_params) 142 | log.info(f"out_ts[0].shape: {out_ts[0].shape}") 143 | log.info(f"out_ts: {out_ts}") 144 | -------------------------------------------------------------------------------- /examples/neutone_gen/example_musicgen_load.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import argparse, json, logging, os, base64, io, tempfile 6 | from typing import List, Dict 7 | 8 | import torch 9 | import torchaudio 10 | 11 | from neutone_sdk import ( 12 | NeutoneParameter, 13 | DiscreteTokensNeutoneParameter, 14 | ContinuousNeutoneParameter, 15 | ) 16 | from neutone_sdk.non_realtime_sqw import NonRealtimeSampleQueueWrapper 17 | from neutone_sdk.non_realtime_wrapper import NonRealtimeTokenizerBase, TokenizerType 18 | 19 | """ 20 | To run this script, you will need to install tokenizers library and also protobuf 21 | if you are using the sentencepiece tokenizer. 22 | """ 23 | 24 | TOK_TYPE = TokenizerType.SENTENCEPIECE 25 | 26 | logging.basicConfig() 27 | log = logging.getLogger(__name__) 28 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 29 | 30 | # class MusicGenWrapperNoTok(nn.Module): 31 | # def __init__(self, text_encoder, lm, audio_decoder, enc_to_dec_proj, logits_processor, pad_token_id: int, decoder_start_token_id: int, delay_mask_fn, num_codebooks: int, audio_channels: int): 32 | # super().__init__() 33 | # self.text_encoder = text_encoder 34 | # self.audio_decoder = audio_decoder 35 | # self.lm = lm 36 | # self.decoder_start_token_id = decoder_start_token_id 37 | # self.delay_mask_fn = delay_mask_fn 38 | # self.num_codebooks = num_codebooks 39 | # self.audio_channels = audio_channels 40 | # self.enc_to_dec_proj = enc_to_dec_proj 41 | # self.logits_processor = logits_processor 42 | # self.pad_token_id = pad_token_id 43 | 44 | # def prepare_text_encoder_kwargs_for_generation(self, input_ids): 45 | # encoder_attention_mask = torch.where(input_ids==0, 0, 1) 46 | # encoder_outputs = self.text_encoder( 47 | # input_ids=input_ids, 48 | # attention_mask=encoder_attention_mask, 49 | # )['last_hidden_state'] 50 | # encoder_outputs = torch.concatenate([encoder_outputs, torch.zeros_like(encoder_outputs)], dim=0) 51 | # encoder_attention_mask = torch.concatenate( 52 | # [encoder_attention_mask, torch.zeros_like(encoder_attention_mask)], dim=0 53 | # ) 54 | # return encoder_outputs, encoder_attention_mask 55 | 56 | # def apply_delay_pattern_mask(self, input_ids, decoder_pad_token_mask): 57 | # """Apply a delay pattern mask to the decoder input ids, only preserving predictions where 58 | # the mask is set to -1, and otherwise setting to the value detailed in the mask.""" 59 | # seq_len = input_ids.shape[-1] 60 | # decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len] 61 | # input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask) 62 | # return input_ids 63 | 64 | # def prepare_inputs_for_generation(self, input_ids, encoder_outputs, delay_pattern_mask): 65 | # input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask) 66 | # # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these 67 | # # before sampling) 68 | # input_ids = input_ids.repeat((2, 1)) 69 | # return input_ids, encoder_outputs 70 | 71 | # def prepare_decoder_input_ids_for_generation(self, batch_size: int): 72 | # return torch.ones(batch_size * self.num_codebooks, 1, dtype=torch.long) * self.decoder_start_token_id 73 | 74 | # def preprocess(self, text_ids: torch.Tensor, max_length: int)-> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 75 | # with torch.no_grad(): 76 | # batch_size = text_ids.shape[0] 77 | # encoder_outputs, encoder_attention_mask = self.prepare_text_encoder_kwargs_for_generation(text_ids) 78 | # encoder_outputs = self.enc_to_dec_proj(encoder_outputs) 79 | # input_ids = self.prepare_decoder_input_ids_for_generation(batch_size) 80 | # input_ids, delay_pattern_mask = self.delay_mask_fn(input_ids, self.decoder_start_token_id, max_length, self.num_codebooks, self.audio_channels) 81 | # return input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask 82 | 83 | # def sample_step(self, input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask): 84 | # i_ids, enc_out = self.prepare_inputs_for_generation(input_ids, encoder_outputs, delay_pattern_mask) 85 | # outputs = self.lm(input_ids=i_ids, encoder_hidden_states=enc_out, encoder_attention_mask=encoder_attention_mask) 86 | # next_token_logits = outputs['logits'][:, -1, :] 87 | # # TODO temperature 88 | # next_token_scores = self.logits_processor(input_ids, next_token_logits) 89 | # probs = nn.functional.softmax(next_token_scores, dim=-1) 90 | # next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) 91 | # # update generated ids, model inputs, and length for next step 92 | # input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) 93 | # return input_ids # update input_ids in the next call 94 | 95 | # def postprocess(self, input_ids: torch.Tensor, delay_pattern_mask: torch.Tensor, text_ids: torch.Tensor): 96 | # batch_size = text_ids.shape[0] 97 | # output_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask) 98 | # output_ids = output_ids[output_ids != self.decoder_start_token_id].reshape( 99 | # batch_size, self.num_codebooks, -1 100 | # ) 101 | # # append the frame dimension back to the audio codes 102 | # output_ids = output_ids[None, ...] 103 | # output_values = self.audio_decoder(output_ids) 104 | # return output_values # update input_ids in the next call 105 | 106 | # def forward(self, text_ids: torch.Tensor, max_length: int): 107 | # with torch.no_grad(): 108 | # input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask = self.preprocess(text_ids, max_length) 109 | # # sample 110 | # for _ in range(max_length-1): 111 | # input_ids = self.sample_step(input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask) 112 | # output_values = self.postprocess(input_ids, delay_pattern_mask, text_ids) 113 | # return output_values 114 | 115 | 116 | class NonRealtimeMusicGenModelWrapper(NonRealtimeTokenizerBase): 117 | def get_model_name(self) -> str: 118 | return "MusicGen" 119 | 120 | def get_model_authors(self) -> List[str]: 121 | return ["Naotake Masuda"] 122 | 123 | def get_model_short_description(self) -> str: 124 | return "" 125 | 126 | def get_model_long_description(self) -> str: 127 | return "" 128 | 129 | def get_technical_description(self) -> str: 130 | return "" 131 | 132 | def get_technical_links(self) -> Dict[str, str]: 133 | return { 134 | "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/neutone_gen/example_clipper_gen.py" 135 | } 136 | 137 | def get_tags(self) -> List[str]: 138 | return ["clipper"] 139 | 140 | def get_model_version(self) -> str: 141 | return "1.0.0" 142 | 143 | def is_experimental(self) -> bool: 144 | return False 145 | 146 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 147 | return [ 148 | DiscreteTokensNeutoneParameter( 149 | "texttokens", 150 | "tokens from a text tokenizer", 151 | default_value=[ 152 | 2775, 153 | 7, 154 | 2783, 155 | 1463, 156 | 28, 157 | 7981, 158 | 63, 159 | 5253, 160 | 7, 161 | 11, 162 | 13353, 163 | 1, 164 | ], 165 | ), 166 | ContinuousNeutoneParameter( 167 | "outputlength", "number of output tokens", default_value=0.5 168 | ), 169 | ] 170 | 171 | @torch.jit.export 172 | def get_audio_in_channels(self) -> List[int]: 173 | return [] 174 | 175 | @torch.jit.export 176 | def get_audio_out_channels(self) -> List[int]: 177 | return [1] 178 | 179 | @torch.jit.export 180 | def get_native_sample_rates(self) -> List[int]: 181 | return [32000] 182 | 183 | @torch.jit.export 184 | def get_native_buffer_sizes(self) -> List[int]: 185 | return [] # Supports all buffer sizes 186 | 187 | @torch.jit.export 188 | def is_one_shot_model(self) -> bool: 189 | return True 190 | 191 | @torch.jit.export 192 | def has_progress_percentage(self) -> bool: 193 | return True 194 | 195 | def aggregate_continuous_params(self, cont_params: torch.Tensor) -> torch.Tensor: 196 | return cont_params # We want sample-level control, so no aggregation 197 | 198 | def do_forward_pass( 199 | self, 200 | curr_block_idx: int, 201 | audio_in: List[torch.Tensor], 202 | knob_params: Dict[str, torch.Tensor], 203 | text_params: List[str], 204 | tokens_params: List[List[int]], 205 | ) -> List[torch.Tensor]: 206 | audio_out = [] 207 | output_length = int(knob_params["outputlength"].mean() * 500) 208 | tokens = tokens_params[0] 209 | # Convert to LongTensor with batch size of 1 210 | tokens = torch.LongTensor(tokens).unsqueeze(0) 211 | with torch.no_grad(): 212 | input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask = ( 213 | self.model.preprocess(tokens, output_length) 214 | ) 215 | for i in range(output_length - 1): 216 | input_ids = self.model.sample_step( 217 | input_ids, 218 | encoder_outputs, 219 | delay_pattern_mask, 220 | encoder_attention_mask, 221 | ) 222 | self.set_progress_percentage(float(i + 1) / output_length * 100) 223 | if self.should_cancel_forward_pass(): 224 | # Can't return empty list for some reason 225 | break 226 | x = self.model.postprocess(input_ids, delay_pattern_mask, tokens) 227 | audio_out.append(x.squeeze(1)) 228 | return audio_out 229 | # return [self.model.forward(min_val, min_val, max_val, gain)] 230 | 231 | 232 | if __name__ == "__main__": 233 | from tokenizers import Tokenizer, SentencePieceUnigramTokenizer 234 | 235 | parser = argparse.ArgumentParser() 236 | parser.add_argument("--model", default="musicgen-model", type=str) 237 | args = parser.parse_args() 238 | model = torch.jit.load("../../out/musicgen_scripted_notok.ts") 239 | if TOK_TYPE == TokenizerType.SENTENCEPIECE: 240 | tok_path = str("../../out/spiece.model") 241 | with open(tok_path, mode="rb") as f: 242 | tok_string = base64.b64encode(f.read()).decode() 243 | tokenizer = SentencePieceUnigramTokenizer.from_spm(tok_path) 244 | elif TOK_TYPE == TokenizerType.JSON: 245 | tok_path = str("../../out/tokenizer.json") 246 | with open(tok_path, "r", encoding="utf-8") as f: 247 | tok_string = json.dumps(json.load(f), ensure_ascii=True) 248 | tokenizer = Tokenizer.from_file(tok_path) 249 | 250 | wrapped = NonRealtimeMusicGenModelWrapper(model, tok_string, TOK_TYPE) 251 | tokens = tokenizer.encode("80s pop track with bassy drums and synth").ids 252 | 253 | sqw = NonRealtimeSampleQueueWrapper(wrapped) 254 | out = sqw.forward_non_realtime( 255 | [], 256 | torch.ones(1, 2048) * 0.2, 257 | tokens_params=[tokens], 258 | ) 259 | sqw.reset() 260 | sqw.prepare_for_inference() 261 | log.info(f" out[0].shape: {out[0].shape}") 262 | log.info(f" out: {out}") 263 | ts = torch.jit.script(sqw) 264 | log.info(f"Scripting successful") 265 | n_samples = 2048 266 | tokens = tokenizer.encode("90s rock song with loud guitars and heavy drums").ids 267 | out_ts = ts.forward_non_realtime( 268 | [], 269 | torch.ones(1, 2048) * 0.2, 270 | tokens_params=[tokens], 271 | ) 272 | log.info(f"out_ts[0].shape: {out_ts[0].shape}") 273 | log.info(f"out_ts: {out_ts}") 274 | torchaudio.save("../../out/out_ts.wav", out_ts[0], sample_rate=32000) 275 | torch.jit.save(ts, "../../out/wrapped-musicgen.ts") 276 | model = torch.jit.load("../../out/wrapped-musicgen.ts") 277 | # test saved tokenizer 278 | print(f"saved with {model.get_tokenizer_type()} tokenizer") 279 | if TOK_TYPE == TokenizerType.SENTENCEPIECE: 280 | tok_bin = base64.b64decode(model.get_tokenizer_str()) 281 | # Create a named temporary file that is deleted when closed 282 | with tempfile.NamedTemporaryFile( 283 | mode="wb", delete=False, suffix=".model" 284 | ) as temp_model_file: 285 | temp_model_file.write(tok_bin) 286 | temp_model_file_path = temp_model_file.name 287 | tokenizer = SentencePieceUnigramTokenizer.from_spm(temp_model_file_path) 288 | elif TOK_TYPE == TokenizerType.JSON: 289 | tokenizer = Tokenizer.from_str(model.get_tokenizer_str()) 290 | print(tokenizer.decode(tokens)) 291 | -------------------------------------------------------------------------------- /neutone_sdk/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from .parameter import * 3 | from .wavform_to_wavform import * 4 | from .sqw import * 5 | from . import utils 6 | -------------------------------------------------------------------------------- /neutone_sdk/assets/default_samples/sample_ambience.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Neutone/neutone_sdk/aee4cac560209fe850686dad3e21695fa8dde473/neutone_sdk/assets/default_samples/sample_ambience.mp3 -------------------------------------------------------------------------------- /neutone_sdk/assets/default_samples/sample_drums.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Neutone/neutone_sdk/aee4cac560209fe850686dad3e21695fa8dde473/neutone_sdk/assets/default_samples/sample_drums.mp3 -------------------------------------------------------------------------------- /neutone_sdk/assets/default_samples/sample_rhodes.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Neutone/neutone_sdk/aee4cac560209fe850686dad3e21695fa8dde473/neutone_sdk/assets/default_samples/sample_rhodes.mp3 -------------------------------------------------------------------------------- /neutone_sdk/audio.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from cffi import FFI 3 | from dataclasses import dataclass 4 | import logging 5 | import math 6 | import io 7 | import pkgutil 8 | from typing import Optional, List, Union 9 | from typing_extensions import Self 10 | 11 | import numpy as np 12 | import torch as tr 13 | from torch import nn, Tensor 14 | import torchaudio 15 | import soundfile as sf 16 | from torch.jit import ScriptModule 17 | from tqdm import tqdm 18 | 19 | import neutone_sdk 20 | 21 | logging.basicConfig() 22 | log = logging.getLogger(__name__) 23 | 24 | 25 | def write_mp3(buffer: io.BytesIO, y: tr.Tensor, sr: int, quality: float = 0): 26 | """ 27 | We're using this instead of sf.write in order to change the bitrate, 28 | where quality goes from 0 (high) to 1 (low). 29 | 30 | The API is similar to torchaudio.save, so y should be (num_channels, num_samples). 31 | """ 32 | assert 0 <= quality <= 1 33 | assert ( 34 | y.shape[0] < y.shape[1] 35 | ), "Expecting audio to have a shape of (num_channels, num_samples), try swapping the dimensions" 36 | ffi = FFI() 37 | quality = ffi.new("double *") 38 | vbr_set = ffi.new("int *") 39 | with sf.SoundFile( 40 | buffer, "w", channels=y.shape[0], samplerate=sr, format="mp3" 41 | ) as f: 42 | quality[0] = 0 # 0[high]~1[low] 43 | # 0x1301 - SFC_SET_COMPRESSION_LEVEL 44 | c = sf._snd.sf_command(f._file, 0x1301, quality, 8) 45 | assert c == sf._snd.SF_TRUE, "Couldn't set bitrate on MP3" 46 | 47 | # 0x1305 - SFC_SET_BITRATE_MODE 48 | vbr_set[0] = 2 # 0 - CONSTANT, 1 - AVERAGE, 2 - VARIABLE 49 | c = sf._snd.sf_command(f._file, 0x1305, vbr_set, 4) 50 | assert c == sf._snd.SF_TRUE, "Couldn't set MP3 to VBR" 51 | 52 | f.write(y.T.numpy()) 53 | assert f.closed 54 | 55 | 56 | @dataclass 57 | class AudioSample: 58 | """ 59 | AudioSample is simply a pair of (audio, sample_rate) that is easier to work 60 | with within the SDK. We recommend users to read and write to mp3 files as 61 | they are better supported and formats like ogg can have subtle bugs when 62 | reading and writing using the current backend (soundfile). 63 | """ 64 | 65 | audio: Tensor 66 | sr: int 67 | 68 | def __post_init__(self): 69 | assert self.audio.ndim == 2 70 | assert ( 71 | self.audio.size(0) == 1 or self.audio.size(0) == 2 72 | ), "Audio sample audio should be 1 or 2 channels, channels first" 73 | 74 | def is_mono(self) -> bool: 75 | return self.audio.size(0) == 1 76 | 77 | def to_mp3_bytes(self) -> bytes: 78 | buff = io.BytesIO() 79 | write_mp3(buff, self.audio, self.sr) 80 | buff.seek(0) 81 | return buff.read() 82 | 83 | def to_mp3_b64(self) -> str: 84 | return base64.b64encode(self.to_mp3_bytes()).decode() 85 | 86 | @classmethod 87 | def from_bytes(cls, bytes_: bytes) -> Self: 88 | y, sr = sf.read(io.BytesIO(bytes_), always_2d=True) 89 | return cls(tr.from_numpy(y.T.astype(np.float32)), sr) 90 | 91 | @classmethod 92 | def from_file(cls, path: str) -> Self: 93 | with open(path, "rb") as f: 94 | return cls.from_bytes(f.read()) 95 | 96 | @classmethod 97 | def from_b64(cls, b64_sample: str) -> Self: 98 | return cls.from_bytes(base64.b64decode(b64_sample)) 99 | 100 | 101 | @dataclass 102 | class AudioSamplePair: 103 | input: AudioSample 104 | output: AudioSample 105 | 106 | def to_metadata_format(self): 107 | return { 108 | "in": self.input.to_mp3_b64(), 109 | "out": self.output.to_mp3_b64(), 110 | } 111 | 112 | 113 | def get_default_audio_samples() -> List[AudioSample]: 114 | """ 115 | Returns a list of audio samples to be displayed on the website. 116 | 117 | The SDK provides one sample by default, but this method can be used to 118 | provide different samples. 119 | 120 | By default the outputs of this function will be ran through the model 121 | and the prerendered samples will be stored inside the saved object. 122 | 123 | See get_prerendered_audio_samples and render_audio_sample for more details. 124 | """ 125 | log.info( 126 | "Using default sample... Please consider using your own audio samples by overriding the get_audio_samples method" 127 | ) 128 | sample_ambience = AudioSample.from_bytes( 129 | pkgutil.get_data(__package__, "assets/default_samples/sample_ambience.mp3"), 130 | ) 131 | sample_drums = AudioSample.from_bytes( 132 | pkgutil.get_data(__package__, "assets/default_samples/sample_drums.mp3"), 133 | ) 134 | sample_rhodes = AudioSample.from_bytes( 135 | pkgutil.get_data(__package__, "assets/default_samples/sample_rhodes.mp3"), 136 | ) 137 | 138 | return [sample_rhodes, sample_drums, sample_ambience] 139 | 140 | 141 | def render_audio_sample( 142 | model: Union["SampleQueueWrapper", "WaveformToWaveformBase", ScriptModule], 143 | input_sample: AudioSample, 144 | params: Optional[Tensor] = None, 145 | output_sr: int = 44100, 146 | ) -> AudioSample: 147 | """ 148 | params: either [model.MAX_N_PARAMS] 1d tensor of constant parameter values 149 | or [model.MAX_N_PARAMS, input_sample.audio.size(1)] 2d tensor of parameter values for every input audio sample 150 | """ 151 | 152 | with tr.no_grad(): 153 | model.use_debug_mode = True # Turn on debug mode to catch common mistakes when rendering sample audio 154 | 155 | preferred_sr = neutone_sdk.SampleQueueWrapper.select_best_model_sr( 156 | input_sample.sr, model.get_native_sample_rates() 157 | ) 158 | if len(model.get_native_buffer_sizes()) > 0: 159 | buffer_size = model.get_native_buffer_sizes()[0] 160 | else: 161 | buffer_size = 512 162 | 163 | audio = input_sample.audio 164 | if input_sample.sr != preferred_sr: 165 | audio = torchaudio.transforms.Resample(input_sample.sr, preferred_sr)(audio) 166 | 167 | if model.is_input_mono() and not input_sample.is_mono(): 168 | audio = tr.mean(audio, dim=0, keepdim=True) 169 | elif not model.is_input_mono() and input_sample.is_mono(): 170 | audio = audio.repeat(2, 1) 171 | 172 | audio_len = audio.size(1) 173 | padding_amount = math.ceil(audio_len / buffer_size) * buffer_size - audio_len 174 | padded_audio = nn.functional.pad(audio, [0, padding_amount]) 175 | audio_chunks = padded_audio.split(buffer_size, dim=1) 176 | 177 | model.set_daw_sample_rate_and_buffer_size( 178 | preferred_sr, buffer_size, preferred_sr, buffer_size 179 | ) 180 | 181 | # make sure the shape of params is compatible with the model calls. 182 | if params is not None: 183 | assert params.shape[0] == model.MAX_N_PARAMS 184 | 185 | # if constant values, copy across audio dimension 186 | if params.dim() == 1: 187 | params = params.repeat([audio_len, 1]).T 188 | 189 | # otherwise resample to match audio 190 | else: 191 | assert params.shape == (model.MAX_N_PARAMS, input_sample.audio.size(1)) 192 | params = torchaudio.transforms.Resample(input_sample.sr, preferred_sr)( 193 | params 194 | ) 195 | params = tr.clamp(params, 0, 1) 196 | 197 | # padding and chunking parameters to match audio 198 | padded_params = nn.functional.pad( 199 | params, [0, padding_amount], mode="replicate" 200 | ) 201 | param_chunks = padded_params.split(buffer_size, dim=1) 202 | 203 | out_chunks = [ 204 | model.forward(audio_chunk, param_chunk).clone() 205 | for audio_chunk, param_chunk in tqdm( 206 | zip(audio_chunks, param_chunks), total=len(audio_chunks) 207 | ) 208 | ] 209 | 210 | else: 211 | out_chunks = [ 212 | model.forward(audio_chunk, None).clone() 213 | for audio_chunk in tqdm(audio_chunks) 214 | ] 215 | 216 | audio_out = tr.hstack(out_chunks)[:, :audio_len] 217 | 218 | model.reset() 219 | 220 | if preferred_sr != output_sr: 221 | audio_out = torchaudio.transforms.Resample(preferred_sr, output_sr)( 222 | audio_out 223 | ) 224 | 225 | # Make the output audio consistent with the input audio 226 | if audio_out.size(0) == 1 and not input_sample.is_mono(): 227 | audio_out = audio_out.repeat(2, 1) 228 | elif audio_out.size(0) == 2 and input_sample.is_mono(): 229 | audio_out = tr.mean(audio_out, dim=0, keepdim=True) 230 | 231 | return AudioSample(audio_out, output_sr) 232 | -------------------------------------------------------------------------------- /neutone_sdk/benchmark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import timeit 4 | import itertools 5 | from typing import List 6 | import click 7 | import torch 8 | from torch.autograd.profiler import record_function 9 | from neutone_sdk import constants 10 | from neutone_sdk.sqw import SampleQueueWrapper 11 | from neutone_sdk.utils import load_neutone_model, model_to_torchscript 12 | import numpy as np 13 | from tqdm import tqdm 14 | 15 | logging.basicConfig() 16 | log = logging.getLogger(__name__) 17 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 18 | 19 | 20 | @click.group() 21 | def cli(): 22 | """This is needed to make a group command with click.""" 23 | pass 24 | 25 | 26 | @cli.command() 27 | @click.option("--model_file", help="Path to model file") 28 | @click.option( 29 | "--buffer_size", 30 | default=(128, 256, 512, 1024, 2048), 31 | multiple=True, 32 | help="Buffer sizes to benchmark", 33 | ) 34 | @click.option( 35 | "--sample_rate", 36 | default=(48000,), 37 | multiple=True, 38 | help="Sample rates to benchmark", 39 | ) 40 | @click.option("--repeat", default=10, help="How many times to repeat the benchmark") 41 | @click.option( 42 | "--n_iters", 43 | default=30, 44 | help="How many forward passes to run for each repetition", 45 | ) 46 | @click.option( 47 | "--daw_is_mono", 48 | default=False, 49 | help="Whether to assume daw is mono or not during the benchmark", 50 | ) 51 | @click.option("--num_threads", default=1, help="num_threads to use for the benchmark") 52 | @click.option( 53 | "--num_interop_threads", 54 | default=1, 55 | help="num_interop_threads to use for the benchmark", 56 | ) 57 | def benchmark_speed( 58 | model_file: str, 59 | buffer_size: List[int], 60 | sample_rate: List[int], 61 | repeat: int, 62 | n_iters: int, 63 | daw_is_mono: bool, 64 | num_threads: int, 65 | num_interop_threads: int, 66 | ) -> None: 67 | return benchmark_speed_( 68 | model_file, 69 | buffer_size, 70 | sample_rate, 71 | repeat, 72 | n_iters, 73 | daw_is_mono, 74 | num_threads, 75 | num_interop_threads, 76 | ) 77 | 78 | 79 | def benchmark_speed_( 80 | model_file: str, 81 | buffer_size: List[int] = (128, 256, 512, 1024, 2048), 82 | sample_rate: List[int] = (48000,), 83 | repeat: int = 10, 84 | n_iters: int = 30, 85 | daw_is_mono: bool = False, 86 | num_threads: int = 1, 87 | num_interop_threads: int = 1, 88 | ) -> None: 89 | daw_n_ch = 1 if daw_is_mono else 2 90 | np.set_printoptions(precision=3) 91 | torch.set_num_threads(num_threads) 92 | torch.set_num_interop_threads(num_interop_threads) 93 | with torch.no_grad(): 94 | m, _ = load_neutone_model(model_file) 95 | log.info( 96 | f"Running benchmark for buffer sizes {buffer_size} and sample rates {sample_rate}. Outliers will be removed from the calculation of mean and std and displayed separately if existing." 97 | ) 98 | for sr, bs in itertools.product(sample_rate, buffer_size): 99 | m.set_daw_sample_rate_and_buffer_size(sr, bs) 100 | for _ in range(n_iters): # Warmup 101 | m.forward(torch.rand((daw_n_ch, bs))) 102 | m.reset() 103 | 104 | # Pregenerate random buffers to more accurately benchmark the model itself 105 | def get_random_buffer_generator(): 106 | buffers = torch.rand(100, daw_n_ch, bs) 107 | i = 0 108 | 109 | def return_next_random_buffer(): 110 | nonlocal i 111 | i = (i + 1) % 100 112 | return buffers[i] 113 | 114 | return return_next_random_buffer 115 | 116 | rbg = get_random_buffer_generator() 117 | 118 | durations = np.array( 119 | timeit.repeat(lambda: m.forward(rbg()), repeat=repeat, number=n_iters) 120 | ) 121 | m.reset() 122 | mean, std = np.mean(durations), np.std(durations) 123 | outlier_mask = np.abs(durations - mean) > 2 * std 124 | outliers = durations[outlier_mask] 125 | # Remove outliers from general benchmark 126 | durations = durations[~outlier_mask] 127 | mean, std = np.mean(durations), np.std(durations) 128 | log.info( 129 | f"Sample rate: {sr: 6} | Buffer size: {bs: 6} | duration: {mean: 6.3f}±{std:.3f} | 1/RTF: {bs/(mean/n_iters*sr): 6.3f} | Outliers: {outliers[:3]}" 130 | ) 131 | 132 | 133 | @cli.command() 134 | @click.option("--model_file", help="Path to model file") 135 | @click.option( 136 | "--buffer_size", 137 | default=(128, 256, 512, 1024, 2048), 138 | multiple=True, 139 | help="Buffer sizes to benchmark", 140 | ) 141 | @click.option( 142 | "--sample_rate", 143 | default=( 144 | 44100, 145 | 48000, 146 | ), 147 | multiple=True, 148 | help="Sample rates to benchmark", 149 | ) 150 | def benchmark_latency( 151 | model_file: str, buffer_size: List[int], sample_rate: List[int] 152 | ) -> None: 153 | return benchmark_latency_(model_file, buffer_size, sample_rate) 154 | 155 | 156 | def benchmark_latency_( 157 | model_file: str, 158 | buffer_size: List[int] = (128, 256, 512, 1024, 2048), 159 | sample_rate: List[int] = (48000,), 160 | ) -> None: 161 | m, _ = load_neutone_model(model_file) 162 | nbs, nsr = m.get_native_buffer_sizes(), m.get_native_sample_rates() 163 | log.info(f"Native buffer sizes: {nbs[:10]}, Native sample rates: {nsr[:10]}") 164 | if len(nbs) > 10 or len(nsr) > 10: 165 | log.info(f"Showing only the first 10 values in case there are more.") 166 | with torch.no_grad(): 167 | delays = [] 168 | for sr, bs in itertools.product(sample_rate, buffer_size): 169 | m.set_daw_sample_rate_and_buffer_size(sr, bs) 170 | m.reset() 171 | delays += [ 172 | [ 173 | sr, 174 | bs, 175 | m.calc_buffering_delay_samples(), 176 | m.calc_model_delay_samples(), 177 | ] 178 | ] 179 | delays = sorted(delays, key=lambda x: x[2] + x[3]) 180 | log.info( 181 | f"Model {model_file} has the following delays for each sample rate / buffer size combination (lowest delay first):" 182 | ) 183 | for sr, bs, bds, mds in delays: 184 | log.info( 185 | f"Sample rate: {sr: 6} | Buffer size: {bs: 6} | Total delay: {bds+mds: 6} | (Buffering delay: {bds: 6} | Model delay: {mds: 6})" 186 | ) 187 | log.info( 188 | f"The recommended sample rate / buffer size combination is sample rate {delays[0][0]}, buffer size {delays[0][1]}" 189 | ) 190 | 191 | 192 | def profile_sqw( 193 | sqw: SampleQueueWrapper, 194 | daw_sr: int = 48000, 195 | daw_bs: int = 512, 196 | daw_is_mono: bool = False, 197 | use_params: bool = True, 198 | convert_to_torchscript: bool = False, 199 | n_iters: int = 100, 200 | ) -> None: 201 | daw_n_ch = 1 if daw_is_mono else 2 202 | audio_buffers = [torch.rand((daw_n_ch, daw_bs)) for _ in range(n_iters)] 203 | if use_params: 204 | param_buffers = [ 205 | torch.rand((constants.MAX_N_PARAMS, daw_bs)) for _ in range(n_iters) 206 | ] 207 | else: 208 | param_buffers = [None for _ in range(n_iters)] 209 | 210 | sqw.set_daw_sample_rate_and_buffer_size(daw_sr, daw_bs) 211 | if hasattr(sqw, "prepare_for_inference"): 212 | sqw.prepare_for_inference() 213 | if convert_to_torchscript: 214 | log.info("Converting to TorchScript") 215 | with torch.no_grad(): 216 | sqw = model_to_torchscript(sqw, freeze=False, optimize=False) 217 | 218 | with torch.inference_mode(): 219 | with torch.profiler.profile( 220 | activities=[torch.profiler.ProfilerActivity.CPU], 221 | with_stack=True, 222 | profile_memory=True, 223 | record_shapes=False, 224 | ) as prof: 225 | with record_function("forward"): 226 | for audio_buff, param_buff in tqdm(zip(audio_buffers, param_buffers)): 227 | out_buff = sqw.forward(audio_buff, param_buff) 228 | 229 | log.info("Displaying Total CPU Time") 230 | log.info(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) 231 | # log.info(prof.key_averages(group_by_stack_n=5).table(sort_by="cpu_time_total", row_limit=10)) 232 | log.info("Displaying CPU Memory Usage") 233 | log.info( 234 | prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10) 235 | ) 236 | log.info("Displaying Grouped CPU Memory Usage") 237 | log.info( 238 | prof.key_averages(group_by_stack_n=5).table( 239 | sort_by="self_cpu_memory_usage", row_limit=5 240 | ) 241 | ) 242 | 243 | 244 | @cli.command() 245 | @click.option("--model_file", help="Path to model file") 246 | @click.option( 247 | "--buffer_size", 248 | default=(128,), 249 | multiple=True, 250 | help="Buffer sizes to benchmark", 251 | ) 252 | @click.option( 253 | "--sample_rate", 254 | default=(48000,), 255 | multiple=True, 256 | help="Sample rates to benchmark", 257 | ) 258 | @click.option( 259 | "--daw_is_mono", 260 | default=False, 261 | help="Whether to assume daw is mono or not during the benchmark", 262 | ) 263 | @click.option( 264 | "--use_params", 265 | default=False, 266 | help="Whether to pass parameters to the model during profiling", 267 | ) 268 | @click.option( 269 | "--n_iters", 270 | default=30, 271 | help="How many forward passes to run while profiling", 272 | ) 273 | @click.option("--num_threads", default=1, help="num_threads to use for the benchmark") 274 | @click.option( 275 | "--num_interop_threads", 276 | default=1, 277 | help="num_interop_threads to use for the benchmark", 278 | ) 279 | def profile( 280 | model_file: str, 281 | buffer_size: List[int], 282 | sample_rate: List[int], 283 | daw_is_mono: bool = False, 284 | use_params: bool = True, 285 | n_iters: int = 100, 286 | num_threads: int = 1, 287 | num_interop_threads: int = 1, 288 | ): 289 | torch.set_num_threads(num_threads) 290 | torch.set_num_interop_threads(num_interop_threads) 291 | m, _ = load_neutone_model(model_file) 292 | for sr, bs in itertools.product(sample_rate, buffer_size): 293 | log.info( 294 | f"Profiling model {model_file} at sample rate {sr} and buffer size {bs}" 295 | ) 296 | profile_sqw( 297 | m, 298 | sr, 299 | bs, 300 | daw_is_mono, 301 | use_params, 302 | False, 303 | n_iters, 304 | ) 305 | 306 | 307 | if __name__ == "__main__": 308 | cli() 309 | -------------------------------------------------------------------------------- /neutone_sdk/cached_mel_spec.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional, Callable 4 | 5 | import torch as tr 6 | from torch import Tensor 7 | from torch import nn 8 | from torchaudio.transforms import MelSpectrogram 9 | 10 | from neutone_sdk import CircularInplaceTensorQueue 11 | 12 | logging.basicConfig() 13 | log = logging.getLogger(__name__) 14 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 15 | 16 | 17 | class CachedMelSpec(nn.Module): 18 | def __init__( 19 | self, 20 | sr: int, 21 | n_ch: int, 22 | n_fft: int = 2048, 23 | hop_len: int = 512, 24 | f_min: float = 0.0, 25 | f_max: Optional[float] = None, 26 | n_mels: int = 128, 27 | window_fn: Callable[..., Tensor] = tr.hann_window, 28 | power: float = 2.0, 29 | normalized: bool = False, 30 | center: bool = True, 31 | use_debug_mode: bool = True, 32 | ) -> None: 33 | """ 34 | Creates a Mel spectrogram that supports streaming of a centered, non-causal 35 | Mel spectrogram operation that uses zero padding. Using this will result in 36 | audio being delayed by (n_fft / 2) - hop_len samples. When calling forward, 37 | the input audio block length must be a multiple of the hop length. 38 | 39 | Parameters: 40 | sr (int): Sample rate of the audio 41 | n_ch (int): Number of audio channels 42 | n_fft (int): STFT n_fft (must be even) 43 | hop_len (int): STFT hop length (must divide into n_fft // 2) 44 | f_min (float): Minimum frequency of the Mel filterbank 45 | f_max (float): Maximum frequency of the Mel filterbank 46 | n_mels (int): Number of mel filterbank bins 47 | window_fn (Callable[..., Tensor]): A function to create a window tensor 48 | power (float): Exponent for the magnitude spectrogram (must be > 0) 49 | normalized (bool): Whether to normalize the mel spectrogram or not 50 | center (bool): Whether to center the mel spectrogram (must be True) 51 | use_debug_mode (bool): Whether to use debug mode or not 52 | """ 53 | super().__init__() 54 | assert center, "center must be True, causal mode is not supported yet" 55 | assert n_fft % 2 == 0, "n_fft must be even" 56 | assert (n_fft // 2) % hop_len == 0, "n_fft // 2 must be divisible by hop_len" 57 | self.n_ch = n_ch 58 | self.n_fft = n_fft 59 | self.hop_len = hop_len 60 | self.use_debug_mode = use_debug_mode 61 | self.mel_spec = MelSpectrogram( 62 | sample_rate=sr, 63 | n_fft=n_fft, 64 | hop_length=hop_len, 65 | f_min=f_min, 66 | f_max=f_max, 67 | n_mels=n_mels, 68 | window_fn=window_fn, 69 | power=power, 70 | normalized=normalized, 71 | center=False, # We use a causal STFT since we do the padding ourselves 72 | ) 73 | self.padding_n_samples = self.n_fft - self.hop_len 74 | self.cache = CircularInplaceTensorQueue( 75 | n_ch, self.padding_n_samples, use_debug_mode 76 | ) 77 | self.register_buffer("padding", tr.zeros((n_ch, self.padding_n_samples))) 78 | self.cache.push(self.padding) 79 | 80 | def forward(self, x: Tensor) -> Tensor: 81 | """ 82 | Computes the Mel spectrogram of the input audio tensor. Supports streaming as 83 | long as the input audio tensor is a multiple of the hop length. 84 | """ 85 | if self.use_debug_mode: 86 | assert x.ndim == 2, "input audio must have shape (n_ch, n_samples)" 87 | assert x.size(0) == self.n_ch, "input audio n_ch is incorrect" 88 | assert ( 89 | x.size(1) % self.hop_len == 0 90 | ), "input audio n_samples must be divisible by hop_len" 91 | # Compute the Mel spec 92 | n_samples = x.size(1) 93 | n_frames = n_samples // self.hop_len 94 | padded_x = tr.cat([self.padding, x], dim=1) 95 | padded_spec = self.mel_spec(padded_x) 96 | spec = padded_spec[:, :, -n_frames:] 97 | 98 | # Update the cache and padding 99 | padding_idx = min(n_samples, self.padding_n_samples) 100 | self.cache.push(x[:, -padding_idx:]) 101 | self.cache.fill(self.padding) 102 | return spec 103 | 104 | def prepare_for_inference(self) -> None: 105 | """ 106 | Prepares the cached Mel spectrogram for inference by disabling debug mode. 107 | """ 108 | self.cache.use_debug_mode = False 109 | self.use_debug_mode = False 110 | 111 | @tr.jit.export 112 | def get_delay_samples(self) -> int: 113 | """ 114 | Returns the number of samples of delay of the cached Mel spectrogram. 115 | """ 116 | return (self.n_fft // 2) - self.hop_len 117 | 118 | @tr.jit.export 119 | def get_delay_frames(self) -> int: 120 | """ 121 | Returns the number of frames of delay of the cached Mel spectrogram. 122 | """ 123 | return self.get_delay_samples() // self.hop_len 124 | 125 | @tr.jit.export 126 | def reset(self) -> None: 127 | """ 128 | Resets the cache and padding of the cached Mel spectrogram. 129 | """ 130 | self.cache.reset() 131 | self.padding.zero_() 132 | self.cache.push(self.padding) 133 | -------------------------------------------------------------------------------- /neutone_sdk/constants.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | SDK_VERSION = "1.4.3" 4 | 5 | MAX_N_PARAMS = 4 6 | MAX_N_CATEGORICAL_VALUES = 20 7 | MAX_N_CATEGORICAL_LABEL_CHARS = 20 8 | MAX_N_AUDIO_SAMPLES = 3 9 | 10 | DEFAULT_DAW_SR = 48000 11 | DEFAULT_DAW_BS = 2048 12 | 13 | NEUTONE_GEN_N_NUMERICAL_PARAMS = 4 14 | NEUTONE_GEN_N_TEXT_PARAMS = 1 15 | NEUTONE_GEN_N_TOKENS_PARAMS = 1 16 | -------------------------------------------------------------------------------- /neutone_sdk/core.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | from abc import ABC, abstractmethod 5 | from typing import Dict, List, Tuple, Union, Any 6 | 7 | import torch as tr 8 | from torch import nn, Tensor 9 | 10 | from neutone_sdk import constants 11 | from neutone_sdk.parameter import NeutoneParameter 12 | 13 | logging.basicConfig() 14 | log = logging.getLogger(__name__) 15 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 16 | 17 | 18 | class NeutoneModel(ABC, nn.Module): 19 | # TorchScript typing does not support instance attributes, so we need to type them 20 | # as class attributes. This is required for supporting models with no parameters. 21 | # (https://github.com/pytorch/pytorch/issues/51041#issuecomment-767061194) 22 | neutone_parameters_metadata: Dict[ 23 | str, Dict[str, Union[int, float, str, bool, List[str], List[int]]] 24 | ] 25 | neutone_parameter_names: List[str] 26 | 27 | def __init__(self, model: nn.Module, use_debug_mode: bool = True) -> None: 28 | """ 29 | Creates an Neutone model, wrapping a child model (that does the real 30 | work). 31 | """ 32 | super().__init__() 33 | 34 | # Save and prepare model. This should be done at the very beginning of the 35 | # constructor to enable accessing the model in other methods of this class. 36 | model.eval() 37 | self.model = model 38 | 39 | self.MAX_N_PARAMS = self._get_max_n_params() 40 | self.SDK_VERSION = constants.SDK_VERSION 41 | self.CURRENT_TIME = time.time() 42 | self.use_debug_mode = use_debug_mode 43 | self.n_neutone_parameters = len(self.get_neutone_parameters()) 44 | 45 | # Ensure the number of parameters is within the allowed limit 46 | assert self.n_neutone_parameters <= self.MAX_N_PARAMS, ( 47 | f"Number of parameters ({self.n_neutone_parameters}) exceeds the maximum " 48 | f"allowed ({self.MAX_N_PARAMS})." 49 | ) 50 | # Ensure parameter names are unique 51 | assert len(set([p.name for p in self.get_neutone_parameters()])) == len( 52 | self.get_neutone_parameters() 53 | ) 54 | 55 | # Save parameter metadata 56 | self.neutone_parameters_metadata = { 57 | f"p{idx + 1}": p.to_metadata() 58 | for idx, p in enumerate(self.get_neutone_parameters()) 59 | } 60 | 61 | # Allocate default params buffer to prevent dynamic allocations later 62 | default_vals_0to1 = self._get_numerical_params_default_values_0to1() 63 | n_numerical_params = default_vals_0to1.size(0) 64 | assert n_numerical_params <= self.MAX_N_PARAMS, ( 65 | f"Number of default param values ({n_numerical_params}) " 66 | f"exceeds the maximum allowed ({self.MAX_N_PARAMS})." 67 | ) 68 | default_vals_0to1 = default_vals_0to1.view(n_numerical_params, 1) 69 | self.register_buffer("numerical_params_default_values_0to1", default_vals_0to1) 70 | 71 | # Save parameter information 72 | self.neutone_parameter_names = [p.name for p in self.get_neutone_parameters()] 73 | 74 | @abstractmethod 75 | def _get_max_n_params(self) -> int: 76 | """ 77 | Sets the maximum number of parameters that the model can have. 78 | This should not be overwritten by SDK users. 79 | """ 80 | pass 81 | 82 | @abstractmethod 83 | def _get_numerical_params_default_values_0to1( 84 | self, 85 | ) -> Tensor: 86 | """ 87 | Returns a float tensor with the default values of the numerical parameters 88 | in the range [0, 1]. 89 | This should not be overwritten by SDK users. 90 | """ 91 | pass 92 | 93 | @abstractmethod 94 | def get_model_name(self) -> str: 95 | """ 96 | Used to set the model name. This will be displayed on both the 97 | website and the plugin. 98 | 99 | Maximum length of 30 characters. 100 | """ 101 | pass 102 | 103 | @abstractmethod 104 | def get_model_authors(self) -> List[str]: 105 | """ 106 | Used to set the model authors. This will be displayed on both the 107 | website and the plugin. 108 | 109 | Should reflect the name of the people that developed the wrapper 110 | of the model using the SDK. Can be different from the authors of 111 | the original model. 112 | 113 | Maximum of 5 authors. 114 | """ 115 | pass 116 | 117 | @abstractmethod 118 | def get_model_short_description(self) -> str: 119 | """ 120 | Used to set the model short description. This will be displayed on both 121 | the website and the plugin. 122 | 123 | This is meant to be seen by the audio creators and should give a summary 124 | of what the model does. 125 | 126 | Maximum of 150 characters. 127 | """ 128 | pass 129 | 130 | @abstractmethod 131 | def get_model_long_description(self) -> str: 132 | """ 133 | Used to set the model long description. This will be displayed only on 134 | the website. 135 | 136 | This is meant to be seen by the audio creators and should give an extensive 137 | description of what the model does. Could describe interesting uses of the 138 | model, good combinations of parameters, what types of audio has it been 139 | tested with etc. 140 | 141 | Maximum of 500 characters. 142 | """ 143 | pass 144 | 145 | @abstractmethod 146 | def get_technical_description(self) -> str: 147 | """ 148 | Used to set the model technical description. This will be displayed only on 149 | the website. 150 | 151 | This is meant to be seen by other researchers or people that want to develop 152 | similar models. It could present a summary of the internals of the model: 153 | what architecture it is based on, what kind of data it was trained with, 154 | on what kind of hardware. 155 | 156 | If the authors of the plugin are different from the authors of the model(s) 157 | included this section along with citation and technical links are places 158 | to provide appropiate credits. 159 | 160 | Maximum of 500 characters. 161 | """ 162 | pass 163 | 164 | @abstractmethod 165 | def get_tags(self) -> List[str]: 166 | """ 167 | Used to provide a list of tags. This will be displayed on the website and will 168 | be used later on for filtering of similar models. 169 | 170 | Maximum of 7 tags of 15 characters each. 171 | """ 172 | pass 173 | 174 | @abstractmethod 175 | def get_model_version(self) -> str: 176 | """ 177 | Used to set the model version. This will be displayed on both the website and the plugin. 178 | 179 | We suggest people use semantic versioning for their models, but in a lot of cases it can 180 | be overkill. For now we only support showing the latest version of the model. 181 | 182 | Please provide a string like "1", "1.0", "1.0.0", "0.1.0" etc. 183 | """ 184 | pass 185 | 186 | @abstractmethod 187 | def is_experimental(self) -> bool: 188 | """ 189 | Used to set the experimental flag. This will be displayed on both the website and the plugin. 190 | 191 | If this flag is set the models will have a special icon next to them signaling to the users of 192 | the plugin that this model is an experimental release. 193 | """ 194 | pass 195 | 196 | def get_technical_links(self) -> Dict[str, str]: 197 | """ 198 | Used to set the hechnical links. These will be displayed only on the website. 199 | 200 | Under the technical description field the following links can be displayed as buttons. 201 | This can be used to provide links to the implementation, to scientific paper, personal websites etc. 202 | 203 | While any key-value pair can be provided, we strongly encourage users to provide a dictionary 204 | with keys such as Paper, Code, Personal, GitHub, Blog, Twitter, Instagram etc. 205 | 206 | Maximum of 3 links. 207 | """ 208 | return {} 209 | 210 | def get_citation(self) -> str: 211 | """ 212 | Used to set the citation. This will be displayed only on the website. 213 | 214 | This field is specifically meant to display the citation for a scientific paper that the model 215 | is based on, if any. Will be displayed under the technical links. Can be left empty. 216 | 217 | Maximum of 150 characters. 218 | """ 219 | return "" 220 | 221 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 222 | return [] 223 | 224 | def prepare_for_inference(self) -> None: 225 | """Prepare a model for inference and to be converted to torchscript.""" 226 | self.use_debug_mode = False 227 | self.model.eval() 228 | self.eval() 229 | 230 | @tr.jit.export 231 | def get_neutone_parameters_metadata( 232 | self, 233 | ) -> Dict[str, Dict[str, Union[int, float, str, bool, List[str], List[int]]]]: 234 | """ 235 | Returns the metadata of the parameters as a dictionary of ParameterMetadata 236 | named tuples. 237 | """ 238 | return self.neutone_parameters_metadata 239 | 240 | @tr.jit.export 241 | def get_numerical_params_default_values_0to1(self) -> Tensor: 242 | """ 243 | Returns the default parameter values as a tensor of shape 244 | (n_numerical_params, 1). 245 | """ 246 | return self.numerical_params_default_values_0to1 247 | 248 | @tr.jit.export 249 | def get_wet_default_value(self) -> float: 250 | return 1.0 251 | 252 | @tr.jit.export 253 | def get_dry_default_value(self) -> float: 254 | return 0.0 255 | 256 | @tr.jit.export 257 | def get_input_gain_default_value(self) -> float: 258 | """[0.0, 1.0] here maps to [-30.0db, +30.0db]""" 259 | return 0.5 260 | 261 | @tr.jit.export 262 | def get_output_gain_default_value(self) -> float: 263 | """[0.0, 1.0] here maps to [-30.0db, +30.0db]""" 264 | return 0.5 265 | 266 | @tr.jit.export 267 | def get_core_preserved_attributes(self) -> List[str]: 268 | return [ 269 | "model", # nn.Module 270 | "get_neutone_parameters_metadata", 271 | "get_numerical_params_default_values_0to1", 272 | "get_wet_default_value", 273 | "get_dry_default_value", 274 | "get_input_gain_default_value", 275 | "get_output_gain_default_value", 276 | "get_core_preserved_attributes", 277 | "to_core_metadata", 278 | ] 279 | 280 | @tr.jit.export 281 | def to_core_metadata(self) -> Dict[str, Any]: 282 | return { 283 | "model_name": self.get_model_name(), 284 | "model_authors": self.get_model_authors(), 285 | "model_short_description": self.get_model_short_description(), 286 | "model_long_description": self.get_model_long_description(), 287 | "neutone_parameters": self.get_neutone_parameters_metadata(), 288 | "wet_default_value": self.get_wet_default_value(), 289 | "dry_default_value": self.get_dry_default_value(), 290 | "input_gain_default_value": self.get_input_gain_default_value(), 291 | "output_gain_default_value": self.get_output_gain_default_value(), 292 | "technical_description": self.get_technical_description(), 293 | "technical_links": self.get_technical_links(), 294 | "tags": self.get_tags(), 295 | "model_version": self.get_model_version(), 296 | "sdk_version": self.SDK_VERSION, 297 | "pytorch_version": tr.__version__, 298 | "date_created": self.CURRENT_TIME, 299 | "citation": self.get_citation(), 300 | "is_experimental": self.is_experimental(), 301 | } 302 | -------------------------------------------------------------------------------- /neutone_sdk/filters.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import List, Optional 3 | from enum import Enum 4 | import warnings 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | """ 11 | Filters for pre-filtering inputs to models such as RAVE. 12 | """ 13 | 14 | 15 | class FilterType(Enum): 16 | LOWPASS = "lowpass" 17 | HIGHPASS = "highpass" 18 | BANDPASS = "bandpass" 19 | BANDSTOP = "bandstop" 20 | 21 | 22 | class FIRFilter(nn.Module): 23 | def __init__( 24 | self, 25 | filt_type: FilterType, 26 | cutoffs: List[float], 27 | filt_size: int = 257, 28 | ): 29 | """Streamable FIR filter for pre-filtering of model inputs, etc. 30 | 31 | Args: 32 | filt_type (FilterType): Type of the filter (FilterType.LOWPASS/HIGHPASS/BANDPASS/BANDSTOP). 33 | cutoffs (List[float]): Cutoff frequencies (in Hz). 2 should be given if bandpass/stop 34 | sample_rate (int): Sampling rate 35 | filt_size (int, optional): Length of the FIR. Defaults to 257. 36 | """ 37 | super().__init__() 38 | # register buffer only allowed once 39 | self.register_buffer("cache", torch.zeros(2, filt_size - 1)) 40 | self.register_buffer("ir_windowed", torch.empty(1, 1, filt_size)) 41 | # Pass in fake sample rate for filter 42 | # Sample rate should be automatically overwritten by calling 43 | # set_parameters() from w2wbase.set_model_sample_rate_and_buffer_size() 44 | self.set_parameters(filt_type, cutoffs, 48000, filt_size) 45 | 46 | def set_parameters( 47 | self, 48 | filt_type: Optional[FilterType] = None, 49 | cutoffs: Optional[List[float]] = None, 50 | sample_rate: Optional[int] = None, 51 | filt_size: Optional[int] = None, 52 | ): 53 | filt_type = self.filt_type if filt_type is None else filt_type 54 | cutoffs = self.cutoffs if cutoffs is None else cutoffs 55 | sample_rate = self.sample_rate if sample_rate is None else sample_rate 56 | filt_size = self.filt_size if filt_size is None else filt_size 57 | if len(cutoffs) == 2: 58 | if filt_type.value in [FilterType.HIGHPASS.value, FilterType.LOWPASS.value]: 59 | raise ValueError( 60 | f"only 1 cutoff value supported for filter type: {filt_type}" 61 | ) 62 | else: 63 | if filt_type.value in [ 64 | FilterType.BANDPASS.value, 65 | FilterType.BANDSTOP.value, 66 | ]: 67 | raise ValueError( 68 | f"2 cutoff values (low, high) needed for filter type: {filt_type}" 69 | ) 70 | # create frequency response by frequency sampling 71 | freqs = torch.fft.rfftfreq(filt_size, 1 / sample_rate) 72 | 73 | if filt_type == FilterType.HIGHPASS: 74 | freq_resp = torch.where((freqs > cutoffs[0]), 1.0, 0.0).float() 75 | elif filt_type == FilterType.LOWPASS: 76 | freq_resp = torch.where((freqs < cutoffs[0]), 1.0, 0.0).float() 77 | elif filt_type == FilterType.BANDPASS: 78 | freq_resp = torch.where( 79 | torch.logical_and(freqs > cutoffs[0], freqs < cutoffs[1]), 1.0, 0.0 80 | ).float() 81 | elif filt_type == FilterType.BANDSTOP: 82 | freq_resp = torch.where( 83 | torch.logical_or(freqs < cutoffs[0], freqs > cutoffs[1]), 1.0, 0.0 84 | ).float() 85 | else: 86 | raise ValueError(f"Unrecognized filter type: {filt_type.value}") 87 | # create impulse response by windowing 88 | ir = torch.fft.irfft(freq_resp, n=filt_size, dim=-1) 89 | filter_window = torch.kaiser_window(filt_size, dtype=torch.float32).roll( 90 | filt_size // 2, -1 91 | ) 92 | self.ir_windowed = (filter_window * ir)[None, None, :].to( 93 | self.ir_windowed.device 94 | ) 95 | self.filt_type = filt_type 96 | self.cutoffs = cutoffs 97 | self.sample_rate = sample_rate 98 | self.filt_size = filt_size 99 | self.delay = filt_size // 2 # constant group delay 100 | 101 | def forward( 102 | self, 103 | audio: torch.Tensor, 104 | ): 105 | """Process audio with filter 106 | 107 | Args: 108 | audio (torch.Tensor): input audio [n_channels, n_samples] 109 | 110 | Returns: 111 | torch.Tensor: filtered audio 112 | """ 113 | n_channels, orig_len = audio.shape 114 | # standard convolution implementation 115 | # pad input with cache 116 | audio = torch.cat([self.cache[:n_channels], audio], dim=-1) 117 | self.cache = audio[:, -(self.filt_size - 1) :] 118 | filtered = F.conv1d( 119 | audio[:, None, :], 120 | self.ir_windowed, 121 | padding="valid", 122 | ).squeeze(1) 123 | return filtered 124 | 125 | 126 | class IIRFilter(nn.Module): 127 | def __init__( 128 | self, 129 | filt_type: FilterType, 130 | cutoff: float, 131 | resonance: float, 132 | ): 133 | """Time-invariant IIR filter 134 | 135 | Args: 136 | filt_type (FilterType): Type of the filter (FilterType.LOWPASS/HIGHPASS/BANDPASS). 137 | cutoff (float): Cutoff frequency in Hz (0 < cutoff < f_nyq) 138 | resonance (float): Filter resonance, controls bandwidth in case of bandpass 139 | sample_rate (int): Sampling rate 140 | """ 141 | super().__init__() 142 | # register buffer only allowed once 143 | self.register_buffer("g", torch.empty(1, 1, 1)) 144 | self.register_buffer("twoR", torch.empty(1, 1, 1) / resonance) 145 | self.register_buffer("mix", torch.empty(1, 1, 3)) 146 | # Pass in fake sample rate for filter 147 | # Sample rate should be automatically overwritten by calling 148 | # set_parameters() from w2wbase.set_model_sample_rate_and_buffer_size() 149 | self.set_parameters(filt_type, cutoff, resonance, 48000) 150 | self.svf = _SVFLayer() 151 | 152 | def set_parameters( 153 | self, 154 | filt_type: Optional[FilterType] = None, 155 | cutoff: Optional[float] = None, 156 | resonance: Optional[float] = None, 157 | sample_rate: Optional[int] = None, 158 | ): 159 | filt_type = self.filt_type if filt_type is None else filt_type 160 | cutoff = self.cutoff if cutoff is None else cutoff 161 | resonance = self.resonance if resonance is None else resonance 162 | sample_rate = self.sample_rate if sample_rate is None else sample_rate 163 | 164 | cutoff = max(min(cutoff, sample_rate / 2 - 1e-4), 1e-4) 165 | resonance = max(resonance, 1e-4) 166 | # frequency warping 167 | self.g = torch.ones(1, 1, 1, device=self.g.device) * math.tan( 168 | math.pi / sample_rate * cutoff 169 | ) 170 | self.twoR = torch.ones(1, 1, 1, device=self.twoR.device) / resonance 171 | if filt_type == FilterType.LOWPASS: 172 | self.mix = torch.tensor([[[0.0, 1.0, 0.0]]], device=self.mix.device) 173 | elif filt_type == FilterType.HIGHPASS: 174 | self.mix = torch.tensor([[[0.0, 0.0, 1.0]]], device=self.mix.device) 175 | elif filt_type == FilterType.BANDPASS: 176 | self.mix = torch.tensor([[[1.0, 0.0, 0.0]]], device=self.mix.device) 177 | else: 178 | raise ValueError(f"Unrecognized filter type: {filt_type}") 179 | self.filt_type = filt_type 180 | self.cutoff = cutoff 181 | self.resonance = resonance 182 | self.sample_rate = sample_rate 183 | self.delay = 0 184 | 185 | def forward(self, audio: torch.Tensor): 186 | """pass through highpass filter 187 | 188 | Args: 189 | audio (torch.Tensor): [batch_size (or n_channels), n_samples] 190 | """ 191 | batch_size, n_samples = audio.shape 192 | g = self.g.expand(n_samples, batch_size, -1) 193 | twoR = self.twoR.expand(n_samples, batch_size, -1) 194 | mix = self.mix.expand(n_samples, batch_size, -1) 195 | return self.svf(audio.permute(1, 0), g, twoR, mix) 196 | 197 | 198 | class IIRSVF(nn.Module): 199 | def __init__(self): 200 | """ 201 | Time-varying SVF with IIRs 202 | """ 203 | super().__init__() 204 | self.svf = _SVFLayer() 205 | self.delay = 0 206 | 207 | def forward( 208 | self, 209 | audio: torch.Tensor, 210 | cutoff: torch.Tensor, 211 | resonance: torch.Tensor, 212 | mix: torch.Tensor, 213 | sample_rate: int, 214 | ): 215 | """Feed into time-varying svf 216 | 217 | Args: 218 | audio (torch.Tensor): Input audio [batch_size (or n_channels), n_samples] 219 | cutoff (torch.Tensor): Cutoff frequency [batch_size, n_samples, 1] 220 | resonance (torch.Tensor): Resonance (0 ~ 1), [batch_size, n_samples, 1] 221 | mix (torch.Tensor): Mix coeff. bp, lp and hp [batch_size, n_samples, 3] ex.) [[[1.0, 0.0, 0.0]]] = bandpass 222 | 223 | Returns: 224 | audio (torch.Tensor): [n_channels, n_samples] 225 | """ 226 | cutoff = torch.clamp(cutoff, min=1e-4, max=sample_rate / 2 - 1e-4) 227 | resonance = torch.clamp(resonance, min=1e-4) 228 | g = torch.tan(math.pi / sample_rate * cutoff).permute(1, 0, 2) 229 | twoR = 1 / resonance.permute(1, 0, 2) 230 | mix = mix.permute(1, 0, 2) 231 | return self.svf(audio.permute(1, 0), g, twoR, mix) 232 | 233 | 234 | class _SVFLayer(nn.Module): 235 | """ 236 | SVF implementation based on "Time-varying filters for musical applications" [Wishnick, 2014] 237 | NOTE: This SVF is slow for use in training due to recurrent operations 238 | """ 239 | 240 | def __init__(self): 241 | super().__init__() 242 | self.register_buffer("state", torch.zeros(1, 2)) 243 | self.register_buffer("Y", torch.empty(4096, 2, 2)) 244 | 245 | def forward( 246 | self, 247 | audio: torch.Tensor, 248 | g: torch.Tensor, 249 | twoR: torch.Tensor, 250 | mix: torch.Tensor, 251 | ): 252 | """pass audio through SVF 253 | Args: 254 | *** time-first, batch-second *** 255 | audio (torch.Tensor): [n_samples, batch_size] 256 | All filter parameters are [n_samples, batch_size, 1 or 3 (mix)] 257 | g (torch.Tensor): Normalized cutoff parameter 258 | twoR (torch.Tensor): Damping parameter 259 | mix (torch.Tensor): Mixing coefficient of bp, lp and hp 260 | 261 | Returns: 262 | [torch.Tensor]: Filtered audio. Shape [batch, n_samples] 263 | """ 264 | seq_len, batch_size = audio.shape 265 | T = 1.0 / (1.0 + g * (g + twoR)) 266 | H = T.unsqueeze(-1) * torch.cat( 267 | [torch.ones_like(g), -g, g, twoR * g + 1], dim=-1 268 | ).reshape(seq_len, batch_size, 2, 2) 269 | 270 | # Y = gHBx + Hs 271 | gHB = g * T * torch.cat([torch.ones_like(g), g], dim=-1) 272 | # [n_samples, batch_size, 2] 273 | gHBx = gHB * audio.unsqueeze(-1) 274 | if seq_len > self.Y.shape[0]: 275 | self.Y = torch.empty(seq_len, 2, 2, device=self.Y.device) 276 | Y = self.Y[:seq_len, :batch_size, :] 277 | # initialize filter state 278 | state = self.state.expand(batch_size, -1) 279 | for t in range(seq_len): 280 | Y[t] = gHBx[t] + torch.bmm(H[t], state.unsqueeze(-1)).squeeze(-1) 281 | state = 2 * Y[t] - state 282 | self.state = state 283 | 284 | # HP = x - 2R*BP - LP 285 | y_hps = audio - twoR.squeeze(-1) * Y[:, :, 0] - Y[:, :, 1] 286 | 287 | y_mixed = ( 288 | twoR.squeeze(-1) * mix[:, :, 0] * Y[:, :, 0] 289 | + mix[:, :, 1] * Y[:, :, 1] 290 | + mix[:, :, 2] * y_hps 291 | ) 292 | y_mixed = y_mixed.permute(1, 0) 293 | return y_mixed 294 | -------------------------------------------------------------------------------- /neutone_sdk/gcn_1d.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional 4 | 5 | import torch 6 | from torch import Tensor 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | logging.basicConfig() 11 | log = logging.getLogger(__name__) 12 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 13 | 14 | 15 | class TFiLM(nn.Module): 16 | """Temporal Feature-wise Linear Modulation (TFiLM) layer. 17 | 18 | Parameters: 19 | n_channels (int): Number of channels in the input signal. 20 | cond_dim (int): Dimensionality of the conditional input. 21 | tfilm_block_size (int): Size of the temporal blocks. 22 | rnn_type (str, optional): Type of RNN to use for the modulation. 23 | 24 | Returns: 25 | Tensor: The output of the TFiLM layer. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | n_channels: int, 31 | cond_dim: int, 32 | tfilm_block_size: int, 33 | rnn_type: str = "lstm", 34 | ) -> None: 35 | super().__init__() 36 | self.nchannels = n_channels 37 | self.cond_dim = cond_dim 38 | self.tfilm_block_size = tfilm_block_size 39 | self.num_layers = 1 40 | self.first_run = True 41 | self.hidden_state = ( 42 | torch.Tensor(0), 43 | torch.Tensor(0), 44 | ) # (hidden_state, cell_state) 45 | 46 | self.maxpool = torch.nn.MaxPool1d( 47 | kernel_size=tfilm_block_size, 48 | stride=None, 49 | padding=0, 50 | dilation=1, 51 | return_indices=False, 52 | ceil_mode=False, 53 | ) 54 | 55 | rnn_types = {"lstm": torch.nn.LSTM, "gru": torch.nn.GRU} 56 | 57 | try: 58 | RNN = rnn_types[rnn_type.lower()] 59 | self.rnn = RNN( 60 | input_size=n_channels + cond_dim, 61 | hidden_size=n_channels, 62 | num_layers=self.num_layers, 63 | batch_first=True, 64 | bidirectional=False, 65 | ) 66 | except KeyError: 67 | raise ValueError(f"Invalid rnn_type. Use 'lstm' or 'gru'. Got {rnn_type}") 68 | 69 | def forward(self, x: Tensor , cond: Optional[Tensor] = None) -> Tensor: 70 | x_in_shape = x.shape # (batch_size, n_channels, samples) 71 | 72 | # Pad input to be divisible by tfilm_block_size 73 | if (x_in_shape[2] % self.tfilm_block_size) != 0: 74 | padding = torch.zeros( 75 | x_in_shape[0], 76 | x_in_shape[1], 77 | self.tfilm_block_size - (x_in_shape[2] % self.tfilm_block_size), 78 | ) 79 | x = torch.cat((x, padding), dim=-1) 80 | 81 | x_shape = x.shape 82 | n_steps = int(x_shape[-1] / self.tfilm_block_size) 83 | 84 | x_down = self.maxpool(x) # (batch_size, n_channels, n_steps) 85 | 86 | if cond is not None: 87 | cond_up = cond.unsqueeze(-1) 88 | cond_up = cond_up.repeat(1, 1, n_steps) # (batch_size, cond_dim, n_steps) 89 | x_down = torch.cat( 90 | (x_down, cond_up), dim=1 91 | ) # (batch_size, n_channels + cond_dim, n_steps) 92 | 93 | # Put shape to (n_steps, batch_size, n_channels + cond_dim) 94 | x_down = x_down.permute(2, 0, 1) 95 | 96 | # Modulation 97 | if self.first_run: # Reset hidden state 98 | x_norm, self.hidden_state = self.rnn(x_down, None) 99 | self.first_run = False 100 | else: 101 | x_norm, self.hidden_state = self.rnn(x_down, self.hidden_state) 102 | 103 | # Put shape back to (batch_size, n_channels, length) 104 | x_norm = x_norm.permute(1, 2, 0) 105 | 106 | # Reshape input and modulation sequence into blocks 107 | x_in = torch.reshape( 108 | x, shape=(-1, self.nchannels, n_steps, self.tfilm_block_size) 109 | ) 110 | x_norm = torch.reshape(x_norm, shape=(-1, self.nchannels, n_steps, 1)) 111 | 112 | x_out = x_norm * x_in 113 | 114 | # Return to the original padded input shape 115 | x_out = torch.reshape(x_out, shape=(x_shape)) 116 | 117 | x_out = x_out[..., : x_in_shape[2]] # Remove padding 118 | 119 | return x_out 120 | 121 | def reset_state(self) -> None: 122 | self.first_run = True 123 | 124 | 125 | class Conv1dCausal(nn.Module): 126 | """Causal 1D convolutional layer 127 | ensures outputs depend only on current and past inputs. 128 | 129 | Parameters: 130 | in_channels (int): Number of channels in the input signal. 131 | out_channels (int): Number of channels produced by the convolution. 132 | kernel_size (int): Size of the convolving kernel. 133 | stride (int): Stride of the convolution. 134 | dilation (int, optional): Spacing between kernel elements. 135 | bias (bool, optional): If True, adds a learnable bias to the output. 136 | 137 | Returns: 138 | Tensor: The output of the causal 1D convolutional layer. 139 | """ 140 | 141 | def __init__( 142 | self, 143 | in_channels: int, 144 | out_channels: int, 145 | kernel_size: int, 146 | stride: int, 147 | dilation: int = 1, 148 | bias: bool = True, 149 | ) -> None: 150 | super().__init__() 151 | self.padding = ( 152 | kernel_size - 1 153 | ) * dilation # input_len == output_len when stride=1 154 | self.in_channels = in_channels 155 | self.conv = nn.Conv1d( 156 | in_channels, 157 | out_channels, 158 | (kernel_size,), 159 | (stride,), 160 | padding=0, 161 | dilation=(dilation,), 162 | bias=bias, 163 | ) 164 | 165 | def forward(self, x: Tensor) -> Tensor: 166 | x = F.pad(x, (self.padding, 0)) # standard zero padding 167 | x = self.conv(x) 168 | return x 169 | 170 | 171 | class GatedAF(nn.Module): 172 | """Gated activation function 173 | applies a tanh activation to one half of the input 174 | and a sigmoid activation to the other half, and then multiplies them element-wise. 175 | 176 | Returns: 177 | Tensor: The output of the gated activation function. 178 | """ 179 | 180 | def __init__(self) -> None: 181 | super().__init__() 182 | 183 | def forward(self, x: Tensor) -> Tensor: 184 | x_tanh, x_sigmoid = x.chunk(2, dim=1) # Split the output into two halves 185 | 186 | x_tanh = torch.tanh(x_tanh) # Apply tanh activation 187 | x_sigmoid = torch.sigmoid(x_sigmoid) # Apply sigmoid activation 188 | 189 | # Element-wise multiplication of tanh and sigmoid activations 190 | x = x_tanh * x_sigmoid 191 | return x 192 | 193 | 194 | class GCN1DBlock(nn.Module): 195 | """Single block of a Gated Convolutional Network (GCN) with conditional modulation. 196 | 197 | Parameters: 198 | in_ch (int): Number of input channels. 199 | out_ch (int): Number of output channels. 200 | kernel_size (int, optional): Size of the convolution kernel. 201 | dilation (int, optional): Dilation rate for dilated convolutions. 202 | stride (int, optional): Stride for the convolution. 203 | cond_dim (int, optional): Dimensionality of the conditional input for FiLM. 204 | """ 205 | 206 | def __init__( 207 | self, 208 | in_ch: int, 209 | out_ch: int, 210 | kernel_size: int = 3, 211 | dilation: int = 1, 212 | stride: int = 1, 213 | cond_dim: int = 0, 214 | rnn_type: str = "lstm", 215 | tfilm_block_size: int = 128, 216 | use_bias_in_conv: bool = False, 217 | ) -> None: 218 | super().__init__() 219 | 220 | self.conv = Conv1dCausal( 221 | in_channels=in_ch, 222 | out_channels=out_ch * 2, # adapt for the Gated Activation Function 223 | kernel_size=kernel_size, 224 | stride=stride, 225 | dilation=dilation, 226 | bias=use_bias_in_conv, 227 | ) 228 | 229 | self.tfilm = None 230 | if cond_dim > 0: 231 | self.tfilm = TFiLM( 232 | n_channels=out_ch * 2, 233 | cond_dim=cond_dim, 234 | tfilm_block_size=tfilm_block_size, 235 | rnn_type=rnn_type, 236 | ) 237 | 238 | self.gated_activation = GatedAF() 239 | 240 | self.res = nn.Conv1d( 241 | in_channels=in_ch, out_channels=out_ch, kernel_size=(1,), bias=False 242 | ) 243 | 244 | def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor: 245 | x_in = x 246 | x = self.conv(x) # Apply causal convolution 247 | if ( 248 | cond is not None and self.tfilm is not None 249 | ): # Apply FiLM if conditional input is given 250 | x = self.tfilm(x, cond) 251 | # Apply gated activation function 252 | x = self.gated_activation(x) 253 | # Apply residual convolution and add to output 254 | x_res = self.res(x_in) 255 | x = x + x_res 256 | return x 257 | 258 | 259 | class GCN1D(nn.Module): 260 | """Gated Convolutional Network (GCN) model, re-implemented from the paper: 261 | https://arxiv.org/abs/2211.00497 262 | 263 | Parameters: 264 | in_ch (int, optional): Number of input channels. 265 | out_ch (int, optional): Number of output channels. 266 | n_blocks (int, optional): Number of GCN blocks. 267 | n_channels (int, optional): Number of channels in the GCN blocks. 268 | dilation_growth (int, optional): Growth rate for dilation in the GCN blocks. 269 | kernel_size (int, optional): Size of the convolution kernel. 270 | cond_dim (int, optional): Dimensionality of the conditional input for FiLM. 271 | 272 | Returns: 273 | Tensor: The output of the GCN model. 274 | """ 275 | 276 | def __init__( 277 | self, 278 | in_ch: int = 1, 279 | out_ch: int = 1, 280 | n_blocks: int = 10, 281 | n_channels: int = 64, 282 | dil_growth: int = 4, 283 | kernel_size: int = 13, 284 | cond_dim: int = 0, 285 | tfilm_block_size: int = 128, 286 | rnn_type: str = "lstm", 287 | use_act: bool = True, 288 | use_bias_in_conv: bool = False, 289 | ) -> None: 290 | super().__init__() 291 | self.kernel_size = kernel_size 292 | self.n_channels = n_channels 293 | self.dil_growth = dil_growth 294 | self.n_blocks = n_blocks 295 | self.cond_dim = cond_dim 296 | self.use_act = use_act 297 | self.use_bias_in_conv = use_bias_in_conv 298 | 299 | # Compute convolution channels and dilations 300 | self.channels = [n_channels] * n_blocks 301 | self.dilations = [dil_growth**idx for idx in range(n_blocks)] 302 | 303 | # Blocks number is given by the number of elements in the channels list 304 | self.n_blocks = len(self.channels) 305 | assert len(self.dilations) == self.n_blocks 306 | 307 | # Create a list of strides 308 | self.strides = [1] * self.n_blocks 309 | 310 | # Create a list of GCN blocks 311 | self.blocks = nn.ModuleList() 312 | block_out_ch = 0 313 | 314 | for idx, (curr_out_ch, dil, stride) in enumerate( 315 | zip(self.channels, self.dilations, self.strides) 316 | ): 317 | block_out_ch = curr_out_ch 318 | if idx == 0: 319 | block_in_ch = in_ch 320 | else: 321 | block_in_ch = block_out_ch 322 | 323 | self.blocks.append( 324 | GCN1DBlock( 325 | block_in_ch, 326 | block_out_ch, 327 | self.kernel_size, 328 | dilation=dil, 329 | stride=stride, 330 | cond_dim=cond_dim, 331 | tfilm_block_size=tfilm_block_size, 332 | rnn_type=rnn_type, 333 | use_bias_in_conv=use_bias_in_conv, 334 | ) 335 | ) 336 | 337 | # Output layer 338 | self.out_net = nn.Conv1d( 339 | self.channels[-1], out_ch, kernel_size=(1,), stride=(1,), bias=False 340 | ) 341 | 342 | # Activation function 343 | self.act = nn.Tanh() 344 | 345 | def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor: 346 | assert x.ndim == 3 # (batch_size, in_ch, samples) 347 | if cond is not None: 348 | assert cond.ndim == 2 # (batch_size, cond_dim) 349 | for block in self.blocks: # Apply GCN blocks 350 | x = block(x, cond) 351 | x = self.out_net(x) # Apply output layer 352 | 353 | if self.act is not None: 354 | x = self.act(x) # Apply tanh activation function 355 | return x 356 | 357 | def calc_receptive_field(self) -> int: 358 | """Calculate the receptive field of the model. 359 | The receptive field is the number of input samples that affect the output of a block. 360 | 361 | The receptive field of the model is the sum of the receptive fields of all layers: 362 | RF = 1 + \sum_{i=1}^{n}(kernel\_size_i - 1) \cdot dilation_i 363 | 364 | i is the layer index, n is the number of layers. 365 | 366 | Returns: 367 | int: The receptive field of the model. 368 | """ 369 | assert all(_ == 1 for _ in self.strides) # TODO(cm): add support for dsTCN 370 | assert self.dilations[0] == 1 # TODO(cm): add support for >1 starting dilation 371 | rf = self.kernel_size 372 | for dil in self.dilations[1:]: 373 | rf = rf + ((self.kernel_size - 1) * dil) 374 | return rf 375 | 376 | -------------------------------------------------------------------------------- /neutone_sdk/metadata.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import requests 5 | from jsonschema import validate, ValidationError 6 | from jsonschema._keywords import anyOf 7 | 8 | from neutone_sdk.audio import AudioSample 9 | 10 | logging.basicConfig() 11 | log = logging.getLogger(__name__) 12 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 13 | 14 | SCHEMA = { 15 | "type": "object", 16 | "properties": { 17 | "model_name": { 18 | "type": "string", 19 | "maxLength": 30, 20 | }, 21 | "model_authors": { 22 | "type": "array", 23 | "maxItems": 5, 24 | "items": {"type": "string"}, 25 | "uniqueItems": True, 26 | }, 27 | "model_version": {"type": "string"}, 28 | "model_short_description": {"type": "string", "maxLength": 150}, 29 | "model_long_description": {"type": "string", "maxLength": 500}, 30 | "technical_description": {"type": "string", "maxLength": 500}, 31 | "technical_links": { 32 | "type": "object", 33 | "additionalProperties": { 34 | "type": "string", 35 | }, 36 | "maxItems": 3, 37 | }, 38 | "tags": { 39 | "type": "array", 40 | "maxItems": 7, 41 | "items": {"type": "string"}, 42 | "uniqueItems": True, 43 | "maxLength": 15, 44 | }, 45 | "citation": { 46 | "type": "string", 47 | "maxLength": 300, 48 | }, 49 | "is_experimental": { 50 | "type": "boolean", 51 | }, 52 | "model_id": {"type": "string"}, 53 | "file_size": {"type": "integer"}, 54 | "sample_sound_files": { 55 | "type": "array", 56 | "items": { 57 | "type": "object", 58 | "required": ["in", "out"], 59 | "properties": { 60 | "in": {"type": "string"}, 61 | "out": {"type": "string"}, 62 | }, 63 | }, 64 | "maxItems": 3, 65 | }, 66 | "neutone_parameters": { 67 | "type": "object", 68 | anyOf: [ 69 | {"required": ["p1"]}, 70 | {"required": ["p1", "p2"]}, 71 | {"required": ["p1", "p2", "p3"]}, 72 | {"required": ["p1", "p2", "p3", "p4"]}, 73 | ], 74 | "properties": { 75 | "p1": {"$ref": "#/definitions/neutoneParameter"}, 76 | "p2": {"$ref": "#/definitions/neutoneParameter"}, 77 | "p3": {"$ref": "#/definitions/neutoneParameter"}, 78 | "p4": {"$ref": "#/definitions/neutoneParameter"}, 79 | }, 80 | }, 81 | "wet_default_value": { 82 | "type": "number", 83 | "minimum": 0.0, 84 | "maximum": 1.0, 85 | }, 86 | "dry_default_value": { 87 | "type": "number", 88 | "minimum": 0.0, 89 | "maximum": 1.0, 90 | }, 91 | "input_gain_default_value": { 92 | "type": "number", 93 | "minimum": 0.0, 94 | "maximum": 1.0, 95 | }, 96 | "output_gain_default_value": { 97 | "type": "number", 98 | "minimum": 0.0, 99 | "maximum": 1.0, 100 | }, 101 | "is_input_mono": { 102 | "type": "boolean", 103 | }, 104 | "is_output_mono": { 105 | "type": "boolean", 106 | }, 107 | "model_type": { 108 | "type": "string", 109 | "enum": ["mono-mono", "mono-stereo", "stereo-mono", "stereo-stereo"], 110 | }, 111 | "native_sample_rates": { 112 | "type": "array", 113 | "items": { 114 | "type": "integer", 115 | "minimum": 0, 116 | "maximum": 384000, 117 | }, 118 | "uniqueItems": True, 119 | }, 120 | "native_buffer_sizes": { 121 | "type": "array", 122 | "items": { 123 | "type": "integer", 124 | "minimum": 1, 125 | "maximum": 65536, 126 | }, 127 | "uniqueItems": True, 128 | }, 129 | "look_behind_samples": { 130 | "type": "integer", 131 | "minimum": 0, 132 | }, 133 | "sdk_version": {"type": "string"}, 134 | "pytorch_version": {"type": "string"}, 135 | "date_created": {"type": "number"}, 136 | }, 137 | "definitions": { 138 | "neutoneParameter": { 139 | "type": "object", 140 | "required": ["name", "description", "default_value", "used", "type"], 141 | "properties": { 142 | "name": {"type": "string"}, 143 | "description": {"type": "string"}, 144 | "default_value": {"type": ["integer", "number", "string"]}, 145 | "used": {"type": "boolean"}, 146 | "type": {"type": "string", "enum": ["continuous"]}, 147 | "max_n_chars": {"type": ["null", "integer"], "minimum": -1}, 148 | "n_values": {"type": ["null", "integer"], "minimum": 2}, 149 | "labels": {"type": ["null", "array"], "items": {"type": "string"}}, 150 | }, 151 | } 152 | }, 153 | "required": [ 154 | "model_name", 155 | "model_authors", 156 | "model_version", 157 | "model_short_description", 158 | "model_long_description", 159 | "technical_description", 160 | "technical_links", 161 | "tags", 162 | "citation", 163 | "is_experimental", 164 | "sample_sound_files", 165 | "neutone_parameters", 166 | "wet_default_value", 167 | "dry_default_value", 168 | "input_gain_default_value", 169 | "output_gain_default_value", 170 | "is_input_mono", 171 | "is_output_mono", 172 | "model_type", 173 | "native_sample_rates", 174 | "native_buffer_sizes", 175 | "look_behind_samples", 176 | "sdk_version", 177 | "pytorch_version", 178 | "date_created", 179 | ], 180 | } 181 | 182 | 183 | def validate_metadata(metadata: dict) -> bool: 184 | try: 185 | validate(instance=metadata, schema=SCHEMA) 186 | except ValidationError as err: 187 | log.error(err) 188 | raise err 189 | 190 | # Check links return 200 191 | for link in metadata["technical_links"].values(): 192 | try: 193 | code = requests.head(link, allow_redirects=True).status_code 194 | if code != 200: 195 | log.error(f"Cannot access link {link}") 196 | except requests.exceptions.ConnectionError: 197 | log.error(f"Cannot access link {link}") 198 | 199 | # Check we can extract mp3s from the samples 200 | for audio_sample_pair in metadata["sample_sound_files"]: 201 | AudioSample.from_b64(audio_sample_pair["in"]) 202 | AudioSample.from_b64(audio_sample_pair["out"]) 203 | 204 | return True 205 | -------------------------------------------------------------------------------- /neutone_sdk/parameter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from abc import ABC 4 | from enum import Enum 5 | from typing import Union, Optional, List, Dict 6 | 7 | from torch import Tensor as T, nn 8 | import torch as tr 9 | from neutone_sdk import constants 10 | 11 | logging.basicConfig() 12 | log = logging.getLogger(__name__) 13 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 14 | 15 | 16 | class NeutoneParameterType(Enum): 17 | CONTINUOUS = "continuous" 18 | CATEGORICAL = "categorical" 19 | TEXT = "text" 20 | TOKENS = "tokens" 21 | 22 | 23 | class NeutoneParameter(ABC, nn.Module): 24 | """ 25 | Defines a Neutone Parameter abstract base class. 26 | 27 | The name and the description of the parameter will be shown as a tooltip 28 | within the UI. This parameter has no functionality and is meant to subclassed. 29 | """ 30 | 31 | def __init__( 32 | self, 33 | name: str, 34 | description: str, 35 | default_value: Union[int, float, str, Optional[List[int]]], 36 | used: bool, 37 | param_type: NeutoneParameterType, 38 | ): 39 | super().__init__() 40 | self.name = name 41 | self.description = description 42 | self.default_value = default_value 43 | self.used = used 44 | self.type = param_type 45 | 46 | def to_metadata( 47 | self, 48 | ) -> Dict[str, Union[int, float, str, bool, List[str], List[int]]]: 49 | return { 50 | "name": self.name, 51 | "description": self.description, 52 | "default_value": self.default_value, 53 | "used": self.used, 54 | "type": self.type.value, 55 | } 56 | 57 | 58 | class ContinuousNeutoneParameter(NeutoneParameter): 59 | """ 60 | Defines a continuous Neutone Parameter that the user can use to control a model. 61 | 62 | The name and the description of the parameter will be shown as a tooltip 63 | within the UI. 64 | `default_value` must be between min_value and max_value and will be used as the 65 | default in the plugin when no presets are available. 66 | """ 67 | 68 | def __init__( 69 | self, 70 | name: str, 71 | description: str, 72 | default_value: float, 73 | min_value: float = 0.0, 74 | max_value: float = 1.0, 75 | used: bool = True, 76 | ): 77 | super().__init__( 78 | name, 79 | description, 80 | default_value, 81 | used, 82 | NeutoneParameterType.CONTINUOUS, 83 | ) 84 | assert ( 85 | min_value < max_value 86 | ), "`min_value` must be less than `max_value` for continuous params" 87 | assert ( 88 | min_value <= default_value <= max_value 89 | ), f"`default_value` for continuous params must be between {min_value} and {max_value}" 90 | self.min_value = min_value 91 | self.max_value = max_value 92 | self.range = max_value - min_value 93 | self.default_value_0to1 = (default_value - min_value) / self.range 94 | 95 | def from_0to1(self, param_val: T) -> T: 96 | """ 97 | Converts a parameter value inplace from [0, 1] to [min_value, max_value]. 98 | """ 99 | tr.mul(param_val, self.range, out=param_val) 100 | tr.add(param_val, self.min_value, out=param_val) 101 | return param_val 102 | 103 | def to_metadata(self) -> Dict[str, Union[int, float, str, bool, List[str]]]: 104 | metadata = super().to_metadata() 105 | metadata["min_value"] = self.min_value 106 | metadata["max_value"] = self.max_value 107 | return metadata 108 | 109 | 110 | class CategoricalNeutoneParameter(NeutoneParameter): 111 | """ 112 | Defines a categorical Neutone Parameter that the user can use to control a model. 113 | 114 | The name and the description of the parameter will be shown as a tooltip 115 | within the UI. 116 | `n_values` must be an int greater than or equal to 2 and less than or equal to 117 | `constants.MAX_N_CATEGORICAL_VALUES`. 118 | `default_value` must be in the range [0, `n_values` - 1]. 119 | `labels` is a list of strings that will be used as the labels for the parameter. 120 | """ 121 | 122 | def __init__( 123 | self, 124 | name: str, 125 | description: str, 126 | n_values: int, 127 | default_value: int, 128 | labels: Optional[List[str]] = None, 129 | used: bool = True, 130 | ): 131 | super().__init__( 132 | name, description, default_value, used, NeutoneParameterType.CATEGORICAL 133 | ) 134 | assert 2 <= n_values <= constants.MAX_N_CATEGORICAL_VALUES, ( 135 | f"`n_values` for categorical params must between 2 and " 136 | f"{constants.MAX_N_CATEGORICAL_VALUES}" 137 | ) 138 | assert ( 139 | 0 <= default_value <= n_values - 1 140 | ), "`default_value` for categorical params must be between 0 and `n_values`-1" 141 | self.n_values = n_values 142 | if labels is None: 143 | labels = [str(idx) for idx in range(n_values)] 144 | else: 145 | assert len(labels) == self.n_values, "labels must have `n_values` elements" 146 | assert all( 147 | len(label) < constants.MAX_N_CATEGORICAL_LABEL_CHARS for label in labels 148 | ), ( 149 | f"All labels must have length less than " 150 | f"{constants.MAX_N_CATEGORICAL_LABEL_CHARS} characters" 151 | ) 152 | self.labels = labels 153 | self.default_value_0to1 = default_value / (n_values - 1) 154 | 155 | def from_0to1(self, param_val: T) -> T: 156 | """ 157 | Converts a parameter value inplace from [0, 1] to [0, `n_values` - 1]. 158 | """ 159 | tr.mul(param_val, self.n_values - 1, out=param_val) 160 | tr.round(param_val, out=param_val) 161 | return param_val 162 | 163 | def to_metadata( 164 | self, 165 | ) -> Dict[str, Union[int, float, str, bool, List[str], List[int]]]: 166 | metadata = super().to_metadata() 167 | metadata["n_values"] = self.n_values 168 | metadata["labels"] = self.labels 169 | return metadata 170 | 171 | 172 | class TextNeutoneParameter(NeutoneParameter): 173 | """ 174 | Defines a text Neutone Parameter that the user can use to control a model. 175 | 176 | The name and the description of the parameter will be shown as a tooltip 177 | within the UI. 178 | `max_n_chars` specifies the maximum number of characters that the user can input. 179 | If this value is set to -1, there is no limit on the number of characters. 180 | `default_value` is the default value to be automatically populated in the text box. 181 | """ 182 | 183 | def __init__( 184 | self, 185 | name: str, 186 | description: str, 187 | max_n_chars: int = -1, 188 | default_value: str = "", 189 | used: bool = True, 190 | ): 191 | super().__init__( 192 | name, description, default_value, used, NeutoneParameterType.TEXT 193 | ) 194 | assert max_n_chars >= -1, "`max_n_chars` must be greater than or equal to -1" 195 | if max_n_chars != -1: 196 | assert ( 197 | len(default_value) <= max_n_chars 198 | ), "`default_value` must be a string of length less than `max_n_chars`" 199 | self.max_n_chars = max_n_chars 200 | 201 | def to_metadata( 202 | self, 203 | ) -> Dict[str, Union[int, float, str, bool, List[str], List[int]]]: 204 | metadata = super().to_metadata() 205 | metadata["max_n_chars"] = self.max_n_chars 206 | return metadata 207 | 208 | 209 | class DiscreteTokensNeutoneParameter(NeutoneParameter): 210 | """ 211 | Defines a discrete token tensor input to a Neutone model 212 | Should be the output of a tokenizer that processes some text input. 213 | 214 | The name and the description of the parameter will be shown as a tooltip 215 | within the UI. 216 | """ 217 | 218 | def __init__( 219 | self, 220 | name: str, 221 | description: str, 222 | max_n_tokens: int = -1, 223 | default_value: Optional[List[int]] = None, 224 | used: bool = True, 225 | ): 226 | if default_value is None: 227 | default_value: List[int] = [] 228 | super().__init__( 229 | name, description, default_value, used, NeutoneParameterType.TOKENS 230 | ) 231 | assert max_n_tokens >= -1, "`max_n_tokens` must be greater than or equal to -1" 232 | if max_n_tokens != -1: 233 | assert ( 234 | len(default_value) <= max_n_tokens 235 | ), "`default_value` must be a list of length less than `max_n_tokens`" 236 | self.max_n_tokens = max_n_tokens 237 | 238 | def to_metadata( 239 | self, 240 | ) -> Dict[str, Union[int, float, str, bool, List[str], List[int]]]: 241 | metadata = super().to_metadata() 242 | metadata["max_n_tokens"] = self.max_n_tokens 243 | return metadata 244 | -------------------------------------------------------------------------------- /neutone_sdk/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Neutone/neutone_sdk/aee4cac560209fe850686dad3e21695fa8dde473/neutone_sdk/py.typed -------------------------------------------------------------------------------- /neutone_sdk/queues.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Tuple 4 | 5 | import torch as tr 6 | from torch import Tensor 7 | 8 | logging.basicConfig() 9 | log = logging.getLogger(__name__) 10 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 11 | 12 | 13 | class CircularInplaceTensorQueue: 14 | def __init__(self, n_ch: int, max_size: int, use_debug_mode: bool = True) -> None: 15 | """ 16 | Creates a FIFO queue designed for audio data that does not allocate any memory during normal use and performs 17 | as few memory operations as possible. The queue is also compatible with converting to TorchScript. 18 | """ 19 | self.use_debug_mode = use_debug_mode 20 | self.max_size = max_size 21 | self.queue = tr.zeros((n_ch, max_size)) 22 | self.start_idx = 0 23 | self.end_idx = 0 24 | self.size = 0 25 | 26 | def _calc_push_indices(self, in_n: int) -> Tuple[int, int, int, int]: 27 | """ 28 | Calculates the indices to place new data of length in_n into the queue. Since it's a circular queue this can 29 | mean wrapping around once past the end of the queue depending on the contents of the queue at that moment in 30 | time. As a result, we define two possible index ranges for pushing data: start_1:end_1 and start_2:end_2 31 | if wrapping occurs, otherwise end_1 == start_2 == end_2 32 | 33 | Returns: 34 | Tuple[int, int, int, int]: start_1, end_1, start_2, end_2 35 | """ 36 | if self.use_debug_mode: 37 | assert 0 < in_n < self.max_size 38 | start_1 = self.end_idx 39 | if start_1 == self.max_size: 40 | start_1 = 0 41 | end_2 = start_1 + in_n 42 | if end_2 > self.max_size: 43 | end_2 = end_2 % self.max_size 44 | end_1 = end_2 45 | start_2 = end_2 46 | if end_2 < start_1: 47 | end_1 = self.max_size 48 | start_2 = 0 49 | return start_1, end_1, start_2, end_2 50 | 51 | def push(self, x: Tensor) -> None: 52 | """ 53 | Pushes the contents of x to the end of the queue. If the queue does not have adequate space left, the contents 54 | of the queue will be overwritten, starting at the head of the queue. 55 | """ 56 | if self.use_debug_mode: 57 | assert x.ndim == self.queue.ndim 58 | assert x.size(0) == self.queue.size(0) 59 | in_n = x.size(1) 60 | if in_n >= self.max_size: 61 | self.queue[:, :] = x[:, -self.max_size :] 62 | self.start_idx = 0 63 | self.end_idx = self.max_size 64 | self.size = self.max_size 65 | return 66 | if in_n < 1: 67 | return 68 | start_1, end_1, start_2, end_2 = self._calc_push_indices(in_n) 69 | n_1 = end_1 - start_1 70 | self.queue[:, start_1:end_1] = x[:, 0:n_1] 71 | if n_1 < in_n: 72 | self.queue[:, start_2:end_2] = x[:, n_1:] 73 | self.end_idx = end_2 74 | self.size = min(self.size + in_n, self.max_size) 75 | if self.size == self.max_size: 76 | self.start_idx = self.end_idx 77 | 78 | def _calc_pop_indices(self, out_n: int) -> Tuple[int, int, int, int]: 79 | """ 80 | Calculates the indices to pop data of length out_n from the queue. Since it's a circular queue this can 81 | mean wrapping around once past the end of the queue depending on the contents of the queue at that moment in 82 | time. As a result, we define two possible index ranges for popping data: start_1:end_1 and start_2:end_2 83 | if wrapping occurs, otherwise end_1 == start_2 == end_2 84 | 85 | Returns: 86 | Tuple[int, int, int, int]: start_1, end_1, start_2, end_2 87 | """ 88 | out_n = min(out_n, self.size) 89 | if self.use_debug_mode: 90 | assert out_n > 0 91 | start_1 = self.start_idx 92 | if start_1 == self.max_size: 93 | start_1 = 0 94 | end_2 = start_1 + out_n 95 | if end_2 > self.max_size: 96 | end_2 = end_2 % self.max_size 97 | end_1 = end_2 98 | start_2 = end_2 99 | if end_2 <= start_1: 100 | end_1 = self.max_size 101 | start_2 = 0 102 | return start_1, end_1, start_2, end_2 103 | 104 | def pop(self, out: Tensor) -> int: 105 | """ 106 | Attempts to fill the out tensor with data popped from the head of the queue. Begins filling the out tensor at 107 | index 0. If the out tensor is bigger than the number of items in the queue, fills the tensor as much as 108 | possible. 109 | 110 | Returns: 111 | int: the number of items successfully popped from the queue. 112 | """ 113 | # TODO(cm): remove duplicate code using fill 114 | if self.use_debug_mode: 115 | assert out.ndim == self.queue.ndim 116 | assert out.size(0) == self.queue.size(0) 117 | if self.is_empty(): 118 | return 0 119 | out_n = out.size(1) 120 | if out_n < 1: 121 | return 0 122 | start_1, end_1, start_2, end_2 = self._calc_pop_indices(out_n) 123 | n_1 = end_1 - start_1 124 | n_2 = end_2 - start_2 125 | removed_n = n_1 + n_2 126 | if self.use_debug_mode: 127 | assert 0 < n_1 <= self.size 128 | assert 0 <= n_2 < self.size 129 | assert removed_n <= self.size 130 | out[:, 0:n_1] = self.queue[:, start_1:end_1] 131 | if n_2 > 0: 132 | out[:, n_1:removed_n] = self.queue[:, start_2:end_2] 133 | self.start_idx = end_2 134 | self.size -= removed_n 135 | if self.use_debug_mode: 136 | if self.size == 0: 137 | assert self.start_idx == self.end_idx 138 | return removed_n 139 | 140 | def fill(self, out: Tensor) -> int: 141 | """ 142 | Attempts to fill the out tensor with data from the head of the queue. Begins filling the out tensor at index 0. 143 | If the out tensor is bigger than the number of items in the queue, fills the tensor as much as possible. Does 144 | not remove any elements from the queue. 145 | 146 | Returns: 147 | int: the number of items successfully filled from the queue. 148 | """ 149 | if self.use_debug_mode: 150 | assert out.ndim == self.queue.ndim 151 | assert out.size(0) == self.queue.size(0) 152 | if self.is_empty(): 153 | return 0 154 | out_n = out.size(1) 155 | if out_n < 1: 156 | return 0 157 | start_1, end_1, start_2, end_2 = self._calc_pop_indices(out_n) 158 | n_1 = end_1 - start_1 159 | n_2 = end_2 - start_2 160 | filled_n = n_1 + n_2 161 | if self.use_debug_mode: 162 | assert 0 < n_1 <= self.size 163 | assert 0 <= n_2 < self.size 164 | assert filled_n <= self.size 165 | out[:, 0:n_1] = self.queue[:, start_1:end_1] 166 | if n_2 > 0: 167 | out[:, n_1:filled_n] = self.queue[:, start_2:end_2] 168 | return filled_n 169 | 170 | def is_empty(self) -> bool: 171 | return self.size == 0 172 | 173 | def is_full(self) -> bool: 174 | return self.size == self.max_size 175 | 176 | def reset(self) -> None: 177 | self.start_idx = 0 178 | self.end_idx = 0 179 | self.size = 0 180 | -------------------------------------------------------------------------------- /neutone_sdk/realtime_stft.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional, List 4 | 5 | import torch as tr 6 | from torch import Tensor 7 | from torch import nn 8 | 9 | logging.basicConfig() 10 | log = logging.getLogger(__name__) 11 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 12 | 13 | 14 | # TODO(cm:) add documentation, for now please see the documentation in `examples/example_spectral_filter.py` 15 | class RealtimeSTFT(nn.Module): 16 | def __init__( 17 | self, 18 | model_io_n_frames: int = 16, 19 | io_n_ch: int = 2, 20 | io_n_samples: int = 512, 21 | n_fft: int = 2048, 22 | hop_len: int = 512, 23 | window: Optional[Tensor] = None, 24 | center: bool = True, 25 | power: Optional[float] = 1.0, 26 | logarithmize: bool = True, 27 | ensure_pos_spec: bool = True, 28 | use_phase_info: bool = True, 29 | fade_n_samples: int = 0, 30 | eps: float = 1e-8, 31 | use_debug_mode: bool = True, 32 | ) -> None: 33 | super().__init__() 34 | self.use_debug_mode = use_debug_mode 35 | if self.use_debug_mode: 36 | assert n_fft % 2 == 0 37 | assert (n_fft // 2) % hop_len == 0 38 | if window is not None: 39 | assert window.shape == (n_fft,) 40 | assert ( 41 | center 42 | ), "Behavior of center=False needs to be debugged, results in artefacts" 43 | # if center: 44 | # log.warning("STFT is not causal when center=True") 45 | assert power is None or power >= 1.0 46 | if power is None and use_phase_info: 47 | log.warning( 48 | "If power=None, `use_phase_info=True` means the imag component is saved, not the angle" 49 | ) 50 | if power is not None and power > 1.0: 51 | log.warning( 52 | "A power greater than 1.0 probably adds unnecessary " 53 | "computational complexity" 54 | ) 55 | assert fade_n_samples < io_n_samples 56 | self.model_io_n_frames = model_io_n_frames 57 | self.io_n_ch = io_n_ch 58 | self.io_n_samples = io_n_samples 59 | self.n_fft = n_fft 60 | self.hop_len = hop_len 61 | self.center = center 62 | self.power = power 63 | self.logarithmize = logarithmize 64 | self.ensure_pos_spec = ensure_pos_spec 65 | self.use_phase_info = use_phase_info 66 | self.fade_n_samples = fade_n_samples 67 | self.eps = eps 68 | 69 | # Derived parameters 70 | self.io_n_frames = None 71 | self.overlap_n_frames = None 72 | self.in_buf_n_frames = None 73 | self.n_bins = None 74 | self.stft_out_shape = None 75 | self.istft_in_n_frames = None 76 | self.istft_length = None 77 | self.model_io_shape = None 78 | self.out_buf_n_samples = None 79 | 80 | # Internal buffers 81 | self.in_buf = None 82 | self.in_buf_tmp = None 83 | self.stft_mag_buf = None 84 | self.mag_buf = None 85 | self.mag_buf_tmp = None 86 | self.spec_out_buf = None 87 | self.stft_phase_buf = None 88 | self.phase_buf = None 89 | self.phase_buf_tmp = None 90 | self.out_frames_buf = None 91 | self.out_buf = None 92 | 93 | # Sets derived parameters and allocates buffers 94 | self.set_buffer_size(io_n_samples) 95 | 96 | # Internal tensors 97 | if window is None: 98 | window = tr.hann_window(self.n_fft) 99 | if not center: 100 | # Ensures the NOLA constraint is met for the hann_window 101 | # See https://github.com/pytorch/pytorch/issues/62323 102 | # 1e-5 is chosen based on the torchaudio implementation 103 | window = tr.clamp(window, min=1e-5) 104 | self.register_buffer("window", window, persistent=True) 105 | log10_eps = tr.log10(tr.tensor([self.eps])) 106 | self.register_buffer("log10_eps", log10_eps, persistent=False) 107 | fade_up = tr.linspace(0, 1, max(self.fade_n_samples, 1)) 108 | self.register_buffer("fade_up", fade_up, persistent=False) 109 | fade_down = tr.linspace(1, 0, max(self.fade_n_samples, 1)) 110 | self.register_buffer("fade_down", fade_down, persistent=False) 111 | zero_phase = tr.zeros(self.model_io_shape) 112 | self.register_buffer("zero_phase", zero_phase, persistent=False) 113 | self.ten_constant = tr.tensor(10.0) 114 | 115 | def _set_derived_params(self) -> None: 116 | self.io_n_frames = self.io_n_samples // self.hop_len 117 | if self.use_debug_mode: 118 | assert self.io_n_frames <= self.model_io_n_frames 119 | self.overlap_n_frames = self.n_fft // 2 // self.hop_len 120 | self.in_buf_n_frames = (2 * self.overlap_n_frames) + self.io_n_frames - 1 121 | self.n_bins = (self.n_fft // 2) + 1 122 | if self.center: 123 | self.stft_out_shape = ( 124 | self.io_n_ch, 125 | self.n_bins, 126 | (2 * self.overlap_n_frames) + self.io_n_frames, 127 | ) 128 | self.istft_in_n_frames = self.overlap_n_frames + self.io_n_frames 129 | self.istft_length = (self.istft_in_n_frames - 1) * self.hop_len 130 | else: 131 | self.stft_out_shape = (self.io_n_ch, self.n_bins, self.io_n_frames) 132 | self.istft_in_n_frames = self.io_n_frames 133 | self.istft_length = self.in_buf_n_frames * self.hop_len 134 | if self.use_debug_mode: 135 | assert self.istft_in_n_frames <= self.model_io_n_frames 136 | 137 | self.model_io_shape = (self.io_n_ch, self.n_bins, self.model_io_n_frames) 138 | self.out_buf_n_samples = self.io_n_samples + self.fade_n_samples 139 | if self.use_debug_mode: 140 | assert self.out_buf_n_samples <= self.istft_length 141 | 142 | def _allocate_buffers(self) -> None: 143 | self.in_buf = tr.full( 144 | (self.io_n_ch, self.in_buf_n_frames * self.hop_len), 145 | self.eps, 146 | ) 147 | self.in_buf_tmp = tr.clone(self.in_buf) 148 | 149 | self.stft_mag_buf = tr.full(self.stft_out_shape, self.eps) 150 | self.mag_buf = tr.full(self.model_io_shape, self.eps) 151 | self.mag_buf_tmp = tr.clone(self.mag_buf) 152 | # Required to allow inplace operations after the encoder 153 | self.spec_out_buf = tr.clone(self.mag_buf) 154 | 155 | self.stft_phase_buf = tr.zeros(self.stft_out_shape) 156 | self.phase_buf = tr.zeros(self.model_io_shape) 157 | self.phase_buf_tmp = tr.clone(self.phase_buf) 158 | 159 | self.out_frames_buf = tr.full( 160 | (self.io_n_ch, self.n_bins, self.istft_in_n_frames), 161 | self.eps, 162 | dtype=tr.complex64, 163 | ) 164 | self.out_buf = tr.full( 165 | (self.io_n_ch, self.out_buf_n_samples), 166 | self.eps, 167 | ) 168 | 169 | def _logarithmize_spec(self, spec: Tensor) -> None: 170 | tr.clamp(spec, min=self.eps, out=spec) 171 | tr.log10(spec, out=spec) 172 | 173 | def _unlogarithmize_spec(self, spec: Tensor) -> None: 174 | tr.pow(self.ten_constant, spec, out=spec) 175 | tr.clamp(spec, min=self.eps, out=spec) 176 | 177 | def _update_mag_or_phase_buffers( 178 | self, stft_out_buf: Tensor, frames_buf: Tensor, frames_buf_tmp: Tensor 179 | ) -> None: 180 | if self.center: 181 | # Remove overlap frames we have computed before 182 | frames = stft_out_buf[:, :, self.overlap_n_frames :] 183 | # Identify frames that are more correct due to missing prev audio 184 | fixed_prev_frames = frames[:, :, : -self.io_n_frames] 185 | if self.use_debug_mode: 186 | assert fixed_prev_frames.size(2) == self.overlap_n_frames 187 | # Identify the new frames for the input audio chunk 188 | new_frames = frames[:, :, -self.io_n_frames :] 189 | # Overwrite previous frames with more correct frames 190 | n_fixed_frames = min(self.model_io_n_frames, self.overlap_n_frames) 191 | frames_buf[:, :, -n_fixed_frames:] = fixed_prev_frames[ 192 | :, :, -n_fixed_frames: 193 | ] 194 | else: 195 | new_frames = stft_out_buf[:, :, -self.io_n_frames :] 196 | 197 | # Shift buffer left and insert new frames (this way because tr.roll allocates memory dynamically) 198 | frames_buf_tmp[:, :, : -self.io_n_frames] = frames_buf[:, :, self.io_n_frames :] 199 | frames_buf[:, :, : -self.io_n_frames] = frames_buf_tmp[ 200 | :, :, : -self.io_n_frames 201 | ] 202 | frames_buf[:, :, -self.io_n_frames :] = new_frames 203 | 204 | @tr.jit.export 205 | def set_buffer_size(self, io_n_samples: int) -> None: 206 | if self.use_debug_mode: 207 | assert io_n_samples >= self.hop_len 208 | assert io_n_samples % self.hop_len == 0 209 | assert self.fade_n_samples <= io_n_samples 210 | self.io_n_samples = io_n_samples 211 | self._set_derived_params() 212 | self._allocate_buffers() 213 | self.reset() 214 | 215 | @tr.jit.export 216 | def calc_model_delay_samples(self) -> int: 217 | return self.fade_n_samples 218 | 219 | @tr.jit.export 220 | def reset(self) -> None: 221 | self.in_buf.fill_(self.eps) 222 | self.stft_mag_buf.fill_(self.eps) 223 | self.mag_buf.fill_(self.eps) 224 | self.spec_out_buf.fill_(self.eps) 225 | self.stft_phase_buf.fill_(0) 226 | self.phase_buf.fill_(0) 227 | self.out_frames_buf.fill_(self.eps) 228 | self.out_buf.fill_(self.eps) 229 | 230 | @tr.jit.export 231 | def calc_min_buffer_size(self) -> int: 232 | return self.hop_len 233 | 234 | @tr.jit.export 235 | def calc_max_buffer_size(self) -> int: 236 | return self.model_io_n_frames * self.hop_len 237 | 238 | @tr.jit.export 239 | def calc_supported_buffer_sizes(self) -> List[int]: 240 | min_buffer_size = self.calc_min_buffer_size() 241 | max_buffer_size = self.calc_max_buffer_size() 242 | buffer_sizes = [ 243 | bs for bs in range(min_buffer_size, max_buffer_size + 1, self.hop_len) 244 | ] 245 | return buffer_sizes 246 | 247 | @tr.jit.ignore 248 | def audio_to_spec_offline(self, audio: Tensor) -> Tensor: 249 | if self.use_debug_mode: 250 | assert audio.size(0) == self.io_n_ch 251 | assert audio.size(1) >= self.n_fft 252 | assert audio.size(1) % self.hop_len == 0 253 | spec = tr.stft( 254 | audio, 255 | n_fft=self.n_fft, 256 | hop_length=self.hop_len, 257 | window=self.window, 258 | center=self.center, 259 | return_complex=True, 260 | ) 261 | if self.power is None: 262 | spec = spec.real 263 | else: 264 | spec = spec.abs() 265 | if self.power != 1.0: 266 | spec = spec.pow(self.power) 267 | 268 | if self.logarithmize: 269 | spec = tr.clamp(spec, min=self.eps) 270 | spec = tr.log10(spec) 271 | if self.ensure_pos_spec: 272 | spec -= self.log10_eps 273 | 274 | return spec 275 | 276 | @tr.jit.export 277 | def audio_to_spec(self, audio: Tensor) -> Tensor: 278 | if self.use_debug_mode: 279 | assert audio.shape == (self.io_n_ch, self.io_n_samples) 280 | # Shift buffer left and insert audio chunk (this way because tr.roll allocates memory dynamically) 281 | self.in_buf_tmp[:, : -self.io_n_samples] = self.in_buf[:, self.io_n_samples :] 282 | self.in_buf[:, : -self.io_n_samples] = self.in_buf_tmp[:, : -self.io_n_samples] 283 | self.in_buf[:, -self.io_n_samples :] = audio 284 | 285 | # TODO(cm): allow pad_mode to be selected 286 | complex_frames = tr.stft( 287 | self.in_buf, 288 | n_fft=self.n_fft, 289 | hop_length=self.hop_len, 290 | window=self.window, 291 | center=self.center, 292 | return_complex=True, 293 | ) 294 | if self.power is None: 295 | self.stft_mag_buf = complex_frames.real 296 | else: 297 | tr.abs(complex_frames, out=self.stft_mag_buf) 298 | if self.power != 1.0: 299 | tr.pow(self.stft_mag_buf, self.power, out=self.stft_mag_buf) 300 | if self.logarithmize: 301 | self._logarithmize_spec(self.stft_mag_buf) 302 | if self.ensure_pos_spec: 303 | self.stft_mag_buf -= self.log10_eps 304 | 305 | self._update_mag_or_phase_buffers( 306 | self.stft_mag_buf, self.mag_buf, self.mag_buf_tmp 307 | ) 308 | 309 | if self.use_phase_info: 310 | if self.power is None: 311 | self.stft_phase_buf = complex_frames.imag 312 | else: 313 | tr.angle(complex_frames, out=self.stft_phase_buf) 314 | self._update_mag_or_phase_buffers( 315 | self.stft_phase_buf, self.phase_buf, self.phase_buf_tmp 316 | ) 317 | 318 | # Prevent future inplace operations from mutating self.mag_buf 319 | self.spec_out_buf[:, :] = self.mag_buf 320 | return self.spec_out_buf 321 | 322 | @tr.jit.export 323 | def spec_to_audio(self, spec: Tensor) -> Tensor: 324 | if self.use_debug_mode: 325 | assert spec.shape == self.model_io_shape 326 | spec = spec[:, :, -self.istft_in_n_frames :] 327 | if self.use_phase_info: 328 | phase = self.phase_buf[:, :, -self.istft_in_n_frames :] 329 | else: 330 | phase = self.zero_phase[:, :, -self.istft_in_n_frames :] 331 | 332 | if self.logarithmize: 333 | if self.ensure_pos_spec: 334 | spec += self.log10_eps 335 | self._unlogarithmize_spec(spec) 336 | 337 | if self.power is None: 338 | self.out_frames_buf.real = spec 339 | self.out_frames_buf.imag = phase 340 | else: 341 | if self.power != 1.0: 342 | tr.pow(spec, 1 / self.power, out=spec) 343 | tr.polar(spec, phase, out=self.out_frames_buf) 344 | 345 | # TODO(cm): allow pad_mode to be selected 346 | rec_audio = tr.istft( 347 | self.out_frames_buf, 348 | n_fft=self.n_fft, 349 | hop_length=self.hop_len, 350 | window=self.window, 351 | center=self.center, 352 | length=self.istft_length, 353 | ) 354 | rec_audio = rec_audio[:, -self.out_buf_n_samples :] 355 | if self.fade_n_samples == 0: 356 | return rec_audio 357 | 358 | self.out_buf[:, -self.fade_n_samples :] *= self.fade_down 359 | rec_audio[:, : self.fade_n_samples] *= self.fade_up 360 | rec_audio[:, : self.fade_n_samples] += self.out_buf[:, -self.fade_n_samples :] 361 | audio_out = rec_audio[:, : self.io_n_samples] 362 | self.out_buf = rec_audio 363 | return audio_out 364 | -------------------------------------------------------------------------------- /neutone_sdk/tcn_1d.py: -------------------------------------------------------------------------------- 1 | """ 2 | Based off 3 | https://github.com/csteinmetz1/steerable-nafx/blob/main/steerable-nafx.ipynb 4 | """ 5 | import logging 6 | import os 7 | from typing import Optional 8 | 9 | import torch as tr 10 | from torch import Tensor 11 | from torch import nn 12 | 13 | logging.basicConfig() 14 | log = logging.getLogger(__name__) 15 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 16 | 17 | 18 | def causal_crop(x: Tensor, length: int) -> Tensor: 19 | if x.shape[-1] != length: 20 | stop = x.shape[-1] - 1 21 | start = stop - length 22 | x = x[..., start:stop] 23 | return x 24 | 25 | 26 | class FiLM(nn.Module): 27 | def __init__( 28 | self, 29 | cond_dim: int, # dim of conditioning input 30 | num_features: int, # dim of the conv channel 31 | use_bn: bool = True, 32 | ) -> None: 33 | super().__init__() 34 | self.num_features = num_features 35 | self.use_bn = use_bn 36 | if self.use_bn: 37 | self.bn = nn.BatchNorm1d(num_features, affine=False) 38 | self.adaptor = nn.Linear(cond_dim, 2 * num_features) 39 | 40 | def forward(self, x: Tensor, cond: Tensor) -> Tensor: 41 | assert cond.ndim == 2 42 | cond = self.adaptor(cond) 43 | g, b = tr.chunk(cond, 2, dim=-1) 44 | g = g.unsqueeze(-1) 45 | b = b.unsqueeze(-1) 46 | 47 | if self.use_bn: 48 | x = self.bn(x) # Apply batchnorm without affine 49 | x = (x * g) + b # Then apply conditional affine 50 | 51 | return x 52 | 53 | 54 | class TCN1DBlock(nn.Module): 55 | def __init__( 56 | self, 57 | in_ch: int, 58 | out_ch: int, 59 | kernel_size: int, 60 | dilation: int, 61 | padding: Optional[int] = None, 62 | cond_dim: int = 0, 63 | use_bias_in_conv: bool = True, 64 | use_bn: bool = True, 65 | use_act: bool = True, 66 | use_res: bool = True, 67 | act: Optional[nn.Module] = None, 68 | prelu_ch: int = 1, 69 | res_groups: int = 1, 70 | ) -> None: 71 | super().__init__() 72 | self.padding = padding 73 | if self.padding is None: 74 | self.padding = ((kernel_size - 1) // 2) * dilation 75 | if act is None: 76 | act = nn.PReLU(prelu_ch) 77 | 78 | self.act = None 79 | if use_act: 80 | self.act = act 81 | 82 | self.conv = nn.Conv1d( 83 | in_ch, 84 | out_ch, 85 | (kernel_size,), 86 | dilation=(dilation,), 87 | padding=self.padding, 88 | bias=use_bias_in_conv, 89 | ) 90 | 91 | self.film = None 92 | if cond_dim > 0: 93 | self.film = FiLM(cond_dim, out_ch, use_bn=use_bn) 94 | 95 | self.bn = None 96 | if use_bn and self.film is None: 97 | self.bn = nn.BatchNorm1d(out_ch) 98 | 99 | self.res = None 100 | if use_res: 101 | self.res = nn.Conv1d(in_ch, out_ch, (1,), groups=res_groups, bias=False) 102 | 103 | def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor: 104 | x_in = x 105 | x = self.conv(x) 106 | if cond is not None and self.film is not None: 107 | x = self.film(x, cond) 108 | elif self.bn is not None: 109 | x = self.bn(x) 110 | 111 | if self.act is not None: 112 | x = self.act(x) 113 | 114 | if self.res is not None: 115 | res = self.res(x_in) 116 | x_res = causal_crop(res, x.shape[-1]) 117 | x += x_res 118 | 119 | return x 120 | 121 | 122 | class TCN1D(nn.Module): 123 | def __init__( 124 | self, 125 | in_ch: int = 1, 126 | out_ch: int = 1, 127 | n_blocks: int = 10, 128 | kernel_size: int = 13, 129 | n_channels: int = 64, 130 | dil_growth: int = 4, 131 | padding: Optional[int] = None, 132 | cond_dim: int = 0, 133 | use_act: bool = True, 134 | use_bn: bool = False, 135 | use_bias_in_conv: bool = True, 136 | ) -> None: 137 | super().__init__() 138 | self.kernel_size = kernel_size 139 | self.n_channels = n_channels 140 | self.dil_growth = dil_growth 141 | self.n_blocks = n_blocks 142 | self.stack_size = n_blocks 143 | self.cond_dim = cond_dim 144 | self.use_act = use_act 145 | self.use_bn = use_bn 146 | self.use_bias_in_conv = use_bias_in_conv 147 | 148 | self.blocks = nn.ModuleList() 149 | for n in range(self.n_blocks): 150 | if n == 0: 151 | block_in_ch = in_ch 152 | block_out_ch = self.n_channels 153 | elif n == self.n_blocks - 1: 154 | block_in_ch = self.n_channels 155 | block_out_ch = out_ch 156 | else: 157 | block_in_ch = self.n_channels 158 | block_out_ch = self.n_channels 159 | 160 | dilation = self.dil_growth**n 161 | self.blocks.append( 162 | TCN1DBlock( 163 | block_in_ch, 164 | block_out_ch, 165 | self.kernel_size, 166 | dilation, 167 | padding=padding, 168 | cond_dim=self.cond_dim, 169 | use_act=self.use_act, 170 | use_bn=self.use_bn, 171 | use_bias_in_conv=self.use_bias_in_conv, 172 | ) 173 | ) 174 | 175 | def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor: 176 | assert x.ndim == 3 # (batch_size, in_ch, samples) 177 | if cond is not None: 178 | assert cond.ndim == 2 # (batch_size, cond_dim) 179 | for block in self.blocks: 180 | x = block(x, cond) 181 | return x 182 | 183 | def calc_receptive_field(self) -> int: 184 | """Compute the receptive field in samples.""" 185 | rf = self.kernel_size 186 | for idx in range(1, self.n_blocks): 187 | dilation = self.dil_growth ** (idx % self.stack_size) 188 | rf = rf + ((self.kernel_size - 1) * dilation) 189 | return rf 190 | 191 | 192 | if __name__ == "__main__": 193 | tcn = TCN1D(n_blocks=4, cond_dim=3, use_bn=True) 194 | log.info(tcn.calc_receptive_field()) 195 | audio = tr.rand((1, 1, 65536)) 196 | cond = tr.rand((1, 3)) 197 | # cond = None 198 | out = tcn.forward(audio, cond) 199 | log.info(out.shape) 200 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "neutone_sdk" 3 | version = "1.4.3" 4 | description = "SDK for wrapping deep learning models for usage in the Neutone plugin" 5 | readme = "README.md" 6 | authors = ["Qosmo "] 7 | homepage = "https://github.com/QosmoInc/neutone_sdk.git" 8 | license = "LGPL" 9 | packages = [{include = "neutone_sdk"}] 10 | 11 | 12 | [tool.poetry.dependencies] 13 | click = ">=8.1.7,<9.0.0" 14 | python = ">=3.8,<4.0" 15 | numpy = "^1.21.6" 16 | torch = ">=1.11.0,<2.2.0" 17 | torchaudio = ">=0.11.0,<2.2.0" 18 | soundfile = ">=0.12.1" 19 | jsonschema = "^4.21.0" 20 | requests = "^2.27.1" 21 | tqdm = "^4.63.1" 22 | 23 | [tool.poetry.dev-dependencies] 24 | black = "22.3.0" 25 | pytest = "*" 26 | 27 | [build-system] 28 | requires = ["poetry-core>=1.1.0"] 29 | build-backend = "poetry.core.masonry.api" 30 | -------------------------------------------------------------------------------- /testing/test_cached_mel_spec.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import torch as tr 5 | from torchaudio.transforms import MelSpectrogram 6 | 7 | from neutone_sdk.cached_mel_spec import CachedMelSpec 8 | 9 | logging.basicConfig() 10 | log = logging.getLogger(__name__) 11 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 12 | 13 | 14 | def test_cached_mel_spec(): 15 | # Setup 16 | tr.set_printoptions(precision=1) 17 | tr.random.manual_seed(42) 18 | 19 | sr = 44100 20 | n_ch = 1 21 | n_fft = 2048 22 | hop_len = 128 23 | n_mels = 16 24 | total_n_samples = 1000 * hop_len 25 | 26 | audio = tr.rand(n_ch, total_n_samples) 27 | # log.info(f"audio = {audio}") 28 | mel_spec = MelSpectrogram( 29 | sample_rate=sr, 30 | n_fft=n_fft, 31 | hop_length=hop_len, 32 | n_mels=n_mels, 33 | center=True, 34 | pad_mode="constant", 35 | ) 36 | cached_mel_spec = CachedMelSpec( 37 | sr=sr, n_ch=n_ch, n_fft=n_fft, hop_len=hop_len, n_mels=n_mels 38 | ) 39 | 40 | # Test delay 41 | delay_samples = cached_mel_spec.get_delay_samples() 42 | assert delay_samples == n_fft // 2 - hop_len 43 | 44 | # Test processing all audio at once 45 | spec = mel_spec(audio) 46 | delay_frames = cached_mel_spec.get_delay_frames() 47 | cached_spec = cached_mel_spec(audio) 48 | cached_spec = cached_spec[:, :, delay_frames:] 49 | # log.info(f" spec = {spec}") 50 | # log.info(f"cached_spec = {cached_spec}") 51 | assert tr.allclose(spec[:, :, : cached_spec.size(2)], cached_spec) 52 | cached_mel_spec.reset() 53 | 54 | # Test processing audio in chunks (random chunk size) 55 | chunks = [] 56 | min_chunk_size = 1 57 | max_chunk_size = 100 58 | curr_idx = 0 59 | while curr_idx < total_n_samples - max_chunk_size: 60 | chunk_size = ( 61 | tr.randint(min_chunk_size, max_chunk_size + 1, (1,)).item() * hop_len 62 | ) 63 | chunks.append(audio[:, curr_idx : curr_idx + chunk_size]) 64 | curr_idx += chunk_size 65 | if curr_idx < total_n_samples: 66 | chunks.append(audio[:, curr_idx:]) 67 | chunks.append( 68 | tr.zeros(n_ch, cached_mel_spec.get_delay_samples() + cached_mel_spec.hop_len) 69 | ) 70 | 71 | spec_chunks = [] 72 | for chunk in chunks: 73 | spec_chunk = cached_mel_spec(chunk) 74 | spec_chunks.append(spec_chunk) 75 | chunked_spec = tr.cat(spec_chunks, dim=2) 76 | chunked_spec = chunked_spec[:, :, delay_frames:] 77 | # log.info(f" spec = {spec}") 78 | # log.info(f"chunked_spec = {chunked_spec}") 79 | assert tr.allclose(spec, chunked_spec) 80 | log.info("test_cached_mel_spec passed!") 81 | 82 | 83 | if __name__ == "__main__": 84 | test_cached_mel_spec() 85 | -------------------------------------------------------------------------------- /testing/test_conv.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | import os 4 | import random 5 | from typing import Union, Tuple 6 | 7 | import torch as tr 8 | from torch import nn 9 | from tqdm import tqdm 10 | 11 | from neutone_sdk.conv import Conv1dGeneral 12 | 13 | logging.basicConfig() 14 | log = logging.getLogger(__name__) 15 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 16 | 17 | 18 | def test_dynamic_bs() -> None: 19 | conv_gen = Conv1dGeneral(in_channels=2, 20 | out_channels=16, 21 | kernel_size=5, 22 | padding="same", 23 | dilation=2, 24 | causal=False, 25 | cached=True, 26 | use_dynamic_bs=True) 27 | for bs in range(64): 28 | audio = tr.rand((bs, 2, 128)) 29 | out = conv_gen(audio) 30 | assert out.shape == (bs, 16, 128) 31 | 32 | 33 | def _test_against_conv_torch(in_channels: int, 34 | out_channels: int, 35 | kernel_size: int, 36 | padding: Union[str, int, Tuple[int]], 37 | dilation: int, 38 | causal: bool, 39 | padding_mode: str = "zeros", 40 | batch_size: int = 1, 41 | block_size: int = 128, 42 | n_blocks: int = 32) -> None: 43 | conv_gen = Conv1dGeneral(in_channels, 44 | out_channels, 45 | kernel_size, 46 | padding=padding, 47 | dilation=dilation, 48 | padding_mode=padding_mode, 49 | causal=causal, 50 | cached=False) 51 | padding_torch = padding 52 | if causal and padding == "same": 53 | # torch.nn.Conv1d doesn't support causal convs so we need to add the causal 54 | # padding to both sides and then remove it from the right side later 55 | assert conv_gen.padding_r == 0 56 | padding_torch = conv_gen.padding_l 57 | conv_torch = nn.Conv1d(in_channels, 58 | out_channels, 59 | kernel_size, 60 | padding=padding_torch, 61 | dilation=dilation, 62 | padding_mode=padding_mode) 63 | 64 | # Copy weights and biases for testing 65 | conv_torch.weight = nn.Parameter(conv_gen.conv1d.weight.clone()) 66 | if conv_torch.bias is not None: 67 | conv_torch.bias = nn.Parameter(conv_gen.conv1d.bias.clone()) 68 | 69 | audio = tr.rand((batch_size, in_channels, n_blocks * block_size)) 70 | out_torch = conv_torch(audio) 71 | out_gen = conv_gen(audio) 72 | # torch.nn.Conv1d doesn't support causal convs so get rid of the extra right samples 73 | if causal and padding != "valid": 74 | if conv_gen.padding_l > 0: 75 | out_torch = out_torch[..., :-conv_gen.padding_l] 76 | assert out_gen.shape == out_torch.shape 77 | assert tr.allclose(out_gen, out_torch) 78 | 79 | conv_gen.set_cached(True) 80 | out_blocks = [] 81 | for idx in range(n_blocks): 82 | audio_block = audio[..., idx * block_size:(idx + 1) * block_size] 83 | out_block = conv_gen(audio_block) 84 | out_blocks.append(out_block) 85 | assert all(b.size(-1) == block_size for b in out_blocks) 86 | out_cached = tr.cat(out_blocks, dim=-1) 87 | 88 | delay_samples = conv_gen.get_delay_samples() 89 | if delay_samples > 0: 90 | # Remove the delay samples from the beginning of the cached output to align 91 | # it with not cached output 92 | out_cached = out_cached[..., delay_samples:] 93 | # Remove the delay samples from the end of the not cached output since they were 94 | # never computed by the cached convolution 95 | out_torch = out_torch[..., :-delay_samples] 96 | # Different padding modes can result in different output lengths of out_torch, 97 | # so we need to crop the longer one to align it with the shorter one 98 | if out_cached.size(-1) > out_torch.size(-1): 99 | out_cached = Conv1dGeneral.causal_crop(out_cached, out_torch.size(-1)) 100 | else: 101 | out_torch = Conv1dGeneral.causal_crop(out_torch, out_cached.size(-1)) 102 | assert out_cached.shape == out_torch.shape 103 | assert tr.allclose(out_cached, out_torch) 104 | 105 | 106 | def test_conv1d_general(): 107 | causal_flags = [False, True] 108 | in_channels = [1, 2] 109 | out_ch = 1 110 | kernel_sizes = [1, 2, 3, 4, 5, 6, 7, 8] 111 | dilations = [1, 2, 3, 4, 5, 6, 7, 8] 112 | max_rand_padding = 32 113 | 114 | for causal, in_ch, kernel_size, dil in tqdm(itertools.product(causal_flags, 115 | in_channels, 116 | kernel_sizes, 117 | dilations)): 118 | rand_pad = random.randint(1, max_rand_padding) 119 | log.info(f"Testing causal={causal}, " 120 | f"in_ch={in_ch}, " 121 | f"kernel_size={kernel_size}, " 122 | f"dil={dil}, " 123 | f"rand_pad={rand_pad}") 124 | _test_against_conv_torch( 125 | in_ch, out_ch, kernel_size, padding="same", dilation=dil, causal=causal) 126 | _test_against_conv_torch( 127 | in_ch, out_ch, kernel_size, padding="valid", dilation=dil, causal=causal) 128 | _test_against_conv_torch( 129 | in_ch, out_ch, kernel_size, padding=0, dilation=dil, causal=causal) 130 | _test_against_conv_torch( 131 | in_ch, out_ch, kernel_size, padding=rand_pad, dilation=dil, causal=causal) 132 | 133 | 134 | def _test_get_delay_samples(in_channels: int, 135 | kernel_size: int, 136 | dilation: int, 137 | causal: bool, 138 | padding_mode: str = "zeros", 139 | batch_size: int = 1, 140 | block_size: int = 128, 141 | n_blocks: int = 32) -> None: 142 | # This needs to be 1 for the asserts to work, but shouldn't affect generalization 143 | out_channels = 1 144 | conv_gen = Conv1dGeneral(in_channels, 145 | out_channels=out_channels, 146 | kernel_size=kernel_size, 147 | padding="same", 148 | dilation=dilation, 149 | padding_mode=padding_mode, 150 | bias=False, 151 | causal=causal) 152 | 153 | # Create an audio signal consisting of 50% silence and then 50% random noise 154 | n_samples = n_blocks * block_size 155 | mid_idx = n_samples // 2 156 | n_samples_b = n_samples - mid_idx 157 | audio = tr.zeros((batch_size, in_channels, n_samples)) 158 | audio[..., mid_idx:] = tr.rand((batch_size, in_channels, n_samples_b)) 159 | 160 | # Measure the index of the first non-zero sample of the uncached convolution 161 | out_uncached = conv_gen(audio) 162 | assert out_uncached.shape == (batch_size, out_channels, n_samples) 163 | nonzero_idx_uncached = (out_uncached != 0).nonzero()[:, -1][0].item() 164 | 165 | # Measure the index of the first non-zero sample of the cached convolution 166 | conv_gen.set_cached(True) 167 | out_blocks = [] 168 | for idx in range(n_blocks): 169 | audio_block = audio[..., idx * block_size:(idx + 1) * block_size] 170 | out_block = conv_gen(audio_block) 171 | out_blocks.append(out_block) 172 | assert all(b.size(-1) == block_size for b in out_blocks) 173 | out_cached = tr.cat(out_blocks, dim=-1) 174 | assert out_cached.shape == (batch_size, out_channels, n_samples) 175 | nonzero_idx_cached = (out_cached != 0).nonzero()[:, -1][0].item() 176 | 177 | # Compare the reported delay to the measured delay 178 | delay_samples = conv_gen.get_delay_samples() 179 | measured_delay_samples = nonzero_idx_cached - nonzero_idx_uncached 180 | assert measured_delay_samples == delay_samples 181 | assert (out_uncached[..., nonzero_idx_uncached] == 182 | out_cached[..., nonzero_idx_cached]) 183 | 184 | 185 | def test_get_delay_samples() -> None: 186 | causal_flags = [False, True] 187 | in_channels = [1, 2] 188 | kernel_sizes = [1, 2, 3, 4, 5, 6, 7, 8] 189 | dilations = [1, 2, 3, 4, 5, 6, 7, 8] 190 | 191 | for causal, in_ch, kernel_size, dil in tqdm(itertools.product(causal_flags, 192 | in_channels, 193 | kernel_sizes, 194 | dilations)): 195 | log.info(f"Testing causal={causal}, " 196 | f"in_ch={in_ch}, " 197 | f"kernel_size={kernel_size}, " 198 | f"dil={dil}") 199 | _test_get_delay_samples(in_ch, kernel_size, dilation=dil, causal=causal) 200 | 201 | 202 | if __name__ == "__main__": 203 | test_dynamic_bs() 204 | test_conv1d_general() 205 | test_get_delay_samples() 206 | -------------------------------------------------------------------------------- /testing/test_profiling.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Dict, List 4 | 5 | import torch as tr 6 | import torch.nn as nn 7 | from torch import Tensor 8 | 9 | from neutone_sdk import ( 10 | WaveformToWaveformBase, 11 | NeutoneParameter, 12 | SampleQueueWrapper, 13 | ) 14 | from neutone_sdk.benchmark import profile_sqw 15 | 16 | logging.basicConfig() 17 | log = logging.getLogger(__name__) 18 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 19 | 20 | 21 | class ProfilingModel(nn.Module): 22 | def forward( 23 | self, x: Tensor, min_val: Tensor, max_val: Tensor, gain: Tensor 24 | ) -> Tensor: 25 | # tr.neg(min_val, out=min_val) 26 | # tr.mul(gain, min_val, out=min_val) 27 | # tr.mul(gain, max_val, out=max_val) 28 | # tr.clip(x, min=min_val, max=max_val, out=x) 29 | # tr.clip(x, min=gain * -min_val, max=gain * max_val, out=x) 30 | return x 31 | 32 | 33 | class ProfilingModelWrapper(WaveformToWaveformBase): 34 | def get_model_name(self) -> str: 35 | return "clipper" 36 | 37 | def get_model_authors(self) -> List[str]: 38 | return ["Andrew Fyfe"] 39 | 40 | def get_model_short_description(self) -> str: 41 | return "Audio clipper." 42 | 43 | def get_model_long_description(self) -> str: 44 | return "Clips the input audio between -1 and 1." 45 | 46 | def get_technical_description(self) -> str: 47 | return "Clips the input audio between -1 and 1." 48 | 49 | def get_technical_links(self) -> Dict[str, str]: 50 | return { 51 | "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/example_clipper.py" 52 | } 53 | 54 | def get_tags(self) -> List[str]: 55 | return ["clipper"] 56 | 57 | def get_model_version(self) -> str: 58 | return "1.0.0" 59 | 60 | def is_experimental(self) -> bool: 61 | return False 62 | 63 | def get_neutone_parameters(self) -> List[NeutoneParameter]: 64 | return [ 65 | NeutoneParameter("min", "min clip threshold", default_value=0.15), 66 | NeutoneParameter("max", "max clip threshold", default_value=0.15), 67 | NeutoneParameter("gain", "scale clip threshold", default_value=1.0), 68 | ] 69 | 70 | @tr.jit.export 71 | def is_input_mono(self) -> bool: 72 | return False 73 | 74 | @tr.jit.export 75 | def is_output_mono(self) -> bool: 76 | return False 77 | 78 | @tr.jit.export 79 | def get_native_sample_rates(self) -> List[int]: 80 | return [48000] 81 | 82 | @tr.jit.export 83 | def get_native_buffer_sizes(self) -> List[int]: 84 | return [512] 85 | 86 | def get_look_behind_samples(self) -> int: 87 | return 0 88 | 89 | # def aggregate_params(self, param: Tensor) -> Tensor: 90 | # return param # We want sample-level control, so no aggregation 91 | 92 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 93 | min_val, max_val, gain = params["min"], params["max"], params["gain"] 94 | x = self.model.forward(x, min_val, max_val, gain) 95 | x = x[:, self.get_look_behind_samples() :] 96 | return x 97 | 98 | 99 | if __name__ == "__main__": 100 | model = ProfilingModel() 101 | wrapper = ProfilingModelWrapper(model) 102 | sqw = SampleQueueWrapper(wrapper) 103 | profile_sqw(sqw, daw_sr=48000, n_iters=100, convert_to_torchscript=True) 104 | -------------------------------------------------------------------------------- /testing/test_queues.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | 5 | import torch as tr 6 | from tqdm import tqdm 7 | 8 | from neutone_sdk import CircularInplaceTensorQueue 9 | 10 | logging.basicConfig() 11 | log = logging.getLogger(__name__) 12 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 13 | 14 | 15 | def test_circular_queue() -> None: 16 | trials = 100 17 | iters = 100 18 | max_queue_len = 19 19 | random.seed(26) 20 | for _ in tqdm(range(trials)): 21 | in_list = [] 22 | out_list = [] 23 | queue_len = random.randint(1, max_queue_len) 24 | queue = CircularInplaceTensorQueue(1, queue_len) 25 | for idx in range(iters): 26 | if not queue.is_full(): 27 | block = tr.full((1, random.randint(1, queue_len - queue.size)), idx + 1) 28 | queue.push(block) 29 | in_list += block[0, :].tolist() 30 | 31 | if not queue.is_empty(): 32 | block = tr.zeros((1, random.randint(1, queue.size))) 33 | queue.pop(block) 34 | out_list += block[0, :].int().tolist() 35 | 36 | assert len(in_list) >= len(out_list) 37 | assert in_list[: len(out_list)] == out_list 38 | assert queue.size == len(in_list) - len(out_list) 39 | -------------------------------------------------------------------------------- /testing/test_sandwiches.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import os 4 | import random 5 | 6 | import torch as tr 7 | import torch.nn.functional as F 8 | from tqdm import tqdm 9 | 10 | from neutone_sdk.sandwich import ( 11 | LinearResampler, 12 | InplaceLinearResampler, 13 | Inplace4pHermiteResampler, 14 | ) 15 | 16 | logging.basicConfig() 17 | log = logging.getLogger(__name__) 18 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 19 | 20 | 21 | def test_linear_resamplers( 22 | n_trials: int = 1000, in_n_ch: int = 2, out_n_ch: int = 2 23 | ) -> None: 24 | random.seed(42) 25 | tr.manual_seed(42) 26 | sampling_rates = [16000, 22050, 32000, 44100, 48000, 88200, 96000] 27 | buffer_sizes = [64, 128, 256, 512, 1024, 2048] 28 | 29 | resampler = LinearResampler(48000, 48000, 512) 30 | inplace_resampler = InplaceLinearResampler(in_n_ch, out_n_ch, 48000, 48000, 512) 31 | 32 | for _ in tqdm(range(n_trials)): 33 | sr_a = random.choice(sampling_rates) 34 | sr_b = random.choice(sampling_rates) 35 | in_bs = random.choice(buffer_sizes) 36 | 37 | resampler.set_sample_rates(sr_a, sr_b, in_bs) 38 | inplace_resampler.set_sample_rates(sr_a, sr_b, in_bs) 39 | # Check inplace linear resampler internal values are correct for matching the ends exactly 40 | assert inplace_resampler.x_in[0] == 0.0 41 | assert inplace_resampler.x_in[-1] == 0.0 or inplace_resampler.x_in[-1] == 1.0 42 | 43 | in_audio = tr.rand((in_n_ch, in_bs)) 44 | in_linear = resampler.process_in(in_audio) 45 | in_linear_inplace = inplace_resampler.process_in(in_audio) 46 | out_bs = in_linear.size(1) 47 | assert in_linear.shape == in_linear_inplace.shape 48 | 49 | # PyTorch interpolation does not match the ends exactly, hence two asserts 50 | assert tr.allclose(in_linear[:, 1:-1], in_linear_inplace[:, 1:-1], atol=1e-6) 51 | assert tr.allclose( 52 | in_linear[:, [0, -1]], in_linear_inplace[:, [0, -1]], atol=1e-3 53 | ) 54 | in_interpolated = F.interpolate( 55 | in_audio.unsqueeze(0), out_bs, mode="linear", align_corners=True 56 | ).squeeze(0) 57 | # PyTorch interpolation does not match the ends exactly, hence two asserts 58 | assert tr.allclose( 59 | in_linear_inplace[:, 1:-1], in_interpolated[:, 1:-1], atol=1e-6 60 | ) 61 | assert tr.allclose( 62 | in_linear_inplace[:, [0, -1]], in_interpolated[:, [0, -1]], atol=1e-3 63 | ) 64 | # Check that the ends match exactly 65 | assert tr.equal(in_linear_inplace[:, [0, -1]], in_audio[:, [0, -1]]) 66 | 67 | out_audio = tr.rand((out_n_ch, out_bs)) 68 | out_linear = resampler.process_out(out_audio) 69 | out_linear_inplace = inplace_resampler.process_out(out_audio) 70 | assert out_linear.shape == out_linear_inplace.shape 71 | assert out_linear.size(1) == in_bs 72 | 73 | # PyTorch interpolation does not match the ends exactly, hence two asserts 74 | assert tr.allclose(out_linear[:, 1:-1], out_linear_inplace[:, 1:-1], atol=1e-6) 75 | assert tr.allclose( 76 | out_linear[:, [0, -1]], out_linear_inplace[:, [0, -1]], atol=1e-3 77 | ) 78 | out_interpolated = F.interpolate( 79 | out_audio.unsqueeze(0), in_bs, mode="linear", align_corners=True 80 | ).squeeze(0) 81 | # PyTorch interpolation does not match the ends exactly, hence two asserts 82 | assert tr.allclose( 83 | out_linear_inplace[:, 1:-1], out_interpolated[:, 1:-1], atol=1e-6 84 | ) 85 | assert tr.allclose( 86 | out_linear_inplace[:, [0, -1]], out_interpolated[:, [0, -1]], atol=1e-3 87 | ) 88 | # Check that the ends match exactly 89 | assert tr.equal(out_linear_inplace[:, [0, -1]], out_audio[:, [0, -1]]) 90 | 91 | 92 | def _calc_4p_hermite(x: float, y_m1: float, y0: float, y1: float, y2: float) -> float: 93 | # This is super slow, but the fast version has already been implemented and is being tested using this 94 | c0 = y0 95 | c1 = 0.5 * (y1 - y_m1) 96 | c2 = y_m1 - 2.5 * y0 + 2.0 * y1 - 0.5 * y2 97 | c3 = 0.5 * (y2 - y_m1) + 1.5 * (y0 - y1) 98 | return ((c3 * x + c2) * x + c1) * x + c0 99 | 100 | 101 | def test_4p_hermite_resampler( 102 | n_trials: int = 50, in_n_ch: int = 2, out_n_ch: int = 2 103 | ) -> None: 104 | random.seed(42) 105 | tr.manual_seed(42) 106 | sampling_rates = [16000, 22050, 32000, 44100, 48000, 88200, 96000] 107 | buffer_sizes = [64, 128, 256, 512, 1024, 2048] 108 | 109 | resampler = Inplace4pHermiteResampler(in_n_ch, out_n_ch, 48000, 48000, 512) 110 | 111 | for _ in tqdm(range(n_trials)): 112 | sr_a = random.choice(sampling_rates) 113 | sr_b = random.choice(sampling_rates) 114 | in_bs = random.choice(buffer_sizes) 115 | 116 | resampler.set_sample_rates(sr_a, sr_b, in_bs) 117 | out_bs = resampler.out_bs 118 | 119 | # Check inplace resampler internal values are correct for matching the first sample 120 | assert resampler.x_in[0] == 0.0 121 | assert resampler.x_in[-1] == 0.0 or resampler.x_in[-1] == 1.0 122 | 123 | # Check process_in() 124 | in_audio = tr.rand((in_n_ch, in_bs)) 125 | in_resampled = resampler.process_in(in_audio) 126 | assert in_resampled.size(0) == in_n_ch 127 | # Check that the first sample is equal to the input audio 128 | assert tr.equal(in_resampled[:, 0], in_audio[:, 0]) 129 | # Check that the last sample is reasonably close to the input audio 130 | assert tr.allclose(in_resampled[:, -1], in_audio[:, -1], atol=1e-3) 131 | 132 | # Check the 4p cubic hermite spline calculation element-wise 133 | x = resampler.x_in 134 | y_m1 = tr.index_select(in_audio, dim=1, index=resampler.y_m1_idx_in) 135 | y0 = tr.index_select(in_audio, dim=1, index=resampler.y0_idx_in) 136 | y1 = tr.index_select(in_audio, dim=1, index=resampler.y1_idx_in) 137 | y2 = tr.index_select(in_audio, dim=1, index=resampler.y2_idx_in) 138 | 139 | for ch_idx in range(in_n_ch): 140 | for x_idx in range(out_bs): 141 | y_calc = _calc_4p_hermite( 142 | x[x_idx], 143 | y_m1[ch_idx, x_idx], 144 | y0[ch_idx, x_idx], 145 | y1[ch_idx, x_idx], 146 | y2[ch_idx, x_idx], 147 | ) 148 | assert math.isclose(y_calc, in_resampled[ch_idx, x_idx], abs_tol=1e-6) 149 | 150 | # TODO(cm): remove duplication 151 | # Check process_out() 152 | out_audio = tr.rand((out_n_ch, out_bs)) 153 | out_resampled = resampler.process_out(out_audio) 154 | assert out_resampled.size(0) == out_n_ch 155 | # Check that the first sample is equal to the input audio 156 | assert tr.equal(out_resampled[:, 0], out_audio[:, 0]) 157 | # Check that the last sample is reasonably close to the input audio 158 | assert tr.allclose(out_resampled[:, -1], out_audio[:, -1], atol=1e-3) 159 | 160 | # Check the 4p cubic hermite spline calculation element-wise 161 | x = resampler.x_out 162 | y_m1 = tr.index_select(out_audio, dim=1, index=resampler.y_m1_idx_out) 163 | y0 = tr.index_select(out_audio, dim=1, index=resampler.y0_idx_out) 164 | y1 = tr.index_select(out_audio, dim=1, index=resampler.y1_idx_out) 165 | y2 = tr.index_select(out_audio, dim=1, index=resampler.y2_idx_out) 166 | 167 | for ch_idx in range(out_n_ch): 168 | for x_idx in range(in_bs): 169 | y_calc = _calc_4p_hermite( 170 | x[x_idx], 171 | y_m1[ch_idx, x_idx], 172 | y0[ch_idx, x_idx], 173 | y1[ch_idx, x_idx], 174 | y2[ch_idx, x_idx], 175 | ) 176 | assert math.isclose(y_calc, out_resampled[ch_idx, x_idx], abs_tol=1e-6) 177 | -------------------------------------------------------------------------------- /testing/test_sqw.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | import math 4 | import os 5 | import random 6 | from typing import Dict, List, Optional 7 | 8 | import torch as tr 9 | import torch.nn as nn 10 | from torch import Tensor 11 | from tqdm import tqdm 12 | 13 | from neutone_sdk import WaveformToWaveformBase, SampleQueueWrapper 14 | 15 | logging.basicConfig() 16 | log = logging.getLogger(__name__) 17 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) 18 | 19 | 20 | class TestModel(nn.Module): 21 | def forward(self, x: Tensor) -> Tensor: 22 | return x 23 | 24 | 25 | class TestModelWrapper(WaveformToWaveformBase): 26 | def __init__( 27 | self, 28 | model: nn.Module = TestModel(), 29 | model_sr: int = 48000, 30 | model_bs: int = 512, 31 | use_debug_mode: bool = True, 32 | ) -> None: 33 | super().__init__(model, use_debug_mode) 34 | self.model_sr = model_sr 35 | self.model_bs = model_bs 36 | 37 | def get_model_name(self) -> str: 38 | return "test" 39 | 40 | def get_model_authors(self) -> List[str]: 41 | return ["Christopher Mitcheltree"] 42 | 43 | def get_model_short_description(self) -> str: 44 | return "Testing." 45 | 46 | def get_model_long_description(self) -> str: 47 | return "Testing." 48 | 49 | def get_technical_description(self) -> str: 50 | return "Testing." 51 | 52 | def get_tags(self) -> List[str]: 53 | return ["test"] 54 | 55 | def get_model_version(self) -> str: 56 | return "1.0.0" 57 | 58 | def is_experimental(self) -> bool: 59 | return True 60 | 61 | @tr.jit.export 62 | def is_input_mono(self) -> bool: 63 | return False 64 | 65 | @tr.jit.export 66 | def is_output_mono(self) -> bool: 67 | return False 68 | 69 | @tr.jit.export 70 | def get_native_sample_rates(self) -> List[int]: 71 | return [self.model_sr] 72 | 73 | @tr.jit.export 74 | def get_native_buffer_sizes(self) -> List[int]: 75 | return [self.model_bs] 76 | 77 | def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor: 78 | return self.model.forward(x) 79 | 80 | 81 | def check_saturation_n(io_bs: int, model_bs: int, saturation_n: int) -> bool: 82 | size_in = saturation_n 83 | size_out = 0 84 | for _ in range(math.lcm(io_bs, model_bs)): 85 | while size_in >= model_bs: 86 | size_in -= model_bs 87 | size_out += model_bs 88 | if size_out < io_bs: 89 | return False 90 | else: 91 | size_out -= io_bs 92 | assert size_in >= 0 93 | assert size_out >= 0 94 | size_in += io_bs 95 | return True 96 | 97 | 98 | def find_saturation_n(io_bs: int, model_bs: int) -> Optional[int]: 99 | lcm = math.lcm(io_bs, model_bs) 100 | for n in range(io_bs, lcm + 1, io_bs): 101 | if check_saturation_n(io_bs, model_bs, n): 102 | return n 103 | return None 104 | 105 | 106 | def check_queue_saturation(io_bs: int, model_bs: int, saturation_n: int) -> bool: 107 | sr = 48000 108 | wrapper = TestModelWrapper(model_sr=sr, model_bs=model_bs) 109 | sqw = SampleQueueWrapper( 110 | wrapper, daw_sr=sr, daw_bs=io_bs, model_sr=sr, model_bs=model_bs 111 | ) 112 | in_queue = sqw.in_queue 113 | out_queue = sqw.out_queue 114 | 115 | io_buffer = tr.zeros((2, io_bs)) 116 | model_buffer = tr.zeros((2, model_bs)) 117 | 118 | is_saturated = False 119 | audio_in = tr.rand((2, (io_bs * model_bs) + (2 * saturation_n))) 120 | blocks_in = tr.split(audio_in, io_bs, dim=1) 121 | 122 | for block_in in blocks_in: 123 | if block_in.size(1) != io_bs: 124 | break 125 | 126 | assert in_queue.max_size - in_queue.size >= io_bs 127 | in_queue.push(block_in) 128 | 129 | if in_queue.size >= saturation_n: 130 | is_saturated = True 131 | 132 | while in_queue.size >= model_bs: 133 | in_popped_n = in_queue.pop(model_buffer) 134 | assert in_popped_n == model_bs 135 | assert out_queue.max_size - in_queue.size >= model_bs 136 | out_queue.push(model_buffer) 137 | 138 | if is_saturated: 139 | out_popped_n = out_queue.pop(io_buffer) 140 | if out_popped_n != io_bs: 141 | return False 142 | 143 | return True 144 | 145 | 146 | def delay_test( 147 | wrapper: TestModelWrapper, 148 | sqw: SampleQueueWrapper, 149 | daw_sr: int, 150 | daw_bs: int, 151 | model_sr: int, 152 | model_bs: int, 153 | ) -> None: 154 | wrapper.model_sr = model_sr 155 | wrapper.model_bs = model_bs 156 | sqw.set_daw_sample_rate_and_buffer_size(daw_sr, daw_bs) 157 | expected_delay = sqw.calc_buffering_delay_samples() 158 | assert expected_delay >= 0 159 | 160 | n_samples = expected_delay + (2 * max(daw_bs, model_bs)) 161 | audio_in = tr.rand((2, n_samples)) 162 | blocks_in = tr.split(audio_in, daw_bs, dim=1) 163 | blocks_out = [] 164 | 165 | for block_in in blocks_in: 166 | if block_in.size(1) != daw_bs: 167 | break 168 | block_out = sqw.forward(block_in) 169 | block_out = tr.clone(block_out) 170 | blocks_out.append(block_out) 171 | 172 | audio_out = tr.cat(blocks_out, dim=1) 173 | 174 | actual_delay_l = tr.nonzero(audio_out[0, :])[0].item() 175 | actual_delay_r = tr.nonzero(audio_out[1, :])[0].item() 176 | assert actual_delay_l == actual_delay_r 177 | actual_delay = actual_delay_r 178 | assert expected_delay == actual_delay, ( 179 | f"expected = {expected_delay}, actual_delay = {actual_delay} | " 180 | f"{daw_sr}, {daw_bs}, {model_sr}, {model_bs}" 181 | ) 182 | 183 | 184 | def test_calc_saturation_n() -> None: 185 | # random.seed(42) 186 | # tr.manual_seed(42) 187 | # io_buffer_sizes = [random.randrange(32, 2048) for _ in range(16)] 188 | # model_buffer_sizes = [random.randrange(32, 2048) for _ in range(16)] 189 | 190 | io_buffer_sizes = list(range(2, 256)) 191 | model_buffer_sizes = list(range(2, 256)) 192 | 193 | log.info(f"io_buffer_sizes: {io_buffer_sizes}") 194 | log.info(f"model_buffer_sizes: {model_buffer_sizes}") 195 | 196 | for io_bs, model_bs in tqdm(itertools.product(io_buffer_sizes, model_buffer_sizes)): 197 | calculated_n = SampleQueueWrapper.calc_saturation_n(io_bs, model_bs) 198 | found_n = find_saturation_n(io_bs, model_bs) 199 | assert ( 200 | found_n is not None 201 | ), f"Could not find a saturation_n. io_bs = {io_bs}, model_bs = {model_bs}" 202 | assert found_n % io_bs == 0 203 | assert ( 204 | calculated_n == found_n 205 | ), f"io_bs = {io_bs}, model_bs = {model_bs}, calculated_n = {calculated_n}, found_n = {found_n}" 206 | assert check_queue_saturation(io_bs, model_bs, found_n) 207 | 208 | log.info("No saturation inconsistencies found") 209 | 210 | 211 | def test_calc_buffering_delay_samples() -> None: 212 | wrapper = TestModelWrapper() 213 | sqw = SampleQueueWrapper(wrapper) 214 | 215 | sampling_rates = [16000, 22050, 32000, 44100, 48000, 88200, 96000] 216 | buffer_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096] 217 | 218 | # random.seed(42) 219 | # tr.manual_seed(42) 220 | # buffer_sizes = [random.randrange(32, 4096) for _ in range(50)] 221 | 222 | log.info(f"Sampling rates: {sampling_rates}") 223 | log.info(f"Buffer sizes: {buffer_sizes}") 224 | 225 | for daw_sr, daw_bs, model_sr, model_bs in tqdm( 226 | itertools.product(sampling_rates, buffer_sizes, sampling_rates, buffer_sizes) 227 | ): 228 | delay_test(wrapper, sqw, daw_sr, daw_bs, model_sr, model_bs) 229 | 230 | log.info("No delay inconsistencies found") 231 | -------------------------------------------------------------------------------- /testing/torchscript_test.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | 3 | import torch as tr 4 | from torch import Tensor as T, nn 5 | 6 | 7 | class TestModel(nn.Module): 8 | def __init__(self): 9 | super().__init__() 10 | self.a = "hi" 11 | self.b = 1 12 | self.c = False 13 | self.d = 3.14 14 | 15 | @tr.jit.export 16 | def get_metadata(self) -> Dict[str, Any]: 17 | return { 18 | "a": self.a, 19 | "b": self.b, 20 | "c": self.c, 21 | "d": self.d, 22 | } 23 | 24 | def forward(self, x: T) -> T: 25 | return 2 * x 26 | 27 | 28 | if __name__ == "__main__": 29 | audio = tr.randn(1, 1, 5) 30 | model = TestModel() 31 | model.eval() 32 | out = model(audio) 33 | print(out) 34 | print(model.get_metadata()) 35 | traced_model = tr.jit.script(model) 36 | out2 = traced_model(audio) 37 | print(out2) 38 | print(traced_model.get_metadata()) 39 | --------------------------------------------------------------------------------