├── .github
    └── ISSUE_TEMPLATE
    │   └── request-add-model.md
├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── neutone_fx
    │   ├── example_clipper.py
    │   ├── example_clipper_prefilter.py
    │   ├── example_delayed_passthrough.py
    │   ├── example_overdrive-random.py
    │   ├── example_rave.py
    │   ├── example_rave_prefilter.py
    │   ├── example_rave_v1_prefilter.py
    │   └── example_spectral_filter.py
    └── neutone_gen
    │   ├── example_clipper.py
    │   └── example_musicgen_load.py
├── neutone_sdk
    ├── __init__.py
    ├── assets
    │   └── default_samples
    │   │   ├── sample_ambience.mp3
    │   │   ├── sample_drums.mp3
    │   │   └── sample_rhodes.mp3
    ├── audio.py
    ├── benchmark.py
    ├── cached_mel_spec.py
    ├── constants.py
    ├── conv.py
    ├── core.py
    ├── filters.py
    ├── gcn_1d.py
    ├── metadata.py
    ├── non_realtime_sqw.py
    ├── non_realtime_wrapper.py
    ├── parameter.py
    ├── py.typed
    ├── queues.py
    ├── realtime_stft.py
    ├── sandwich.py
    ├── sqw.py
    ├── tcn_1d.py
    ├── utils.py
    └── wavform_to_wavform.py
├── pyproject.toml
└── testing
    ├── test_cached_mel_spec.py
    ├── test_conv.py
    ├── test_profiling.py
    ├── test_queues.py
    ├── test_sandwiches.py
    ├── test_sqw.py
    └── torchscript_test.py


/.github/ISSUE_TEMPLATE/request-add-model.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Request add model
 3 | about: Template for adding a model to the default list in Neutone
 4 | title: "[MODEL] <NAME>"
 5 | labels: enhancement
 6 | assignees: bogdanteleaga, christhetree
 7 | 
 8 | ---
 9 | 
10 | ## A brief description of what your model does
11 | 
12 | <here>
13 | 
14 | ## Checklist
15 | - [ ] I have checked the model works properly loaded locally in the Neutone plugin on my machine. <Insert description of OS version, DAW version, M1/Intel>
16 | - [ ] I have uploaded the .nm model file at a publicly available location: <Insert link here>.
17 | 
18 | ## Extra information
19 | <Any other extra information you'd like to tell us>
20 | 
21 | ## Metadata
22 | 
23 | The model export function should dump a json file. Please paste the contents here for review and discussions.
24 | 
25 | ```
26 | ```
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### macOS template
  2 | # General
  3 | .DS_Store
  4 | .AppleDouble
  5 | .LSOverride
  6 | 
  7 | # Icon must end with two \r
  8 | Icon
  9 | 
 10 | # Thumbnails
 11 | ._*
 12 | 
 13 | # Files that might appear in the root of a volume
 14 | .DocumentRevisions-V100
 15 | .fseventsd
 16 | .Spotlight-V100
 17 | .TemporaryItems
 18 | .Trashes
 19 | .VolumeIcon.icns
 20 | .com.apple.timemachine.donotpresent
 21 | 
 22 | # Directories potentially created on remote AFP share
 23 | .AppleDB
 24 | .AppleDesktop
 25 | Network Trash Folder
 26 | Temporary Items
 27 | .apdisk
 28 | 
 29 | ### Python template
 30 | # Byte-compiled / optimized / DLL files
 31 | __pycache__/
 32 | *.py[cod]
 33 | *$py.class
 34 | 
 35 | # C extensions
 36 | *.so
 37 | 
 38 | # Distribution / packaging
 39 | .Python
 40 | build/
 41 | develop-eggs/
 42 | dist/
 43 | downloads/
 44 | eggs/
 45 | .eggs/
 46 | lib/
 47 | lib64/
 48 | parts/
 49 | sdist/
 50 | var/
 51 | wheels/
 52 | pip-wheel-metadata/
 53 | share/python-wheels/
 54 | *.egg-info/
 55 | .installed.cfg
 56 | *.egg
 57 | MANIFEST
 58 | 
 59 | # PyInstaller
 60 | #  Usually these files are written by a python script from a template
 61 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 62 | *.manifest
 63 | *.spec
 64 | 
 65 | # Installer logs
 66 | pip-log.txt
 67 | pip-delete-this-directory.txt
 68 | 
 69 | # Unit test / coverage reports
 70 | htmlcov/
 71 | .tox/
 72 | .nox/
 73 | .coverage
 74 | .coverage.*
 75 | .cache
 76 | nosetests.xml
 77 | coverage.xml
 78 | *.cover
 79 | *.py,cover
 80 | .hypothesis/
 81 | .pytest_cache/
 82 | cover/
 83 | 
 84 | # Translations
 85 | *.mo
 86 | *.pot
 87 | 
 88 | # Django stuff:
 89 | *.log
 90 | local_settings.py
 91 | db.sqlite3
 92 | db.sqlite3-journal
 93 | 
 94 | # Flask stuff:
 95 | instance/
 96 | .webassets-cache
 97 | 
 98 | # Scrapy stuff:
 99 | .scrapy
100 | 
101 | # Sphinx documentation
102 | docs/_build/
103 | 
104 | # PyBuilder
105 | .pybuilder/
106 | target/
107 | 
108 | # Jupyter Notebook
109 | .ipynb_checkpoints
110 | 
111 | # IPython
112 | profile_default/
113 | ipython_config.py
114 | 
115 | # pyenv
116 | #   For a library or package, you might want to ignore these files since the code is
117 | #   intended to run in multiple environments; otherwise, check them in:
118 | .python-version
119 | 
120 | # pipenv
121 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
122 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
123 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
124 | #   install all needed dependencies.
125 | #Pipfile.lock
126 | 
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
128 | __pypackages__/
129 | 
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Environments
138 | .env
139 | .venv
140 | env/
141 | venv/
142 | ENV/
143 | env.bak/
144 | venv.bak/
145 | 
146 | # Spyder project settings
147 | .spyderproject
148 | .spyproject
149 | 
150 | # Rope project settings
151 | .ropeproject
152 | 
153 | # mkdocs documentation
154 | /site
155 | 
156 | # mypy
157 | .mypy_cache/
158 | .dmypy.json
159 | dmypy.json
160 | 
161 | # Pyre type checker
162 | .pyre/
163 | 
164 | # pytype static type analyzer
165 | .pytype/
166 | 
167 | # Cython debug symbols
168 | cython_debug/
169 | 
170 | ### CUDA template
171 | *.i
172 | *.ii
173 | *.gpu
174 | *.ptx
175 | *.cubin
176 | *.fatbin
177 | 
178 | ### VirtualEnv template
179 | # Virtualenv
180 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
181 | [Bb]in
182 | [Ii]nclude
183 | [Ll]ib
184 | [Ll]ib64
185 | [Ll]ocal
186 | [Ss]cripts
187 | pyvenv.cfg
188 | pip-selfcheck.json
189 | 
190 | ### JupyterNotebooks template
191 | # gitignore template for Jupyter Notebooks
192 | # website: http://jupyter.org/
193 | 
194 | */.ipynb_checkpoints/*
195 | 
196 | # IPython
197 | 
198 | # Remove previous ipynb_checkpoints
199 | #   git rm -r .ipynb_checkpoints/
200 | 
201 | # User added
202 | .idea/
203 | exports/
204 | export_model/
205 | models/
206 | out/
207 | scratch.py
208 | 


--------------------------------------------------------------------------------
/examples/neutone_fx/example_clipper.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pathlib
  4 | from argparse import ArgumentParser
  5 | from typing import Dict, List
  6 | 
  7 | import torch as tr
  8 | import torch.nn as nn
  9 | from torch import Tensor
 10 | 
 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter
 12 | from neutone_sdk.utils import save_neutone_model
 13 | 
 14 | logging.basicConfig()
 15 | log = logging.getLogger(__name__)
 16 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 17 | 
 18 | 
 19 | class ClipperModel(nn.Module):
 20 |     def forward(
 21 |         self, x: Tensor, min_val: Tensor, max_val: Tensor, gain: Tensor
 22 |     ) -> Tensor:
 23 |         tr.neg(min_val, out=min_val)
 24 |         tr.mul(gain, min_val, out=min_val)
 25 |         tr.mul(gain, max_val, out=max_val)
 26 |         tr.clip(x, min=min_val, max=max_val, out=x)
 27 |         return x
 28 | 
 29 | 
 30 | class ClipperModelWrapper(WaveformToWaveformBase):
 31 |     def get_model_name(self) -> str:
 32 |         return "clipper"
 33 | 
 34 |     def get_model_authors(self) -> List[str]:
 35 |         return ["Andrew Fyfe"]
 36 | 
 37 |     def get_model_short_description(self) -> str:
 38 |         return "Audio clipper."
 39 | 
 40 |     def get_model_long_description(self) -> str:
 41 |         return "Clips the input audio between -1 and 1."
 42 | 
 43 |     def get_technical_description(self) -> str:
 44 |         return "Clips the input audio between -1 and 1."
 45 | 
 46 |     def get_technical_links(self) -> Dict[str, str]:
 47 |         return {
 48 |             "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/example_clipper.py"
 49 |         }
 50 | 
 51 |     def get_tags(self) -> List[str]:
 52 |         return ["clipper"]
 53 | 
 54 |     def get_model_version(self) -> str:
 55 |         return "1.0.0"
 56 | 
 57 |     def is_experimental(self) -> bool:
 58 |         return False
 59 | 
 60 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 61 |         return [
 62 |             ContinuousNeutoneParameter("min", "min clip threshold", default_value=0.15),
 63 |             ContinuousNeutoneParameter("max", "max clip threshold", default_value=0.15),
 64 |             ContinuousNeutoneParameter("gain", "scale clip threshold", default_value=1.0),
 65 |         ]
 66 | 
 67 |     @tr.jit.export
 68 |     def is_input_mono(self) -> bool:
 69 |         return False
 70 | 
 71 |     @tr.jit.export
 72 |     def is_output_mono(self) -> bool:
 73 |         return False
 74 | 
 75 |     @tr.jit.export
 76 |     def get_native_sample_rates(self) -> List[int]:
 77 |         return []  # Supports all sample rates
 78 | 
 79 |     @tr.jit.export
 80 |     def get_native_buffer_sizes(self) -> List[int]:
 81 |         return []  # Supports all buffer sizes
 82 | 
 83 |     def aggregate_params(self, params: Tensor) -> Tensor:
 84 |         return params  # We want sample-level control, so no aggregation
 85 | 
 86 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
 87 |         min_val, max_val, gain = params["min"], params["max"], params["gain"]
 88 |         x = self.model.forward(x, min_val, max_val, gain)
 89 |         return x
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     parser = ArgumentParser()
 94 |     parser.add_argument("-o", "--output", default="export_model")
 95 |     args = parser.parse_args()
 96 |     root_dir = pathlib.Path(args.output)
 97 | 
 98 |     model = ClipperModel()
 99 |     wrapper = ClipperModelWrapper(model)
100 |     save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True)
101 | 


--------------------------------------------------------------------------------
/examples/neutone_fx/example_clipper_prefilter.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | from argparse import ArgumentParser
  3 | from typing import Dict, List
  4 | 
  5 | import torch as tr
  6 | import torch.nn as nn
  7 | from torch import Tensor
  8 | 
  9 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter
 10 | from neutone_sdk.filters import FIRFilter, FilterType
 11 | from neutone_sdk.utils import save_neutone_model
 12 | 
 13 | """
 14 | Example wrapper script for prefilter + models with variable sample rate.
 15 | WaveformToWaveformBase.set_model_sample_rate_and_buffer_size() is used 
 16 | to change the sample rate of the filter to the actual used sample rate.
 17 | """
 18 | 
 19 | 
 20 | class ClipperModel(nn.Module):
 21 |     def forward(
 22 |         self, x: Tensor, min_val: Tensor, max_val: Tensor, gain: Tensor
 23 |     ) -> Tensor:
 24 |         tr.neg(min_val, out=min_val)
 25 |         tr.mul(gain, min_val, out=min_val)
 26 |         tr.mul(gain, max_val, out=max_val)
 27 |         tr.clip(x, min=min_val, max=max_val, out=x)
 28 |         return x
 29 | 
 30 | 
 31 | class ClipperModelWrapper(WaveformToWaveformBase):
 32 |     def __init__(self, model: nn.Module, use_debug_mode: bool = True) -> None:
 33 |         super().__init__(model, use_debug_mode)
 34 |         # filter to be applied before model
 35 |         self.pre_filter = FIRFilter(FilterType.LOWPASS, cutoffs=[1000.0], filt_size=257)
 36 | 
 37 |     def get_model_name(self) -> str:
 38 |         return "clipper"
 39 | 
 40 |     def get_model_authors(self) -> List[str]:
 41 |         return ["Andrew Fyfe"]
 42 | 
 43 |     def get_model_short_description(self) -> str:
 44 |         return "Audio clipper."
 45 | 
 46 |     def get_model_long_description(self) -> str:
 47 |         return "Clips the input audio between -1 and 1."
 48 | 
 49 |     def get_technical_description(self) -> str:
 50 |         return "Clips the input audio between -1 and 1."
 51 | 
 52 |     def get_technical_links(self) -> Dict[str, str]:
 53 |         return {
 54 |             "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/example_clipper.py"
 55 |         }
 56 | 
 57 |     def get_tags(self) -> List[str]:
 58 |         return ["clipper"]
 59 | 
 60 |     def get_model_version(self) -> str:
 61 |         return "1.0.0"
 62 | 
 63 |     def is_experimental(self) -> bool:
 64 |         return False
 65 | 
 66 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 67 |         return [
 68 |             ContinuousNeutoneParameter("min", "min clip threshold", default_value=0.15),
 69 |             ContinuousNeutoneParameter("max", "max clip threshold", default_value=0.15),
 70 |             ContinuousNeutoneParameter("gain", "scale clip threshold", default_value=1.0),
 71 |         ]
 72 | 
 73 |     @tr.jit.export
 74 |     def is_input_mono(self) -> bool:
 75 |         return False
 76 | 
 77 |     @tr.jit.export
 78 |     def is_output_mono(self) -> bool:
 79 |         return False
 80 | 
 81 |     def calc_model_delay_samples(self) -> int:
 82 |         # model latency should also be added if non-causal
 83 |         return self.pre_filter.delay
 84 | 
 85 |     def set_model_sample_rate_and_buffer_size(
 86 |         self, sample_rate: int, n_samples: int
 87 |     ) -> bool:
 88 |         # While clipper works in any sample rate, prefilter's samplerate must be changed
 89 |         self.pre_filter.set_parameters(sample_rate=sample_rate)
 90 |         return True
 91 | 
 92 |     @tr.jit.export
 93 |     def get_native_sample_rates(self) -> List[int]:
 94 |         return []  # Supports all sample rates
 95 | 
 96 |     @tr.jit.export
 97 |     def get_native_buffer_sizes(self) -> List[int]:
 98 |         return []  # Supports all buffer sizes
 99 | 
100 |     def aggregate_params(self, params: Tensor) -> Tensor:
101 |         return params  # We want sample-level control, so no aggregation
102 | 
103 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
104 |         min_val, max_val, gain = params["min"], params["max"], params["gain"]
105 |         # Apply pre-filter
106 |         x = self.pre_filter(x)
107 |         x = self.model.forward(x, min_val, max_val, gain)
108 |         return x
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     parser = ArgumentParser()
113 |     parser.add_argument("-o", "--output", default="export_model")
114 |     args = parser.parse_args()
115 |     root_dir = pathlib.Path(args.output)
116 |     model = ClipperModel()
117 |     wrapper = ClipperModelWrapper(model)
118 |     save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True)
119 | 


--------------------------------------------------------------------------------
/examples/neutone_fx/example_delayed_passthrough.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pathlib
  4 | from argparse import ArgumentParser
  5 | from typing import Dict, List
  6 | 
  7 | import torch as tr
  8 | import torch.nn as nn
  9 | from torch import Tensor
 10 | 
 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter
 12 | from neutone_sdk.utils import save_neutone_model
 13 | 
 14 | logging.basicConfig()
 15 | log = logging.getLogger(__name__)
 16 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 17 | 
 18 | 
 19 | class DelayedPassthroughModel(nn.Module):
 20 |     def __init__(self, delay_n_samples: int, in_ch: int = 2) -> None:
 21 |         super().__init__()
 22 |         self.delay_n_samples = delay_n_samples
 23 |         self.delay_buf = tr.zeros((in_ch, delay_n_samples))
 24 | 
 25 |     def forward(self, x: Tensor) -> Tensor:
 26 |         x = tr.cat([self.delay_buf, x], dim=-1)
 27 |         self.delay_buf[:, :] = x[:, -self.delay_n_samples :]
 28 |         x = x[:, : -self.delay_n_samples]
 29 |         return x
 30 | 
 31 | 
 32 | class DelayedPassthroughModelWrapper(WaveformToWaveformBase):
 33 |     def get_model_name(self) -> str:
 34 |         return "delayed.passthrough"
 35 | 
 36 |     def get_model_authors(self) -> List[str]:
 37 |         return ["Christopher Mitcheltree"]
 38 | 
 39 |     def get_model_short_description(self) -> str:
 40 |         return "Delayed passthrough model."
 41 | 
 42 |     def get_model_long_description(self) -> str:
 43 |         return "Delays the input audio by some number of samples. Should be tested with 50/50 dry/wet."
 44 | 
 45 |     def get_technical_description(self) -> str:
 46 |         return "Delays the input audio by some number of samples. Should be tested with 50/50 dry/wet."
 47 | 
 48 |     def get_technical_links(self) -> Dict[str, str]:
 49 |         return {}
 50 | 
 51 |     def get_tags(self) -> List[str]:
 52 |         return []
 53 | 
 54 |     def get_model_version(self) -> str:
 55 |         return "1.0.0"
 56 | 
 57 |     def is_experimental(self) -> bool:
 58 |         return True
 59 | 
 60 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 61 |         return []
 62 | 
 63 |     @tr.jit.export
 64 |     def is_input_mono(self) -> bool:
 65 |         return False
 66 | 
 67 |     @tr.jit.export
 68 |     def is_output_mono(self) -> bool:
 69 |         return False
 70 | 
 71 |     @tr.jit.export
 72 |     def get_native_sample_rates(self) -> List[int]:
 73 |         return [44100]  # Change this to test different scenarios
 74 | 
 75 |     @tr.jit.export
 76 |     def get_native_buffer_sizes(self) -> List[int]:
 77 |         return [2048]  # Change this to test different scenarios
 78 | 
 79 |     @tr.jit.export
 80 |     def reset_model(self) -> bool:
 81 |         self.model.delay_buf.fill_(0)
 82 |         return True
 83 | 
 84 |     @tr.jit.export
 85 |     def calc_model_delay_samples(self) -> int:
 86 |         return self.model.delay_n_samples
 87 | 
 88 |     @tr.jit.export
 89 |     def get_wet_default_value(self) -> float:
 90 |         return 0.5
 91 | 
 92 |     @tr.jit.export
 93 |     def get_dry_default_value(self) -> float:
 94 |         return 0.5
 95 | 
 96 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
 97 |         x = self.model.forward(x)
 98 |         return x
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     parser = ArgumentParser()
103 |     parser.add_argument("-o", "--output", default="export_model")
104 |     args = parser.parse_args()
105 |     root_dir = pathlib.Path(args.output)
106 | 
107 |     model = DelayedPassthroughModel(
108 |         delay_n_samples=500
109 |     )  # Change delay_n_samples to test different scenarios
110 |     wrapper = DelayedPassthroughModelWrapper(model)
111 |     save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True)
112 | 


--------------------------------------------------------------------------------
/examples/neutone_fx/example_overdrive-random.py:
--------------------------------------------------------------------------------
  1 | # This code is based on the following repository written by Christian J. Steinmetz
  2 | # https://github.com/csteinmetz1/micro-tcn
  3 | import logging
  4 | import os
  5 | from argparse import ArgumentParser
  6 | from pathlib import Path
  7 | from typing import Dict, List
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | from torch import Tensor
 12 | 
 13 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter
 14 | from neutone_sdk.tcn_1d import FiLM
 15 | from neutone_sdk.utils import save_neutone_model
 16 | 
 17 | logging.basicConfig()
 18 | log = logging.getLogger(__name__)
 19 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 20 | 
 21 | 
 22 | # TODO(christhetree): integrate this into tcn_1d.py
 23 | class PaddingCached(nn.Module):  # to maintain signal continuity over sample windows
 24 |     def __init__(self, padding: int, channels: int) -> None:
 25 |         super().__init__()
 26 |         self.padding = padding
 27 |         self.channels = channels
 28 |         pad = torch.zeros(1, self.channels, self.padding)
 29 |         self.register_buffer("pad", pad)
 30 | 
 31 |     def forward(self, x: Tensor) -> Tensor:
 32 |         padded_x = torch.cat([self.pad, x], -1)  # concat input signal to the cache
 33 |         self.pad = padded_x[..., -self.padding :]  # discard old cache
 34 |         return padded_x
 35 | 
 36 | 
 37 | # TODO(christhetree): integrate this into tcn_1d.py
 38 | class Conv1dCached(nn.Module):  # Conv1d with cache
 39 |     def __init__(
 40 |         self,
 41 |         in_chan: int,
 42 |         out_chan: int,
 43 |         kernel: int,
 44 |         stride: int,
 45 |         padding: int,
 46 |         dilation: int = 1,
 47 |         weight_norm: bool = False,
 48 |         bias: bool = False,
 49 |     ) -> None:
 50 |         super().__init__()
 51 |         self.pad = PaddingCached(padding * 2, in_chan)
 52 |         self.conv = nn.Conv1d(
 53 |             in_chan, out_chan, kernel, stride, dilation=dilation, bias=bias
 54 |         )
 55 |         nn.init.normal_(self.conv.weight)  # random initialization
 56 |         if weight_norm:
 57 |             self.conv = nn.utils.weight_norm(self.conv)
 58 | 
 59 |     def forward(self, x: Tensor) -> Tensor:
 60 |         x = self.pad(x)  # get (cached input + current input)
 61 |         x = self.conv(x)
 62 |         return x
 63 | 
 64 | 
 65 | # TODO(christhetree): integrate this into tcn_1d.py
 66 | class TCNBlock(nn.Module):
 67 |     def __init__(
 68 |         self,
 69 |         in_ch: int,
 70 |         out_ch: int,
 71 |         kernel_size: int = 3,
 72 |         dilation: int = 1,
 73 |         cond_dim: int = 32,
 74 |     ) -> None:
 75 |         super(TCNBlock, self).__init__()
 76 |         self.in_ch = in_ch
 77 |         self.out_ch = out_ch
 78 |         self.kernel_size = kernel_size
 79 |         padding = kernel_size // 2 * dilation
 80 |         self.conv1 = Conv1dCached(
 81 |             in_ch,
 82 |             out_ch,
 83 |             kernel=kernel_size,
 84 |             stride=1,
 85 |             padding=padding,
 86 |             dilation=dilation,
 87 |             bias=True,
 88 |         )
 89 |         self.res = nn.Conv1d(
 90 |             in_ch, out_ch, kernel_size=1, groups=1, bias=False
 91 |         )  # residual connection
 92 |         self.bn = nn.BatchNorm1d(out_ch)
 93 |         self.film = FiLM(out_ch, cond_dim)
 94 |         self.relu = nn.PReLU(out_ch)
 95 | 
 96 |     def forward(self, x: Tensor, p: Tensor) -> Tensor:
 97 |         x_in = x
 98 |         x = self.conv1(x)
 99 |         x = self.film(x, p)
100 |         x = self.bn(x)
101 |         x = self.relu(x)
102 | 
103 |         # residual
104 |         x_res = self.res(x_in)
105 |         start = (x_res.shape[-1] - x.shape[-1]) // 2
106 |         stop = start + x.shape[-1]
107 |         x = x + x_res[..., start:stop]
108 |         return x
109 | 
110 | 
111 | class OverdriveModel(nn.Module):
112 |     def __init__(
113 |         self,
114 |         ninputs: int = 1,
115 |         noutputs: int = 1,
116 |         nblocks: int = 4,
117 |         channel_growth: int = 0,
118 |         channel_width: int = 32,
119 |         kernel_size: int = 13,
120 |         dilation_growth: int = 2,
121 |         ncondition: int = 2,
122 |     ) -> None:
123 |         super().__init__()
124 | 
125 |         # MLP layers for conditioning
126 |         self.ncondition = ncondition
127 |         self.condition = torch.nn.Sequential(
128 |             torch.nn.Linear(ncondition, 16),
129 |             torch.nn.ReLU(),
130 |             torch.nn.Linear(16, 32),
131 |             torch.nn.ReLU(),
132 |             torch.nn.Linear(32, 32),  # cond_dim = 32
133 |             torch.nn.ReLU(),
134 |         )
135 | 
136 |         # main model
137 |         self.blocks = torch.nn.ModuleList()
138 |         for n in range(nblocks):
139 |             in_ch = out_ch if n > 0 else ninputs
140 |             out_ch = in_ch * channel_growth if channel_growth > 1 else channel_width
141 |             dilation = dilation_growth**n
142 |             self.blocks.append(
143 |                 TCNBlock(in_ch, out_ch, kernel_size, dilation, cond_dim=32)
144 |             )
145 |         self.output = nn.Conv1d(out_ch, noutputs, kernel_size=1)
146 | 
147 |         # random initialization
148 |         self.initialize_random()
149 | 
150 |     def forward(self, x: Tensor, c: Tensor) -> Tensor:
151 |         p = self.condition(c)  # conditioning
152 |         for _, block in enumerate(self.blocks):
153 |             x = block(x, p)
154 |         y = torch.tanh(self.output(x))  # clipping
155 |         return y
156 | 
157 |     def weights_init(self, m: nn.Module) -> None:
158 |         classname = m.__class__.__name__
159 |         if classname == "Linear":
160 |             nn.init.normal_(m.weight, 0, 0.40)
161 | 
162 |     def initialize_random(self) -> None:
163 |         for n in self.blocks:
164 |             nn.init.normal_(n.conv1.conv.weight, 0, 0.7)
165 |             # nn.init.normal_(self.output.weight, 0, 0.25)
166 |         self.condition.apply(self.weights_init)
167 | 
168 | 
169 | class OverdriveModelWrapper(WaveformToWaveformBase):
170 |     def get_model_name(self) -> str:
171 |         return "conv1d-overdrive.random"
172 | 
173 |     def get_model_authors(self) -> List[str]:
174 |         return ["Nao Tokui"]
175 | 
176 |     def get_model_short_description(self) -> str:
177 |         return "Neural distortion/overdrive effect"
178 | 
179 |     def get_model_long_description(self) -> str:
180 |         return "Neural distortion/overdrive effect through randomly initialized Convolutional Neural Network"
181 | 
182 |     def get_technical_description(self) -> str:
183 |         return "Random distortion/overdrive effect through randomly initialized Temporal-1D-convolution layers. Based on the idea proposed by Steinmetz et al."
184 | 
185 |     def get_tags(self) -> List[str]:
186 |         return ["distortion", "overdrive"]
187 | 
188 |     def get_model_version(self) -> str:
189 |         return "1.0.0"
190 | 
191 |     def is_experimental(self) -> bool:
192 |         return False
193 | 
194 |     def get_technical_links(self) -> Dict[str, str]:
195 |         return {
196 |             "Paper": "https://arxiv.org/abs/2010.04237",
197 |             "Code": "https://github.com/csteinmetz1/micro-tcn",
198 |         }
199 | 
200 |     def get_citation(self) -> str:
201 |         return "Steinmetz, C. J., & Reiss, J. D. (2020). Randomized overdrive neural networks. arXiv preprint arXiv:2010.04237."
202 | 
203 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
204 |         return [
205 |             ContinuousNeutoneParameter("depth", "Effect Depth", 0.0),
206 |             ContinuousNeutoneParameter("P1", "Feature modulation 1", 0.0),
207 |             ContinuousNeutoneParameter("P2", "Feature modulation 2", 0.0),
208 |         ]
209 | 
210 |     @torch.jit.export
211 |     def is_input_mono(self) -> bool:
212 |         return False
213 | 
214 |     @torch.jit.export
215 |     def is_output_mono(self) -> bool:
216 |         return False
217 | 
218 |     @torch.jit.export
219 |     def get_native_sample_rates(self) -> List[int]:
220 |         return []  # Supports all sample rates
221 | 
222 |     @torch.jit.export
223 |     def get_native_buffer_sizes(self) -> List[int]:
224 |         return []  # Supports all buffer sizes
225 | 
226 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
227 |         # conditioning for FiLM layer
228 |         p1 = params["P1"]
229 |         p2 = params["P2"]
230 |         depth = params["depth"]
231 |         condition = torch.hstack([p1, p2]).reshape((1, -1)) * depth
232 | 
233 |         # main process
234 |         for ch in range(x.shape[0]):  # process channel by channel
235 |             x_ = x[ch].reshape(1, 1, -1)
236 |             x_ = self.model(x_, condition)
237 |             x[ch] = x_.squeeze()
238 |         return x
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     parser = ArgumentParser()
243 |     parser.add_argument("-o", "--output", default="export_model")
244 |     args = parser.parse_args()
245 |     root_dir = Path(args.output)
246 | 
247 |     model = OverdriveModel()
248 |     wrapper = OverdriveModelWrapper(model)
249 |     metadata = wrapper.to_metadata()
250 |     save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True)
251 | 


--------------------------------------------------------------------------------
/examples/neutone_fx/example_rave.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from argparse import ArgumentParser
  4 | from pathlib import Path
  5 | from typing import Dict, List
  6 | 
  7 | import torch
  8 | import torchaudio
  9 | from torch import Tensor
 10 | 
 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter
 12 | from neutone_sdk.audio import (
 13 |     AudioSample,
 14 |     AudioSamplePair,
 15 |     render_audio_sample,
 16 | )
 17 | from neutone_sdk.utils import save_neutone_model
 18 | 
 19 | logging.basicConfig()
 20 | log = logging.getLogger(__name__)
 21 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 22 | 
 23 | 
 24 | class RAVEModelWrapper(WaveformToWaveformBase):
 25 |     def get_model_name(self) -> str:
 26 |         return "RAVE.example"  # <-EDIT THIS
 27 | 
 28 |     def get_model_authors(self) -> List[str]:
 29 |         return ["Author Name"]  # <-EDIT THIS
 30 | 
 31 |     def get_model_short_description(self) -> str:
 32 |         return "RAVE model trained on xxx sounds."  # <-EDIT THIS
 33 | 
 34 |     def get_model_long_description(self) -> str:
 35 |         return (  # <-EDIT THIS
 36 |             "RAVE timbre transfer model trained on xxx sounds. Useful for xxx sounds."
 37 |         )
 38 | 
 39 |     def get_technical_description(self) -> str:
 40 |         return "RAVE model proposed by Caillon, Antoine et al."
 41 | 
 42 |     def get_technical_links(self) -> Dict[str, str]:
 43 |         return {
 44 |             "Paper": "https://arxiv.org/abs/2111.05011",
 45 |             "Code": "https://github.com/acids-ircam/RAVE",
 46 |         }
 47 | 
 48 |     def get_tags(self) -> List[str]:
 49 |         return ["timbre transfer", "RAVE"]
 50 | 
 51 |     def get_model_version(self) -> str:
 52 |         return "1.0.0"
 53 | 
 54 |     def is_experimental(self) -> bool:
 55 |         """
 56 |         set to True for models in experimental stage
 57 |         (status shown on the website)
 58 |         """
 59 |         return True  # <-EDIT THIS
 60 | 
 61 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 62 |         return [
 63 |             ContinuousNeutoneParameter(
 64 |                 name="Chaos", description="Magnitude of latent noise", default_value=0.0
 65 |             ),
 66 |             ContinuousNeutoneParameter(
 67 |                 name="Z edit index",
 68 |                 description="Index of latent dimension to edit",
 69 |                 default_value=0.0,
 70 |             ),
 71 |             ContinuousNeutoneParameter(
 72 |                 name="Z scale",
 73 |                 description="Scale of latent variable",
 74 |                 default_value=0.5,
 75 |             ),
 76 |             ContinuousNeutoneParameter(
 77 |                 name="Z offset",
 78 |                 description="Offset of latent variable",
 79 |                 default_value=0.5,
 80 |             ),
 81 |         ]
 82 | 
 83 |     def is_input_mono(self) -> bool:
 84 |         return True  # <-Set to False for stereo (each channel processed separately)
 85 | 
 86 |     def is_output_mono(self) -> bool:
 87 |         return True  # <-Set to False for stereo (each channel processed separately)
 88 | 
 89 |     def get_native_sample_rates(self) -> List[int]:
 90 |         return [48000]  # <-EDIT THIS
 91 | 
 92 |     def get_native_buffer_sizes(self) -> List[int]:
 93 |         return [2048]
 94 | 
 95 |     def get_citation(self) -> str:
 96 |         return """Caillon, A., & Esling, P. (2021). RAVE: A variational autoencoder for fast and high-quality neural audio synthesis. arXiv preprint arXiv:2111.05011."""
 97 | 
 98 |     def calc_model_delay_samples(self) -> int:
 99 |         return 2048
100 | 
101 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
102 |         # parameters edit the latent variable
103 |         z = self.model.encode(x.unsqueeze(1))
104 |         noise_amp = params["Chaos"]
105 |         z = torch.randn_like(z) * noise_amp + z
106 |         # add offset / scale
107 |         idx_z = int(
108 |             torch.clamp(params["Z edit index"], min=0.0, max=0.99)
109 |             * self.model.latent_size
110 |         )
111 |         z_scale = params["Z scale"] * 2  # 0~1 -> 0~2
112 |         z_offset = params["Z offset"] * 2 - 1  # 0~1 -> -1~1
113 |         z[:, idx_z] = z[:, idx_z] * z_scale + z_offset
114 |         out = self.model.decode(z)
115 |         out = out.squeeze(1)
116 |         return out  # (n_channels=1, sample_size)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     parser = ArgumentParser()
121 |     parser.add_argument(
122 |         "-i",
123 |         "--input",
124 |         default="./models/rave/rave_cached.ts",
125 |         help="exported RAVE torchscript file",
126 |     )
127 |     parser.add_argument("-o", "--output", default="ravemodel", help="model output name")
128 |     parser.add_argument("-f", "--folder", default="./exports", help="output folder")
129 |     parser.add_argument(
130 |         "-s",
131 |         "--sounds",
132 |         nargs="*",
133 |         type=str,
134 |         default=None,
135 |         help="directory of sounds to use as example input.",
136 |     )
137 |     args = parser.parse_args()
138 |     root_dir = Path(args.folder) / args.output
139 | 
140 |     # wrap it
141 |     model = torch.jit.load(args.input)
142 |     wrapper = RAVEModelWrapper(model)
143 | 
144 |     soundpairs = None
145 |     if args.sounds is not None:
146 |         soundpairs = []
147 |         for sound in args.sounds:
148 |             wave, sr = torchaudio.load(sound)
149 |             input_sample = AudioSample(wave, sr)
150 |             rendered_sample = render_audio_sample(wrapper, input_sample)
151 |             soundpairs.append(AudioSamplePair(input_sample, rendered_sample))
152 | 
153 |     save_neutone_model(
154 |         wrapper,
155 |         root_dir,
156 |         freeze=False,
157 |         dump_samples=True,
158 |         submission=True,
159 |         audio_sample_pairs=soundpairs,
160 |     )
161 | 


--------------------------------------------------------------------------------
/examples/neutone_fx/example_rave_prefilter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from argparse import ArgumentParser
  4 | from pathlib import Path
  5 | from typing import Dict, List
  6 | 
  7 | import torch
  8 | import torchaudio
  9 | from torch import Tensor, nn
 10 | 
 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter
 12 | from neutone_sdk.audio import (
 13 |     AudioSample,
 14 |     AudioSamplePair,
 15 |     render_audio_sample,
 16 | )
 17 | from neutone_sdk.filters import FIRFilter, FilterType
 18 | from neutone_sdk.utils import save_neutone_model
 19 | 
 20 | logging.basicConfig()
 21 | log = logging.getLogger(__name__)
 22 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 23 | 
 24 | 
 25 | class FilteredRAVEModelWrapper(WaveformToWaveformBase):
 26 |     def __init__(self, model: nn.Module, use_debug_mode: bool = True) -> None:
 27 |         super().__init__(model, use_debug_mode)
 28 |         # filter to be applied before model
 29 |         # cut below 500 and above 4000 Hz
 30 |         self.pre_filter = FIRFilter(
 31 |             FilterType.BANDPASS, cutoffs=[500.0, 4000.0], filt_size=257
 32 |         )
 33 | 
 34 |     def get_model_name(self) -> str:
 35 |         return "RAVE.example"  # <-EDIT THIS
 36 | 
 37 |     def get_model_authors(self) -> List[str]:
 38 |         return ["Author Name"]  # <-EDIT THIS
 39 | 
 40 |     def get_model_short_description(self) -> str:
 41 |         return "RAVE model trained on xxx sounds."  # <-EDIT THIS
 42 | 
 43 |     def get_model_long_description(self) -> str:
 44 |         return (  # <-EDIT THIS
 45 |             "RAVE timbre transfer model trained on xxx sounds. Useful for xxx sounds."
 46 |         )
 47 | 
 48 |     def get_technical_description(self) -> str:
 49 |         return "RAVE model proposed by Caillon, Antoine et al."
 50 | 
 51 |     def get_technical_links(self) -> Dict[str, str]:
 52 |         return {
 53 |             "Paper": "https://arxiv.org/abs/2111.05011",
 54 |             "Code": "https://github.com/acids-ircam/RAVE",
 55 |         }
 56 | 
 57 |     def get_tags(self) -> List[str]:
 58 |         return ["timbre transfer", "RAVE"]
 59 | 
 60 |     def get_model_version(self) -> str:
 61 |         return "1.0.0"
 62 | 
 63 |     def is_experimental(self) -> bool:
 64 |         """
 65 |         set to True for models in experimental stage
 66 |         (status shown on the website)
 67 |         """
 68 |         return True  # <-EDIT THIS
 69 | 
 70 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 71 |         return [
 72 |             ContinuousNeutoneParameter(
 73 |                 name="Chaos", description="Magnitude of latent noise", default_value=0.0
 74 |             ),
 75 |             ContinuousNeutoneParameter(
 76 |                 name="Z edit index",
 77 |                 description="Index of latent dimension to edit",
 78 |                 default_value=0.0,
 79 |             ),
 80 |             ContinuousNeutoneParameter(
 81 |                 name="Z scale",
 82 |                 description="Scale of latent variable",
 83 |                 default_value=0.5,
 84 |             ),
 85 |             ContinuousNeutoneParameter(
 86 |                 name="Z offset",
 87 |                 description="Offset of latent variable",
 88 |                 default_value=0.5,
 89 |             ),
 90 |         ]
 91 | 
 92 |     def is_input_mono(self) -> bool:
 93 |         return True  # <-Set to False for stereo (each channel processed separately)
 94 | 
 95 |     def is_output_mono(self) -> bool:
 96 |         return True  # <-Set to False for stereo (each channel processed separately)
 97 | 
 98 |     def get_native_sample_rates(self) -> List[int]:
 99 |         return [48000]  # <-Set to model sr during training
100 | 
101 |     def get_native_buffer_sizes(self) -> List[int]:
102 |         return [2048]
103 | 
104 |     def calc_model_delay_samples(self) -> int:
105 |         # model latency should also be added if non-causal
106 |         return self.pre_filter.delay + 2048
107 | 
108 |     def set_model_sample_rate_and_buffer_size(
109 |         self, sample_rate: int, n_samples: int
110 |     ) -> bool:
111 |         # Set prefilter samplerate to current sample rate
112 |         self.pre_filter.set_parameters(sample_rate=sample_rate)
113 |         return True
114 | 
115 |     def get_citation(self) -> str:
116 |         return """Caillon, A., & Esling, P. (2021). RAVE: A variational autoencoder for fast and high-quality neural audio synthesis. arXiv preprint arXiv:2111.05011."""
117 | 
118 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
119 |         # Apply pre-filter
120 |         x = self.pre_filter(x)
121 |         # parameters edit the latent variable
122 |         z = self.model.encode(x.unsqueeze(1))
123 |         noise_amp = params["Chaos"] * 2
124 |         z = torch.randn_like(z) * noise_amp + z
125 |         # add offset / scale
126 |         idx_z = int(
127 |             torch.clamp(params["Z edit index"], min=0.0, max=0.99)
128 |             * self.model.latent_size
129 |         )
130 |         z_scale = params["Z scale"] * 2  # 0~1 -> 0~2
131 |         z_offset = params["Z offset"] * 2 - 1  # 0~1 -> -1~1
132 |         z[:, idx_z] = z[:, idx_z] * z_scale + z_offset
133 |         out = self.model.decode(z)
134 |         out = out.squeeze(1)
135 |         return out
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     parser = ArgumentParser()
140 |     parser.add_argument(
141 |         "-i",
142 |         "--input",
143 |         default="./models/rave/rave_cached.ts",
144 |         help="exported RAVE torchscript file",
145 |     )
146 |     parser.add_argument("-o", "--output", default="ravemodel", help="model output name")
147 |     parser.add_argument("-f", "--folder", default="./exports", help="output folder")
148 |     parser.add_argument(
149 |         "-s",
150 |         "--sounds",
151 |         nargs="*",
152 |         type=str,
153 |         default=None,
154 |         help="directory of sounds to use as example input.",
155 |     )
156 |     args = parser.parse_args()
157 |     root_dir = Path(args.folder) / args.output
158 | 
159 |     # wrap it
160 |     model = torch.jit.load(args.input)
161 |     wrapper = FilteredRAVEModelWrapper(model)
162 | 
163 |     soundpairs = None
164 |     if args.sounds is not None:
165 |         soundpairs = []
166 |         for sound in args.sounds:
167 |             wave, sr = torchaudio.load(sound)
168 |             input_sample = AudioSample(wave, sr)
169 |             rendered_sample = render_audio_sample(wrapper, input_sample)
170 |             soundpairs.append(AudioSamplePair(input_sample, rendered_sample))
171 | 
172 |     save_neutone_model(
173 |         wrapper,
174 |         root_dir,
175 |         freeze=False,
176 |         dump_samples=True,
177 |         submission=True,
178 |         audio_sample_pairs=soundpairs,
179 |     )
180 | 


--------------------------------------------------------------------------------
/examples/neutone_fx/example_rave_v1_prefilter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from argparse import ArgumentParser
  4 | from pathlib import Path
  5 | from typing import Dict, List
  6 | 
  7 | import torch
  8 | import torchaudio
  9 | from torch import Tensor, nn
 10 | 
 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter
 12 | from neutone_sdk.audio import (
 13 |     AudioSample,
 14 |     AudioSamplePair,
 15 |     render_audio_sample,
 16 | )
 17 | from neutone_sdk.filters import FIRFilter, FilterType
 18 | from neutone_sdk.utils import save_neutone_model
 19 | 
 20 | logging.basicConfig()
 21 | log = logging.getLogger(__name__)
 22 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 23 | 
 24 | 
 25 | class FilteredRAVEv1ModelWrapper(WaveformToWaveformBase):
 26 |     def __init__(self, model: nn.Module, use_debug_mode: bool = True) -> None:
 27 |         super().__init__(model, use_debug_mode)
 28 |         self.pre_filter = FIRFilter(
 29 |             FilterType.BANDPASS, cutoffs=[500.0, 4000.0], filt_size=257
 30 |         )
 31 | 
 32 |     def get_model_name(self) -> str:
 33 |         return "RAVE.example"
 34 | 
 35 |     def get_model_authors(self) -> List[str]:
 36 |         return ["Author Name"]
 37 | 
 38 |     def get_model_short_description(self) -> str:
 39 |         return "stereo RAVE model trained on ..."
 40 | 
 41 |     def get_model_long_description(self) -> str:
 42 |         return (  # <-EDIT THIS
 43 |             "RAVE timbre transfer model trained on xxx sounds. Useful for xxx sounds."
 44 |         )
 45 | 
 46 |     def get_technical_description(self) -> str:
 47 |         return "RAVE model proposed by Caillon, Antoine et al."
 48 | 
 49 |     def get_technical_links(self) -> Dict[str, str]:
 50 |         return {
 51 |             "Paper": "https://arxiv.org/abs/2111.05011",
 52 |             "Code": "https://github.com/acids-ircam/RAVE",
 53 |         }
 54 | 
 55 |     def get_tags(self) -> List[str]:
 56 |         return ["timbre transfer", "RAVE"]
 57 | 
 58 |     def get_model_version(self) -> str:
 59 |         return "1.0.0"
 60 | 
 61 |     def is_experimental(self) -> bool:
 62 |         """
 63 |         set to True for models in experimental stage
 64 |         (status shown on the website)
 65 |         """
 66 |         return False
 67 | 
 68 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 69 |         return [
 70 |             ContinuousNeutoneParameter(
 71 |                 name="Chaos",
 72 |                 description="Magnitude of latent noise",
 73 |                 default_value=0.0,
 74 |             ),
 75 |             ContinuousNeutoneParameter(
 76 |                 name="Z edit index",
 77 |                 description="Index of latent dimension to edit",
 78 |                 default_value=0.0,
 79 |             ),
 80 |             ContinuousNeutoneParameter(
 81 |                 name="Z scale",
 82 |                 description="Scale of latent variable",
 83 |                 default_value=0.5,
 84 |             ),
 85 |             ContinuousNeutoneParameter(
 86 |                 name="Z offset",
 87 |                 description="Offset of latent variable",
 88 |                 default_value=0.5,
 89 |             ),
 90 |         ]
 91 | 
 92 |     def is_input_mono(self) -> bool:
 93 |         return True  # <-Set to False for stereo (each channel processed separately)
 94 | 
 95 |     def is_output_mono(self) -> bool:
 96 |         return True  # <-Set to False for stereo (each channel processed separately)
 97 | 
 98 |     def get_native_sample_rates(self) -> List[int]:
 99 |         return [48000]  # <-Set to model sr during training
100 | 
101 |     def get_native_buffer_sizes(self) -> List[int]:
102 |         return [2048]
103 | 
104 |     def calc_model_delay_samples(self) -> int:
105 |         # model latency should also be added if non-causal
106 |         return self.pre_filter.delay
107 | 
108 |     def set_model_sample_rate_and_buffer_size(
109 |         self, sample_rate: int, n_samples: int
110 |     ) -> bool:
111 |         # Set prefilter samplerate to current sample rate
112 |         self.pre_filter.set_parameters(sample_rate=sample_rate)
113 |         return True
114 | 
115 |     def get_citation(self) -> str:
116 |         return """Caillon, A., & Esling, P. (2021). RAVE: A variational autoencoder for fast and high-quality neural audio synthesis. arXiv preprint arXiv:2111.05011."""
117 | 
118 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
119 |         # Apply pre-filter
120 |         x = self.pre_filter(x)
121 |         ## parameters edit the latent variable
122 |         z_mean, z_std = self.model.encode_amortized(x.unsqueeze(1))
123 |         noise_amp = z_std * params["Chaos"] * 4
124 |         batch, latent_dim, time = z_std.shape
125 |         z = (
126 |             torch.randn(1, latent_dim, 1, device=z_std.device).expand(batch, -1, time)
127 |             * noise_amp
128 |             + z_mean
129 |         )
130 |         # add offset / scale
131 |         idx_z = int(
132 |             torch.clamp(params["Z edit index"], min=0.0, max=0.99)
133 |             * self.model.cropped_latent_size
134 |         )
135 |         z_scale = params["Z scale"] * 2  # 0~1 -> 0~2
136 |         z_offset = params["Z offset"] * 2 - 1  # 0~1 -> -1~1
137 |         z[:, idx_z] = z[:, idx_z] * z_scale + z_offset
138 |         out = self.model.decode(z)
139 |         out = out.squeeze(1)
140 |         return out
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     parser = ArgumentParser()
145 |     parser.add_argument(
146 |         "-i",
147 |         "--input",
148 |         default="./models/rave/rave_cached.ts",
149 |         help="exported RAVE torchscript file",
150 |     )
151 |     parser.add_argument("-o", "--output", default="ravemodel", help="model output name")
152 |     parser.add_argument("-f", "--folder", default="./exports", help="output folder")
153 |     parser.add_argument(
154 |         "-s",
155 |         "--sounds",
156 |         nargs="*",
157 |         type=str,
158 |         default=None,
159 |         help="directory of sounds to use as example input.",
160 |     )
161 |     args = parser.parse_args()
162 |     root_dir = Path(args.folder) / args.output
163 | 
164 |     # wrap it
165 |     model = torch.jit.load(args.input)
166 |     wrapper = FilteredRAVEv1ModelWrapper(model)
167 | 
168 |     soundpairs = None
169 |     if args.sounds is not None:
170 |         soundpairs = []
171 |         for sound in args.sounds:
172 |             wave, sr = torchaudio.load(sound)
173 |             input_sample = AudioSample(wave, sr)
174 |             rendered_sample = render_audio_sample(wrapper, input_sample)
175 |             soundpairs.append(AudioSamplePair(input_sample, rendered_sample))
176 | 
177 |     save_neutone_model(
178 |         wrapper,
179 |         root_dir,
180 |         freeze=False,
181 |         dump_samples=True,
182 |         submission=True,
183 |         audio_sample_pairs=soundpairs,
184 |     )
185 | 


--------------------------------------------------------------------------------
/examples/neutone_fx/example_spectral_filter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pathlib
  4 | from argparse import ArgumentParser
  5 | from typing import Dict, List
  6 | 
  7 | import torch as tr
  8 | import torch.nn as nn
  9 | from torch import Tensor
 10 | 
 11 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter, ContinuousNeutoneParameter
 12 | from neutone_sdk.realtime_stft import RealtimeSTFT
 13 | from neutone_sdk.utils import save_neutone_model
 14 | 
 15 | logging.basicConfig()
 16 | log = logging.getLogger(__name__)
 17 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 18 | 
 19 | 
 20 | class SpectralFilter(nn.Module):
 21 |     def __init__(self) -> None:
 22 |         """
 23 |         Creates a spectral notch filter, where the bandwidth of the filter also changes as the center frequency changes.
 24 |         """
 25 |         super().__init__()
 26 |         self.base_constant = tr.tensor(
 27 |             1025 / tr.e
 28 |         )  # Used to scale the controls somewhat to the STFT
 29 |         self.half_constant = tr.tensor(0.5)  # Prevent dynamic memory allocations
 30 | 
 31 |     def _map_0to1_val_to_log_bin_idx(self, val: Tensor, max_bin: int) -> int:
 32 |         """
 33 |         Maps a float tensor between [0.0, 1.0] to an integer between [0, max_bins] with the assumption that the
 34 |         bin indices follow a logarithmic spacing.
 35 |         """
 36 |         idx = (
 37 |             (tr.pow(self.base_constant, val) - 1.0)
 38 |             / (self.base_constant - 1.0)
 39 |             * max_bin
 40 |         )
 41 |         idx = int(tr.clip(tr.round(idx), 0, max_bin))
 42 |         return idx
 43 | 
 44 |     def forward(
 45 |         self, x: Tensor, center: Tensor, width: Tensor, amount: Tensor
 46 |     ) -> Tensor:
 47 |         """
 48 |         Filters a positive valued magnitude spectrogram using a notch filter with controllable center, width,
 49 |         and amount of attenuation.
 50 | 
 51 |         Args:
 52 |             x: a magnitude spectrogram with shape (n_ch, n_bins, n_frames)
 53 |             center: 1D control value between [0.0, 1.0] for the center frequency of the filter.
 54 |             width: 1D control value between [0.0, 1.0] for the bandwidth of the filter.
 55 |             amount: 1D control value between [0.0, 1.0] for the amount of attenuation.
 56 |         """
 57 |         if amount == 0.0:
 58 |             return x
 59 |         n_bins = x.size(1)  # Figure out how many bins we have to work with
 60 |         # Find the center freq bin
 61 |         center_bin_idx = self._map_0to1_val_to_log_bin_idx(center, n_bins - 1)
 62 |         # Find the lowest freq bin
 63 |         lo_bin_idx = self._map_0to1_val_to_log_bin_idx(
 64 |             center * (1.0 - width), n_bins - 1
 65 |         )
 66 |         lo_bin_idx = max(0, lo_bin_idx)
 67 |         # Find the highest freq bin
 68 |         hi_bin_idx = self._map_0to1_val_to_log_bin_idx(
 69 |             center + ((1.0 - center) * width), n_bins - 1
 70 |         )
 71 |         hi_bin_idx = min(n_bins - 1, hi_bin_idx)
 72 |         # If the filter has 0 width, we don't need to do anything
 73 |         if hi_bin_idx - lo_bin_idx == 0:
 74 |             return x
 75 |         # Filter the low bins of the notch
 76 |         if center_bin_idx - lo_bin_idx > 0:
 77 |             # Using a linear spacing here is not ideal since the frequency bins are not linearly spaced,
 78 |             # but this is just an example
 79 |             lo_filter = 1.0 - (
 80 |                 tr.linspace(0.0, 1.0, center_bin_idx - lo_bin_idx + 2)[1:-1] * amount
 81 |             )
 82 |             lo_filter = lo_filter.view(1, -1, 1)
 83 |             x[:, lo_bin_idx:center_bin_idx, :] *= lo_filter
 84 |         # Filter the high bins of the notch
 85 |         if hi_bin_idx - center_bin_idx > 0:
 86 |             # Using a linear spacing here is not ideal since the frequency bins are not linearly spaced,
 87 |             # but this is just an example
 88 |             hi_filter = 1.0 - (
 89 |                 tr.linspace(1.0, 0.0, hi_bin_idx - center_bin_idx + 1)[:-1] * amount
 90 |             )
 91 |             hi_filter = hi_filter.view(1, -1, 1)
 92 |             x[:, center_bin_idx:hi_bin_idx, :] *= hi_filter
 93 |         return x
 94 | 
 95 | 
 96 | class SpectralFilterWrapper(WaveformToWaveformBase):
 97 |     def __init__(
 98 |         self,
 99 |         spectral_filter_model: nn.Module,
100 |         model_io_n_frames: int = 16,
101 |         n_fft: int = 2048,
102 |         hop_len: int = 512,
103 |         fade_n_samples: int = 384,  # Cross-fade for 3/4 of the hop_len to ensure no buzzing in the wet audio
104 |         use_debug_mode: bool = True,
105 |     ) -> None:
106 |         """
107 |         Creates a modified WaveformToWaveformBase wrapper that can be used to create spectral neural audio effects.
108 |         Feel free to use this as a starting point to make your own spectral effects!
109 | 
110 |         Args:
111 |             spectral_filter_model: a spectral model, in this example a filter (could be replaced with anything).
112 |             model_io_n_frames: the number of STFT frames the spectral model expects as input and output.
113 |             n_fft: n_fft to use for the STFT.
114 |             hop_len: hop_len in samples to use for the STFT.
115 |             fade_n_samples: no. of samples to crossfade between output buffers of audio after the inverse STFT. Adds a
116 |                             slight delay, but prevents clicks and pops in the output audio.
117 |             use_debug_mode: makes debugging easier, is turned off automatically before the model is exported.
118 |         """
119 |         super().__init__(spectral_filter_model, use_debug_mode)
120 |         in_ch = 1 if self.is_input_mono() else 2
121 |         self.stft = RealtimeSTFT(
122 |             model_io_n_frames=model_io_n_frames,
123 |             io_n_ch=in_ch,
124 |             n_fft=n_fft,
125 |             hop_len=hop_len,
126 |             power=1.0,  # Ensures an energy spectrogram
127 |             logarithmize=False,  # We don't need a log-magnitude spectrogram for this filter
128 |             ensure_pos_spec=True,  # Ensures a positive-valued spectrogram
129 |             use_phase_info=True,  # Keep the phase information for the inverse STFT
130 |             fade_n_samples=fade_n_samples,
131 |             use_debug_mode=use_debug_mode,
132 |         )
133 |         self.stft.set_buffer_size(self.stft.calc_min_buffer_size())
134 |         if use_debug_mode:
135 |             log.info(f"Supported buffer sizes = {self.get_native_buffer_sizes()}")
136 |             log.info(f"Supported sample rate = {self.get_native_sample_rates()}")
137 |             log.info(f"STFT delay = {self.calc_model_delay_samples()}")
138 | 
139 |     def get_model_name(self) -> str:
140 |         return "spectral.filter"
141 | 
142 |     def get_model_authors(self) -> List[str]:
143 |         return ["Christopher Mitcheltree"]
144 | 
145 |     def get_model_short_description(self) -> str:
146 |         return "Spectral notch filter."
147 | 
148 |     def get_model_long_description(self) -> str:
149 |         return (
150 |             "Filters the audio in the spectral domain using a central frequency, bandwidth, and amount. "
151 |             "The bandwidth changes as the central frequency changes."
152 |         )
153 | 
154 |     def get_technical_description(self) -> str:
155 |         return (
156 |             "Filters the audio in the spectral domain using a central frequency, bandwidth, and amount. "
157 |             "The bandwidth changes as the central frequency changes."
158 |         )
159 | 
160 |     def get_technical_links(self) -> Dict[str, str]:
161 |         return {}
162 | 
163 |     def get_tags(self) -> List[str]:
164 |         return ["spectral", "filter", "notch filter", "stft", "template"]
165 | 
166 |     def get_model_version(self) -> str:
167 |         return "1.0.0"
168 | 
169 |     def is_experimental(self) -> bool:
170 |         return True
171 | 
172 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
173 |         return [
174 |             ContinuousNeutoneParameter(
175 |                 "center", "center frequency of the filter", default_value=0.3
176 |             ),
177 |             ContinuousNeutoneParameter("width", "width of the filter", default_value=0.5),
178 |             ContinuousNeutoneParameter(
179 |                 "amount", "spectral attenuation amount", default_value=0.9
180 |             ),
181 |         ]
182 | 
183 |     @tr.jit.export
184 |     def is_input_mono(self) -> bool:
185 |         return False
186 | 
187 |     @tr.jit.export
188 |     def is_output_mono(self) -> bool:
189 |         return False
190 | 
191 |     @tr.jit.export
192 |     def get_native_sample_rates(self) -> List[int]:
193 |         # For consistent filtering across different sampling rates, a native sampling rate must be given. Feel free to
194 |         # change this to your required sampling rate.
195 |         return [44100]
196 | 
197 |     @tr.jit.export
198 |     def get_native_buffer_sizes(self) -> List[int]:
199 |         return (
200 |             self.stft.calc_supported_buffer_sizes()
201 |         )  # Possible buffer sizes are determined by the STFT parameters
202 | 
203 |     @tr.jit.export
204 |     def calc_model_delay_samples(self) -> int:
205 |         # TODO(cm): make a model specific version of this method?
206 |         return self.stft.calc_model_delay_samples()  # This is equal to `fade_n_samples`
207 | 
208 |     def set_model_buffer_size(self, n_samples: int) -> bool:
209 |         self.stft.set_buffer_size(n_samples)
210 |         return True
211 | 
212 |     def reset_model(self) -> bool:
213 |         self.stft.reset()
214 |         return True
215 | 
216 |     def prepare_for_inference(self) -> None:
217 |         super().prepare_for_inference()
218 |         # This needs to be done explicitly until we have dedicated wrapper base class for spectral models
219 |         self.stft.use_debug_mode = False
220 |         self.stft.eval()
221 | 
222 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
223 |         center, width, amount = params["center"], params["width"], params["amount"]
224 |         x = self.stft.audio_to_spec(
225 |             x
226 |         )  # Convert the audio to a spectrogram (n_ch, n_bins, n_frames)
227 |         x = self.model.forward(
228 |             x, center, width, amount
229 |         )  # Apply the spectral filter and receive an altered spectrogram
230 |         x = self.stft.spec_to_audio(
231 |             x
232 |         )  # Convert the filtered spectrogram back to audio (n_ch, n_samples)
233 |         return x
234 | 
235 | 
236 | if __name__ == "__main__":
237 |     parser = ArgumentParser()
238 |     parser.add_argument("-o", "--output", default="export_model")
239 |     args = parser.parse_args()
240 |     root_dir = pathlib.Path(args.output)
241 | 
242 |     model = SpectralFilter()
243 |     wrapper = SpectralFilterWrapper(model)
244 |     save_neutone_model(wrapper, root_dir, dump_samples=True, submission=True)
245 | 


--------------------------------------------------------------------------------
/examples/neutone_gen/example_clipper.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pathlib
  4 | from argparse import ArgumentParser
  5 | from typing import Dict, List
  6 | 
  7 | import torch as tr
  8 | import torch.nn as nn
  9 | from torch import Tensor
 10 | 
 11 | from neutone_sdk import NeutoneParameter, ContinuousNeutoneParameter
 12 | from neutone_sdk.non_realtime_sqw import NonRealtimeSampleQueueWrapper
 13 | from neutone_sdk.non_realtime_wrapper import NonRealtimeBase
 14 | 
 15 | logging.basicConfig()
 16 | log = logging.getLogger(__name__)
 17 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 18 | 
 19 | 
 20 | class ClipperModel(nn.Module):
 21 |     def forward(
 22 |         self, x: Tensor, min_val: Tensor, max_val: Tensor, gain: Tensor
 23 |     ) -> Tensor:
 24 |         tr.neg(min_val, out=min_val)
 25 |         tr.mul(gain, min_val, out=min_val)
 26 |         tr.mul(gain, max_val, out=max_val)
 27 |         tr.clip(x, min=min_val, max=max_val, out=x)
 28 |         return x
 29 |         # return x[:, :-4]
 30 |         # return tr.rand(2, 2048).fill_(0.5)
 31 | 
 32 | 
 33 | class NonRealtimeClipperModelWrapper(NonRealtimeBase):
 34 |     def get_model_name(self) -> str:
 35 |         return "clipper"
 36 | 
 37 |     def get_model_authors(self) -> List[str]:
 38 |         return ["Christopher Mitcheltree"]
 39 | 
 40 |     def get_model_short_description(self) -> str:
 41 |         return "Audio clipper."
 42 | 
 43 |     def get_model_long_description(self) -> str:
 44 |         return "Clips the input audio between -1 and 1."
 45 | 
 46 |     def get_technical_description(self) -> str:
 47 |         return "Clips the input audio between -1 and 1."
 48 | 
 49 |     def get_technical_links(self) -> Dict[str, str]:
 50 |         return {
 51 |             "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/neutone_gen/example_clipper_gen.py"
 52 |         }
 53 | 
 54 |     def get_tags(self) -> List[str]:
 55 |         return ["clipper"]
 56 | 
 57 |     def get_model_version(self) -> str:
 58 |         return "1.0.0"
 59 | 
 60 |     def is_experimental(self) -> bool:
 61 |         return False
 62 | 
 63 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 64 |         return [
 65 |             ContinuousNeutoneParameter("min", "min clip threshold", default_value=0.15),
 66 |             ContinuousNeutoneParameter("max", "max clip threshold", default_value=0.15),
 67 |             ContinuousNeutoneParameter(
 68 |                 "gain", "scale clip threshold", default_value=1.0
 69 |             ),
 70 |         ]
 71 | 
 72 |     @tr.jit.export
 73 |     def get_audio_in_channels(self) -> List[int]:
 74 |         return [2]
 75 | 
 76 |     @tr.jit.export
 77 |     def get_audio_out_channels(self) -> List[int]:
 78 |         return [2]
 79 | 
 80 |     @tr.jit.export
 81 |     def get_native_sample_rates(self) -> List[int]:
 82 |         return []  # Supports all sample rates
 83 | 
 84 |     @tr.jit.export
 85 |     def get_native_buffer_sizes(self) -> List[int]:
 86 |         return []  # Supports all buffer sizes
 87 | 
 88 |     @tr.jit.export
 89 |     def is_one_shot_model(self) -> bool:
 90 |         return False
 91 | 
 92 |     def aggregate_continuous_params(self, cont_params: Tensor) -> Tensor:
 93 |         return cont_params  # We want sample-level control, so no aggregation
 94 | 
 95 |     def do_forward_pass(
 96 |         self,
 97 |         curr_block_idx: int,
 98 |         audio_in: List[Tensor],
 99 |         numerical_params: Dict[str, Tensor],
100 |         text_params: List[str],
101 |         tokens_params: List[List[int]],
102 |     ) -> List[Tensor]:
103 |         min_val, max_val, gain = (
104 |             numerical_params["min"],
105 |             numerical_params["max"],
106 |             numerical_params["gain"],
107 |         )
108 |         audio_out = []
109 |         for x in audio_in:
110 |             x = self.model.forward(x, min_val, max_val, gain)
111 |             audio_out.append(x)
112 |         return audio_out
113 |         # return [self.model.forward(min_val, min_val, max_val, gain)]
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     parser = ArgumentParser()
118 |     parser.add_argument("-o", "--output", default="export_model")
119 |     args = parser.parse_args()
120 |     root_dir = pathlib.Path(args.output)
121 | 
122 |     model = ClipperModel()
123 |     wrapper = NonRealtimeClipperModelWrapper(model)
124 |     sqw = NonRealtimeSampleQueueWrapper(wrapper)
125 | 
126 |     in_n_samples = 2048
127 |     audio_in = [tr.rand(1, in_n_samples)]
128 |     # audio_in = []
129 |     numerical_params = tr.rand(3, in_n_samples)
130 |     # numerical_params = None
131 | 
132 |     out = sqw.forward_non_realtime(audio_in, numerical_params)
133 |     log.info(f"   out[0].shape: {out[0].shape}")
134 |     log.info(f"   out: {out}")
135 | 
136 |     sqw.reset()
137 |     sqw.prepare_for_inference()
138 |     # TODO(cm): write export method for nonrealtime models
139 |     ts = tr.jit.script(sqw)
140 | 
141 |     out_ts = ts.forward_non_realtime(audio_in, numerical_params)
142 |     log.info(f"out_ts[0].shape: {out_ts[0].shape}")
143 |     log.info(f"out_ts: {out_ts}")
144 | 


--------------------------------------------------------------------------------
/examples/neutone_gen/example_musicgen_load.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import os
  5 | import argparse, json, logging, os, base64, io, tempfile
  6 | from typing import List, Dict
  7 | 
  8 | import torch
  9 | import torchaudio
 10 | 
 11 | from neutone_sdk import (
 12 |     NeutoneParameter,
 13 |     DiscreteTokensNeutoneParameter,
 14 |     ContinuousNeutoneParameter,
 15 | )
 16 | from neutone_sdk.non_realtime_sqw import NonRealtimeSampleQueueWrapper
 17 | from neutone_sdk.non_realtime_wrapper import NonRealtimeTokenizerBase, TokenizerType
 18 | 
 19 | """
 20 | To run this script, you will need to install tokenizers library and also protobuf 
 21 | if you are using the sentencepiece tokenizer.
 22 | """
 23 | 
 24 | TOK_TYPE = TokenizerType.SENTENCEPIECE
 25 | 
 26 | logging.basicConfig()
 27 | log = logging.getLogger(__name__)
 28 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 29 | 
 30 | # class MusicGenWrapperNoTok(nn.Module):
 31 | #     def __init__(self, text_encoder, lm, audio_decoder, enc_to_dec_proj, logits_processor, pad_token_id: int, decoder_start_token_id: int, delay_mask_fn, num_codebooks: int, audio_channels: int):
 32 | #         super().__init__()
 33 | #         self.text_encoder = text_encoder
 34 | #         self.audio_decoder = audio_decoder
 35 | #         self.lm = lm
 36 | #         self.decoder_start_token_id = decoder_start_token_id
 37 | #         self.delay_mask_fn = delay_mask_fn
 38 | #         self.num_codebooks = num_codebooks
 39 | #         self.audio_channels = audio_channels
 40 | #         self.enc_to_dec_proj = enc_to_dec_proj
 41 | #         self.logits_processor = logits_processor
 42 | #         self.pad_token_id = pad_token_id
 43 | 
 44 | #     def prepare_text_encoder_kwargs_for_generation(self, input_ids):
 45 | #         encoder_attention_mask = torch.where(input_ids==0, 0, 1)
 46 | #         encoder_outputs = self.text_encoder(
 47 | #             input_ids=input_ids,
 48 | #             attention_mask=encoder_attention_mask,
 49 | #         )['last_hidden_state']
 50 | #         encoder_outputs = torch.concatenate([encoder_outputs, torch.zeros_like(encoder_outputs)], dim=0)
 51 | #         encoder_attention_mask = torch.concatenate(
 52 | #                     [encoder_attention_mask, torch.zeros_like(encoder_attention_mask)], dim=0
 53 | #                 )
 54 | #         return encoder_outputs, encoder_attention_mask
 55 | 
 56 | #     def apply_delay_pattern_mask(self, input_ids, decoder_pad_token_mask):
 57 | #         """Apply a delay pattern mask to the decoder input ids, only preserving predictions where
 58 | #         the mask is set to -1, and otherwise setting to the value detailed in the mask."""
 59 | #         seq_len = input_ids.shape[-1]
 60 | #         decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
 61 | #         input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
 62 | #         return input_ids
 63 | 
 64 | #     def prepare_inputs_for_generation(self, input_ids, encoder_outputs, delay_pattern_mask):
 65 | #         input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
 66 | #         # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
 67 | #         # before sampling)
 68 | #         input_ids = input_ids.repeat((2, 1))
 69 | #         return input_ids, encoder_outputs
 70 | 
 71 | #     def prepare_decoder_input_ids_for_generation(self, batch_size: int):
 72 | #         return torch.ones(batch_size * self.num_codebooks, 1, dtype=torch.long) * self.decoder_start_token_id
 73 | 
 74 | #     def preprocess(self, text_ids: torch.Tensor, max_length: int)-> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 75 | #         with torch.no_grad():
 76 | #             batch_size = text_ids.shape[0]
 77 | #             encoder_outputs, encoder_attention_mask = self.prepare_text_encoder_kwargs_for_generation(text_ids)
 78 | #             encoder_outputs = self.enc_to_dec_proj(encoder_outputs)
 79 | #             input_ids = self.prepare_decoder_input_ids_for_generation(batch_size)
 80 | #             input_ids, delay_pattern_mask = self.delay_mask_fn(input_ids, self.decoder_start_token_id, max_length, self.num_codebooks, self.audio_channels)
 81 | #         return input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask
 82 | 
 83 | #     def sample_step(self, input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask):
 84 | #         i_ids, enc_out = self.prepare_inputs_for_generation(input_ids, encoder_outputs, delay_pattern_mask)
 85 | #         outputs = self.lm(input_ids=i_ids, encoder_hidden_states=enc_out, encoder_attention_mask=encoder_attention_mask)
 86 | #         next_token_logits = outputs['logits'][:, -1, :]
 87 | #         # TODO temperature
 88 | #         next_token_scores = self.logits_processor(input_ids, next_token_logits)
 89 | #         probs = nn.functional.softmax(next_token_scores, dim=-1)
 90 | #         next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
 91 | #         # update generated ids, model inputs, and length for next step
 92 | #         input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
 93 | #         return input_ids # update input_ids in the next call
 94 | 
 95 | #     def postprocess(self, input_ids: torch.Tensor, delay_pattern_mask: torch.Tensor, text_ids: torch.Tensor):
 96 | #         batch_size = text_ids.shape[0]
 97 | #         output_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
 98 | #         output_ids = output_ids[output_ids != self.decoder_start_token_id].reshape(
 99 | #             batch_size, self.num_codebooks, -1
100 | #         )
101 | #         # append the frame dimension back to the audio codes
102 | #         output_ids = output_ids[None, ...]
103 | #         output_values = self.audio_decoder(output_ids)
104 | #         return output_values # update input_ids in the next call
105 | 
106 | #     def forward(self, text_ids: torch.Tensor, max_length: int):
107 | #         with torch.no_grad():
108 | #             input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask = self.preprocess(text_ids, max_length)
109 | #             # sample
110 | #             for _ in range(max_length-1):
111 | #                 input_ids = self.sample_step(input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask)
112 | #             output_values = self.postprocess(input_ids, delay_pattern_mask, text_ids)
113 | #         return output_values
114 | 
115 | 
116 | class NonRealtimeMusicGenModelWrapper(NonRealtimeTokenizerBase):
117 |     def get_model_name(self) -> str:
118 |         return "MusicGen"
119 | 
120 |     def get_model_authors(self) -> List[str]:
121 |         return ["Naotake Masuda"]
122 | 
123 |     def get_model_short_description(self) -> str:
124 |         return ""
125 | 
126 |     def get_model_long_description(self) -> str:
127 |         return ""
128 | 
129 |     def get_technical_description(self) -> str:
130 |         return ""
131 | 
132 |     def get_technical_links(self) -> Dict[str, str]:
133 |         return {
134 |             "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/neutone_gen/example_clipper_gen.py"
135 |         }
136 | 
137 |     def get_tags(self) -> List[str]:
138 |         return ["clipper"]
139 | 
140 |     def get_model_version(self) -> str:
141 |         return "1.0.0"
142 | 
143 |     def is_experimental(self) -> bool:
144 |         return False
145 | 
146 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
147 |         return [
148 |             DiscreteTokensNeutoneParameter(
149 |                 "texttokens",
150 |                 "tokens from a text tokenizer",
151 |                 default_value=[
152 |                     2775,
153 |                     7,
154 |                     2783,
155 |                     1463,
156 |                     28,
157 |                     7981,
158 |                     63,
159 |                     5253,
160 |                     7,
161 |                     11,
162 |                     13353,
163 |                     1,
164 |                 ],
165 |             ),
166 |             ContinuousNeutoneParameter(
167 |                 "outputlength", "number of output tokens", default_value=0.5
168 |             ),
169 |         ]
170 | 
171 |     @torch.jit.export
172 |     def get_audio_in_channels(self) -> List[int]:
173 |         return []
174 | 
175 |     @torch.jit.export
176 |     def get_audio_out_channels(self) -> List[int]:
177 |         return [1]
178 | 
179 |     @torch.jit.export
180 |     def get_native_sample_rates(self) -> List[int]:
181 |         return [32000]
182 | 
183 |     @torch.jit.export
184 |     def get_native_buffer_sizes(self) -> List[int]:
185 |         return []  # Supports all buffer sizes
186 | 
187 |     @torch.jit.export
188 |     def is_one_shot_model(self) -> bool:
189 |         return True
190 | 
191 |     @torch.jit.export
192 |     def has_progress_percentage(self) -> bool:
193 |         return True
194 | 
195 |     def aggregate_continuous_params(self, cont_params: torch.Tensor) -> torch.Tensor:
196 |         return cont_params  # We want sample-level control, so no aggregation
197 | 
198 |     def do_forward_pass(
199 |         self,
200 |         curr_block_idx: int,
201 |         audio_in: List[torch.Tensor],
202 |         knob_params: Dict[str, torch.Tensor],
203 |         text_params: List[str],
204 |         tokens_params: List[List[int]],
205 |     ) -> List[torch.Tensor]:
206 |         audio_out = []
207 |         output_length = int(knob_params["outputlength"].mean() * 500)
208 |         tokens = tokens_params[0]
209 |         # Convert to LongTensor with batch size of 1
210 |         tokens = torch.LongTensor(tokens).unsqueeze(0)
211 |         with torch.no_grad():
212 |             input_ids, encoder_outputs, delay_pattern_mask, encoder_attention_mask = (
213 |                 self.model.preprocess(tokens, output_length)
214 |             )
215 |             for i in range(output_length - 1):
216 |                 input_ids = self.model.sample_step(
217 |                     input_ids,
218 |                     encoder_outputs,
219 |                     delay_pattern_mask,
220 |                     encoder_attention_mask,
221 |                 )
222 |                 self.set_progress_percentage(float(i + 1) / output_length * 100)
223 |                 if self.should_cancel_forward_pass():
224 |                     # Can't return empty list for some reason
225 |                     break
226 |             x = self.model.postprocess(input_ids, delay_pattern_mask, tokens)
227 |         audio_out.append(x.squeeze(1))
228 |         return audio_out
229 |         # return [self.model.forward(min_val, min_val, max_val, gain)]
230 | 
231 | 
232 | if __name__ == "__main__":
233 |     from tokenizers import Tokenizer, SentencePieceUnigramTokenizer
234 | 
235 |     parser = argparse.ArgumentParser()
236 |     parser.add_argument("--model", default="musicgen-model", type=str)
237 |     args = parser.parse_args()
238 |     model = torch.jit.load("../../out/musicgen_scripted_notok.ts")
239 |     if TOK_TYPE == TokenizerType.SENTENCEPIECE:
240 |         tok_path = str("../../out/spiece.model")
241 |         with open(tok_path, mode="rb") as f:
242 |             tok_string = base64.b64encode(f.read()).decode()
243 |         tokenizer = SentencePieceUnigramTokenizer.from_spm(tok_path)
244 |     elif TOK_TYPE == TokenizerType.JSON:
245 |         tok_path = str("../../out/tokenizer.json")
246 |         with open(tok_path, "r", encoding="utf-8") as f:
247 |             tok_string = json.dumps(json.load(f), ensure_ascii=True)
248 |         tokenizer = Tokenizer.from_file(tok_path)
249 | 
250 |     wrapped = NonRealtimeMusicGenModelWrapper(model, tok_string, TOK_TYPE)
251 |     tokens = tokenizer.encode("80s pop track with bassy drums and synth").ids
252 | 
253 |     sqw = NonRealtimeSampleQueueWrapper(wrapped)
254 |     out = sqw.forward_non_realtime(
255 |         [],
256 |         torch.ones(1, 2048) * 0.2,
257 |         tokens_params=[tokens],
258 |     )
259 |     sqw.reset()
260 |     sqw.prepare_for_inference()
261 |     log.info(f"   out[0].shape: {out[0].shape}")
262 |     log.info(f"   out: {out}")
263 |     ts = torch.jit.script(sqw)
264 |     log.info(f"Scripting successful")
265 |     n_samples = 2048
266 |     tokens = tokenizer.encode("90s rock song with loud guitars and heavy drums").ids
267 |     out_ts = ts.forward_non_realtime(
268 |         [],
269 |         torch.ones(1, 2048) * 0.2,
270 |         tokens_params=[tokens],
271 |     )
272 |     log.info(f"out_ts[0].shape: {out_ts[0].shape}")
273 |     log.info(f"out_ts: {out_ts}")
274 |     torchaudio.save("../../out/out_ts.wav", out_ts[0], sample_rate=32000)
275 |     torch.jit.save(ts, "../../out/wrapped-musicgen.ts")
276 |     model = torch.jit.load("../../out/wrapped-musicgen.ts")
277 |     # test saved tokenizer
278 |     print(f"saved with {model.get_tokenizer_type()} tokenizer")
279 |     if TOK_TYPE == TokenizerType.SENTENCEPIECE:
280 |         tok_bin = base64.b64decode(model.get_tokenizer_str())
281 |         # Create a named temporary file that is deleted when closed
282 |         with tempfile.NamedTemporaryFile(
283 |             mode="wb", delete=False, suffix=".model"
284 |         ) as temp_model_file:
285 |             temp_model_file.write(tok_bin)
286 |             temp_model_file_path = temp_model_file.name
287 |             tokenizer = SentencePieceUnigramTokenizer.from_spm(temp_model_file_path)
288 |     elif TOK_TYPE == TokenizerType.JSON:
289 |         tokenizer = Tokenizer.from_str(model.get_tokenizer_str())
290 |     print(tokenizer.decode(tokens))
291 | 


--------------------------------------------------------------------------------
/neutone_sdk/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from .parameter import *
3 | from .wavform_to_wavform import *
4 | from .sqw import *
5 | from . import utils
6 | 


--------------------------------------------------------------------------------
/neutone_sdk/assets/default_samples/sample_ambience.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Neutone/neutone_sdk/aee4cac560209fe850686dad3e21695fa8dde473/neutone_sdk/assets/default_samples/sample_ambience.mp3


--------------------------------------------------------------------------------
/neutone_sdk/assets/default_samples/sample_drums.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Neutone/neutone_sdk/aee4cac560209fe850686dad3e21695fa8dde473/neutone_sdk/assets/default_samples/sample_drums.mp3


--------------------------------------------------------------------------------
/neutone_sdk/assets/default_samples/sample_rhodes.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Neutone/neutone_sdk/aee4cac560209fe850686dad3e21695fa8dde473/neutone_sdk/assets/default_samples/sample_rhodes.mp3


--------------------------------------------------------------------------------
/neutone_sdk/audio.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | from cffi import FFI
  3 | from dataclasses import dataclass
  4 | import logging
  5 | import math
  6 | import io
  7 | import pkgutil
  8 | from typing import Optional, List, Union
  9 | from typing_extensions import Self
 10 | 
 11 | import numpy as np
 12 | import torch as tr
 13 | from torch import nn, Tensor
 14 | import torchaudio
 15 | import soundfile as sf
 16 | from torch.jit import ScriptModule
 17 | from tqdm import tqdm
 18 | 
 19 | import neutone_sdk
 20 | 
 21 | logging.basicConfig()
 22 | log = logging.getLogger(__name__)
 23 | 
 24 | 
 25 | def write_mp3(buffer: io.BytesIO, y: tr.Tensor, sr: int, quality: float = 0):
 26 |     """
 27 |     We're using this instead of sf.write in order to change the bitrate,
 28 |     where quality goes from 0 (high) to 1 (low).
 29 | 
 30 |     The API is similar to torchaudio.save, so y should be (num_channels, num_samples).
 31 |     """
 32 |     assert 0 <= quality <= 1
 33 |     assert (
 34 |         y.shape[0] < y.shape[1]
 35 |     ), "Expecting  audio to have a shape of (num_channels, num_samples), try swapping the dimensions"
 36 |     ffi = FFI()
 37 |     quality = ffi.new("double *")
 38 |     vbr_set = ffi.new("int *")
 39 |     with sf.SoundFile(
 40 |         buffer, "w", channels=y.shape[0], samplerate=sr, format="mp3"
 41 |     ) as f:
 42 |         quality[0] = 0  # 0[high]~1[low]
 43 |         # 0x1301 - SFC_SET_COMPRESSION_LEVEL
 44 |         c = sf._snd.sf_command(f._file, 0x1301, quality, 8)
 45 |         assert c == sf._snd.SF_TRUE, "Couldn't set bitrate on MP3"
 46 | 
 47 |         # 0x1305 - SFC_SET_BITRATE_MODE
 48 |         vbr_set[0] = 2  # 0 - CONSTANT, 1 - AVERAGE, 2 - VARIABLE
 49 |         c = sf._snd.sf_command(f._file, 0x1305, vbr_set, 4)
 50 |         assert c == sf._snd.SF_TRUE, "Couldn't set MP3 to VBR"
 51 | 
 52 |         f.write(y.T.numpy())
 53 |     assert f.closed
 54 | 
 55 | 
 56 | @dataclass
 57 | class AudioSample:
 58 |     """
 59 |     AudioSample is simply a pair of (audio, sample_rate) that is easier to work
 60 |     with within the SDK. We recommend users to read and write to mp3 files as
 61 |     they are better supported and formats like ogg can have subtle bugs when
 62 |     reading and writing using the current backend (soundfile).
 63 |     """
 64 | 
 65 |     audio: Tensor
 66 |     sr: int
 67 | 
 68 |     def __post_init__(self):
 69 |         assert self.audio.ndim == 2
 70 |         assert (
 71 |             self.audio.size(0) == 1 or self.audio.size(0) == 2
 72 |         ), "Audio sample audio should be 1 or 2 channels, channels first"
 73 | 
 74 |     def is_mono(self) -> bool:
 75 |         return self.audio.size(0) == 1
 76 | 
 77 |     def to_mp3_bytes(self) -> bytes:
 78 |         buff = io.BytesIO()
 79 |         write_mp3(buff, self.audio, self.sr)
 80 |         buff.seek(0)
 81 |         return buff.read()
 82 | 
 83 |     def to_mp3_b64(self) -> str:
 84 |         return base64.b64encode(self.to_mp3_bytes()).decode()
 85 | 
 86 |     @classmethod
 87 |     def from_bytes(cls, bytes_: bytes) -> Self:
 88 |         y, sr = sf.read(io.BytesIO(bytes_), always_2d=True)
 89 |         return cls(tr.from_numpy(y.T.astype(np.float32)), sr)
 90 | 
 91 |     @classmethod
 92 |     def from_file(cls, path: str) -> Self:
 93 |         with open(path, "rb") as f:
 94 |             return cls.from_bytes(f.read())
 95 | 
 96 |     @classmethod
 97 |     def from_b64(cls, b64_sample: str) -> Self:
 98 |         return cls.from_bytes(base64.b64decode(b64_sample))
 99 | 
100 | 
101 | @dataclass
102 | class AudioSamplePair:
103 |     input: AudioSample
104 |     output: AudioSample
105 | 
106 |     def to_metadata_format(self):
107 |         return {
108 |             "in": self.input.to_mp3_b64(),
109 |             "out": self.output.to_mp3_b64(),
110 |         }
111 | 
112 | 
113 | def get_default_audio_samples() -> List[AudioSample]:
114 |     """
115 |     Returns a list of audio samples to be displayed on the website.
116 | 
117 |     The SDK provides one sample by default, but this method can be used to
118 |     provide different samples.
119 | 
120 |     By default the outputs of this function will be ran through the model
121 |     and the prerendered samples will be stored inside the saved object.
122 | 
123 |     See get_prerendered_audio_samples and render_audio_sample for more details.
124 |     """
125 |     log.info(
126 |         "Using default sample... Please consider using your own audio samples by overriding the get_audio_samples method"
127 |     )
128 |     sample_ambience = AudioSample.from_bytes(
129 |         pkgutil.get_data(__package__, "assets/default_samples/sample_ambience.mp3"),
130 |     )
131 |     sample_drums = AudioSample.from_bytes(
132 |         pkgutil.get_data(__package__, "assets/default_samples/sample_drums.mp3"),
133 |     )
134 |     sample_rhodes = AudioSample.from_bytes(
135 |         pkgutil.get_data(__package__, "assets/default_samples/sample_rhodes.mp3"),
136 |     )
137 | 
138 |     return [sample_rhodes, sample_drums, sample_ambience]
139 | 
140 | 
141 | def render_audio_sample(
142 |     model: Union["SampleQueueWrapper", "WaveformToWaveformBase", ScriptModule],
143 |     input_sample: AudioSample,
144 |     params: Optional[Tensor] = None,
145 |     output_sr: int = 44100,
146 | ) -> AudioSample:
147 |     """
148 |     params: either [model.MAX_N_PARAMS] 1d tensor of constant parameter values
149 |             or [model.MAX_N_PARAMS, input_sample.audio.size(1)] 2d tensor of parameter values for every input audio sample
150 |     """
151 | 
152 |     with tr.no_grad():
153 |         model.use_debug_mode = True  # Turn on debug mode to catch common mistakes when rendering sample audio
154 | 
155 |         preferred_sr = neutone_sdk.SampleQueueWrapper.select_best_model_sr(
156 |             input_sample.sr, model.get_native_sample_rates()
157 |         )
158 |         if len(model.get_native_buffer_sizes()) > 0:
159 |             buffer_size = model.get_native_buffer_sizes()[0]
160 |         else:
161 |             buffer_size = 512
162 | 
163 |         audio = input_sample.audio
164 |         if input_sample.sr != preferred_sr:
165 |             audio = torchaudio.transforms.Resample(input_sample.sr, preferred_sr)(audio)
166 | 
167 |         if model.is_input_mono() and not input_sample.is_mono():
168 |             audio = tr.mean(audio, dim=0, keepdim=True)
169 |         elif not model.is_input_mono() and input_sample.is_mono():
170 |             audio = audio.repeat(2, 1)
171 | 
172 |         audio_len = audio.size(1)
173 |         padding_amount = math.ceil(audio_len / buffer_size) * buffer_size - audio_len
174 |         padded_audio = nn.functional.pad(audio, [0, padding_amount])
175 |         audio_chunks = padded_audio.split(buffer_size, dim=1)
176 | 
177 |         model.set_daw_sample_rate_and_buffer_size(
178 |             preferred_sr, buffer_size, preferred_sr, buffer_size
179 |         )
180 | 
181 |         # make sure the shape of params is compatible with the model calls.
182 |         if params is not None:
183 |             assert params.shape[0] == model.MAX_N_PARAMS
184 | 
185 |             # if constant values, copy across audio dimension
186 |             if params.dim() == 1:
187 |                 params = params.repeat([audio_len, 1]).T
188 | 
189 |             # otherwise resample to match audio
190 |             else:
191 |                 assert params.shape == (model.MAX_N_PARAMS, input_sample.audio.size(1))
192 |                 params = torchaudio.transforms.Resample(input_sample.sr, preferred_sr)(
193 |                     params
194 |                 )
195 |                 params = tr.clamp(params, 0, 1)
196 | 
197 |             # padding and chunking parameters to match audio
198 |             padded_params = nn.functional.pad(
199 |                 params, [0, padding_amount], mode="replicate"
200 |             )
201 |             param_chunks = padded_params.split(buffer_size, dim=1)
202 | 
203 |             out_chunks = [
204 |                 model.forward(audio_chunk, param_chunk).clone()
205 |                 for audio_chunk, param_chunk in tqdm(
206 |                     zip(audio_chunks, param_chunks), total=len(audio_chunks)
207 |                 )
208 |             ]
209 | 
210 |         else:
211 |             out_chunks = [
212 |                 model.forward(audio_chunk, None).clone()
213 |                 for audio_chunk in tqdm(audio_chunks)
214 |             ]
215 | 
216 |         audio_out = tr.hstack(out_chunks)[:, :audio_len]
217 | 
218 |         model.reset()
219 | 
220 |         if preferred_sr != output_sr:
221 |             audio_out = torchaudio.transforms.Resample(preferred_sr, output_sr)(
222 |                 audio_out
223 |             )
224 | 
225 |         # Make the output audio consistent with the input audio
226 |         if audio_out.size(0) == 1 and not input_sample.is_mono():
227 |             audio_out = audio_out.repeat(2, 1)
228 |         elif audio_out.size(0) == 2 and input_sample.is_mono():
229 |             audio_out = tr.mean(audio_out, dim=0, keepdim=True)
230 | 
231 |         return AudioSample(audio_out, output_sr)
232 | 


--------------------------------------------------------------------------------
/neutone_sdk/benchmark.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import timeit
  4 | import itertools
  5 | from typing import List
  6 | import click
  7 | import torch
  8 | from torch.autograd.profiler import record_function
  9 | from neutone_sdk import constants
 10 | from neutone_sdk.sqw import SampleQueueWrapper
 11 | from neutone_sdk.utils import load_neutone_model, model_to_torchscript
 12 | import numpy as np
 13 | from tqdm import tqdm
 14 | 
 15 | logging.basicConfig()
 16 | log = logging.getLogger(__name__)
 17 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 18 | 
 19 | 
 20 | @click.group()
 21 | def cli():
 22 |     """This is needed to make a group command with click."""
 23 |     pass
 24 | 
 25 | 
 26 | @cli.command()
 27 | @click.option("--model_file", help="Path to model file")
 28 | @click.option(
 29 |     "--buffer_size",
 30 |     default=(128, 256, 512, 1024, 2048),
 31 |     multiple=True,
 32 |     help="Buffer sizes to benchmark",
 33 | )
 34 | @click.option(
 35 |     "--sample_rate",
 36 |     default=(48000,),
 37 |     multiple=True,
 38 |     help="Sample rates to benchmark",
 39 | )
 40 | @click.option("--repeat", default=10, help="How many times to repeat the benchmark")
 41 | @click.option(
 42 |     "--n_iters",
 43 |     default=30,
 44 |     help="How many forward passes to run for each repetition",
 45 | )
 46 | @click.option(
 47 |     "--daw_is_mono",
 48 |     default=False,
 49 |     help="Whether to assume daw is mono or not during the benchmark",
 50 | )
 51 | @click.option("--num_threads", default=1, help="num_threads to use for the benchmark")
 52 | @click.option(
 53 |     "--num_interop_threads",
 54 |     default=1,
 55 |     help="num_interop_threads to use for the benchmark",
 56 | )
 57 | def benchmark_speed(
 58 |     model_file: str,
 59 |     buffer_size: List[int],
 60 |     sample_rate: List[int],
 61 |     repeat: int,
 62 |     n_iters: int,
 63 |     daw_is_mono: bool,
 64 |     num_threads: int,
 65 |     num_interop_threads: int,
 66 | ) -> None:
 67 |     return benchmark_speed_(
 68 |         model_file,
 69 |         buffer_size,
 70 |         sample_rate,
 71 |         repeat,
 72 |         n_iters,
 73 |         daw_is_mono,
 74 |         num_threads,
 75 |         num_interop_threads,
 76 |     )
 77 | 
 78 | 
 79 | def benchmark_speed_(
 80 |     model_file: str,
 81 |     buffer_size: List[int] = (128, 256, 512, 1024, 2048),
 82 |     sample_rate: List[int] = (48000,),
 83 |     repeat: int = 10,
 84 |     n_iters: int = 30,
 85 |     daw_is_mono: bool = False,
 86 |     num_threads: int = 1,
 87 |     num_interop_threads: int = 1,
 88 | ) -> None:
 89 |     daw_n_ch = 1 if daw_is_mono else 2
 90 |     np.set_printoptions(precision=3)
 91 |     torch.set_num_threads(num_threads)
 92 |     torch.set_num_interop_threads(num_interop_threads)
 93 |     with torch.no_grad():
 94 |         m, _ = load_neutone_model(model_file)
 95 |         log.info(
 96 |             f"Running benchmark for buffer sizes {buffer_size} and sample rates {sample_rate}. Outliers will be removed from the calculation of mean and std and displayed separately if existing."
 97 |         )
 98 |         for sr, bs in itertools.product(sample_rate, buffer_size):
 99 |             m.set_daw_sample_rate_and_buffer_size(sr, bs)
100 |             for _ in range(n_iters):  # Warmup
101 |                 m.forward(torch.rand((daw_n_ch, bs)))
102 |             m.reset()
103 | 
104 |             # Pregenerate random buffers to more accurately benchmark the model itself
105 |             def get_random_buffer_generator():
106 |                 buffers = torch.rand(100, daw_n_ch, bs)
107 |                 i = 0
108 | 
109 |                 def return_next_random_buffer():
110 |                     nonlocal i
111 |                     i = (i + 1) % 100
112 |                     return buffers[i]
113 | 
114 |                 return return_next_random_buffer
115 | 
116 |             rbg = get_random_buffer_generator()
117 | 
118 |             durations = np.array(
119 |                 timeit.repeat(lambda: m.forward(rbg()), repeat=repeat, number=n_iters)
120 |             )
121 |             m.reset()
122 |             mean, std = np.mean(durations), np.std(durations)
123 |             outlier_mask = np.abs(durations - mean) > 2 * std
124 |             outliers = durations[outlier_mask]
125 |             # Remove outliers from general benchmark
126 |             durations = durations[~outlier_mask]
127 |             mean, std = np.mean(durations), np.std(durations)
128 |             log.info(
129 |                 f"Sample rate: {sr: 6} | Buffer size: {bs: 6} | duration: {mean: 6.3f}±{std:.3f} | 1/RTF: {bs/(mean/n_iters*sr): 6.3f} | Outliers: {outliers[:3]}"
130 |             )
131 | 
132 | 
133 | @cli.command()
134 | @click.option("--model_file", help="Path to model file")
135 | @click.option(
136 |     "--buffer_size",
137 |     default=(128, 256, 512, 1024, 2048),
138 |     multiple=True,
139 |     help="Buffer sizes to benchmark",
140 | )
141 | @click.option(
142 |     "--sample_rate",
143 |     default=(
144 |         44100,
145 |         48000,
146 |     ),
147 |     multiple=True,
148 |     help="Sample rates to benchmark",
149 | )
150 | def benchmark_latency(
151 |     model_file: str, buffer_size: List[int], sample_rate: List[int]
152 | ) -> None:
153 |     return benchmark_latency_(model_file, buffer_size, sample_rate)
154 | 
155 | 
156 | def benchmark_latency_(
157 |     model_file: str,
158 |     buffer_size: List[int] = (128, 256, 512, 1024, 2048),
159 |     sample_rate: List[int] = (48000,),
160 | ) -> None:
161 |     m, _ = load_neutone_model(model_file)
162 |     nbs, nsr = m.get_native_buffer_sizes(), m.get_native_sample_rates()
163 |     log.info(f"Native buffer sizes: {nbs[:10]}, Native sample rates: {nsr[:10]}")
164 |     if len(nbs) > 10 or len(nsr) > 10:
165 |         log.info(f"Showing only the first 10 values in case there are more.")
166 |     with torch.no_grad():
167 |         delays = []
168 |         for sr, bs in itertools.product(sample_rate, buffer_size):
169 |             m.set_daw_sample_rate_and_buffer_size(sr, bs)
170 |             m.reset()
171 |             delays += [
172 |                 [
173 |                     sr,
174 |                     bs,
175 |                     m.calc_buffering_delay_samples(),
176 |                     m.calc_model_delay_samples(),
177 |                 ]
178 |             ]
179 |         delays = sorted(delays, key=lambda x: x[2] + x[3])
180 |         log.info(
181 |             f"Model {model_file} has the following delays for each sample rate / buffer size combination (lowest delay first):"
182 |         )
183 |         for sr, bs, bds, mds in delays:
184 |             log.info(
185 |                 f"Sample rate: {sr: 6} | Buffer size: {bs: 6} | Total delay: {bds+mds: 6} | (Buffering delay: {bds: 6} | Model delay: {mds: 6})"
186 |             )
187 |         log.info(
188 |             f"The recommended sample rate / buffer size combination is sample rate {delays[0][0]}, buffer size {delays[0][1]}"
189 |         )
190 | 
191 | 
192 | def profile_sqw(
193 |     sqw: SampleQueueWrapper,
194 |     daw_sr: int = 48000,
195 |     daw_bs: int = 512,
196 |     daw_is_mono: bool = False,
197 |     use_params: bool = True,
198 |     convert_to_torchscript: bool = False,
199 |     n_iters: int = 100,
200 | ) -> None:
201 |     daw_n_ch = 1 if daw_is_mono else 2
202 |     audio_buffers = [torch.rand((daw_n_ch, daw_bs)) for _ in range(n_iters)]
203 |     if use_params:
204 |         param_buffers = [
205 |             torch.rand((constants.MAX_N_PARAMS, daw_bs)) for _ in range(n_iters)
206 |         ]
207 |     else:
208 |         param_buffers = [None for _ in range(n_iters)]
209 | 
210 |     sqw.set_daw_sample_rate_and_buffer_size(daw_sr, daw_bs)
211 |     if hasattr(sqw, "prepare_for_inference"):
212 |         sqw.prepare_for_inference()
213 |     if convert_to_torchscript:
214 |         log.info("Converting to TorchScript")
215 |         with torch.no_grad():
216 |             sqw = model_to_torchscript(sqw, freeze=False, optimize=False)
217 | 
218 |     with torch.inference_mode():
219 |         with torch.profiler.profile(
220 |             activities=[torch.profiler.ProfilerActivity.CPU],
221 |             with_stack=True,
222 |             profile_memory=True,
223 |             record_shapes=False,
224 |         ) as prof:
225 |             with record_function("forward"):
226 |                 for audio_buff, param_buff in tqdm(zip(audio_buffers, param_buffers)):
227 |                     out_buff = sqw.forward(audio_buff, param_buff)
228 | 
229 |         log.info("Displaying Total CPU Time")
230 |         log.info(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
231 |         # log.info(prof.key_averages(group_by_stack_n=5).table(sort_by="cpu_time_total", row_limit=10))
232 |         log.info("Displaying CPU Memory Usage")
233 |         log.info(
234 |             prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10)
235 |         )
236 |         log.info("Displaying Grouped CPU Memory Usage")
237 |         log.info(
238 |             prof.key_averages(group_by_stack_n=5).table(
239 |                 sort_by="self_cpu_memory_usage", row_limit=5
240 |             )
241 |         )
242 | 
243 | 
244 | @cli.command()
245 | @click.option("--model_file", help="Path to model file")
246 | @click.option(
247 |     "--buffer_size",
248 |     default=(128,),
249 |     multiple=True,
250 |     help="Buffer sizes to benchmark",
251 | )
252 | @click.option(
253 |     "--sample_rate",
254 |     default=(48000,),
255 |     multiple=True,
256 |     help="Sample rates to benchmark",
257 | )
258 | @click.option(
259 |     "--daw_is_mono",
260 |     default=False,
261 |     help="Whether to assume daw is mono or not during the benchmark",
262 | )
263 | @click.option(
264 |     "--use_params",
265 |     default=False,
266 |     help="Whether to pass parameters to the model during profiling",
267 | )
268 | @click.option(
269 |     "--n_iters",
270 |     default=30,
271 |     help="How many forward passes to run while profiling",
272 | )
273 | @click.option("--num_threads", default=1, help="num_threads to use for the benchmark")
274 | @click.option(
275 |     "--num_interop_threads",
276 |     default=1,
277 |     help="num_interop_threads to use for the benchmark",
278 | )
279 | def profile(
280 |     model_file: str,
281 |     buffer_size: List[int],
282 |     sample_rate: List[int],
283 |     daw_is_mono: bool = False,
284 |     use_params: bool = True,
285 |     n_iters: int = 100,
286 |     num_threads: int = 1,
287 |     num_interop_threads: int = 1,
288 | ):
289 |     torch.set_num_threads(num_threads)
290 |     torch.set_num_interop_threads(num_interop_threads)
291 |     m, _ = load_neutone_model(model_file)
292 |     for sr, bs in itertools.product(sample_rate, buffer_size):
293 |         log.info(
294 |             f"Profiling model {model_file} at sample rate {sr} and buffer size {bs}"
295 |         )
296 |         profile_sqw(
297 |             m,
298 |             sr,
299 |             bs,
300 |             daw_is_mono,
301 |             use_params,
302 |             False,
303 |             n_iters,
304 |         )
305 | 
306 | 
307 | if __name__ == "__main__":
308 |     cli()
309 | 


--------------------------------------------------------------------------------
/neutone_sdk/cached_mel_spec.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Optional, Callable
  4 | 
  5 | import torch as tr
  6 | from torch import Tensor
  7 | from torch import nn
  8 | from torchaudio.transforms import MelSpectrogram
  9 | 
 10 | from neutone_sdk import CircularInplaceTensorQueue
 11 | 
 12 | logging.basicConfig()
 13 | log = logging.getLogger(__name__)
 14 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 15 | 
 16 | 
 17 | class CachedMelSpec(nn.Module):
 18 |     def __init__(
 19 |         self,
 20 |         sr: int,
 21 |         n_ch: int,
 22 |         n_fft: int = 2048,
 23 |         hop_len: int = 512,
 24 |         f_min: float = 0.0,
 25 |         f_max: Optional[float] = None,
 26 |         n_mels: int = 128,
 27 |         window_fn: Callable[..., Tensor] = tr.hann_window,
 28 |         power: float = 2.0,
 29 |         normalized: bool = False,
 30 |         center: bool = True,
 31 |         use_debug_mode: bool = True,
 32 |     ) -> None:
 33 |         """
 34 |         Creates a Mel spectrogram that supports streaming of a centered, non-causal
 35 |         Mel spectrogram operation that uses zero padding. Using this will result in
 36 |         audio being delayed by (n_fft / 2) - hop_len samples. When calling forward,
 37 |         the input audio block length must be a multiple of the hop length.
 38 | 
 39 |         Parameters:
 40 |             sr (int): Sample rate of the audio
 41 |             n_ch (int): Number of audio channels
 42 |             n_fft (int): STFT n_fft (must be even)
 43 |             hop_len (int): STFT hop length (must divide into n_fft // 2)
 44 |             f_min (float): Minimum frequency of the Mel filterbank
 45 |             f_max (float): Maximum frequency of the Mel filterbank
 46 |             n_mels (int): Number of mel filterbank bins
 47 |             window_fn (Callable[..., Tensor]): A function to create a window tensor
 48 |             power (float): Exponent for the magnitude spectrogram (must be > 0)
 49 |             normalized (bool): Whether to normalize the mel spectrogram or not
 50 |             center (bool): Whether to center the mel spectrogram (must be True)
 51 |             use_debug_mode (bool): Whether to use debug mode or not
 52 |         """
 53 |         super().__init__()
 54 |         assert center, "center must be True, causal mode is not supported yet"
 55 |         assert n_fft % 2 == 0, "n_fft must be even"
 56 |         assert (n_fft // 2) % hop_len == 0, "n_fft // 2 must be divisible by hop_len"
 57 |         self.n_ch = n_ch
 58 |         self.n_fft = n_fft
 59 |         self.hop_len = hop_len
 60 |         self.use_debug_mode = use_debug_mode
 61 |         self.mel_spec = MelSpectrogram(
 62 |             sample_rate=sr,
 63 |             n_fft=n_fft,
 64 |             hop_length=hop_len,
 65 |             f_min=f_min,
 66 |             f_max=f_max,
 67 |             n_mels=n_mels,
 68 |             window_fn=window_fn,
 69 |             power=power,
 70 |             normalized=normalized,
 71 |             center=False,  # We use a causal STFT since we do the padding ourselves
 72 |         )
 73 |         self.padding_n_samples = self.n_fft - self.hop_len
 74 |         self.cache = CircularInplaceTensorQueue(
 75 |             n_ch, self.padding_n_samples, use_debug_mode
 76 |         )
 77 |         self.register_buffer("padding", tr.zeros((n_ch, self.padding_n_samples)))
 78 |         self.cache.push(self.padding)
 79 | 
 80 |     def forward(self, x: Tensor) -> Tensor:
 81 |         """
 82 |         Computes the Mel spectrogram of the input audio tensor. Supports streaming as
 83 |         long as the input audio tensor is a multiple of the hop length.
 84 |         """
 85 |         if self.use_debug_mode:
 86 |             assert x.ndim == 2, "input audio must have shape (n_ch, n_samples)"
 87 |             assert x.size(0) == self.n_ch, "input audio n_ch is incorrect"
 88 |             assert (
 89 |                 x.size(1) % self.hop_len == 0
 90 |             ), "input audio n_samples must be divisible by hop_len"
 91 |         # Compute the Mel spec
 92 |         n_samples = x.size(1)
 93 |         n_frames = n_samples // self.hop_len
 94 |         padded_x = tr.cat([self.padding, x], dim=1)
 95 |         padded_spec = self.mel_spec(padded_x)
 96 |         spec = padded_spec[:, :, -n_frames:]
 97 | 
 98 |         # Update the cache and padding
 99 |         padding_idx = min(n_samples, self.padding_n_samples)
100 |         self.cache.push(x[:, -padding_idx:])
101 |         self.cache.fill(self.padding)
102 |         return spec
103 | 
104 |     def prepare_for_inference(self) -> None:
105 |         """
106 |         Prepares the cached Mel spectrogram for inference by disabling debug mode.
107 |         """
108 |         self.cache.use_debug_mode = False
109 |         self.use_debug_mode = False
110 | 
111 |     @tr.jit.export
112 |     def get_delay_samples(self) -> int:
113 |         """
114 |         Returns the number of samples of delay of the cached Mel spectrogram.
115 |         """
116 |         return (self.n_fft // 2) - self.hop_len
117 | 
118 |     @tr.jit.export
119 |     def get_delay_frames(self) -> int:
120 |         """
121 |         Returns the number of frames of delay of the cached Mel spectrogram.
122 |         """
123 |         return self.get_delay_samples() // self.hop_len
124 | 
125 |     @tr.jit.export
126 |     def reset(self) -> None:
127 |         """
128 |         Resets the cache and padding of the cached Mel spectrogram.
129 |         """
130 |         self.cache.reset()
131 |         self.padding.zero_()
132 |         self.cache.push(self.padding)
133 | 


--------------------------------------------------------------------------------
/neutone_sdk/constants.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | SDK_VERSION = "1.4.3"
 4 | 
 5 | MAX_N_PARAMS = 4
 6 | MAX_N_CATEGORICAL_VALUES = 20
 7 | MAX_N_CATEGORICAL_LABEL_CHARS = 20
 8 | MAX_N_AUDIO_SAMPLES = 3
 9 | 
10 | DEFAULT_DAW_SR = 48000
11 | DEFAULT_DAW_BS = 2048
12 | 
13 | NEUTONE_GEN_N_NUMERICAL_PARAMS = 4
14 | NEUTONE_GEN_N_TEXT_PARAMS = 1
15 | NEUTONE_GEN_N_TOKENS_PARAMS = 1
16 | 


--------------------------------------------------------------------------------
/neutone_sdk/core.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | from abc import ABC, abstractmethod
  5 | from typing import Dict, List, Tuple, Union, Any
  6 | 
  7 | import torch as tr
  8 | from torch import nn, Tensor
  9 | 
 10 | from neutone_sdk import constants
 11 | from neutone_sdk.parameter import NeutoneParameter
 12 | 
 13 | logging.basicConfig()
 14 | log = logging.getLogger(__name__)
 15 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 16 | 
 17 | 
 18 | class NeutoneModel(ABC, nn.Module):
 19 |     # TorchScript typing does not support instance attributes, so we need to type them
 20 |     # as class attributes. This is required for supporting models with no parameters.
 21 |     # (https://github.com/pytorch/pytorch/issues/51041#issuecomment-767061194)
 22 |     neutone_parameters_metadata: Dict[
 23 |         str, Dict[str, Union[int, float, str, bool, List[str], List[int]]]
 24 |     ]
 25 |     neutone_parameter_names: List[str]
 26 | 
 27 |     def __init__(self, model: nn.Module, use_debug_mode: bool = True) -> None:
 28 |         """
 29 |         Creates an Neutone model, wrapping a child model (that does the real
 30 |         work).
 31 |         """
 32 |         super().__init__()
 33 | 
 34 |         # Save and prepare model. This should be done at the very beginning of the
 35 |         # constructor to enable accessing the model in other methods of this class.
 36 |         model.eval()
 37 |         self.model = model
 38 | 
 39 |         self.MAX_N_PARAMS = self._get_max_n_params()
 40 |         self.SDK_VERSION = constants.SDK_VERSION
 41 |         self.CURRENT_TIME = time.time()
 42 |         self.use_debug_mode = use_debug_mode
 43 |         self.n_neutone_parameters = len(self.get_neutone_parameters())
 44 | 
 45 |         # Ensure the number of parameters is within the allowed limit
 46 |         assert self.n_neutone_parameters <= self.MAX_N_PARAMS, (
 47 |             f"Number of parameters ({self.n_neutone_parameters}) exceeds the maximum "
 48 |             f"allowed ({self.MAX_N_PARAMS})."
 49 |         )
 50 |         # Ensure parameter names are unique
 51 |         assert len(set([p.name for p in self.get_neutone_parameters()])) == len(
 52 |             self.get_neutone_parameters()
 53 |         )
 54 | 
 55 |         # Save parameter metadata
 56 |         self.neutone_parameters_metadata = {
 57 |             f"p{idx + 1}": p.to_metadata()
 58 |             for idx, p in enumerate(self.get_neutone_parameters())
 59 |         }
 60 | 
 61 |         # Allocate default params buffer to prevent dynamic allocations later
 62 |         default_vals_0to1 = self._get_numerical_params_default_values_0to1()
 63 |         n_numerical_params = default_vals_0to1.size(0)
 64 |         assert n_numerical_params <= self.MAX_N_PARAMS, (
 65 |             f"Number of default param values ({n_numerical_params}) "
 66 |             f"exceeds the maximum allowed ({self.MAX_N_PARAMS})."
 67 |         )
 68 |         default_vals_0to1 = default_vals_0to1.view(n_numerical_params, 1)
 69 |         self.register_buffer("numerical_params_default_values_0to1", default_vals_0to1)
 70 | 
 71 |         # Save parameter information
 72 |         self.neutone_parameter_names = [p.name for p in self.get_neutone_parameters()]
 73 | 
 74 |     @abstractmethod
 75 |     def _get_max_n_params(self) -> int:
 76 |         """
 77 |         Sets the maximum number of parameters that the model can have.
 78 |         This should not be overwritten by SDK users.
 79 |         """
 80 |         pass
 81 | 
 82 |     @abstractmethod
 83 |     def _get_numerical_params_default_values_0to1(
 84 |         self,
 85 |     ) -> Tensor:
 86 |         """
 87 |         Returns a float tensor with the default values of the numerical parameters
 88 |         in the range [0, 1].
 89 |         This should not be overwritten by SDK users.
 90 |         """
 91 |         pass
 92 | 
 93 |     @abstractmethod
 94 |     def get_model_name(self) -> str:
 95 |         """
 96 |         Used to set the model name. This will be displayed on both the
 97 |         website and the plugin.
 98 | 
 99 |         Maximum length of 30 characters.
100 |         """
101 |         pass
102 | 
103 |     @abstractmethod
104 |     def get_model_authors(self) -> List[str]:
105 |         """
106 |         Used to set the model authors. This will be displayed on both the
107 |         website and the plugin.
108 | 
109 |         Should reflect the name of the people that developed the wrapper
110 |         of the model using the SDK. Can be different from the authors of
111 |         the original model.
112 | 
113 |         Maximum of 5 authors.
114 |         """
115 |         pass
116 | 
117 |     @abstractmethod
118 |     def get_model_short_description(self) -> str:
119 |         """
120 |         Used to set the model short description. This will be displayed on both
121 |         the website and the plugin.
122 | 
123 |         This is meant to be seen by the audio creators and should give a summary
124 |         of what the model does.
125 | 
126 |         Maximum of 150 characters.
127 |         """
128 |         pass
129 | 
130 |     @abstractmethod
131 |     def get_model_long_description(self) -> str:
132 |         """
133 |         Used to set the model long description. This will be displayed only on
134 |         the website.
135 | 
136 |         This is meant to be seen by the audio creators and should give an extensive
137 |         description of what the model does. Could describe interesting uses of the
138 |         model, good combinations of parameters, what types of audio has it been
139 |         tested with etc.
140 | 
141 |         Maximum of 500 characters.
142 |         """
143 |         pass
144 | 
145 |     @abstractmethod
146 |     def get_technical_description(self) -> str:
147 |         """
148 |         Used to set the model technical description. This will be displayed only on
149 |         the website.
150 | 
151 |         This is meant to be seen by other researchers or people that want to develop
152 |         similar models. It could present a summary of the internals of the model:
153 |         what architecture it is based on, what kind of data it was trained with,
154 |         on what kind of hardware.
155 | 
156 |         If the authors of the plugin are different from the authors of the model(s)
157 |         included this section along with citation and technical links are places
158 |         to provide appropiate credits.
159 | 
160 |         Maximum of 500 characters.
161 |         """
162 |         pass
163 | 
164 |     @abstractmethod
165 |     def get_tags(self) -> List[str]:
166 |         """
167 |         Used to provide a list of tags. This will be displayed on the website and will
168 |         be used later on for filtering of similar models.
169 | 
170 |         Maximum of 7 tags of 15 characters each.
171 |         """
172 |         pass
173 | 
174 |     @abstractmethod
175 |     def get_model_version(self) -> str:
176 |         """
177 |         Used to set the model version. This will be displayed on both the website and the plugin.
178 | 
179 |         We suggest people use semantic versioning for their models, but in a lot of cases it can
180 |         be overkill. For now we only support showing the latest version of the model.
181 | 
182 |         Please provide a string like "1", "1.0", "1.0.0", "0.1.0" etc.
183 |         """
184 |         pass
185 | 
186 |     @abstractmethod
187 |     def is_experimental(self) -> bool:
188 |         """
189 |         Used to set the experimental flag. This will be displayed on both the website and the plugin.
190 | 
191 |         If this flag is set the models will have a special icon next to them signaling to the users of
192 |         the plugin that this model is an experimental release.
193 |         """
194 |         pass
195 | 
196 |     def get_technical_links(self) -> Dict[str, str]:
197 |         """
198 |         Used to set the hechnical links. These will be displayed only on the website.
199 | 
200 |         Under the technical description field the following links can be displayed as buttons.
201 |         This can be used to provide links to the implementation, to scientific paper, personal websites etc.
202 | 
203 |         While any key-value pair can be provided, we strongly encourage users to provide a dictionary
204 |         with keys such as Paper, Code, Personal, GitHub, Blog, Twitter, Instagram etc.
205 | 
206 |         Maximum of 3 links.
207 |         """
208 |         return {}
209 | 
210 |     def get_citation(self) -> str:
211 |         """
212 |         Used to set the citation. This will be displayed only on the website.
213 | 
214 |         This field is specifically meant to display the citation for a scientific paper that the model
215 |         is based on, if any. Will be displayed under the technical links. Can be left empty.
216 | 
217 |         Maximum of 150 characters.
218 |         """
219 |         return ""
220 | 
221 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
222 |         return []
223 | 
224 |     def prepare_for_inference(self) -> None:
225 |         """Prepare a model for inference and to be converted to torchscript."""
226 |         self.use_debug_mode = False
227 |         self.model.eval()
228 |         self.eval()
229 | 
230 |     @tr.jit.export
231 |     def get_neutone_parameters_metadata(
232 |         self,
233 |     ) -> Dict[str, Dict[str, Union[int, float, str, bool, List[str], List[int]]]]:
234 |         """
235 |         Returns the metadata of the parameters as a dictionary of ParameterMetadata
236 |         named tuples.
237 |         """
238 |         return self.neutone_parameters_metadata
239 | 
240 |     @tr.jit.export
241 |     def get_numerical_params_default_values_0to1(self) -> Tensor:
242 |         """
243 |         Returns the default parameter values as a tensor of shape
244 |         (n_numerical_params, 1).
245 |         """
246 |         return self.numerical_params_default_values_0to1
247 | 
248 |     @tr.jit.export
249 |     def get_wet_default_value(self) -> float:
250 |         return 1.0
251 | 
252 |     @tr.jit.export
253 |     def get_dry_default_value(self) -> float:
254 |         return 0.0
255 | 
256 |     @tr.jit.export
257 |     def get_input_gain_default_value(self) -> float:
258 |         """[0.0, 1.0] here maps to [-30.0db, +30.0db]"""
259 |         return 0.5
260 | 
261 |     @tr.jit.export
262 |     def get_output_gain_default_value(self) -> float:
263 |         """[0.0, 1.0] here maps to [-30.0db, +30.0db]"""
264 |         return 0.5
265 | 
266 |     @tr.jit.export
267 |     def get_core_preserved_attributes(self) -> List[str]:
268 |         return [
269 |             "model",  # nn.Module
270 |             "get_neutone_parameters_metadata",
271 |             "get_numerical_params_default_values_0to1",
272 |             "get_wet_default_value",
273 |             "get_dry_default_value",
274 |             "get_input_gain_default_value",
275 |             "get_output_gain_default_value",
276 |             "get_core_preserved_attributes",
277 |             "to_core_metadata",
278 |         ]
279 | 
280 |     @tr.jit.export
281 |     def to_core_metadata(self) -> Dict[str, Any]:
282 |         return {
283 |             "model_name": self.get_model_name(),
284 |             "model_authors": self.get_model_authors(),
285 |             "model_short_description": self.get_model_short_description(),
286 |             "model_long_description": self.get_model_long_description(),
287 |             "neutone_parameters": self.get_neutone_parameters_metadata(),
288 |             "wet_default_value": self.get_wet_default_value(),
289 |             "dry_default_value": self.get_dry_default_value(),
290 |             "input_gain_default_value": self.get_input_gain_default_value(),
291 |             "output_gain_default_value": self.get_output_gain_default_value(),
292 |             "technical_description": self.get_technical_description(),
293 |             "technical_links": self.get_technical_links(),
294 |             "tags": self.get_tags(),
295 |             "model_version": self.get_model_version(),
296 |             "sdk_version": self.SDK_VERSION,
297 |             "pytorch_version": tr.__version__,
298 |             "date_created": self.CURRENT_TIME,
299 |             "citation": self.get_citation(),
300 |             "is_experimental": self.is_experimental(),
301 |         }
302 | 


--------------------------------------------------------------------------------
/neutone_sdk/filters.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import List, Optional
  3 | from enum import Enum
  4 | import warnings
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | """
 11 | Filters for pre-filtering inputs to models such as RAVE.
 12 | """
 13 | 
 14 | 
 15 | class FilterType(Enum):
 16 |     LOWPASS = "lowpass"
 17 |     HIGHPASS = "highpass"
 18 |     BANDPASS = "bandpass"
 19 |     BANDSTOP = "bandstop"
 20 | 
 21 | 
 22 | class FIRFilter(nn.Module):
 23 |     def __init__(
 24 |         self,
 25 |         filt_type: FilterType,
 26 |         cutoffs: List[float],
 27 |         filt_size: int = 257,
 28 |     ):
 29 |         """Streamable FIR filter for pre-filtering of model inputs, etc.
 30 | 
 31 |         Args:
 32 |             filt_type (FilterType): Type of the filter (FilterType.LOWPASS/HIGHPASS/BANDPASS/BANDSTOP).
 33 |             cutoffs (List[float]): Cutoff frequencies (in Hz). 2 should be given if bandpass/stop
 34 |             sample_rate (int): Sampling rate
 35 |             filt_size (int, optional): Length of the FIR. Defaults to 257.
 36 |         """
 37 |         super().__init__()
 38 |         # register buffer only allowed once
 39 |         self.register_buffer("cache", torch.zeros(2, filt_size - 1))
 40 |         self.register_buffer("ir_windowed", torch.empty(1, 1, filt_size))
 41 |         # Pass in fake sample rate for filter
 42 |         # Sample rate should be automatically overwritten by calling
 43 |         # set_parameters() from w2wbase.set_model_sample_rate_and_buffer_size()
 44 |         self.set_parameters(filt_type, cutoffs, 48000, filt_size)
 45 | 
 46 |     def set_parameters(
 47 |         self,
 48 |         filt_type: Optional[FilterType] = None,
 49 |         cutoffs: Optional[List[float]] = None,
 50 |         sample_rate: Optional[int] = None,
 51 |         filt_size: Optional[int] = None,
 52 |     ):
 53 |         filt_type = self.filt_type if filt_type is None else filt_type
 54 |         cutoffs = self.cutoffs if cutoffs is None else cutoffs
 55 |         sample_rate = self.sample_rate if sample_rate is None else sample_rate
 56 |         filt_size = self.filt_size if filt_size is None else filt_size
 57 |         if len(cutoffs) == 2:
 58 |             if filt_type.value in [FilterType.HIGHPASS.value, FilterType.LOWPASS.value]:
 59 |                 raise ValueError(
 60 |                     f"only 1 cutoff value supported for filter type: {filt_type}"
 61 |                 )
 62 |         else:
 63 |             if filt_type.value in [
 64 |                 FilterType.BANDPASS.value,
 65 |                 FilterType.BANDSTOP.value,
 66 |             ]:
 67 |                 raise ValueError(
 68 |                     f"2 cutoff values (low, high) needed for filter type: {filt_type}"
 69 |                 )
 70 |         # create frequency response by frequency sampling
 71 |         freqs = torch.fft.rfftfreq(filt_size, 1 / sample_rate)
 72 | 
 73 |         if filt_type == FilterType.HIGHPASS:
 74 |             freq_resp = torch.where((freqs > cutoffs[0]), 1.0, 0.0).float()
 75 |         elif filt_type == FilterType.LOWPASS:
 76 |             freq_resp = torch.where((freqs < cutoffs[0]), 1.0, 0.0).float()
 77 |         elif filt_type == FilterType.BANDPASS:
 78 |             freq_resp = torch.where(
 79 |                 torch.logical_and(freqs > cutoffs[0], freqs < cutoffs[1]), 1.0, 0.0
 80 |             ).float()
 81 |         elif filt_type == FilterType.BANDSTOP:
 82 |             freq_resp = torch.where(
 83 |                 torch.logical_or(freqs < cutoffs[0], freqs > cutoffs[1]), 1.0, 0.0
 84 |             ).float()
 85 |         else:
 86 |             raise ValueError(f"Unrecognized filter type: {filt_type.value}")
 87 |         # create impulse response by windowing
 88 |         ir = torch.fft.irfft(freq_resp, n=filt_size, dim=-1)
 89 |         filter_window = torch.kaiser_window(filt_size, dtype=torch.float32).roll(
 90 |             filt_size // 2, -1
 91 |         )
 92 |         self.ir_windowed = (filter_window * ir)[None, None, :].to(
 93 |             self.ir_windowed.device
 94 |         )
 95 |         self.filt_type = filt_type
 96 |         self.cutoffs = cutoffs
 97 |         self.sample_rate = sample_rate
 98 |         self.filt_size = filt_size
 99 |         self.delay = filt_size // 2  # constant group delay
100 | 
101 |     def forward(
102 |         self,
103 |         audio: torch.Tensor,
104 |     ):
105 |         """Process audio with filter
106 | 
107 |         Args:
108 |             audio (torch.Tensor): input audio [n_channels, n_samples]
109 | 
110 |         Returns:
111 |             torch.Tensor: filtered audio
112 |         """
113 |         n_channels, orig_len = audio.shape
114 |         # standard convolution implementation
115 |         # pad input with cache
116 |         audio = torch.cat([self.cache[:n_channels], audio], dim=-1)
117 |         self.cache = audio[:, -(self.filt_size - 1) :]
118 |         filtered = F.conv1d(
119 |             audio[:, None, :],
120 |             self.ir_windowed,
121 |             padding="valid",
122 |         ).squeeze(1)
123 |         return filtered
124 | 
125 | 
126 | class IIRFilter(nn.Module):
127 |     def __init__(
128 |         self,
129 |         filt_type: FilterType,
130 |         cutoff: float,
131 |         resonance: float,
132 |     ):
133 |         """Time-invariant IIR filter
134 | 
135 |         Args:
136 |             filt_type (FilterType): Type of the filter (FilterType.LOWPASS/HIGHPASS/BANDPASS).
137 |             cutoff (float): Cutoff frequency in Hz (0 < cutoff < f_nyq)
138 |             resonance (float): Filter resonance, controls bandwidth in case of bandpass
139 |             sample_rate (int): Sampling rate
140 |         """
141 |         super().__init__()
142 |         # register buffer only allowed once
143 |         self.register_buffer("g", torch.empty(1, 1, 1))
144 |         self.register_buffer("twoR", torch.empty(1, 1, 1) / resonance)
145 |         self.register_buffer("mix", torch.empty(1, 1, 3))
146 |         # Pass in fake sample rate for filter
147 |         # Sample rate should be automatically overwritten by calling
148 |         # set_parameters() from w2wbase.set_model_sample_rate_and_buffer_size()
149 |         self.set_parameters(filt_type, cutoff, resonance, 48000)
150 |         self.svf = _SVFLayer()
151 | 
152 |     def set_parameters(
153 |         self,
154 |         filt_type: Optional[FilterType] = None,
155 |         cutoff: Optional[float] = None,
156 |         resonance: Optional[float] = None,
157 |         sample_rate: Optional[int] = None,
158 |     ):
159 |         filt_type = self.filt_type if filt_type is None else filt_type
160 |         cutoff = self.cutoff if cutoff is None else cutoff
161 |         resonance = self.resonance if resonance is None else resonance
162 |         sample_rate = self.sample_rate if sample_rate is None else sample_rate
163 | 
164 |         cutoff = max(min(cutoff, sample_rate / 2 - 1e-4), 1e-4)
165 |         resonance = max(resonance, 1e-4)
166 |         # frequency warping
167 |         self.g = torch.ones(1, 1, 1, device=self.g.device) * math.tan(
168 |             math.pi / sample_rate * cutoff
169 |         )
170 |         self.twoR = torch.ones(1, 1, 1, device=self.twoR.device) / resonance
171 |         if filt_type == FilterType.LOWPASS:
172 |             self.mix = torch.tensor([[[0.0, 1.0, 0.0]]], device=self.mix.device)
173 |         elif filt_type == FilterType.HIGHPASS:
174 |             self.mix = torch.tensor([[[0.0, 0.0, 1.0]]], device=self.mix.device)
175 |         elif filt_type == FilterType.BANDPASS:
176 |             self.mix = torch.tensor([[[1.0, 0.0, 0.0]]], device=self.mix.device)
177 |         else:
178 |             raise ValueError(f"Unrecognized filter type: {filt_type}")
179 |         self.filt_type = filt_type
180 |         self.cutoff = cutoff
181 |         self.resonance = resonance
182 |         self.sample_rate = sample_rate
183 |         self.delay = 0
184 | 
185 |     def forward(self, audio: torch.Tensor):
186 |         """pass through highpass filter
187 | 
188 |         Args:
189 |             audio (torch.Tensor): [batch_size (or n_channels), n_samples]
190 |         """
191 |         batch_size, n_samples = audio.shape
192 |         g = self.g.expand(n_samples, batch_size, -1)
193 |         twoR = self.twoR.expand(n_samples, batch_size, -1)
194 |         mix = self.mix.expand(n_samples, batch_size, -1)
195 |         return self.svf(audio.permute(1, 0), g, twoR, mix)
196 | 
197 | 
198 | class IIRSVF(nn.Module):
199 |     def __init__(self):
200 |         """
201 |         Time-varying SVF with IIRs
202 |         """
203 |         super().__init__()
204 |         self.svf = _SVFLayer()
205 |         self.delay = 0
206 | 
207 |     def forward(
208 |         self,
209 |         audio: torch.Tensor,
210 |         cutoff: torch.Tensor,
211 |         resonance: torch.Tensor,
212 |         mix: torch.Tensor,
213 |         sample_rate: int,
214 |     ):
215 |         """Feed into time-varying svf
216 | 
217 |         Args:
218 |             audio (torch.Tensor): Input audio [batch_size (or n_channels), n_samples]
219 |             cutoff (torch.Tensor): Cutoff frequency [batch_size, n_samples, 1]
220 |             resonance (torch.Tensor): Resonance (0 ~ 1), [batch_size, n_samples, 1]
221 |             mix (torch.Tensor): Mix coeff. bp, lp and hp [batch_size, n_samples, 3] ex.) [[[1.0, 0.0, 0.0]]] = bandpass
222 | 
223 |         Returns:
224 |             audio (torch.Tensor): [n_channels, n_samples]
225 |         """
226 |         cutoff = torch.clamp(cutoff, min=1e-4, max=sample_rate / 2 - 1e-4)
227 |         resonance = torch.clamp(resonance, min=1e-4)
228 |         g = torch.tan(math.pi / sample_rate * cutoff).permute(1, 0, 2)
229 |         twoR = 1 / resonance.permute(1, 0, 2)
230 |         mix = mix.permute(1, 0, 2)
231 |         return self.svf(audio.permute(1, 0), g, twoR, mix)
232 | 
233 | 
234 | class _SVFLayer(nn.Module):
235 |     """
236 |     SVF implementation based on "Time-varying filters for musical applications" [Wishnick, 2014]
237 |     NOTE: This SVF is slow for use in training due to recurrent operations
238 |     """
239 | 
240 |     def __init__(self):
241 |         super().__init__()
242 |         self.register_buffer("state", torch.zeros(1, 2))
243 |         self.register_buffer("Y", torch.empty(4096, 2, 2))
244 | 
245 |     def forward(
246 |         self,
247 |         audio: torch.Tensor,
248 |         g: torch.Tensor,
249 |         twoR: torch.Tensor,
250 |         mix: torch.Tensor,
251 |     ):
252 |         """pass audio through SVF
253 |         Args:
254 |             *** time-first, batch-second ***
255 |             audio (torch.Tensor): [n_samples, batch_size]
256 |             All filter parameters are [n_samples, batch_size, 1 or 3 (mix)]
257 |             g (torch.Tensor): Normalized cutoff parameter
258 |             twoR (torch.Tensor): Damping parameter
259 |             mix (torch.Tensor): Mixing coefficient of bp, lp and hp
260 | 
261 |         Returns:
262 |             [torch.Tensor]: Filtered audio. Shape [batch, n_samples]
263 |         """
264 |         seq_len, batch_size = audio.shape
265 |         T = 1.0 / (1.0 + g * (g + twoR))
266 |         H = T.unsqueeze(-1) * torch.cat(
267 |             [torch.ones_like(g), -g, g, twoR * g + 1], dim=-1
268 |         ).reshape(seq_len, batch_size, 2, 2)
269 | 
270 |         # Y = gHBx + Hs
271 |         gHB = g * T * torch.cat([torch.ones_like(g), g], dim=-1)
272 |         # [n_samples, batch_size, 2]
273 |         gHBx = gHB * audio.unsqueeze(-1)
274 |         if seq_len > self.Y.shape[0]:
275 |             self.Y = torch.empty(seq_len, 2, 2, device=self.Y.device)
276 |         Y = self.Y[:seq_len, :batch_size, :]
277 |         # initialize filter state
278 |         state = self.state.expand(batch_size, -1)
279 |         for t in range(seq_len):
280 |             Y[t] = gHBx[t] + torch.bmm(H[t], state.unsqueeze(-1)).squeeze(-1)
281 |             state = 2 * Y[t] - state
282 |         self.state = state
283 | 
284 |         # HP = x - 2R*BP - LP
285 |         y_hps = audio - twoR.squeeze(-1) * Y[:, :, 0] - Y[:, :, 1]
286 | 
287 |         y_mixed = (
288 |             twoR.squeeze(-1) * mix[:, :, 0] * Y[:, :, 0]
289 |             + mix[:, :, 1] * Y[:, :, 1]
290 |             + mix[:, :, 2] * y_hps
291 |         )
292 |         y_mixed = y_mixed.permute(1, 0)
293 |         return y_mixed
294 | 


--------------------------------------------------------------------------------
/neutone_sdk/gcn_1d.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Optional
  4 | 
  5 | import torch
  6 | from torch import Tensor
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | logging.basicConfig()
 11 | log = logging.getLogger(__name__)
 12 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 13 | 
 14 | 
 15 | class TFiLM(nn.Module):
 16 |     """Temporal Feature-wise Linear Modulation (TFiLM) layer.
 17 | 
 18 |     Parameters:
 19 |         n_channels (int): Number of channels in the input signal.
 20 |         cond_dim (int): Dimensionality of the conditional input.
 21 |         tfilm_block_size (int): Size of the temporal blocks.
 22 |         rnn_type (str, optional): Type of RNN to use for the modulation.
 23 | 
 24 |     Returns:
 25 |         Tensor: The output of the TFiLM layer.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         n_channels: int,
 31 |         cond_dim: int,
 32 |         tfilm_block_size: int,
 33 |         rnn_type: str = "lstm",
 34 |     ) -> None:
 35 |         super().__init__()
 36 |         self.nchannels = n_channels
 37 |         self.cond_dim = cond_dim
 38 |         self.tfilm_block_size = tfilm_block_size
 39 |         self.num_layers = 1
 40 |         self.first_run = True
 41 |         self.hidden_state = (
 42 |             torch.Tensor(0),
 43 |             torch.Tensor(0),
 44 |         )  # (hidden_state, cell_state)
 45 | 
 46 |         self.maxpool = torch.nn.MaxPool1d(
 47 |             kernel_size=tfilm_block_size,
 48 |             stride=None,
 49 |             padding=0,
 50 |             dilation=1,
 51 |             return_indices=False,
 52 |             ceil_mode=False,
 53 |         )
 54 | 
 55 |         rnn_types = {"lstm": torch.nn.LSTM, "gru": torch.nn.GRU}
 56 | 
 57 |         try:
 58 |             RNN = rnn_types[rnn_type.lower()]
 59 |             self.rnn = RNN(
 60 |                 input_size=n_channels + cond_dim,
 61 |                 hidden_size=n_channels,
 62 |                 num_layers=self.num_layers,
 63 |                 batch_first=True,
 64 |                 bidirectional=False,
 65 |             )
 66 |         except KeyError:
 67 |             raise ValueError(f"Invalid rnn_type. Use 'lstm' or 'gru'. Got {rnn_type}")
 68 | 
 69 |     def forward(self, x: Tensor , cond: Optional[Tensor] = None) -> Tensor:
 70 |         x_in_shape = x.shape  # (batch_size, n_channels, samples)
 71 | 
 72 |         # Pad input to be divisible by tfilm_block_size
 73 |         if (x_in_shape[2] % self.tfilm_block_size) != 0:
 74 |             padding = torch.zeros(
 75 |                 x_in_shape[0],
 76 |                 x_in_shape[1],
 77 |                 self.tfilm_block_size - (x_in_shape[2] % self.tfilm_block_size),
 78 |             )
 79 |             x = torch.cat((x, padding), dim=-1)
 80 | 
 81 |         x_shape = x.shape
 82 |         n_steps = int(x_shape[-1] / self.tfilm_block_size)
 83 | 
 84 |         x_down = self.maxpool(x)  # (batch_size, n_channels, n_steps)
 85 | 
 86 |         if cond is not None:
 87 |             cond_up = cond.unsqueeze(-1)
 88 |             cond_up = cond_up.repeat(1, 1, n_steps)  # (batch_size, cond_dim, n_steps)
 89 |             x_down = torch.cat(
 90 |                 (x_down, cond_up), dim=1
 91 |             )  # (batch_size, n_channels + cond_dim, n_steps)
 92 | 
 93 |         # Put shape to (n_steps, batch_size, n_channels + cond_dim)
 94 |         x_down = x_down.permute(2, 0, 1)
 95 | 
 96 |         # Modulation
 97 |         if self.first_run:  # Reset hidden state
 98 |             x_norm, self.hidden_state = self.rnn(x_down, None)
 99 |             self.first_run = False
100 |         else:
101 |             x_norm, self.hidden_state = self.rnn(x_down, self.hidden_state)
102 | 
103 |         # Put shape back to (batch_size, n_channels, length)
104 |         x_norm = x_norm.permute(1, 2, 0)
105 | 
106 |         # Reshape input and modulation sequence into blocks
107 |         x_in = torch.reshape(
108 |             x, shape=(-1, self.nchannels, n_steps, self.tfilm_block_size)
109 |         )
110 |         x_norm = torch.reshape(x_norm, shape=(-1, self.nchannels, n_steps, 1))
111 | 
112 |         x_out = x_norm * x_in
113 | 
114 |         # Return to the original padded input shape
115 |         x_out = torch.reshape(x_out, shape=(x_shape))
116 | 
117 |         x_out = x_out[..., : x_in_shape[2]]  # Remove padding
118 | 
119 |         return x_out
120 | 
121 |     def reset_state(self) -> None:
122 |         self.first_run = True
123 | 
124 | 
125 | class Conv1dCausal(nn.Module):
126 |     """Causal 1D convolutional layer
127 |     ensures outputs depend only on current and past inputs.
128 | 
129 |     Parameters:
130 |         in_channels (int): Number of channels in the input signal.
131 |         out_channels (int): Number of channels produced by the convolution.
132 |         kernel_size (int): Size of the convolving kernel.
133 |         stride (int): Stride of the convolution.
134 |         dilation (int, optional): Spacing between kernel elements.
135 |         bias (bool, optional): If True, adds a learnable bias to the output.
136 | 
137 |     Returns:
138 |         Tensor: The output of the causal 1D convolutional layer.
139 |     """
140 | 
141 |     def __init__(
142 |         self,
143 |         in_channels: int,
144 |         out_channels: int,
145 |         kernel_size: int,
146 |         stride: int,
147 |         dilation: int = 1,
148 |         bias: bool = True,
149 |     ) -> None:
150 |         super().__init__()
151 |         self.padding = (
152 |             kernel_size - 1
153 |         ) * dilation  # input_len == output_len when stride=1
154 |         self.in_channels = in_channels
155 |         self.conv = nn.Conv1d(
156 |             in_channels,
157 |             out_channels,
158 |             (kernel_size,),
159 |             (stride,),
160 |             padding=0,
161 |             dilation=(dilation,),
162 |             bias=bias,
163 |         )
164 | 
165 |     def forward(self, x: Tensor) -> Tensor:
166 |         x = F.pad(x, (self.padding, 0))  # standard zero padding
167 |         x = self.conv(x)
168 |         return x
169 | 
170 | 
171 | class GatedAF(nn.Module):
172 |     """Gated activation function
173 |     applies a tanh activation to one half of the input
174 |     and a sigmoid activation to the other half, and then multiplies them element-wise.
175 | 
176 |     Returns:
177 |         Tensor: The output of the gated activation function.
178 |     """
179 | 
180 |     def __init__(self) -> None:
181 |         super().__init__()
182 | 
183 |     def forward(self, x: Tensor) -> Tensor:
184 |         x_tanh, x_sigmoid = x.chunk(2, dim=1)  # Split the output into two halves
185 | 
186 |         x_tanh = torch.tanh(x_tanh)  # Apply tanh activation
187 |         x_sigmoid = torch.sigmoid(x_sigmoid)  # Apply sigmoid activation
188 | 
189 |         # Element-wise multiplication of tanh and sigmoid activations
190 |         x = x_tanh * x_sigmoid
191 |         return x
192 | 
193 | 
194 | class GCN1DBlock(nn.Module):
195 |     """Single block of a Gated Convolutional Network (GCN) with conditional modulation.
196 | 
197 |     Parameters:
198 |         in_ch (int): Number of input channels.
199 |         out_ch (int): Number of output channels.
200 |         kernel_size (int, optional): Size of the convolution kernel.
201 |         dilation (int, optional): Dilation rate for dilated convolutions.
202 |         stride (int, optional): Stride for the convolution.
203 |         cond_dim (int, optional): Dimensionality of the conditional input for FiLM.
204 |     """
205 | 
206 |     def __init__(
207 |         self,
208 |         in_ch: int,
209 |         out_ch: int,
210 |         kernel_size: int = 3,
211 |         dilation: int = 1,
212 |         stride: int = 1,
213 |         cond_dim: int = 0,
214 |         rnn_type: str = "lstm",
215 |         tfilm_block_size: int = 128,
216 |         use_bias_in_conv: bool = False,
217 |     ) -> None:
218 |         super().__init__()
219 | 
220 |         self.conv = Conv1dCausal(
221 |             in_channels=in_ch,
222 |             out_channels=out_ch * 2,  # adapt for the Gated Activation Function
223 |             kernel_size=kernel_size,
224 |             stride=stride,
225 |             dilation=dilation,
226 |             bias=use_bias_in_conv,
227 |         )
228 | 
229 |         self.tfilm = None
230 |         if cond_dim > 0:
231 |             self.tfilm = TFiLM(
232 |                 n_channels=out_ch * 2,
233 |                 cond_dim=cond_dim,
234 |                 tfilm_block_size=tfilm_block_size,
235 |                 rnn_type=rnn_type,
236 |             )
237 | 
238 |         self.gated_activation = GatedAF()
239 | 
240 |         self.res = nn.Conv1d(
241 |             in_channels=in_ch, out_channels=out_ch, kernel_size=(1,), bias=False
242 |         )
243 | 
244 |     def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor:
245 |         x_in = x
246 |         x = self.conv(x)  # Apply causal convolution
247 |         if (
248 |             cond is not None and self.tfilm is not None
249 |         ):  # Apply FiLM if conditional input is given
250 |             x = self.tfilm(x, cond)
251 |         # Apply gated activation function
252 |         x = self.gated_activation(x)
253 |         # Apply residual convolution and add to output
254 |         x_res = self.res(x_in)
255 |         x = x + x_res
256 |         return x
257 | 
258 | 
259 | class GCN1D(nn.Module):
260 |     """Gated Convolutional Network (GCN) model, re-implemented from the paper:
261 |     https://arxiv.org/abs/2211.00497
262 | 
263 |     Parameters:
264 |         in_ch (int, optional): Number of input channels.
265 |         out_ch (int, optional): Number of output channels.
266 |         n_blocks (int, optional): Number of GCN blocks.
267 |         n_channels (int, optional): Number of channels in the GCN blocks.
268 |         dilation_growth (int, optional): Growth rate for dilation in the GCN blocks.
269 |         kernel_size (int, optional): Size of the convolution kernel.
270 |         cond_dim (int, optional): Dimensionality of the conditional input for FiLM.
271 | 
272 |     Returns:
273 |         Tensor: The output of the GCN model.
274 |     """
275 | 
276 |     def __init__(
277 |         self,
278 |         in_ch: int = 1,
279 |         out_ch: int = 1,
280 |         n_blocks: int = 10,
281 |         n_channels: int = 64,
282 |         dil_growth: int = 4,
283 |         kernel_size: int = 13,
284 |         cond_dim: int = 0,
285 |         tfilm_block_size: int = 128,
286 |         rnn_type: str = "lstm",
287 |         use_act: bool = True,
288 |         use_bias_in_conv: bool = False,
289 |     ) -> None:
290 |         super().__init__()
291 |         self.kernel_size = kernel_size
292 |         self.n_channels = n_channels
293 |         self.dil_growth = dil_growth
294 |         self.n_blocks = n_blocks
295 |         self.cond_dim = cond_dim
296 |         self.use_act = use_act
297 |         self.use_bias_in_conv = use_bias_in_conv
298 | 
299 |         # Compute convolution channels and dilations
300 |         self.channels = [n_channels] * n_blocks
301 |         self.dilations = [dil_growth**idx for idx in range(n_blocks)]
302 | 
303 |         # Blocks number is given by the number of elements in the channels list
304 |         self.n_blocks = len(self.channels)
305 |         assert len(self.dilations) == self.n_blocks
306 | 
307 |         # Create a list of strides
308 |         self.strides = [1] * self.n_blocks
309 | 
310 |         # Create a list of GCN blocks
311 |         self.blocks = nn.ModuleList()
312 |         block_out_ch = 0
313 | 
314 |         for idx, (curr_out_ch, dil, stride) in enumerate(
315 |             zip(self.channels, self.dilations, self.strides)
316 |         ):
317 |             block_out_ch = curr_out_ch
318 |             if idx == 0:
319 |                 block_in_ch = in_ch
320 |             else:
321 |                 block_in_ch = block_out_ch
322 | 
323 |             self.blocks.append(
324 |                 GCN1DBlock(
325 |                     block_in_ch,
326 |                     block_out_ch,
327 |                     self.kernel_size,
328 |                     dilation=dil,
329 |                     stride=stride,
330 |                     cond_dim=cond_dim,
331 |                     tfilm_block_size=tfilm_block_size,
332 |                     rnn_type=rnn_type,
333 |                     use_bias_in_conv=use_bias_in_conv,
334 |                 )
335 |             )
336 | 
337 |         # Output layer
338 |         self.out_net = nn.Conv1d(
339 |             self.channels[-1], out_ch, kernel_size=(1,), stride=(1,), bias=False
340 |         )
341 | 
342 |         # Activation function
343 |         self.act = nn.Tanh()
344 | 
345 |     def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor:
346 |         assert x.ndim == 3  # (batch_size, in_ch, samples)
347 |         if cond is not None:
348 |             assert cond.ndim == 2  # (batch_size, cond_dim)
349 |         for block in self.blocks:  # Apply GCN blocks
350 |             x = block(x, cond)
351 |         x = self.out_net(x)  # Apply output layer
352 | 
353 |         if self.act is not None:
354 |             x = self.act(x)  # Apply tanh activation function
355 |         return x
356 | 
357 |     def calc_receptive_field(self) -> int:
358 |         """Calculate the receptive field of the model.
359 |         The receptive field is the number of input samples that affect the output of a block.
360 | 
361 |         The receptive field of the model is the sum of the receptive fields of all layers:
362 |         RF = 1 + \sum_{i=1}^{n}(kernel\_size_i - 1) \cdot dilation_i
363 | 
364 |         i is the layer index, n is the number of layers.
365 | 
366 |         Returns:
367 |             int: The receptive field of the model.
368 |         """
369 |         assert all(_ == 1 for _ in self.strides)  # TODO(cm): add support for dsTCN
370 |         assert self.dilations[0] == 1  # TODO(cm): add support for >1 starting dilation
371 |         rf = self.kernel_size
372 |         for dil in self.dilations[1:]:
373 |             rf = rf + ((self.kernel_size - 1) * dil)
374 |         return rf
375 | 
376 | 


--------------------------------------------------------------------------------
/neutone_sdk/metadata.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | 
  4 | import requests
  5 | from jsonschema import validate, ValidationError
  6 | from jsonschema._keywords import anyOf
  7 | 
  8 | from neutone_sdk.audio import AudioSample
  9 | 
 10 | logging.basicConfig()
 11 | log = logging.getLogger(__name__)
 12 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 13 | 
 14 | SCHEMA = {
 15 |     "type": "object",
 16 |     "properties": {
 17 |         "model_name": {
 18 |             "type": "string",
 19 |             "maxLength": 30,
 20 |         },
 21 |         "model_authors": {
 22 |             "type": "array",
 23 |             "maxItems": 5,
 24 |             "items": {"type": "string"},
 25 |             "uniqueItems": True,
 26 |         },
 27 |         "model_version": {"type": "string"},
 28 |         "model_short_description": {"type": "string", "maxLength": 150},
 29 |         "model_long_description": {"type": "string", "maxLength": 500},
 30 |         "technical_description": {"type": "string", "maxLength": 500},
 31 |         "technical_links": {
 32 |             "type": "object",
 33 |             "additionalProperties": {
 34 |                 "type": "string",
 35 |             },
 36 |             "maxItems": 3,
 37 |         },
 38 |         "tags": {
 39 |             "type": "array",
 40 |             "maxItems": 7,
 41 |             "items": {"type": "string"},
 42 |             "uniqueItems": True,
 43 |             "maxLength": 15,
 44 |         },
 45 |         "citation": {
 46 |             "type": "string",
 47 |             "maxLength": 300,
 48 |         },
 49 |         "is_experimental": {
 50 |             "type": "boolean",
 51 |         },
 52 |         "model_id": {"type": "string"},
 53 |         "file_size": {"type": "integer"},
 54 |         "sample_sound_files": {
 55 |             "type": "array",
 56 |             "items": {
 57 |                 "type": "object",
 58 |                 "required": ["in", "out"],
 59 |                 "properties": {
 60 |                     "in": {"type": "string"},
 61 |                     "out": {"type": "string"},
 62 |                 },
 63 |             },
 64 |             "maxItems": 3,
 65 |         },
 66 |         "neutone_parameters": {
 67 |             "type": "object",
 68 |             anyOf: [
 69 |                 {"required": ["p1"]},
 70 |                 {"required": ["p1", "p2"]},
 71 |                 {"required": ["p1", "p2", "p3"]},
 72 |                 {"required": ["p1", "p2", "p3", "p4"]},
 73 |             ],
 74 |             "properties": {
 75 |                 "p1": {"$ref": "#/definitions/neutoneParameter"},
 76 |                 "p2": {"$ref": "#/definitions/neutoneParameter"},
 77 |                 "p3": {"$ref": "#/definitions/neutoneParameter"},
 78 |                 "p4": {"$ref": "#/definitions/neutoneParameter"},
 79 |             },
 80 |         },
 81 |         "wet_default_value": {
 82 |             "type": "number",
 83 |             "minimum": 0.0,
 84 |             "maximum": 1.0,
 85 |         },
 86 |         "dry_default_value": {
 87 |             "type": "number",
 88 |             "minimum": 0.0,
 89 |             "maximum": 1.0,
 90 |         },
 91 |         "input_gain_default_value": {
 92 |             "type": "number",
 93 |             "minimum": 0.0,
 94 |             "maximum": 1.0,
 95 |         },
 96 |         "output_gain_default_value": {
 97 |             "type": "number",
 98 |             "minimum": 0.0,
 99 |             "maximum": 1.0,
100 |         },
101 |         "is_input_mono": {
102 |             "type": "boolean",
103 |         },
104 |         "is_output_mono": {
105 |             "type": "boolean",
106 |         },
107 |         "model_type": {
108 |             "type": "string",
109 |             "enum": ["mono-mono", "mono-stereo", "stereo-mono", "stereo-stereo"],
110 |         },
111 |         "native_sample_rates": {
112 |             "type": "array",
113 |             "items": {
114 |                 "type": "integer",
115 |                 "minimum": 0,
116 |                 "maximum": 384000,
117 |             },
118 |             "uniqueItems": True,
119 |         },
120 |         "native_buffer_sizes": {
121 |             "type": "array",
122 |             "items": {
123 |                 "type": "integer",
124 |                 "minimum": 1,
125 |                 "maximum": 65536,
126 |             },
127 |             "uniqueItems": True,
128 |         },
129 |         "look_behind_samples": {
130 |             "type": "integer",
131 |             "minimum": 0,
132 |         },
133 |         "sdk_version": {"type": "string"},
134 |         "pytorch_version": {"type": "string"},
135 |         "date_created": {"type": "number"},
136 |     },
137 |     "definitions": {
138 |         "neutoneParameter": {
139 |             "type": "object",
140 |             "required": ["name", "description", "default_value", "used", "type"],
141 |             "properties": {
142 |                 "name": {"type": "string"},
143 |                 "description": {"type": "string"},
144 |                 "default_value": {"type": ["integer", "number", "string"]},
145 |                 "used": {"type": "boolean"},
146 |                 "type": {"type": "string", "enum": ["continuous"]},
147 |                 "max_n_chars": {"type": ["null", "integer"], "minimum": -1},
148 |                 "n_values": {"type": ["null", "integer"], "minimum": 2},
149 |                 "labels": {"type": ["null", "array"], "items": {"type": "string"}},
150 |             },
151 |         }
152 |     },
153 |     "required": [
154 |         "model_name",
155 |         "model_authors",
156 |         "model_version",
157 |         "model_short_description",
158 |         "model_long_description",
159 |         "technical_description",
160 |         "technical_links",
161 |         "tags",
162 |         "citation",
163 |         "is_experimental",
164 |         "sample_sound_files",
165 |         "neutone_parameters",
166 |         "wet_default_value",
167 |         "dry_default_value",
168 |         "input_gain_default_value",
169 |         "output_gain_default_value",
170 |         "is_input_mono",
171 |         "is_output_mono",
172 |         "model_type",
173 |         "native_sample_rates",
174 |         "native_buffer_sizes",
175 |         "look_behind_samples",
176 |         "sdk_version",
177 |         "pytorch_version",
178 |         "date_created",
179 |     ],
180 | }
181 | 
182 | 
183 | def validate_metadata(metadata: dict) -> bool:
184 |     try:
185 |         validate(instance=metadata, schema=SCHEMA)
186 |     except ValidationError as err:
187 |         log.error(err)
188 |         raise err
189 | 
190 |     # Check links return 200
191 |     for link in metadata["technical_links"].values():
192 |         try:
193 |             code = requests.head(link, allow_redirects=True).status_code
194 |             if code != 200:
195 |                 log.error(f"Cannot access link {link}")
196 |         except requests.exceptions.ConnectionError:
197 |             log.error(f"Cannot access link {link}")
198 | 
199 |     # Check we can extract mp3s from the samples
200 |     for audio_sample_pair in metadata["sample_sound_files"]:
201 |         AudioSample.from_b64(audio_sample_pair["in"])
202 |         AudioSample.from_b64(audio_sample_pair["out"])
203 | 
204 |     return True
205 | 


--------------------------------------------------------------------------------
/neutone_sdk/parameter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from abc import ABC
  4 | from enum import Enum
  5 | from typing import Union, Optional, List, Dict
  6 | 
  7 | from torch import Tensor as T, nn
  8 | import torch as tr
  9 | from neutone_sdk import constants
 10 | 
 11 | logging.basicConfig()
 12 | log = logging.getLogger(__name__)
 13 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 14 | 
 15 | 
 16 | class NeutoneParameterType(Enum):
 17 |     CONTINUOUS = "continuous"
 18 |     CATEGORICAL = "categorical"
 19 |     TEXT = "text"
 20 |     TOKENS = "tokens"
 21 | 
 22 | 
 23 | class NeutoneParameter(ABC, nn.Module):
 24 |     """
 25 |     Defines a Neutone Parameter abstract base class.
 26 | 
 27 |     The name and the description of the parameter will be shown as a tooltip
 28 |     within the UI. This parameter has no functionality and is meant to subclassed.
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         name: str,
 34 |         description: str,
 35 |         default_value: Union[int, float, str, Optional[List[int]]],
 36 |         used: bool,
 37 |         param_type: NeutoneParameterType,
 38 |     ):
 39 |         super().__init__()
 40 |         self.name = name
 41 |         self.description = description
 42 |         self.default_value = default_value
 43 |         self.used = used
 44 |         self.type = param_type
 45 | 
 46 |     def to_metadata(
 47 |         self,
 48 |     ) -> Dict[str, Union[int, float, str, bool, List[str], List[int]]]:
 49 |         return {
 50 |             "name": self.name,
 51 |             "description": self.description,
 52 |             "default_value": self.default_value,
 53 |             "used": self.used,
 54 |             "type": self.type.value,
 55 |         }
 56 | 
 57 | 
 58 | class ContinuousNeutoneParameter(NeutoneParameter):
 59 |     """
 60 |     Defines a continuous Neutone Parameter that the user can use to control a model.
 61 | 
 62 |     The name and the description of the parameter will be shown as a tooltip
 63 |     within the UI.
 64 |     `default_value` must be between min_value and max_value and will be used as the
 65 |     default in the plugin when no presets are available.
 66 |     """
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         name: str,
 71 |         description: str,
 72 |         default_value: float,
 73 |         min_value: float = 0.0,
 74 |         max_value: float = 1.0,
 75 |         used: bool = True,
 76 |     ):
 77 |         super().__init__(
 78 |             name,
 79 |             description,
 80 |             default_value,
 81 |             used,
 82 |             NeutoneParameterType.CONTINUOUS,
 83 |         )
 84 |         assert (
 85 |             min_value < max_value
 86 |         ), "`min_value` must be less than `max_value` for continuous params"
 87 |         assert (
 88 |             min_value <= default_value <= max_value
 89 |         ), f"`default_value` for continuous params must be between {min_value} and {max_value}"
 90 |         self.min_value = min_value
 91 |         self.max_value = max_value
 92 |         self.range = max_value - min_value
 93 |         self.default_value_0to1 = (default_value - min_value) / self.range
 94 | 
 95 |     def from_0to1(self, param_val: T) -> T:
 96 |         """
 97 |         Converts a parameter value inplace from [0, 1] to [min_value, max_value].
 98 |         """
 99 |         tr.mul(param_val, self.range, out=param_val)
100 |         tr.add(param_val, self.min_value, out=param_val)
101 |         return param_val
102 | 
103 |     def to_metadata(self) -> Dict[str, Union[int, float, str, bool, List[str]]]:
104 |         metadata = super().to_metadata()
105 |         metadata["min_value"] = self.min_value
106 |         metadata["max_value"] = self.max_value
107 |         return metadata
108 | 
109 | 
110 | class CategoricalNeutoneParameter(NeutoneParameter):
111 |     """
112 |     Defines a categorical Neutone Parameter that the user can use to control a model.
113 | 
114 |     The name and the description of the parameter will be shown as a tooltip
115 |     within the UI.
116 |     `n_values` must be an int greater than or equal to 2 and less than or equal to
117 |     `constants.MAX_N_CATEGORICAL_VALUES`.
118 |     `default_value` must be in the range [0, `n_values` - 1].
119 |     `labels` is a list of strings that will be used as the labels for the parameter.
120 |     """
121 | 
122 |     def __init__(
123 |         self,
124 |         name: str,
125 |         description: str,
126 |         n_values: int,
127 |         default_value: int,
128 |         labels: Optional[List[str]] = None,
129 |         used: bool = True,
130 |     ):
131 |         super().__init__(
132 |             name, description, default_value, used, NeutoneParameterType.CATEGORICAL
133 |         )
134 |         assert 2 <= n_values <= constants.MAX_N_CATEGORICAL_VALUES, (
135 |             f"`n_values` for categorical params must between 2 and "
136 |             f"{constants.MAX_N_CATEGORICAL_VALUES}"
137 |         )
138 |         assert (
139 |             0 <= default_value <= n_values - 1
140 |         ), "`default_value` for categorical params must be between 0 and `n_values`-1"
141 |         self.n_values = n_values
142 |         if labels is None:
143 |             labels = [str(idx) for idx in range(n_values)]
144 |         else:
145 |             assert len(labels) == self.n_values, "labels must have `n_values` elements"
146 |         assert all(
147 |             len(label) < constants.MAX_N_CATEGORICAL_LABEL_CHARS for label in labels
148 |         ), (
149 |             f"All labels must have length less than "
150 |             f"{constants.MAX_N_CATEGORICAL_LABEL_CHARS} characters"
151 |         )
152 |         self.labels = labels
153 |         self.default_value_0to1 = default_value / (n_values - 1)
154 | 
155 |     def from_0to1(self, param_val: T) -> T:
156 |         """
157 |         Converts a parameter value inplace from [0, 1] to [0, `n_values` - 1].
158 |         """
159 |         tr.mul(param_val, self.n_values - 1, out=param_val)
160 |         tr.round(param_val, out=param_val)
161 |         return param_val
162 | 
163 |     def to_metadata(
164 |         self,
165 |     ) -> Dict[str, Union[int, float, str, bool, List[str], List[int]]]:
166 |         metadata = super().to_metadata()
167 |         metadata["n_values"] = self.n_values
168 |         metadata["labels"] = self.labels
169 |         return metadata
170 | 
171 | 
172 | class TextNeutoneParameter(NeutoneParameter):
173 |     """
174 |     Defines a text Neutone Parameter that the user can use to control a model.
175 | 
176 |     The name and the description of the parameter will be shown as a tooltip
177 |     within the UI.
178 |     `max_n_chars` specifies the maximum number of characters that the user can input.
179 |     If this value is set to -1, there is no limit on the number of characters.
180 |     `default_value` is the default value to be automatically populated in the text box.
181 |     """
182 | 
183 |     def __init__(
184 |         self,
185 |         name: str,
186 |         description: str,
187 |         max_n_chars: int = -1,
188 |         default_value: str = "",
189 |         used: bool = True,
190 |     ):
191 |         super().__init__(
192 |             name, description, default_value, used, NeutoneParameterType.TEXT
193 |         )
194 |         assert max_n_chars >= -1, "`max_n_chars` must be greater than or equal to -1"
195 |         if max_n_chars != -1:
196 |             assert (
197 |                 len(default_value) <= max_n_chars
198 |             ), "`default_value` must be a string of length less than `max_n_chars`"
199 |         self.max_n_chars = max_n_chars
200 | 
201 |     def to_metadata(
202 |         self,
203 |     ) -> Dict[str, Union[int, float, str, bool, List[str], List[int]]]:
204 |         metadata = super().to_metadata()
205 |         metadata["max_n_chars"] = self.max_n_chars
206 |         return metadata
207 | 
208 | 
209 | class DiscreteTokensNeutoneParameter(NeutoneParameter):
210 |     """
211 |     Defines a discrete token tensor input to a Neutone model
212 |     Should be the output of a tokenizer that processes some text input.
213 | 
214 |     The name and the description of the parameter will be shown as a tooltip
215 |     within the UI.
216 |     """
217 | 
218 |     def __init__(
219 |         self,
220 |         name: str,
221 |         description: str,
222 |         max_n_tokens: int = -1,
223 |         default_value: Optional[List[int]] = None,
224 |         used: bool = True,
225 |     ):
226 |         if default_value is None:
227 |             default_value: List[int] = []
228 |         super().__init__(
229 |             name, description, default_value, used, NeutoneParameterType.TOKENS
230 |         )
231 |         assert max_n_tokens >= -1, "`max_n_tokens` must be greater than or equal to -1"
232 |         if max_n_tokens != -1:
233 |             assert (
234 |                 len(default_value) <= max_n_tokens
235 |             ), "`default_value` must be a list of length less than `max_n_tokens`"
236 |         self.max_n_tokens = max_n_tokens
237 | 
238 |     def to_metadata(
239 |         self,
240 |     ) -> Dict[str, Union[int, float, str, bool, List[str], List[int]]]:
241 |         metadata = super().to_metadata()
242 |         metadata["max_n_tokens"] = self.max_n_tokens
243 |         return metadata
244 | 


--------------------------------------------------------------------------------
/neutone_sdk/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Neutone/neutone_sdk/aee4cac560209fe850686dad3e21695fa8dde473/neutone_sdk/py.typed


--------------------------------------------------------------------------------
/neutone_sdk/queues.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Tuple
  4 | 
  5 | import torch as tr
  6 | from torch import Tensor
  7 | 
  8 | logging.basicConfig()
  9 | log = logging.getLogger(__name__)
 10 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 11 | 
 12 | 
 13 | class CircularInplaceTensorQueue:
 14 |     def __init__(self, n_ch: int, max_size: int, use_debug_mode: bool = True) -> None:
 15 |         """
 16 |         Creates a FIFO queue designed for audio data that does not allocate any memory during normal use and performs
 17 |         as few memory operations as possible. The queue is also compatible with converting to TorchScript.
 18 |         """
 19 |         self.use_debug_mode = use_debug_mode
 20 |         self.max_size = max_size
 21 |         self.queue = tr.zeros((n_ch, max_size))
 22 |         self.start_idx = 0
 23 |         self.end_idx = 0
 24 |         self.size = 0
 25 | 
 26 |     def _calc_push_indices(self, in_n: int) -> Tuple[int, int, int, int]:
 27 |         """
 28 |         Calculates the indices to place new data of length in_n into the queue. Since it's a circular queue this can
 29 |         mean wrapping around once past the end of the queue depending on the contents of the queue at that moment in
 30 |         time. As a result, we define two possible index ranges for pushing data: start_1:end_1 and start_2:end_2
 31 |         if wrapping occurs, otherwise end_1 == start_2 == end_2
 32 | 
 33 |         Returns:
 34 |             Tuple[int, int, int, int]: start_1, end_1, start_2, end_2
 35 |         """
 36 |         if self.use_debug_mode:
 37 |             assert 0 < in_n < self.max_size
 38 |         start_1 = self.end_idx
 39 |         if start_1 == self.max_size:
 40 |             start_1 = 0
 41 |         end_2 = start_1 + in_n
 42 |         if end_2 > self.max_size:
 43 |             end_2 = end_2 % self.max_size
 44 |         end_1 = end_2
 45 |         start_2 = end_2
 46 |         if end_2 < start_1:
 47 |             end_1 = self.max_size
 48 |             start_2 = 0
 49 |         return start_1, end_1, start_2, end_2
 50 | 
 51 |     def push(self, x: Tensor) -> None:
 52 |         """
 53 |         Pushes the contents of x to the end of the queue. If the queue does not have adequate space left, the contents
 54 |         of the queue will be overwritten, starting at the head of the queue.
 55 |         """
 56 |         if self.use_debug_mode:
 57 |             assert x.ndim == self.queue.ndim
 58 |             assert x.size(0) == self.queue.size(0)
 59 |         in_n = x.size(1)
 60 |         if in_n >= self.max_size:
 61 |             self.queue[:, :] = x[:, -self.max_size :]
 62 |             self.start_idx = 0
 63 |             self.end_idx = self.max_size
 64 |             self.size = self.max_size
 65 |             return
 66 |         if in_n < 1:
 67 |             return
 68 |         start_1, end_1, start_2, end_2 = self._calc_push_indices(in_n)
 69 |         n_1 = end_1 - start_1
 70 |         self.queue[:, start_1:end_1] = x[:, 0:n_1]
 71 |         if n_1 < in_n:
 72 |             self.queue[:, start_2:end_2] = x[:, n_1:]
 73 |         self.end_idx = end_2
 74 |         self.size = min(self.size + in_n, self.max_size)
 75 |         if self.size == self.max_size:
 76 |             self.start_idx = self.end_idx
 77 | 
 78 |     def _calc_pop_indices(self, out_n: int) -> Tuple[int, int, int, int]:
 79 |         """
 80 |         Calculates the indices to pop data of length out_n from the queue. Since it's a circular queue this can
 81 |         mean wrapping around once past the end of the queue depending on the contents of the queue at that moment in
 82 |         time. As a result, we define two possible index ranges for popping data: start_1:end_1 and start_2:end_2
 83 |         if wrapping occurs, otherwise end_1 == start_2 == end_2
 84 | 
 85 |         Returns:
 86 |             Tuple[int, int, int, int]: start_1, end_1, start_2, end_2
 87 |         """
 88 |         out_n = min(out_n, self.size)
 89 |         if self.use_debug_mode:
 90 |             assert out_n > 0
 91 |         start_1 = self.start_idx
 92 |         if start_1 == self.max_size:
 93 |             start_1 = 0
 94 |         end_2 = start_1 + out_n
 95 |         if end_2 > self.max_size:
 96 |             end_2 = end_2 % self.max_size
 97 |         end_1 = end_2
 98 |         start_2 = end_2
 99 |         if end_2 <= start_1:
100 |             end_1 = self.max_size
101 |             start_2 = 0
102 |         return start_1, end_1, start_2, end_2
103 | 
104 |     def pop(self, out: Tensor) -> int:
105 |         """
106 |         Attempts to fill the out tensor with data popped from the head of the queue. Begins filling the out tensor at
107 |         index 0. If the out tensor is bigger than the number of items in the queue, fills the tensor as much as
108 |         possible.
109 | 
110 |         Returns:
111 |             int: the number of items successfully popped from the queue.
112 |         """
113 |         # TODO(cm): remove duplicate code using fill
114 |         if self.use_debug_mode:
115 |             assert out.ndim == self.queue.ndim
116 |             assert out.size(0) == self.queue.size(0)
117 |         if self.is_empty():
118 |             return 0
119 |         out_n = out.size(1)
120 |         if out_n < 1:
121 |             return 0
122 |         start_1, end_1, start_2, end_2 = self._calc_pop_indices(out_n)
123 |         n_1 = end_1 - start_1
124 |         n_2 = end_2 - start_2
125 |         removed_n = n_1 + n_2
126 |         if self.use_debug_mode:
127 |             assert 0 < n_1 <= self.size
128 |             assert 0 <= n_2 < self.size
129 |             assert removed_n <= self.size
130 |         out[:, 0:n_1] = self.queue[:, start_1:end_1]
131 |         if n_2 > 0:
132 |             out[:, n_1:removed_n] = self.queue[:, start_2:end_2]
133 |         self.start_idx = end_2
134 |         self.size -= removed_n
135 |         if self.use_debug_mode:
136 |             if self.size == 0:
137 |                 assert self.start_idx == self.end_idx
138 |         return removed_n
139 | 
140 |     def fill(self, out: Tensor) -> int:
141 |         """
142 |         Attempts to fill the out tensor with data from the head of the queue. Begins filling the out tensor at index 0.
143 |         If the out tensor is bigger than the number of items in the queue, fills the tensor as much as possible. Does
144 |         not remove any elements from the queue.
145 | 
146 |         Returns:
147 |             int: the number of items successfully filled from the queue.
148 |         """
149 |         if self.use_debug_mode:
150 |             assert out.ndim == self.queue.ndim
151 |             assert out.size(0) == self.queue.size(0)
152 |         if self.is_empty():
153 |             return 0
154 |         out_n = out.size(1)
155 |         if out_n < 1:
156 |             return 0
157 |         start_1, end_1, start_2, end_2 = self._calc_pop_indices(out_n)
158 |         n_1 = end_1 - start_1
159 |         n_2 = end_2 - start_2
160 |         filled_n = n_1 + n_2
161 |         if self.use_debug_mode:
162 |             assert 0 < n_1 <= self.size
163 |             assert 0 <= n_2 < self.size
164 |             assert filled_n <= self.size
165 |         out[:, 0:n_1] = self.queue[:, start_1:end_1]
166 |         if n_2 > 0:
167 |             out[:, n_1:filled_n] = self.queue[:, start_2:end_2]
168 |         return filled_n
169 | 
170 |     def is_empty(self) -> bool:
171 |         return self.size == 0
172 | 
173 |     def is_full(self) -> bool:
174 |         return self.size == self.max_size
175 | 
176 |     def reset(self) -> None:
177 |         self.start_idx = 0
178 |         self.end_idx = 0
179 |         self.size = 0
180 | 


--------------------------------------------------------------------------------
/neutone_sdk/realtime_stft.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Optional, List
  4 | 
  5 | import torch as tr
  6 | from torch import Tensor
  7 | from torch import nn
  8 | 
  9 | logging.basicConfig()
 10 | log = logging.getLogger(__name__)
 11 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 12 | 
 13 | 
 14 | # TODO(cm:) add documentation, for now please see the documentation in `examples/example_spectral_filter.py`
 15 | class RealtimeSTFT(nn.Module):
 16 |     def __init__(
 17 |         self,
 18 |         model_io_n_frames: int = 16,
 19 |         io_n_ch: int = 2,
 20 |         io_n_samples: int = 512,
 21 |         n_fft: int = 2048,
 22 |         hop_len: int = 512,
 23 |         window: Optional[Tensor] = None,
 24 |         center: bool = True,
 25 |         power: Optional[float] = 1.0,
 26 |         logarithmize: bool = True,
 27 |         ensure_pos_spec: bool = True,
 28 |         use_phase_info: bool = True,
 29 |         fade_n_samples: int = 0,
 30 |         eps: float = 1e-8,
 31 |         use_debug_mode: bool = True,
 32 |     ) -> None:
 33 |         super().__init__()
 34 |         self.use_debug_mode = use_debug_mode
 35 |         if self.use_debug_mode:
 36 |             assert n_fft % 2 == 0
 37 |             assert (n_fft // 2) % hop_len == 0
 38 |             if window is not None:
 39 |                 assert window.shape == (n_fft,)
 40 |             assert (
 41 |                 center
 42 |             ), "Behavior of center=False needs to be debugged, results in artefacts"
 43 |             # if center:
 44 |             #     log.warning("STFT is not causal when center=True")
 45 |             assert power is None or power >= 1.0
 46 |             if power is None and use_phase_info:
 47 |                 log.warning(
 48 |                     "If power=None, `use_phase_info=True` means the imag component is saved, not the angle"
 49 |                 )
 50 |             if power is not None and power > 1.0:
 51 |                 log.warning(
 52 |                     "A power greater than 1.0 probably adds unnecessary "
 53 |                     "computational complexity"
 54 |                 )
 55 |             assert fade_n_samples < io_n_samples
 56 |         self.model_io_n_frames = model_io_n_frames
 57 |         self.io_n_ch = io_n_ch
 58 |         self.io_n_samples = io_n_samples
 59 |         self.n_fft = n_fft
 60 |         self.hop_len = hop_len
 61 |         self.center = center
 62 |         self.power = power
 63 |         self.logarithmize = logarithmize
 64 |         self.ensure_pos_spec = ensure_pos_spec
 65 |         self.use_phase_info = use_phase_info
 66 |         self.fade_n_samples = fade_n_samples
 67 |         self.eps = eps
 68 | 
 69 |         # Derived parameters
 70 |         self.io_n_frames = None
 71 |         self.overlap_n_frames = None
 72 |         self.in_buf_n_frames = None
 73 |         self.n_bins = None
 74 |         self.stft_out_shape = None
 75 |         self.istft_in_n_frames = None
 76 |         self.istft_length = None
 77 |         self.model_io_shape = None
 78 |         self.out_buf_n_samples = None
 79 | 
 80 |         # Internal buffers
 81 |         self.in_buf = None
 82 |         self.in_buf_tmp = None
 83 |         self.stft_mag_buf = None
 84 |         self.mag_buf = None
 85 |         self.mag_buf_tmp = None
 86 |         self.spec_out_buf = None
 87 |         self.stft_phase_buf = None
 88 |         self.phase_buf = None
 89 |         self.phase_buf_tmp = None
 90 |         self.out_frames_buf = None
 91 |         self.out_buf = None
 92 | 
 93 |         # Sets derived parameters and allocates buffers
 94 |         self.set_buffer_size(io_n_samples)
 95 | 
 96 |         # Internal tensors
 97 |         if window is None:
 98 |             window = tr.hann_window(self.n_fft)
 99 |             if not center:
100 |                 # Ensures the NOLA constraint is met for the hann_window
101 |                 # See https://github.com/pytorch/pytorch/issues/62323
102 |                 # 1e-5 is chosen based on the torchaudio implementation
103 |                 window = tr.clamp(window, min=1e-5)
104 |         self.register_buffer("window", window, persistent=True)
105 |         log10_eps = tr.log10(tr.tensor([self.eps]))
106 |         self.register_buffer("log10_eps", log10_eps, persistent=False)
107 |         fade_up = tr.linspace(0, 1, max(self.fade_n_samples, 1))
108 |         self.register_buffer("fade_up", fade_up, persistent=False)
109 |         fade_down = tr.linspace(1, 0, max(self.fade_n_samples, 1))
110 |         self.register_buffer("fade_down", fade_down, persistent=False)
111 |         zero_phase = tr.zeros(self.model_io_shape)
112 |         self.register_buffer("zero_phase", zero_phase, persistent=False)
113 |         self.ten_constant = tr.tensor(10.0)
114 | 
115 |     def _set_derived_params(self) -> None:
116 |         self.io_n_frames = self.io_n_samples // self.hop_len
117 |         if self.use_debug_mode:
118 |             assert self.io_n_frames <= self.model_io_n_frames
119 |         self.overlap_n_frames = self.n_fft // 2 // self.hop_len
120 |         self.in_buf_n_frames = (2 * self.overlap_n_frames) + self.io_n_frames - 1
121 |         self.n_bins = (self.n_fft // 2) + 1
122 |         if self.center:
123 |             self.stft_out_shape = (
124 |                 self.io_n_ch,
125 |                 self.n_bins,
126 |                 (2 * self.overlap_n_frames) + self.io_n_frames,
127 |             )
128 |             self.istft_in_n_frames = self.overlap_n_frames + self.io_n_frames
129 |             self.istft_length = (self.istft_in_n_frames - 1) * self.hop_len
130 |         else:
131 |             self.stft_out_shape = (self.io_n_ch, self.n_bins, self.io_n_frames)
132 |             self.istft_in_n_frames = self.io_n_frames
133 |             self.istft_length = self.in_buf_n_frames * self.hop_len
134 |         if self.use_debug_mode:
135 |             assert self.istft_in_n_frames <= self.model_io_n_frames
136 | 
137 |         self.model_io_shape = (self.io_n_ch, self.n_bins, self.model_io_n_frames)
138 |         self.out_buf_n_samples = self.io_n_samples + self.fade_n_samples
139 |         if self.use_debug_mode:
140 |             assert self.out_buf_n_samples <= self.istft_length
141 | 
142 |     def _allocate_buffers(self) -> None:
143 |         self.in_buf = tr.full(
144 |             (self.io_n_ch, self.in_buf_n_frames * self.hop_len),
145 |             self.eps,
146 |         )
147 |         self.in_buf_tmp = tr.clone(self.in_buf)
148 | 
149 |         self.stft_mag_buf = tr.full(self.stft_out_shape, self.eps)
150 |         self.mag_buf = tr.full(self.model_io_shape, self.eps)
151 |         self.mag_buf_tmp = tr.clone(self.mag_buf)
152 |         # Required to allow inplace operations after the encoder
153 |         self.spec_out_buf = tr.clone(self.mag_buf)
154 | 
155 |         self.stft_phase_buf = tr.zeros(self.stft_out_shape)
156 |         self.phase_buf = tr.zeros(self.model_io_shape)
157 |         self.phase_buf_tmp = tr.clone(self.phase_buf)
158 | 
159 |         self.out_frames_buf = tr.full(
160 |             (self.io_n_ch, self.n_bins, self.istft_in_n_frames),
161 |             self.eps,
162 |             dtype=tr.complex64,
163 |         )
164 |         self.out_buf = tr.full(
165 |             (self.io_n_ch, self.out_buf_n_samples),
166 |             self.eps,
167 |         )
168 | 
169 |     def _logarithmize_spec(self, spec: Tensor) -> None:
170 |         tr.clamp(spec, min=self.eps, out=spec)
171 |         tr.log10(spec, out=spec)
172 | 
173 |     def _unlogarithmize_spec(self, spec: Tensor) -> None:
174 |         tr.pow(self.ten_constant, spec, out=spec)
175 |         tr.clamp(spec, min=self.eps, out=spec)
176 | 
177 |     def _update_mag_or_phase_buffers(
178 |         self, stft_out_buf: Tensor, frames_buf: Tensor, frames_buf_tmp: Tensor
179 |     ) -> None:
180 |         if self.center:
181 |             # Remove overlap frames we have computed before
182 |             frames = stft_out_buf[:, :, self.overlap_n_frames :]
183 |             # Identify frames that are more correct due to missing prev audio
184 |             fixed_prev_frames = frames[:, :, : -self.io_n_frames]
185 |             if self.use_debug_mode:
186 |                 assert fixed_prev_frames.size(2) == self.overlap_n_frames
187 |             # Identify the new frames for the input audio chunk
188 |             new_frames = frames[:, :, -self.io_n_frames :]
189 |             # Overwrite previous frames with more correct frames
190 |             n_fixed_frames = min(self.model_io_n_frames, self.overlap_n_frames)
191 |             frames_buf[:, :, -n_fixed_frames:] = fixed_prev_frames[
192 |                 :, :, -n_fixed_frames:
193 |             ]
194 |         else:
195 |             new_frames = stft_out_buf[:, :, -self.io_n_frames :]
196 | 
197 |         # Shift buffer left and insert new frames (this way because tr.roll allocates memory dynamically)
198 |         frames_buf_tmp[:, :, : -self.io_n_frames] = frames_buf[:, :, self.io_n_frames :]
199 |         frames_buf[:, :, : -self.io_n_frames] = frames_buf_tmp[
200 |             :, :, : -self.io_n_frames
201 |         ]
202 |         frames_buf[:, :, -self.io_n_frames :] = new_frames
203 | 
204 |     @tr.jit.export
205 |     def set_buffer_size(self, io_n_samples: int) -> None:
206 |         if self.use_debug_mode:
207 |             assert io_n_samples >= self.hop_len
208 |             assert io_n_samples % self.hop_len == 0
209 |             assert self.fade_n_samples <= io_n_samples
210 |         self.io_n_samples = io_n_samples
211 |         self._set_derived_params()
212 |         self._allocate_buffers()
213 |         self.reset()
214 | 
215 |     @tr.jit.export
216 |     def calc_model_delay_samples(self) -> int:
217 |         return self.fade_n_samples
218 | 
219 |     @tr.jit.export
220 |     def reset(self) -> None:
221 |         self.in_buf.fill_(self.eps)
222 |         self.stft_mag_buf.fill_(self.eps)
223 |         self.mag_buf.fill_(self.eps)
224 |         self.spec_out_buf.fill_(self.eps)
225 |         self.stft_phase_buf.fill_(0)
226 |         self.phase_buf.fill_(0)
227 |         self.out_frames_buf.fill_(self.eps)
228 |         self.out_buf.fill_(self.eps)
229 | 
230 |     @tr.jit.export
231 |     def calc_min_buffer_size(self) -> int:
232 |         return self.hop_len
233 | 
234 |     @tr.jit.export
235 |     def calc_max_buffer_size(self) -> int:
236 |         return self.model_io_n_frames * self.hop_len
237 | 
238 |     @tr.jit.export
239 |     def calc_supported_buffer_sizes(self) -> List[int]:
240 |         min_buffer_size = self.calc_min_buffer_size()
241 |         max_buffer_size = self.calc_max_buffer_size()
242 |         buffer_sizes = [
243 |             bs for bs in range(min_buffer_size, max_buffer_size + 1, self.hop_len)
244 |         ]
245 |         return buffer_sizes
246 | 
247 |     @tr.jit.ignore
248 |     def audio_to_spec_offline(self, audio: Tensor) -> Tensor:
249 |         if self.use_debug_mode:
250 |             assert audio.size(0) == self.io_n_ch
251 |             assert audio.size(1) >= self.n_fft
252 |             assert audio.size(1) % self.hop_len == 0
253 |         spec = tr.stft(
254 |             audio,
255 |             n_fft=self.n_fft,
256 |             hop_length=self.hop_len,
257 |             window=self.window,
258 |             center=self.center,
259 |             return_complex=True,
260 |         )
261 |         if self.power is None:
262 |             spec = spec.real
263 |         else:
264 |             spec = spec.abs()
265 |             if self.power != 1.0:
266 |                 spec = spec.pow(self.power)
267 | 
268 |         if self.logarithmize:
269 |             spec = tr.clamp(spec, min=self.eps)
270 |             spec = tr.log10(spec)
271 |             if self.ensure_pos_spec:
272 |                 spec -= self.log10_eps
273 | 
274 |         return spec
275 | 
276 |     @tr.jit.export
277 |     def audio_to_spec(self, audio: Tensor) -> Tensor:
278 |         if self.use_debug_mode:
279 |             assert audio.shape == (self.io_n_ch, self.io_n_samples)
280 |         # Shift buffer left and insert audio chunk (this way because tr.roll allocates memory dynamically)
281 |         self.in_buf_tmp[:, : -self.io_n_samples] = self.in_buf[:, self.io_n_samples :]
282 |         self.in_buf[:, : -self.io_n_samples] = self.in_buf_tmp[:, : -self.io_n_samples]
283 |         self.in_buf[:, -self.io_n_samples :] = audio
284 | 
285 |         # TODO(cm): allow pad_mode to be selected
286 |         complex_frames = tr.stft(
287 |             self.in_buf,
288 |             n_fft=self.n_fft,
289 |             hop_length=self.hop_len,
290 |             window=self.window,
291 |             center=self.center,
292 |             return_complex=True,
293 |         )
294 |         if self.power is None:
295 |             self.stft_mag_buf = complex_frames.real
296 |         else:
297 |             tr.abs(complex_frames, out=self.stft_mag_buf)
298 |             if self.power != 1.0:
299 |                 tr.pow(self.stft_mag_buf, self.power, out=self.stft_mag_buf)
300 |         if self.logarithmize:
301 |             self._logarithmize_spec(self.stft_mag_buf)
302 |             if self.ensure_pos_spec:
303 |                 self.stft_mag_buf -= self.log10_eps
304 | 
305 |         self._update_mag_or_phase_buffers(
306 |             self.stft_mag_buf, self.mag_buf, self.mag_buf_tmp
307 |         )
308 | 
309 |         if self.use_phase_info:
310 |             if self.power is None:
311 |                 self.stft_phase_buf = complex_frames.imag
312 |             else:
313 |                 tr.angle(complex_frames, out=self.stft_phase_buf)
314 |             self._update_mag_or_phase_buffers(
315 |                 self.stft_phase_buf, self.phase_buf, self.phase_buf_tmp
316 |             )
317 | 
318 |         # Prevent future inplace operations from mutating self.mag_buf
319 |         self.spec_out_buf[:, :] = self.mag_buf
320 |         return self.spec_out_buf
321 | 
322 |     @tr.jit.export
323 |     def spec_to_audio(self, spec: Tensor) -> Tensor:
324 |         if self.use_debug_mode:
325 |             assert spec.shape == self.model_io_shape
326 |         spec = spec[:, :, -self.istft_in_n_frames :]
327 |         if self.use_phase_info:
328 |             phase = self.phase_buf[:, :, -self.istft_in_n_frames :]
329 |         else:
330 |             phase = self.zero_phase[:, :, -self.istft_in_n_frames :]
331 | 
332 |         if self.logarithmize:
333 |             if self.ensure_pos_spec:
334 |                 spec += self.log10_eps
335 |             self._unlogarithmize_spec(spec)
336 | 
337 |         if self.power is None:
338 |             self.out_frames_buf.real = spec
339 |             self.out_frames_buf.imag = phase
340 |         else:
341 |             if self.power != 1.0:
342 |                 tr.pow(spec, 1 / self.power, out=spec)
343 |             tr.polar(spec, phase, out=self.out_frames_buf)
344 | 
345 |         # TODO(cm): allow pad_mode to be selected
346 |         rec_audio = tr.istft(
347 |             self.out_frames_buf,
348 |             n_fft=self.n_fft,
349 |             hop_length=self.hop_len,
350 |             window=self.window,
351 |             center=self.center,
352 |             length=self.istft_length,
353 |         )
354 |         rec_audio = rec_audio[:, -self.out_buf_n_samples :]
355 |         if self.fade_n_samples == 0:
356 |             return rec_audio
357 | 
358 |         self.out_buf[:, -self.fade_n_samples :] *= self.fade_down
359 |         rec_audio[:, : self.fade_n_samples] *= self.fade_up
360 |         rec_audio[:, : self.fade_n_samples] += self.out_buf[:, -self.fade_n_samples :]
361 |         audio_out = rec_audio[:, : self.io_n_samples]
362 |         self.out_buf = rec_audio
363 |         return audio_out
364 | 


--------------------------------------------------------------------------------
/neutone_sdk/tcn_1d.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Based off
  3 | https://github.com/csteinmetz1/steerable-nafx/blob/main/steerable-nafx.ipynb
  4 | """
  5 | import logging
  6 | import os
  7 | from typing import Optional
  8 | 
  9 | import torch as tr
 10 | from torch import Tensor
 11 | from torch import nn
 12 | 
 13 | logging.basicConfig()
 14 | log = logging.getLogger(__name__)
 15 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 16 | 
 17 | 
 18 | def causal_crop(x: Tensor, length: int) -> Tensor:
 19 |     if x.shape[-1] != length:
 20 |         stop = x.shape[-1] - 1
 21 |         start = stop - length
 22 |         x = x[..., start:stop]
 23 |     return x
 24 | 
 25 | 
 26 | class FiLM(nn.Module):
 27 |     def __init__(
 28 |         self,
 29 |         cond_dim: int,  # dim of conditioning input
 30 |         num_features: int,  # dim of the conv channel
 31 |         use_bn: bool = True,
 32 |     ) -> None:
 33 |         super().__init__()
 34 |         self.num_features = num_features
 35 |         self.use_bn = use_bn
 36 |         if self.use_bn:
 37 |             self.bn = nn.BatchNorm1d(num_features, affine=False)
 38 |         self.adaptor = nn.Linear(cond_dim, 2 * num_features)
 39 | 
 40 |     def forward(self, x: Tensor, cond: Tensor) -> Tensor:
 41 |         assert cond.ndim == 2
 42 |         cond = self.adaptor(cond)
 43 |         g, b = tr.chunk(cond, 2, dim=-1)
 44 |         g = g.unsqueeze(-1)
 45 |         b = b.unsqueeze(-1)
 46 | 
 47 |         if self.use_bn:
 48 |             x = self.bn(x)  # Apply batchnorm without affine
 49 |         x = (x * g) + b  # Then apply conditional affine
 50 | 
 51 |         return x
 52 | 
 53 | 
 54 | class TCN1DBlock(nn.Module):
 55 |     def __init__(
 56 |         self,
 57 |         in_ch: int,
 58 |         out_ch: int,
 59 |         kernel_size: int,
 60 |         dilation: int,
 61 |         padding: Optional[int] = None,
 62 |         cond_dim: int = 0,
 63 |         use_bias_in_conv: bool = True,
 64 |         use_bn: bool = True,
 65 |         use_act: bool = True,
 66 |         use_res: bool = True,
 67 |         act: Optional[nn.Module] = None,
 68 |         prelu_ch: int = 1,
 69 |         res_groups: int = 1,
 70 |     ) -> None:
 71 |         super().__init__()
 72 |         self.padding = padding
 73 |         if self.padding is None:
 74 |             self.padding = ((kernel_size - 1) // 2) * dilation
 75 |         if act is None:
 76 |             act = nn.PReLU(prelu_ch)
 77 | 
 78 |         self.act = None
 79 |         if use_act:
 80 |             self.act = act
 81 | 
 82 |         self.conv = nn.Conv1d(
 83 |             in_ch,
 84 |             out_ch,
 85 |             (kernel_size,),
 86 |             dilation=(dilation,),
 87 |             padding=self.padding,
 88 |             bias=use_bias_in_conv,
 89 |         )
 90 | 
 91 |         self.film = None
 92 |         if cond_dim > 0:
 93 |             self.film = FiLM(cond_dim, out_ch, use_bn=use_bn)
 94 | 
 95 |         self.bn = None
 96 |         if use_bn and self.film is None:
 97 |             self.bn = nn.BatchNorm1d(out_ch)
 98 | 
 99 |         self.res = None
100 |         if use_res:
101 |             self.res = nn.Conv1d(in_ch, out_ch, (1,), groups=res_groups, bias=False)
102 | 
103 |     def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor:
104 |         x_in = x
105 |         x = self.conv(x)
106 |         if cond is not None and self.film is not None:
107 |             x = self.film(x, cond)
108 |         elif self.bn is not None:
109 |             x = self.bn(x)
110 | 
111 |         if self.act is not None:
112 |             x = self.act(x)
113 | 
114 |         if self.res is not None:
115 |             res = self.res(x_in)
116 |             x_res = causal_crop(res, x.shape[-1])
117 |             x += x_res
118 | 
119 |         return x
120 | 
121 | 
122 | class TCN1D(nn.Module):
123 |     def __init__(
124 |         self,
125 |         in_ch: int = 1,
126 |         out_ch: int = 1,
127 |         n_blocks: int = 10,
128 |         kernel_size: int = 13,
129 |         n_channels: int = 64,
130 |         dil_growth: int = 4,
131 |         padding: Optional[int] = None,
132 |         cond_dim: int = 0,
133 |         use_act: bool = True,
134 |         use_bn: bool = False,
135 |         use_bias_in_conv: bool = True,
136 |     ) -> None:
137 |         super().__init__()
138 |         self.kernel_size = kernel_size
139 |         self.n_channels = n_channels
140 |         self.dil_growth = dil_growth
141 |         self.n_blocks = n_blocks
142 |         self.stack_size = n_blocks
143 |         self.cond_dim = cond_dim
144 |         self.use_act = use_act
145 |         self.use_bn = use_bn
146 |         self.use_bias_in_conv = use_bias_in_conv
147 | 
148 |         self.blocks = nn.ModuleList()
149 |         for n in range(self.n_blocks):
150 |             if n == 0:
151 |                 block_in_ch = in_ch
152 |                 block_out_ch = self.n_channels
153 |             elif n == self.n_blocks - 1:
154 |                 block_in_ch = self.n_channels
155 |                 block_out_ch = out_ch
156 |             else:
157 |                 block_in_ch = self.n_channels
158 |                 block_out_ch = self.n_channels
159 | 
160 |             dilation = self.dil_growth**n
161 |             self.blocks.append(
162 |                 TCN1DBlock(
163 |                     block_in_ch,
164 |                     block_out_ch,
165 |                     self.kernel_size,
166 |                     dilation,
167 |                     padding=padding,
168 |                     cond_dim=self.cond_dim,
169 |                     use_act=self.use_act,
170 |                     use_bn=self.use_bn,
171 |                     use_bias_in_conv=self.use_bias_in_conv,
172 |                 )
173 |             )
174 | 
175 |     def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor:
176 |         assert x.ndim == 3  # (batch_size, in_ch, samples)
177 |         if cond is not None:
178 |             assert cond.ndim == 2  # (batch_size, cond_dim)
179 |         for block in self.blocks:
180 |             x = block(x, cond)
181 |         return x
182 | 
183 |     def calc_receptive_field(self) -> int:
184 |         """Compute the receptive field in samples."""
185 |         rf = self.kernel_size
186 |         for idx in range(1, self.n_blocks):
187 |             dilation = self.dil_growth ** (idx % self.stack_size)
188 |             rf = rf + ((self.kernel_size - 1) * dilation)
189 |         return rf
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     tcn = TCN1D(n_blocks=4, cond_dim=3, use_bn=True)
194 |     log.info(tcn.calc_receptive_field())
195 |     audio = tr.rand((1, 1, 65536))
196 |     cond = tr.rand((1, 3))
197 |     # cond = None
198 |     out = tcn.forward(audio, cond)
199 |     log.info(out.shape)
200 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "neutone_sdk"
 3 | version = "1.4.3"
 4 | description = "SDK for wrapping deep learning models for usage in the Neutone plugin"
 5 | readme = "README.md"
 6 | authors = ["Qosmo <contact@qosmo.jp>"]
 7 | homepage = "https://github.com/QosmoInc/neutone_sdk.git"
 8 | license = "LGPL"
 9 | packages = [{include = "neutone_sdk"}]
10 | 
11 | 
12 | [tool.poetry.dependencies]
13 | click = ">=8.1.7,<9.0.0"
14 | python = ">=3.8,<4.0"
15 | numpy = "^1.21.6"
16 | torch = ">=1.11.0,<2.2.0"
17 | torchaudio = ">=0.11.0,<2.2.0"
18 | soundfile = ">=0.12.1"
19 | jsonschema = "^4.21.0"
20 | requests = "^2.27.1"
21 | tqdm = "^4.63.1"
22 | 
23 | [tool.poetry.dev-dependencies]
24 | black = "22.3.0"
25 | pytest = "*"
26 | 
27 | [build-system]
28 | requires = ["poetry-core>=1.1.0"]
29 | build-backend = "poetry.core.masonry.api"
30 | 


--------------------------------------------------------------------------------
/testing/test_cached_mel_spec.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import torch as tr
 5 | from torchaudio.transforms import MelSpectrogram
 6 | 
 7 | from neutone_sdk.cached_mel_spec import CachedMelSpec
 8 | 
 9 | logging.basicConfig()
10 | log = logging.getLogger(__name__)
11 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
12 | 
13 | 
14 | def test_cached_mel_spec():
15 |     # Setup
16 |     tr.set_printoptions(precision=1)
17 |     tr.random.manual_seed(42)
18 | 
19 |     sr = 44100
20 |     n_ch = 1
21 |     n_fft = 2048
22 |     hop_len = 128
23 |     n_mels = 16
24 |     total_n_samples = 1000 * hop_len
25 | 
26 |     audio = tr.rand(n_ch, total_n_samples)
27 |     # log.info(f"audio = {audio}")
28 |     mel_spec = MelSpectrogram(
29 |         sample_rate=sr,
30 |         n_fft=n_fft,
31 |         hop_length=hop_len,
32 |         n_mels=n_mels,
33 |         center=True,
34 |         pad_mode="constant",
35 |     )
36 |     cached_mel_spec = CachedMelSpec(
37 |         sr=sr, n_ch=n_ch, n_fft=n_fft, hop_len=hop_len, n_mels=n_mels
38 |     )
39 | 
40 |     # Test delay
41 |     delay_samples = cached_mel_spec.get_delay_samples()
42 |     assert delay_samples == n_fft // 2 - hop_len
43 | 
44 |     # Test processing all audio at once
45 |     spec = mel_spec(audio)
46 |     delay_frames = cached_mel_spec.get_delay_frames()
47 |     cached_spec = cached_mel_spec(audio)
48 |     cached_spec = cached_spec[:, :, delay_frames:]
49 |     # log.info(f"       spec = {spec}")
50 |     # log.info(f"cached_spec = {cached_spec}")
51 |     assert tr.allclose(spec[:, :, : cached_spec.size(2)], cached_spec)
52 |     cached_mel_spec.reset()
53 | 
54 |     # Test processing audio in chunks (random chunk size)
55 |     chunks = []
56 |     min_chunk_size = 1
57 |     max_chunk_size = 100
58 |     curr_idx = 0
59 |     while curr_idx < total_n_samples - max_chunk_size:
60 |         chunk_size = (
61 |             tr.randint(min_chunk_size, max_chunk_size + 1, (1,)).item() * hop_len
62 |         )
63 |         chunks.append(audio[:, curr_idx : curr_idx + chunk_size])
64 |         curr_idx += chunk_size
65 |     if curr_idx < total_n_samples:
66 |         chunks.append(audio[:, curr_idx:])
67 |     chunks.append(
68 |         tr.zeros(n_ch, cached_mel_spec.get_delay_samples() + cached_mel_spec.hop_len)
69 |     )
70 | 
71 |     spec_chunks = []
72 |     for chunk in chunks:
73 |         spec_chunk = cached_mel_spec(chunk)
74 |         spec_chunks.append(spec_chunk)
75 |     chunked_spec = tr.cat(spec_chunks, dim=2)
76 |     chunked_spec = chunked_spec[:, :, delay_frames:]
77 |     # log.info(f"        spec = {spec}")
78 |     # log.info(f"chunked_spec = {chunked_spec}")
79 |     assert tr.allclose(spec, chunked_spec)
80 |     log.info("test_cached_mel_spec passed!")
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     test_cached_mel_spec()
85 | 


--------------------------------------------------------------------------------
/testing/test_conv.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import logging
  3 | import os
  4 | import random
  5 | from typing import Union, Tuple
  6 | 
  7 | import torch as tr
  8 | from torch import nn
  9 | from tqdm import tqdm
 10 | 
 11 | from neutone_sdk.conv import Conv1dGeneral
 12 | 
 13 | logging.basicConfig()
 14 | log = logging.getLogger(__name__)
 15 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 16 | 
 17 | 
 18 | def test_dynamic_bs() -> None:
 19 |     conv_gen = Conv1dGeneral(in_channels=2,
 20 |                              out_channels=16,
 21 |                              kernel_size=5,
 22 |                              padding="same",
 23 |                              dilation=2,
 24 |                              causal=False,
 25 |                              cached=True,
 26 |                              use_dynamic_bs=True)
 27 |     for bs in range(64):
 28 |         audio = tr.rand((bs, 2, 128))
 29 |         out = conv_gen(audio)
 30 |         assert out.shape == (bs, 16, 128)
 31 | 
 32 | 
 33 | def _test_against_conv_torch(in_channels: int,
 34 |                              out_channels: int,
 35 |                              kernel_size: int,
 36 |                              padding: Union[str, int, Tuple[int]],
 37 |                              dilation: int,
 38 |                              causal: bool,
 39 |                              padding_mode: str = "zeros",
 40 |                              batch_size: int = 1,
 41 |                              block_size: int = 128,
 42 |                              n_blocks: int = 32) -> None:
 43 |     conv_gen = Conv1dGeneral(in_channels,
 44 |                              out_channels,
 45 |                              kernel_size,
 46 |                              padding=padding,
 47 |                              dilation=dilation,
 48 |                              padding_mode=padding_mode,
 49 |                              causal=causal,
 50 |                              cached=False)
 51 |     padding_torch = padding
 52 |     if causal and padding == "same":
 53 |         # torch.nn.Conv1d doesn't support causal convs so we need to add the causal
 54 |         # padding to both sides and then remove it from the right side later
 55 |         assert conv_gen.padding_r == 0
 56 |         padding_torch = conv_gen.padding_l
 57 |     conv_torch = nn.Conv1d(in_channels,
 58 |                            out_channels,
 59 |                            kernel_size,
 60 |                            padding=padding_torch,
 61 |                            dilation=dilation,
 62 |                            padding_mode=padding_mode)
 63 | 
 64 |     # Copy weights and biases for testing
 65 |     conv_torch.weight = nn.Parameter(conv_gen.conv1d.weight.clone())
 66 |     if conv_torch.bias is not None:
 67 |         conv_torch.bias = nn.Parameter(conv_gen.conv1d.bias.clone())
 68 | 
 69 |     audio = tr.rand((batch_size, in_channels, n_blocks * block_size))
 70 |     out_torch = conv_torch(audio)
 71 |     out_gen = conv_gen(audio)
 72 |     # torch.nn.Conv1d doesn't support causal convs so get rid of the extra right samples
 73 |     if causal and padding != "valid":
 74 |         if conv_gen.padding_l > 0:
 75 |             out_torch = out_torch[..., :-conv_gen.padding_l]
 76 |     assert out_gen.shape == out_torch.shape
 77 |     assert tr.allclose(out_gen, out_torch)
 78 | 
 79 |     conv_gen.set_cached(True)
 80 |     out_blocks = []
 81 |     for idx in range(n_blocks):
 82 |         audio_block = audio[..., idx * block_size:(idx + 1) * block_size]
 83 |         out_block = conv_gen(audio_block)
 84 |         out_blocks.append(out_block)
 85 |     assert all(b.size(-1) == block_size for b in out_blocks)
 86 |     out_cached = tr.cat(out_blocks, dim=-1)
 87 | 
 88 |     delay_samples = conv_gen.get_delay_samples()
 89 |     if delay_samples > 0:
 90 |         # Remove the delay samples from the beginning of the cached output to align
 91 |         # it with not cached output
 92 |         out_cached = out_cached[..., delay_samples:]
 93 |         # Remove the delay samples from the end of the not cached output since they were
 94 |         # never computed by the cached convolution
 95 |         out_torch = out_torch[..., :-delay_samples]
 96 |     # Different padding modes can result in different output lengths of out_torch,
 97 |     # so we need to crop the longer one to align it with the shorter one
 98 |     if out_cached.size(-1) > out_torch.size(-1):
 99 |         out_cached = Conv1dGeneral.causal_crop(out_cached, out_torch.size(-1))
100 |     else:
101 |         out_torch = Conv1dGeneral.causal_crop(out_torch, out_cached.size(-1))
102 |     assert out_cached.shape == out_torch.shape
103 |     assert tr.allclose(out_cached, out_torch)
104 | 
105 | 
106 | def test_conv1d_general():
107 |     causal_flags = [False, True]
108 |     in_channels = [1, 2]
109 |     out_ch = 1
110 |     kernel_sizes = [1, 2, 3, 4, 5, 6, 7, 8]
111 |     dilations = [1, 2, 3, 4, 5, 6, 7, 8]
112 |     max_rand_padding = 32
113 | 
114 |     for causal, in_ch, kernel_size, dil in tqdm(itertools.product(causal_flags,
115 |                                                                   in_channels,
116 |                                                                   kernel_sizes,
117 |                                                                   dilations)):
118 |         rand_pad = random.randint(1, max_rand_padding)
119 |         log.info(f"Testing causal={causal}, "
120 |                  f"in_ch={in_ch}, "
121 |                  f"kernel_size={kernel_size}, "
122 |                  f"dil={dil}, "
123 |                  f"rand_pad={rand_pad}")
124 |         _test_against_conv_torch(
125 |             in_ch, out_ch, kernel_size, padding="same", dilation=dil, causal=causal)
126 |         _test_against_conv_torch(
127 |             in_ch, out_ch, kernel_size, padding="valid", dilation=dil, causal=causal)
128 |         _test_against_conv_torch(
129 |             in_ch, out_ch, kernel_size, padding=0, dilation=dil, causal=causal)
130 |         _test_against_conv_torch(
131 |             in_ch, out_ch, kernel_size, padding=rand_pad, dilation=dil, causal=causal)
132 | 
133 | 
134 | def _test_get_delay_samples(in_channels: int,
135 |                             kernel_size: int,
136 |                             dilation: int,
137 |                             causal: bool,
138 |                             padding_mode: str = "zeros",
139 |                             batch_size: int = 1,
140 |                             block_size: int = 128,
141 |                             n_blocks: int = 32) -> None:
142 |     # This needs to be 1 for the asserts to work, but shouldn't affect generalization
143 |     out_channels = 1
144 |     conv_gen = Conv1dGeneral(in_channels,
145 |                              out_channels=out_channels,
146 |                              kernel_size=kernel_size,
147 |                              padding="same",
148 |                              dilation=dilation,
149 |                              padding_mode=padding_mode,
150 |                              bias=False,
151 |                              causal=causal)
152 | 
153 |     # Create an audio signal consisting of 50% silence and then 50% random noise
154 |     n_samples = n_blocks * block_size
155 |     mid_idx = n_samples // 2
156 |     n_samples_b = n_samples - mid_idx
157 |     audio = tr.zeros((batch_size, in_channels, n_samples))
158 |     audio[..., mid_idx:] = tr.rand((batch_size, in_channels, n_samples_b))
159 | 
160 |     # Measure the index of the first non-zero sample of the uncached convolution
161 |     out_uncached = conv_gen(audio)
162 |     assert out_uncached.shape == (batch_size, out_channels, n_samples)
163 |     nonzero_idx_uncached = (out_uncached != 0).nonzero()[:, -1][0].item()
164 | 
165 |     # Measure the index of the first non-zero sample of the cached convolution
166 |     conv_gen.set_cached(True)
167 |     out_blocks = []
168 |     for idx in range(n_blocks):
169 |         audio_block = audio[..., idx * block_size:(idx + 1) * block_size]
170 |         out_block = conv_gen(audio_block)
171 |         out_blocks.append(out_block)
172 |     assert all(b.size(-1) == block_size for b in out_blocks)
173 |     out_cached = tr.cat(out_blocks, dim=-1)
174 |     assert out_cached.shape == (batch_size, out_channels, n_samples)
175 |     nonzero_idx_cached = (out_cached != 0).nonzero()[:, -1][0].item()
176 | 
177 |     # Compare the reported delay to the measured delay
178 |     delay_samples = conv_gen.get_delay_samples()
179 |     measured_delay_samples = nonzero_idx_cached - nonzero_idx_uncached
180 |     assert measured_delay_samples == delay_samples
181 |     assert (out_uncached[..., nonzero_idx_uncached] ==
182 |             out_cached[..., nonzero_idx_cached])
183 | 
184 | 
185 | def test_get_delay_samples() -> None:
186 |     causal_flags = [False, True]
187 |     in_channels = [1, 2]
188 |     kernel_sizes = [1, 2, 3, 4, 5, 6, 7, 8]
189 |     dilations = [1, 2, 3, 4, 5, 6, 7, 8]
190 | 
191 |     for causal, in_ch, kernel_size, dil in tqdm(itertools.product(causal_flags,
192 |                                                                   in_channels,
193 |                                                                   kernel_sizes,
194 |                                                                   dilations)):
195 |         log.info(f"Testing causal={causal}, "
196 |                  f"in_ch={in_ch}, "
197 |                  f"kernel_size={kernel_size}, "
198 |                  f"dil={dil}")
199 |         _test_get_delay_samples(in_ch, kernel_size, dilation=dil, causal=causal)
200 | 
201 | 
202 | if __name__ == "__main__":
203 |     test_dynamic_bs()
204 |     test_conv1d_general()
205 |     test_get_delay_samples()
206 | 


--------------------------------------------------------------------------------
/testing/test_profiling.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Dict, List
  4 | 
  5 | import torch as tr
  6 | import torch.nn as nn
  7 | from torch import Tensor
  8 | 
  9 | from neutone_sdk import (
 10 |     WaveformToWaveformBase,
 11 |     NeutoneParameter,
 12 |     SampleQueueWrapper,
 13 | )
 14 | from neutone_sdk.benchmark import profile_sqw
 15 | 
 16 | logging.basicConfig()
 17 | log = logging.getLogger(__name__)
 18 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 19 | 
 20 | 
 21 | class ProfilingModel(nn.Module):
 22 |     def forward(
 23 |         self, x: Tensor, min_val: Tensor, max_val: Tensor, gain: Tensor
 24 |     ) -> Tensor:
 25 |         # tr.neg(min_val, out=min_val)
 26 |         # tr.mul(gain, min_val, out=min_val)
 27 |         # tr.mul(gain, max_val, out=max_val)
 28 |         # tr.clip(x, min=min_val, max=max_val, out=x)
 29 |         # tr.clip(x, min=gain * -min_val, max=gain * max_val, out=x)
 30 |         return x
 31 | 
 32 | 
 33 | class ProfilingModelWrapper(WaveformToWaveformBase):
 34 |     def get_model_name(self) -> str:
 35 |         return "clipper"
 36 | 
 37 |     def get_model_authors(self) -> List[str]:
 38 |         return ["Andrew Fyfe"]
 39 | 
 40 |     def get_model_short_description(self) -> str:
 41 |         return "Audio clipper."
 42 | 
 43 |     def get_model_long_description(self) -> str:
 44 |         return "Clips the input audio between -1 and 1."
 45 | 
 46 |     def get_technical_description(self) -> str:
 47 |         return "Clips the input audio between -1 and 1."
 48 | 
 49 |     def get_technical_links(self) -> Dict[str, str]:
 50 |         return {
 51 |             "Code": "https://github.com/QosmoInc/neutone_sdk/blob/main/examples/example_clipper.py"
 52 |         }
 53 | 
 54 |     def get_tags(self) -> List[str]:
 55 |         return ["clipper"]
 56 | 
 57 |     def get_model_version(self) -> str:
 58 |         return "1.0.0"
 59 | 
 60 |     def is_experimental(self) -> bool:
 61 |         return False
 62 | 
 63 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 64 |         return [
 65 |             NeutoneParameter("min", "min clip threshold", default_value=0.15),
 66 |             NeutoneParameter("max", "max clip threshold", default_value=0.15),
 67 |             NeutoneParameter("gain", "scale clip threshold", default_value=1.0),
 68 |         ]
 69 | 
 70 |     @tr.jit.export
 71 |     def is_input_mono(self) -> bool:
 72 |         return False
 73 | 
 74 |     @tr.jit.export
 75 |     def is_output_mono(self) -> bool:
 76 |         return False
 77 | 
 78 |     @tr.jit.export
 79 |     def get_native_sample_rates(self) -> List[int]:
 80 |         return [48000]
 81 | 
 82 |     @tr.jit.export
 83 |     def get_native_buffer_sizes(self) -> List[int]:
 84 |         return [512]
 85 | 
 86 |     def get_look_behind_samples(self) -> int:
 87 |         return 0
 88 | 
 89 |     # def aggregate_params(self, param: Tensor) -> Tensor:
 90 |     #     return param  # We want sample-level control, so no aggregation
 91 | 
 92 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
 93 |         min_val, max_val, gain = params["min"], params["max"], params["gain"]
 94 |         x = self.model.forward(x, min_val, max_val, gain)
 95 |         x = x[:, self.get_look_behind_samples() :]
 96 |         return x
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     model = ProfilingModel()
101 |     wrapper = ProfilingModelWrapper(model)
102 |     sqw = SampleQueueWrapper(wrapper)
103 |     profile_sqw(sqw, daw_sr=48000, n_iters=100, convert_to_torchscript=True)
104 | 


--------------------------------------------------------------------------------
/testing/test_queues.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import random
 4 | 
 5 | import torch as tr
 6 | from tqdm import tqdm
 7 | 
 8 | from neutone_sdk import CircularInplaceTensorQueue
 9 | 
10 | logging.basicConfig()
11 | log = logging.getLogger(__name__)
12 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
13 | 
14 | 
15 | def test_circular_queue() -> None:
16 |     trials = 100
17 |     iters = 100
18 |     max_queue_len = 19
19 |     random.seed(26)
20 |     for _ in tqdm(range(trials)):
21 |         in_list = []
22 |         out_list = []
23 |         queue_len = random.randint(1, max_queue_len)
24 |         queue = CircularInplaceTensorQueue(1, queue_len)
25 |         for idx in range(iters):
26 |             if not queue.is_full():
27 |                 block = tr.full((1, random.randint(1, queue_len - queue.size)), idx + 1)
28 |                 queue.push(block)
29 |                 in_list += block[0, :].tolist()
30 | 
31 |             if not queue.is_empty():
32 |                 block = tr.zeros((1, random.randint(1, queue.size)))
33 |                 queue.pop(block)
34 |                 out_list += block[0, :].int().tolist()
35 | 
36 |         assert len(in_list) >= len(out_list)
37 |         assert in_list[: len(out_list)] == out_list
38 |         assert queue.size == len(in_list) - len(out_list)
39 | 


--------------------------------------------------------------------------------
/testing/test_sandwiches.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import os
  4 | import random
  5 | 
  6 | import torch as tr
  7 | import torch.nn.functional as F
  8 | from tqdm import tqdm
  9 | 
 10 | from neutone_sdk.sandwich import (
 11 |     LinearResampler,
 12 |     InplaceLinearResampler,
 13 |     Inplace4pHermiteResampler,
 14 | )
 15 | 
 16 | logging.basicConfig()
 17 | log = logging.getLogger(__name__)
 18 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 19 | 
 20 | 
 21 | def test_linear_resamplers(
 22 |     n_trials: int = 1000, in_n_ch: int = 2, out_n_ch: int = 2
 23 | ) -> None:
 24 |     random.seed(42)
 25 |     tr.manual_seed(42)
 26 |     sampling_rates = [16000, 22050, 32000, 44100, 48000, 88200, 96000]
 27 |     buffer_sizes = [64, 128, 256, 512, 1024, 2048]
 28 | 
 29 |     resampler = LinearResampler(48000, 48000, 512)
 30 |     inplace_resampler = InplaceLinearResampler(in_n_ch, out_n_ch, 48000, 48000, 512)
 31 | 
 32 |     for _ in tqdm(range(n_trials)):
 33 |         sr_a = random.choice(sampling_rates)
 34 |         sr_b = random.choice(sampling_rates)
 35 |         in_bs = random.choice(buffer_sizes)
 36 | 
 37 |         resampler.set_sample_rates(sr_a, sr_b, in_bs)
 38 |         inplace_resampler.set_sample_rates(sr_a, sr_b, in_bs)
 39 |         # Check inplace linear resampler internal values are correct for matching the ends exactly
 40 |         assert inplace_resampler.x_in[0] == 0.0
 41 |         assert inplace_resampler.x_in[-1] == 0.0 or inplace_resampler.x_in[-1] == 1.0
 42 | 
 43 |         in_audio = tr.rand((in_n_ch, in_bs))
 44 |         in_linear = resampler.process_in(in_audio)
 45 |         in_linear_inplace = inplace_resampler.process_in(in_audio)
 46 |         out_bs = in_linear.size(1)
 47 |         assert in_linear.shape == in_linear_inplace.shape
 48 | 
 49 |         # PyTorch interpolation does not match the ends exactly, hence two asserts
 50 |         assert tr.allclose(in_linear[:, 1:-1], in_linear_inplace[:, 1:-1], atol=1e-6)
 51 |         assert tr.allclose(
 52 |             in_linear[:, [0, -1]], in_linear_inplace[:, [0, -1]], atol=1e-3
 53 |         )
 54 |         in_interpolated = F.interpolate(
 55 |             in_audio.unsqueeze(0), out_bs, mode="linear", align_corners=True
 56 |         ).squeeze(0)
 57 |         # PyTorch interpolation does not match the ends exactly, hence two asserts
 58 |         assert tr.allclose(
 59 |             in_linear_inplace[:, 1:-1], in_interpolated[:, 1:-1], atol=1e-6
 60 |         )
 61 |         assert tr.allclose(
 62 |             in_linear_inplace[:, [0, -1]], in_interpolated[:, [0, -1]], atol=1e-3
 63 |         )
 64 |         # Check that the ends match exactly
 65 |         assert tr.equal(in_linear_inplace[:, [0, -1]], in_audio[:, [0, -1]])
 66 | 
 67 |         out_audio = tr.rand((out_n_ch, out_bs))
 68 |         out_linear = resampler.process_out(out_audio)
 69 |         out_linear_inplace = inplace_resampler.process_out(out_audio)
 70 |         assert out_linear.shape == out_linear_inplace.shape
 71 |         assert out_linear.size(1) == in_bs
 72 | 
 73 |         # PyTorch interpolation does not match the ends exactly, hence two asserts
 74 |         assert tr.allclose(out_linear[:, 1:-1], out_linear_inplace[:, 1:-1], atol=1e-6)
 75 |         assert tr.allclose(
 76 |             out_linear[:, [0, -1]], out_linear_inplace[:, [0, -1]], atol=1e-3
 77 |         )
 78 |         out_interpolated = F.interpolate(
 79 |             out_audio.unsqueeze(0), in_bs, mode="linear", align_corners=True
 80 |         ).squeeze(0)
 81 |         # PyTorch interpolation does not match the ends exactly, hence two asserts
 82 |         assert tr.allclose(
 83 |             out_linear_inplace[:, 1:-1], out_interpolated[:, 1:-1], atol=1e-6
 84 |         )
 85 |         assert tr.allclose(
 86 |             out_linear_inplace[:, [0, -1]], out_interpolated[:, [0, -1]], atol=1e-3
 87 |         )
 88 |         # Check that the ends match exactly
 89 |         assert tr.equal(out_linear_inplace[:, [0, -1]], out_audio[:, [0, -1]])
 90 | 
 91 | 
 92 | def _calc_4p_hermite(x: float, y_m1: float, y0: float, y1: float, y2: float) -> float:
 93 |     # This is super slow, but the fast version has already been implemented and is being tested using this
 94 |     c0 = y0
 95 |     c1 = 0.5 * (y1 - y_m1)
 96 |     c2 = y_m1 - 2.5 * y0 + 2.0 * y1 - 0.5 * y2
 97 |     c3 = 0.5 * (y2 - y_m1) + 1.5 * (y0 - y1)
 98 |     return ((c3 * x + c2) * x + c1) * x + c0
 99 | 
100 | 
101 | def test_4p_hermite_resampler(
102 |     n_trials: int = 50, in_n_ch: int = 2, out_n_ch: int = 2
103 | ) -> None:
104 |     random.seed(42)
105 |     tr.manual_seed(42)
106 |     sampling_rates = [16000, 22050, 32000, 44100, 48000, 88200, 96000]
107 |     buffer_sizes = [64, 128, 256, 512, 1024, 2048]
108 | 
109 |     resampler = Inplace4pHermiteResampler(in_n_ch, out_n_ch, 48000, 48000, 512)
110 | 
111 |     for _ in tqdm(range(n_trials)):
112 |         sr_a = random.choice(sampling_rates)
113 |         sr_b = random.choice(sampling_rates)
114 |         in_bs = random.choice(buffer_sizes)
115 | 
116 |         resampler.set_sample_rates(sr_a, sr_b, in_bs)
117 |         out_bs = resampler.out_bs
118 | 
119 |         # Check inplace resampler internal values are correct for matching the first sample
120 |         assert resampler.x_in[0] == 0.0
121 |         assert resampler.x_in[-1] == 0.0 or resampler.x_in[-1] == 1.0
122 | 
123 |         # Check process_in()
124 |         in_audio = tr.rand((in_n_ch, in_bs))
125 |         in_resampled = resampler.process_in(in_audio)
126 |         assert in_resampled.size(0) == in_n_ch
127 |         # Check that the first sample is equal to the input audio
128 |         assert tr.equal(in_resampled[:, 0], in_audio[:, 0])
129 |         # Check that the last sample is reasonably close to the input audio
130 |         assert tr.allclose(in_resampled[:, -1], in_audio[:, -1], atol=1e-3)
131 | 
132 |         # Check the 4p cubic hermite spline calculation element-wise
133 |         x = resampler.x_in
134 |         y_m1 = tr.index_select(in_audio, dim=1, index=resampler.y_m1_idx_in)
135 |         y0 = tr.index_select(in_audio, dim=1, index=resampler.y0_idx_in)
136 |         y1 = tr.index_select(in_audio, dim=1, index=resampler.y1_idx_in)
137 |         y2 = tr.index_select(in_audio, dim=1, index=resampler.y2_idx_in)
138 | 
139 |         for ch_idx in range(in_n_ch):
140 |             for x_idx in range(out_bs):
141 |                 y_calc = _calc_4p_hermite(
142 |                     x[x_idx],
143 |                     y_m1[ch_idx, x_idx],
144 |                     y0[ch_idx, x_idx],
145 |                     y1[ch_idx, x_idx],
146 |                     y2[ch_idx, x_idx],
147 |                 )
148 |                 assert math.isclose(y_calc, in_resampled[ch_idx, x_idx], abs_tol=1e-6)
149 | 
150 |         # TODO(cm): remove duplication
151 |         # Check process_out()
152 |         out_audio = tr.rand((out_n_ch, out_bs))
153 |         out_resampled = resampler.process_out(out_audio)
154 |         assert out_resampled.size(0) == out_n_ch
155 |         # Check that the first sample is equal to the input audio
156 |         assert tr.equal(out_resampled[:, 0], out_audio[:, 0])
157 |         # Check that the last sample is reasonably close to the input audio
158 |         assert tr.allclose(out_resampled[:, -1], out_audio[:, -1], atol=1e-3)
159 | 
160 |         # Check the 4p cubic hermite spline calculation element-wise
161 |         x = resampler.x_out
162 |         y_m1 = tr.index_select(out_audio, dim=1, index=resampler.y_m1_idx_out)
163 |         y0 = tr.index_select(out_audio, dim=1, index=resampler.y0_idx_out)
164 |         y1 = tr.index_select(out_audio, dim=1, index=resampler.y1_idx_out)
165 |         y2 = tr.index_select(out_audio, dim=1, index=resampler.y2_idx_out)
166 | 
167 |         for ch_idx in range(out_n_ch):
168 |             for x_idx in range(in_bs):
169 |                 y_calc = _calc_4p_hermite(
170 |                     x[x_idx],
171 |                     y_m1[ch_idx, x_idx],
172 |                     y0[ch_idx, x_idx],
173 |                     y1[ch_idx, x_idx],
174 |                     y2[ch_idx, x_idx],
175 |                 )
176 |                 assert math.isclose(y_calc, out_resampled[ch_idx, x_idx], abs_tol=1e-6)
177 | 


--------------------------------------------------------------------------------
/testing/test_sqw.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import logging
  3 | import math
  4 | import os
  5 | import random
  6 | from typing import Dict, List, Optional
  7 | 
  8 | import torch as tr
  9 | import torch.nn as nn
 10 | from torch import Tensor
 11 | from tqdm import tqdm
 12 | 
 13 | from neutone_sdk import WaveformToWaveformBase, SampleQueueWrapper
 14 | 
 15 | logging.basicConfig()
 16 | log = logging.getLogger(__name__)
 17 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 18 | 
 19 | 
 20 | class TestModel(nn.Module):
 21 |     def forward(self, x: Tensor) -> Tensor:
 22 |         return x
 23 | 
 24 | 
 25 | class TestModelWrapper(WaveformToWaveformBase):
 26 |     def __init__(
 27 |         self,
 28 |         model: nn.Module = TestModel(),
 29 |         model_sr: int = 48000,
 30 |         model_bs: int = 512,
 31 |         use_debug_mode: bool = True,
 32 |     ) -> None:
 33 |         super().__init__(model, use_debug_mode)
 34 |         self.model_sr = model_sr
 35 |         self.model_bs = model_bs
 36 | 
 37 |     def get_model_name(self) -> str:
 38 |         return "test"
 39 | 
 40 |     def get_model_authors(self) -> List[str]:
 41 |         return ["Christopher Mitcheltree"]
 42 | 
 43 |     def get_model_short_description(self) -> str:
 44 |         return "Testing."
 45 | 
 46 |     def get_model_long_description(self) -> str:
 47 |         return "Testing."
 48 | 
 49 |     def get_technical_description(self) -> str:
 50 |         return "Testing."
 51 | 
 52 |     def get_tags(self) -> List[str]:
 53 |         return ["test"]
 54 | 
 55 |     def get_model_version(self) -> str:
 56 |         return "1.0.0"
 57 | 
 58 |     def is_experimental(self) -> bool:
 59 |         return True
 60 | 
 61 |     @tr.jit.export
 62 |     def is_input_mono(self) -> bool:
 63 |         return False
 64 | 
 65 |     @tr.jit.export
 66 |     def is_output_mono(self) -> bool:
 67 |         return False
 68 | 
 69 |     @tr.jit.export
 70 |     def get_native_sample_rates(self) -> List[int]:
 71 |         return [self.model_sr]
 72 | 
 73 |     @tr.jit.export
 74 |     def get_native_buffer_sizes(self) -> List[int]:
 75 |         return [self.model_bs]
 76 | 
 77 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
 78 |         return self.model.forward(x)
 79 | 
 80 | 
 81 | def check_saturation_n(io_bs: int, model_bs: int, saturation_n: int) -> bool:
 82 |     size_in = saturation_n
 83 |     size_out = 0
 84 |     for _ in range(math.lcm(io_bs, model_bs)):
 85 |         while size_in >= model_bs:
 86 |             size_in -= model_bs
 87 |             size_out += model_bs
 88 |         if size_out < io_bs:
 89 |             return False
 90 |         else:
 91 |             size_out -= io_bs
 92 |         assert size_in >= 0
 93 |         assert size_out >= 0
 94 |         size_in += io_bs
 95 |     return True
 96 | 
 97 | 
 98 | def find_saturation_n(io_bs: int, model_bs: int) -> Optional[int]:
 99 |     lcm = math.lcm(io_bs, model_bs)
100 |     for n in range(io_bs, lcm + 1, io_bs):
101 |         if check_saturation_n(io_bs, model_bs, n):
102 |             return n
103 |     return None
104 | 
105 | 
106 | def check_queue_saturation(io_bs: int, model_bs: int, saturation_n: int) -> bool:
107 |     sr = 48000
108 |     wrapper = TestModelWrapper(model_sr=sr, model_bs=model_bs)
109 |     sqw = SampleQueueWrapper(
110 |         wrapper, daw_sr=sr, daw_bs=io_bs, model_sr=sr, model_bs=model_bs
111 |     )
112 |     in_queue = sqw.in_queue
113 |     out_queue = sqw.out_queue
114 | 
115 |     io_buffer = tr.zeros((2, io_bs))
116 |     model_buffer = tr.zeros((2, model_bs))
117 | 
118 |     is_saturated = False
119 |     audio_in = tr.rand((2, (io_bs * model_bs) + (2 * saturation_n)))
120 |     blocks_in = tr.split(audio_in, io_bs, dim=1)
121 | 
122 |     for block_in in blocks_in:
123 |         if block_in.size(1) != io_bs:
124 |             break
125 | 
126 |         assert in_queue.max_size - in_queue.size >= io_bs
127 |         in_queue.push(block_in)
128 | 
129 |         if in_queue.size >= saturation_n:
130 |             is_saturated = True
131 | 
132 |         while in_queue.size >= model_bs:
133 |             in_popped_n = in_queue.pop(model_buffer)
134 |             assert in_popped_n == model_bs
135 |             assert out_queue.max_size - in_queue.size >= model_bs
136 |             out_queue.push(model_buffer)
137 | 
138 |         if is_saturated:
139 |             out_popped_n = out_queue.pop(io_buffer)
140 |             if out_popped_n != io_bs:
141 |                 return False
142 | 
143 |     return True
144 | 
145 | 
146 | def delay_test(
147 |     wrapper: TestModelWrapper,
148 |     sqw: SampleQueueWrapper,
149 |     daw_sr: int,
150 |     daw_bs: int,
151 |     model_sr: int,
152 |     model_bs: int,
153 | ) -> None:
154 |     wrapper.model_sr = model_sr
155 |     wrapper.model_bs = model_bs
156 |     sqw.set_daw_sample_rate_and_buffer_size(daw_sr, daw_bs)
157 |     expected_delay = sqw.calc_buffering_delay_samples()
158 |     assert expected_delay >= 0
159 | 
160 |     n_samples = expected_delay + (2 * max(daw_bs, model_bs))
161 |     audio_in = tr.rand((2, n_samples))
162 |     blocks_in = tr.split(audio_in, daw_bs, dim=1)
163 |     blocks_out = []
164 | 
165 |     for block_in in blocks_in:
166 |         if block_in.size(1) != daw_bs:
167 |             break
168 |         block_out = sqw.forward(block_in)
169 |         block_out = tr.clone(block_out)
170 |         blocks_out.append(block_out)
171 | 
172 |     audio_out = tr.cat(blocks_out, dim=1)
173 | 
174 |     actual_delay_l = tr.nonzero(audio_out[0, :])[0].item()
175 |     actual_delay_r = tr.nonzero(audio_out[1, :])[0].item()
176 |     assert actual_delay_l == actual_delay_r
177 |     actual_delay = actual_delay_r
178 |     assert expected_delay == actual_delay, (
179 |         f"expected = {expected_delay}, actual_delay = {actual_delay} | "
180 |         f"{daw_sr}, {daw_bs}, {model_sr}, {model_bs}"
181 |     )
182 | 
183 | 
184 | def test_calc_saturation_n() -> None:
185 |     # random.seed(42)
186 |     # tr.manual_seed(42)
187 |     # io_buffer_sizes = [random.randrange(32, 2048) for _ in range(16)]
188 |     # model_buffer_sizes = [random.randrange(32, 2048) for _ in range(16)]
189 | 
190 |     io_buffer_sizes = list(range(2, 256))
191 |     model_buffer_sizes = list(range(2, 256))
192 | 
193 |     log.info(f"io_buffer_sizes: {io_buffer_sizes}")
194 |     log.info(f"model_buffer_sizes: {model_buffer_sizes}")
195 | 
196 |     for io_bs, model_bs in tqdm(itertools.product(io_buffer_sizes, model_buffer_sizes)):
197 |         calculated_n = SampleQueueWrapper.calc_saturation_n(io_bs, model_bs)
198 |         found_n = find_saturation_n(io_bs, model_bs)
199 |         assert (
200 |             found_n is not None
201 |         ), f"Could not find a saturation_n. io_bs = {io_bs}, model_bs = {model_bs}"
202 |         assert found_n % io_bs == 0
203 |         assert (
204 |             calculated_n == found_n
205 |         ), f"io_bs = {io_bs}, model_bs = {model_bs}, calculated_n = {calculated_n}, found_n = {found_n}"
206 |         assert check_queue_saturation(io_bs, model_bs, found_n)
207 | 
208 |     log.info("No saturation inconsistencies found")
209 | 
210 | 
211 | def test_calc_buffering_delay_samples() -> None:
212 |     wrapper = TestModelWrapper()
213 |     sqw = SampleQueueWrapper(wrapper)
214 | 
215 |     sampling_rates = [16000, 22050, 32000, 44100, 48000, 88200, 96000]
216 |     buffer_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096]
217 | 
218 |     # random.seed(42)
219 |     # tr.manual_seed(42)
220 |     # buffer_sizes = [random.randrange(32, 4096) for _ in range(50)]
221 | 
222 |     log.info(f"Sampling rates: {sampling_rates}")
223 |     log.info(f"Buffer sizes: {buffer_sizes}")
224 | 
225 |     for daw_sr, daw_bs, model_sr, model_bs in tqdm(
226 |         itertools.product(sampling_rates, buffer_sizes, sampling_rates, buffer_sizes)
227 |     ):
228 |         delay_test(wrapper, sqw, daw_sr, daw_bs, model_sr, model_bs)
229 | 
230 |     log.info("No delay inconsistencies found")
231 | 


--------------------------------------------------------------------------------
/testing/torchscript_test.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any
 2 | 
 3 | import torch as tr
 4 | from torch import Tensor as T, nn
 5 | 
 6 | 
 7 | class TestModel(nn.Module):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.a = "hi"
11 |         self.b = 1
12 |         self.c = False
13 |         self.d = 3.14
14 | 
15 |     @tr.jit.export
16 |     def get_metadata(self) -> Dict[str, Any]:
17 |         return {
18 |             "a": self.a,
19 |             "b": self.b,
20 |             "c": self.c,
21 |             "d": self.d,
22 |         }
23 | 
24 |     def forward(self, x: T) -> T:
25 |         return 2 * x
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     audio = tr.randn(1, 1, 5)
30 |     model = TestModel()
31 |     model.eval()
32 |     out = model(audio)
33 |     print(out)
34 |     print(model.get_metadata())
35 |     traced_model = tr.jit.script(model)
36 |     out2 = traced_model(audio)
37 |     print(out2)
38 |     print(traced_model.get_metadata())
39 | 


--------------------------------------------------------------------------------