├── .env
├── .gitmodules
├── LICENSE
├── README.md
├── example_sound.wav
├── export.py
├── neutone_models
    ├── ddspXsynth.nm
    └── ddspXviolin.nm
├── neutone_wrapper.py
├── sms_utils.py
├── xsynth.py
└── xsynth_utils.py


/.env:
--------------------------------------------------------------------------------
1 | PYTHONPATH="realtimeDDSP"


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "realtimeDDSP"]
2 | 	path = realtimeDDSP
3 | 	url = https://github.com/hyakuchiki/realtimeDDSP.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Neutone Model for Cross-Synthesis with DDSP
 2 | 
 3 | [neutone](https://neutone.space/) models that perform a form of [cross-synthesis](https://ccrma.stanford.edu/~jos/sasp/Cross_Synthesis.html) between an input audio signal and the output of a DDSP timbre transfer model. The main use case is to alter the timbre of vocals while preserving the lyrics, but of course any creative "misuse" is encouraged.
 4 | 
 5 | See [here](https://youtu.be/1xAAkqWc4cc) for a video demonstration.
 6 | 
 7 | ## Using the Neutone Cross-Synthesis Models
 8 | 
 9 | 1. Install the [neutone](https://neutone.space/) plugin.
10 | 
11 | 2. Download the models from the `neutone_models` folder and load them into the neutone plugin.
12 | 
13 | The four custom knobs are:
14 | 
15 | **A**. Pitch shift of the input signal. This is useful to align the input signal with the timbre model. 0.5 (knob at 12o'clock) is no shift. 
16 | 
17 | **B**. Amount of cross-synthesis. 0 only uses the input harmonics, 1 only uses the model harmonics. In between, the harmonics are mixed together.
18 | 
19 | **C**. Volume of the filtered noise generated by the DDSP model. At 0.5, the noise is at "default" volume. At 0, the noise is completely muted. 
20 | 
21 | **D**. Formant shift of the input signal. This is an additional tool to alter the sound. 0.5 (knob at 12o'clock) is no shift.
22 | 
23 | ## Creating Your Own 
24 | 
25 | The two models provided were trained on violin recordings and the [Helm](https://tytel.org/helm/) synthesizer. If you want a cross-synthesis model trained on your own audio, you can follow these steps:
26 | 
27 | 1. Train a DDSP timbre transfer model. Either follow the instructions in the [realtimeDDSP](https://github.com/hyakuchiki/realtimeDDSP) repository, or use the corresponding [Colab Notebook](https://colab.research.google.com/drive/15FuafmtGWEyvTOOQbN1AMIQRhGLy23Pg). 
28 | 
29 | 2. If you're satisfied with the timbre model, download the checkpoint file. 
30 | 
31 | 3. Clone this repository and install the dependencies.
32 | ```
33 | git clone https://github.com/dsuedholt/ddsp_xsynth
34 | cd ddsp_xsynth
35 | pip install -r realtimeDDSP/requirements.txt
36 | ```
37 | 
38 | 4. Export the model checkpoint to a neutone model.
39 | ```
40 | python export.py PATH_TO_CKPT EXPORT_NAME [--dataset_name="..." --author_name="..."]
41 | ```
42 | 


--------------------------------------------------------------------------------
/example_sound.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsuedholt/ddsp_xsynth/f5d90ae2f39a7fef824803412e54b7c5090c5dd4/example_sound.wav


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
  1 | import os, logging, argparse, sys
  2 | from pathlib import Path
  3 | 
  4 | import torch
  5 | 
  6 | sys.path.append(
  7 |     os.path.join(os.path.dirname(os.path.abspath(__file__)), "realtimeDDSP")
  8 | )
  9 | 
 10 | from diffsynth.model import EstimatorSynth
 11 | from diffsynth.modules.generators import FilteredNoise, Harmonic
 12 | from diffsynth.modules.reverb import IRReverb
 13 | from diffsynth.synthesizer import Synthesizer
 14 | from diffsynth.stream import StreamIRReverb, StreamFilteredNoise
 15 | from diffsynth.processor import Mix
 16 | 
 17 | from realtimeDDSP.diffsynth.stream import replace_modules
 18 | from neutone_sdk.audio import (
 19 |     AudioSample,
 20 |     AudioSamplePair,
 21 |     render_audio_sample,
 22 | )
 23 | from neutone_sdk.utils import save_neutone_model
 24 | import torchaudio
 25 | 
 26 | from neutone_wrapper import DDSPXSynthWrapper
 27 | from xsynth import DDSPXSynth, XSynthStreamHarmonic
 28 | 
 29 | 
 30 | logging.basicConfig()
 31 | log = logging.getLogger(__name__)
 32 | log.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
 33 | 
 34 | 
 35 | def get_stream_synth(synth):
 36 |     new_ps = []
 37 |     new_cs = []
 38 |     conditioned = synth.conditioned_params
 39 |     for proc, conn in zip(synth.processors, synth.connections):
 40 |         if isinstance(proc, Harmonic):
 41 |             # Replace with streamable version of harmonic synthesizer
 42 |             new_ps.append(
 43 |                 XSynthStreamHarmonic(
 44 |                     proc.sample_rate,
 45 |                     proc.normalize_below_nyquist,
 46 |                     proc.name,
 47 |                     proc.n_harmonics,
 48 |                     proc.freq_range,
 49 |                 )
 50 |             )
 51 |             conn_harm = dict(conn)
 52 |             conn_harm["harm_xsynth"] = "harm_xsynth"
 53 |             conn_harm["input_distribution"] = "input_distribution"
 54 |             new_cs.append(conn_harm)
 55 |             conditioned.extend(["harm_xsynth", "input_distribution"])
 56 |         elif isinstance(proc, FilteredNoise):
 57 |             # Replace with streamable version of noise synthesizer
 58 |             new_ps.append(
 59 |                 StreamFilteredNoise(proc.filter_size, proc.name, proc.amplitude)
 60 |             )
 61 |             new_cs.append(conn)
 62 |         elif isinstance(proc, IRReverb):
 63 |             # remove learned reverb
 64 |             pass
 65 |         elif proc.name == "add":
 66 |             # Replace add module with mix module for adjusting harm/noise
 67 |             new_ps.append(Mix(proc.name))
 68 |             conn_mix = dict(conn)
 69 |             conn_mix["mix_a"] = "harmmix"
 70 |             conn_mix["mix_b"] = "noisemix"
 71 |             new_cs.append(conn_mix)
 72 |             conditioned.extend(["harmmix", "noisemix"])
 73 |         else:
 74 |             new_ps.append(proc)
 75 |             new_cs.append(conn)
 76 |     synth.processors = torch.nn.ModuleList(new_ps)
 77 |     # make new synth
 78 |     dag = [(proc, conn) for proc, conn in zip(new_ps, new_cs)]
 79 |     new_synth = Synthesizer(dag, conditioned=conditioned)
 80 |     return new_synth
 81 | 
 82 | 
 83 | if __name__ == "__main__":
 84 |     parser = argparse.ArgumentParser()
 85 |     parser.add_argument("ckpt", type=str, help="")
 86 |     parser.add_argument("output", type=str, help="model output name")
 87 |     parser.add_argument("--folder", default="./exports", help="output folder")
 88 |     parser.add_argument("--dataset_name", type=str, default="example")
 89 |     parser.add_argument("--author_name", type=str, default="Author Name")
 90 |     args = parser.parse_args()
 91 |     root_dir = Path(args.folder) / args.output
 92 | 
 93 |     model = EstimatorSynth.load_from_checkpoint(args.ckpt)
 94 |     replace_modules(model.estimator)
 95 | 
 96 |     # get streamable hpnir synth with mix parameters
 97 |     model.synth = get_stream_synth(model.synth)
 98 |     stream_model = DDSPXSynth(model.estimator, model.synth, 48000, hop_size=960)
 99 |     dummy = torch.zeros(1, 2048)
100 |     _ = stream_model(
101 |         dummy,
102 |         torch.ones(1),
103 |         {
104 |             "harm_xsynth": torch.ones(1),
105 |             "harmmix": torch.ones(1),
106 |             "noisemix": torch.ones(1),
107 |             "formant": torch.ones(1),
108 |         },
109 |     )
110 |     wrapper = DDSPXSynthWrapper(
111 |         stream_model, author_name=args.author_name, dataset_name=args.dataset_name
112 |     )
113 | 
114 |     wave, sr = torchaudio.load("example_sound.wav")
115 |     input_sample = AudioSample(wave, sr)
116 |     params = (
117 |         torch.tensor([0.5, 0.75, 0.25, 0.5]).repeat((stream_model.window_size, 1)).T
118 |     )
119 |     rendered_sample = render_audio_sample(wrapper, input_sample, params=params)
120 |     soundpairs = [AudioSamplePair(input_sample, rendered_sample)]
121 | 
122 |     save_neutone_model(
123 |         wrapper,
124 |         root_dir,
125 |         freeze=False,
126 |         dump_samples=True,
127 |         submission=True,
128 |         audio_sample_pairs=soundpairs,
129 |     )
130 | 


--------------------------------------------------------------------------------
/neutone_models/ddspXsynth.nm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsuedholt/ddsp_xsynth/f5d90ae2f39a7fef824803412e54b7c5090c5dd4/neutone_models/ddspXsynth.nm


--------------------------------------------------------------------------------
/neutone_models/ddspXviolin.nm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsuedholt/ddsp_xsynth/f5d90ae2f39a7fef824803412e54b7c5090c5dd4/neutone_models/ddspXviolin.nm


--------------------------------------------------------------------------------
/neutone_wrapper.py:
--------------------------------------------------------------------------------
  1 | from neutone_sdk import WaveformToWaveformBase, NeutoneParameter
  2 | from typing import List, Dict
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | from torch import Tensor
  7 | 
  8 | 
  9 | class DDSPXSynthWrapper(WaveformToWaveformBase):
 10 |     def __init__(
 11 |         self,
 12 |         model: nn.Module,
 13 |         use_debug_mode: bool = True,
 14 |         dataset_name: str = "example",
 15 |         author_name: str = "Author Name",
 16 |     ) -> None:
 17 |         super().__init__(model, use_debug_mode)
 18 |         self.author_name = [author_name]
 19 | 
 20 |         self.dataset_name = dataset_name
 21 | 
 22 |     def is_input_mono(self) -> bool:
 23 |         return True
 24 | 
 25 |     def is_output_mono(self) -> bool:
 26 |         return True
 27 | 
 28 |     def get_native_sample_rates(self) -> List[int]:
 29 |         return [self.model.sample_rate]
 30 | 
 31 |     def get_native_buffer_sizes(self) -> List[int]:
 32 |         return [self.model.window_size]
 33 | 
 34 |     def get_model_name(self) -> str:
 35 |         return "ddspX" + self.dataset_name
 36 | 
 37 |     def get_model_authors(self) -> List[str]:
 38 |         return self.author_name
 39 | 
 40 |     def get_model_short_description(self) -> str:
 41 |         return f"DDSP cross-synthesis model trained on {self.dataset_name} data"
 42 | 
 43 |     def get_model_long_description(self) -> str:
 44 |         return f"A DDSP timbre transfer model trained on {self.dataset_name} data that performs cross-synthesis between the input signal and the DDSP output."
 45 | 
 46 |     def get_technical_description(self) -> str:
 47 |         return f"A DDSP timbre transfer model trained on {self.dataset_name} data that performs cross-synthesis between the input signal and the DDSP output."
 48 | 
 49 |     def get_tags(self) -> List[str]:
 50 |         return ["timbre transfer", "DDSP", "cross synthesis", self.dataset_name]
 51 | 
 52 |     def get_model_version(self) -> str:
 53 |         return "0.1.0"
 54 | 
 55 |     def is_experimental(self) -> bool:
 56 |         return True
 57 | 
 58 |     def get_citation(self) -> str:
 59 |         return """
 60 |         Engel, J., Hantrakul, L., Gu, C., & Roberts, A. (2020). DDSP: Differentiable Digital Signal Processing. ICLR.
 61 |         """
 62 | 
 63 |     def get_neutone_parameters(self) -> List[NeutoneParameter]:
 64 |         return [
 65 |             NeutoneParameter(
 66 |                 name="Pitch Shift",
 67 |                 description="Apply pitch shift (-24 to +24 semitones)",
 68 |                 default_value=0.5,
 69 |             ),
 70 |             NeutoneParameter(
 71 |                 name="Cross-Synth",
 72 |                 description="How much the input signal is mixed into harmonics",
 73 |                 default_value=0.5,
 74 |             ),
 75 |             NeutoneParameter(
 76 |                 name="Noise",
 77 |                 description="Volume of the filter noise",
 78 |                 default_value=0.5,
 79 |             ),
 80 |             NeutoneParameter(
 81 |                 name="Formant", description="Formant Shift", default_value=0.5
 82 |             ),
 83 |         ]
 84 | 
 85 |     @torch.no_grad()
 86 |     def do_forward_pass(self, x: Tensor, params: Dict[str, Tensor]) -> Tensor:
 87 |         MAX_SHIFT = 24
 88 |         pshift = (params["Pitch Shift"] - 0.5) * 2 * MAX_SHIFT
 89 |         semishift = torch.round(pshift)
 90 |         f0_mult = 2 ** (semishift / 12)
 91 | 
 92 |         harm_xsynth = params["Cross-Synth"]
 93 |         noise_xsynth = params["Noise"]
 94 |         formant = params["Formant"]
 95 |         cond_params = {
 96 |             "harm_xsynth": harm_xsynth,
 97 |             "harmmix": torch.ones(1) * 0.5,
 98 |             "noisemix": noise_xsynth,
 99 |             "formant": formant,
100 |         }
101 | 
102 |         out, data = self.model(x, f0_mult=f0_mult, param=cond_params)
103 |         return out
104 | 


--------------------------------------------------------------------------------
/sms_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # These functions are adapted from the sms-tools library for Spectral Modeling Synthesis:
 4 | # https://github.com/MTG/sms-tools/blob/master/software/models/utilFunctions.py
 5 | 
 6 | # Modifications to the original code:
 7 | # - replaced numpy calls with torch calls
 8 | # - removed phase calculations, not needed for this application
 9 | 
10 | 
11 | def detect_peaks(mX, t=torch.tensor(-95.0, dtype=torch.float32)):
12 |     """
13 |     Detect spectral peak locations
14 |     mX: magnitude spectrum (dB), t: threshold
15 |     returns ploc: peak locations
16 |     """
17 | 
18 |     z = torch.zeros_like(mX[1:-1])
19 | 
20 |     thresh = torch.where(
21 |         torch.greater(mX[1:-1], t), mX[1:-1], z
22 |     )  # locations above threshold
23 |     next_minor = torch.where(
24 |         mX[1:-1] > mX[2:], mX[1:-1], z
25 |     )  # locations higher than the next one
26 |     prev_minor = torch.where(
27 |         mX[1:-1] > mX[:-2], mX[1:-1], z
28 |     )  # locations higher than the previous one
29 |     ploc = thresh * next_minor * prev_minor  # locations fulfilling the three criteria
30 |     ploc = ploc.nonzero()[:, 0] + 1  # add 1 to compensate for previous steps
31 |     return ploc
32 | 
33 | 
34 | def interpolate_peaks(mX, ploc):
35 |     """
36 |     Interpolate peak values using parabolic interpolation
37 |     mX: magnitude spectrum, ploc: locations of peaks
38 |     returns iploc, ipmag: interpolated peak location and magnitude values
39 |     """
40 | 
41 |     val = mX[ploc]  # magnitude of peak bin
42 |     lval = mX[ploc - 1]  # magnitude of bin at left
43 |     rval = mX[ploc + 1]  # magnitude of bin at right
44 |     iploc = ploc + 0.5 * (lval - rval) / (lval - 2 * val + rval)  # center of parabola
45 |     ipmag = val - 0.25 * (lval - rval) * (iploc - ploc)  # magnitude of peaks
46 |     return iploc, ipmag
47 | 
48 | 
49 | def detect_harmonics(
50 |     pfreq: torch.Tensor,
51 |     pmag: torch.Tensor,
52 |     f0: torch.Tensor,
53 |     nH: int,
54 |     hfreqp: torch.Tensor,
55 |     fs: int,
56 |     harmDevSlope=torch.tensor(0.01, dtype=torch.float32),
57 | ):
58 |     """
59 |     Detection of the harmonics of a frame from a set of spectral peaks using f0
60 |     to the ideal harmonic series built on top of a fundamental frequency
61 |     pfreq, pmag: peak frequencies and magnitude
62 |     f0: fundamental frequency, nH: number of harmonics,
63 |     hfreqp: harmonic frequencies of previous frame,
64 |     fs: sampling rate; harmDevSlope: slope of change of the deviation allowed to perfect harmonic
65 |     returns hfreq, hmag: harmonic frequencies and magnitudes
66 |     """
67 | 
68 |     if f0 <= 0:  # if no f0 return no harmonics
69 |         return torch.zeros(nH), torch.zeros(nH)
70 |     hfreq = torch.zeros(nH)  # initialize harmonic frequencies
71 |     hmag = torch.zeros(nH) - 100  # initialize harmonic magnitudes
72 |     hf = f0 * torch.arange(1, nH + 1)  # initialize harmonic frequencies
73 |     hi = 0  # initialize harmonic index
74 |     if (
75 |         hfreqp.nonzero().numel() == 0
76 |     ):  # if no incomming harmonic tracks initialize to harmonic series
77 |         hfreqp = hf
78 |     while (
79 |         pfreq.numel() > 0 and (f0 > 0) and (hi < nH) and (hf[hi] < fs / 2)
80 |     ):  # find harmonic peaks
81 |         pei = torch.argmin(abs(pfreq - hf[hi]))  # closest peak
82 |         dev1 = abs(pfreq[pei] - hf[hi])  # deviation from perfect harmonic
83 |         dev2 = (
84 |             abs(pfreq[pei] - hfreqp[hi])
85 |             if hfreqp[hi] > 0
86 |             else torch.tensor(fs, dtype=torch.float32)
87 |         )  # deviation from previous frame
88 |         threshold = f0 / 3 + harmDevSlope * pfreq[pei]
89 |         if (dev1 < threshold) or (
90 |             dev2 < threshold
91 |         ):  # accept peak if deviation is small
92 |             hfreq[hi] = pfreq[pei]  # harmonic frequencies
93 |             hmag[hi] = pmag[pei]  # harmonic magnitudes
94 |         hi += 1  # increase harmonic index
95 |     return hfreq, hmag
96 | 


--------------------------------------------------------------------------------
/xsynth.py:
--------------------------------------------------------------------------------
  1 | # The classes in this file are extensions of synthesizer classes implemented
  2 | # by Naotake Masuda in realtimeDDSP: https://github.com/hyakuchiki/realtimeDDSP
  3 | 
  4 | # Functionality for extracting input harmonics and interpolation between input
  5 | # and model-generated harmonics is added. Because torchscript currently does not
  6 | # support super() calls, some of the original code is copied into the modified classes.
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | import torchaudio
 11 | from typing import Dict, Tuple
 12 | 
 13 | from diffsynth.stream import (
 14 |     CachedStreamEstimatorFLSynth,
 15 |     StreamHarmonic,
 16 |     StreamFilteredNoise,
 17 | )
 18 | 
 19 | from diffsynth.f0 import yin_frame, FMIN, FMAX
 20 | from diffsynth.spectral import spec_loudness, spectrogram
 21 | 
 22 | import diffsynth.util as util
 23 | import xsynth_utils
 24 | 
 25 | 
 26 | class XSynthStreamHarmonic(StreamHarmonic):
 27 |     def __init__(
 28 |         self,
 29 |         sample_rate: int = 48000,
 30 |         normalize_below_nyquist: bool = True,
 31 |         name: str = "harmonic",
 32 |         n_harmonics: int = 256,
 33 |         freq_range: Tuple[float, float] = ...,
 34 |         batch_size: int = 1,
 35 |     ):
 36 |         super().__init__(
 37 |             sample_rate,
 38 |             normalize_below_nyquist,
 39 |             name,
 40 |             n_harmonics,
 41 |             freq_range,
 42 |             batch_size,
 43 |         )
 44 |         self.param_sizes["harm_xsynth"] = 1
 45 |         self.param_range["harm_xsynth"] = (0.0, 1.0)
 46 |         self.param_types["harm_xsynth"] = "raw"
 47 |         self.param_sizes["input_distribution"] = n_harmonics
 48 |         self.param_range["input_distribution"] = (0.0, 1.0)
 49 |         self.param_types["input_distribution"] = "raw"
 50 | 
 51 |     def forward(self, params: Dict[str, torch.Tensor], n_samples: int):
 52 |         input_harmonics = params["input_distribution"]
 53 |         amplitudes = params["amplitudes"]
 54 |         harmonic_distribution = params["harmonic_distribution"]
 55 |         f0_hz = params["f0_hz"]
 56 |         # Bandlimit the harmonic distribution.
 57 |         if self.normalize_below_nyquist:
 58 |             harmonic_frequencies = util.get_harmonic_frequencies(
 59 |                 f0_hz, self.n_harmonics
 60 |             )
 61 |             harmonic_distribution = util.remove_above_nyquist(
 62 |                 harmonic_frequencies, harmonic_distribution, self.sample_rate
 63 |             )
 64 | 
 65 |         # Normalize
 66 |         harmonic_distribution /= torch.sum(harmonic_distribution, dim=-1, keepdim=True)
 67 |         input_harmonics /= torch.sum(input_harmonics, dim=-1, keepdim=True)
 68 | 
 69 |         xsynth = params["harm_xsynth"]
 70 |         harmonic_distribution = (
 71 |             harmonic_distribution * (1 - xsynth) + input_harmonics * xsynth
 72 |         )
 73 | 
 74 |         # copy synth code from StreamHarmonic since torchscript doesn't support super() function calls
 75 | 
 76 |         harmonic_amplitudes = amplitudes * harmonic_distribution
 77 |         # interpolate with previous params
 78 |         harmonic_frequencies = util.get_harmonic_frequencies(f0_hz, self.n_harmonics)
 79 |         harmonic_freqs = torch.cat([self.prev_freqs, harmonic_frequencies], dim=1)
 80 |         frequency_envelopes = util.resample_frames(harmonic_freqs, n_samples)
 81 |         harmonic_amps = torch.cat([self.prev_harm, harmonic_amplitudes], dim=1)
 82 |         amplitude_envelopes = util.resample_frames(harmonic_amps, n_samples)
 83 |         audio, last_phase = util.oscillator_bank_stream(
 84 |             frequency_envelopes,
 85 |             amplitude_envelopes,
 86 |             sample_rate=self.sample_rate,
 87 |             init_phase=self.phase,
 88 |         )
 89 |         self.phase = last_phase
 90 |         self.prev_harm = harmonic_amplitudes[:, -1:]
 91 |         self.prev_freqs = harmonic_frequencies[:, -1:]
 92 |         return audio
 93 | 
 94 | 
 95 | class DDSPXSynth(CachedStreamEstimatorFLSynth):
 96 |     def __init__(
 97 |         self,
 98 |         estimator,
 99 |         synth,
100 |         sample_rate,
101 |         hop_size=960,
102 |         pitch_min=50.0,
103 |         pitch_max=2000.0,
104 |         n_harmonics=256,
105 |     ):
106 |         super().__init__(estimator, synth, sample_rate, hop_size, pitch_min, pitch_max)
107 |         self.hann_win = torch.hann_window(self.window_size, periodic=False)
108 |         self.prev_harm_freqs = torch.zeros(n_harmonics)
109 |         self.n_harmonics = n_harmonics
110 | 
111 |     def forward(
112 |         self, audio: torch.Tensor, f0_mult: torch.Tensor, param: Dict[str, torch.Tensor]
113 |     ):
114 |         with torch.no_grad():
115 |             orig_len = audio.shape[-1]
116 |             orig_audio = audio
117 |             # input cache
118 |             audio = torch.cat([self.input_cache.to(audio.device), audio], dim=-1)
119 |             windows = util.slice_windows(
120 |                 audio, self.window_size, self.hop_size, pad=False
121 |             )
122 | 
123 |             self.offset = self.hop_size - ((orig_len - self.offset) % self.hop_size)
124 |             self.input_cache = audio[:, -(self.window_size - self.offset) :]
125 | 
126 |             f0 = yin_frame(windows, self.sample_rate, self.pitch_min, self.pitch_max)
127 |             # loudness
128 |             comp_spec = torch.fft.rfft(windows, dim=-1)
129 |             loudness = spec_loudness(comp_spec, self.a_weighting)
130 | 
131 |             if f0[:, 0] == 0:
132 |                 # use previous f0 if noisy
133 |                 f0[:, 0] = self.prev_f0
134 |                 # also assume silent if noisy
135 |                 # loudness[:, 0] = 0
136 |             for i in range(1, f0.shape[1]):
137 |                 if f0[:, i] == 0:
138 |                     f0[:, i] = f0[:, i - 1]
139 |                     # loudness[:, i] = 0
140 | 
141 |             self.prev_f0 = f0[:, -1]
142 |             # estimator
143 |             f0 = f0_mult * f0
144 |             x = {
145 |                 "f0": f0[:, :, None],
146 |                 "loud": loudness[:, :, None],
147 |             }  # batch=1, n_frames=windows.shape[1], 1
148 |             x.update(param)
149 |             est_param = self.estimator(x)
150 | 
151 |             # get spectrogram of input audio
152 |             # for simplicity we compute the harmonics for each input frame, not for each model frame
153 |             input_spec = torch.abs(
154 |                 torch.fft.rfft(
155 |                     torch.squeeze(orig_audio) * self.hann_win, norm="forward"
156 |                 )
157 |             )
158 | 
159 |             # convert complex spectrogram to magnitudes in db
160 |             input_spec_db = 20 * torch.log10(input_spec + 1e-10)
161 | 
162 |             f0_orig = self.prev_f0[-1]
163 |             f0_synth = f0[:, -1]
164 | 
165 |             # get frequencies and magnitudes of input harmonics
166 |             self.prev_harm_freqs, harm_mags = xsynth_utils.detect_harmonics(
167 |                 input_spec_db,
168 |                 f0_orig,
169 |                 self.n_harmonics,
170 |                 self.prev_harm_freqs,
171 |                 self.sample_rate,
172 |             )
173 | 
174 |             # get harmonic magnitudes for the harmonics of the synthesis f0
175 | 
176 |             input_dist = 10 ** (
177 |                 xsynth_utils.interpolate_harmonics(
178 |                     self.prev_harm_freqs,
179 |                     harm_mags,
180 |                     f0_synth,
181 |                     self.sample_rate,
182 |                     param["formant"],
183 |                 )
184 |                 / 20
185 |             )
186 |             input_dist[
187 |                 torch.arange(1, self.n_harmonics + 1) * f0_synth > self.sample_rate / 2
188 |             ] = 0
189 |             x["input_distribution"] = input_dist
190 | 
191 |             params_dict = self.synth.fill_params(est_param, x)
192 |             render_length = (
193 |                 windows.shape[1] * self.hop_size
194 |             )  # last_of_prev_frame<->0th window<-...->last window
195 | 
196 |             resyn_audio, outputs = self.synth(params_dict, render_length)
197 |             # output cache (delay)
198 |             resyn_audio = torch.cat(
199 |                 [self.output_cache.to(audio.device), resyn_audio], dim=-1
200 |             )
201 |             if resyn_audio.shape[-1] > orig_len:
202 |                 self.output_cache = resyn_audio[:, orig_len:]
203 |                 resyn_audio = resyn_audio[:, :orig_len]
204 |             return resyn_audio, (loudness, f0)
205 | 


--------------------------------------------------------------------------------
/xsynth_utils.py:
--------------------------------------------------------------------------------
 1 | import sms_utils
 2 | import torch
 3 | 
 4 | 
 5 | def detect_harmonics(spec_db, f0, n_harmonics: int, prev_harm_freqs, sample_rate: int):
 6 |     peak_locs = sms_utils.detect_peaks(spec_db)
 7 |     ipeak_locs, ipeak_mags = sms_utils.interpolate_peaks(spec_db, peak_locs)
 8 |     nfft = (spec_db.numel() - 1) * 2
 9 |     ipeak_freqs = sample_rate * ipeak_locs / nfft
10 | 
11 |     return sms_utils.detect_harmonics(
12 |         ipeak_freqs,
13 |         ipeak_mags,
14 |         f0,
15 |         n_harmonics,
16 |         prev_harm_freqs,
17 |         sample_rate,
18 |     )
19 | 
20 | 
21 | def interpolate_harmonics(harm_freqs, harm_mags, f0_synth, sample_rate: int, formant):
22 |     """
23 |     Sample a given harmonic envelope at harmonics of a new fundamental frequency.
24 |     This allows for timbre-preserving (formant-preserving) pitch shifts.
25 |     Optional formant shift is applied by squashing or stretching the sampling intervals.
26 | 
27 |     Args:
28 |         harm_freqs: frequencies of the detected harmonics in the input envelope
29 |         harm_mags: magnitudes of the detected harmonics in the input envelope
30 |         f0_synth: new fundamental frequency to sample the envelope at
31 |         sample_rate: sample rate of the audio
32 |         formant: formant shift factor (0.5 = no shift, 0 = full shift down, 1 = full shift up)
33 | 
34 |     Returns:
35 |         synth_harm_mags: harmonic envelope to be used for additive synthesis
36 |     """
37 | 
38 |     n_harmonics = harm_freqs.numel()
39 | 
40 |     present_harmonics = harm_freqs > 0
41 | 
42 |     if not present_harmonics.any():
43 |         return torch.ones(n_harmonics) * -100
44 | 
45 |     harm_freqs = harm_freqs[present_harmonics]
46 |     harm_mags = harm_mags[present_harmonics]
47 | 
48 |     # make sure harm_freqs are sorted, adjust harm_mags accordingly
49 |     sort_idx = torch.argsort(harm_freqs)
50 |     harm_freqs = harm_freqs[sort_idx]
51 |     harm_mags = harm_mags[sort_idx]
52 | 
53 |     # insert a "pseudo-harmonic" at 0 Hz with magnitude -100 dB (silence) to make interpolation easier
54 |     harm_freqs = torch.cat([torch.zeros(1), harm_freqs])
55 |     harm_mags = torch.cat([torch.ones(1) * -100, harm_mags])
56 | 
57 |     formant_shift = 2 ** -(formant * 2 - 1)
58 |     synth_harm_mags = torch.ones(n_harmonics) * -100
59 |     synth_harm_freqs = f0_synth * torch.arange(1, n_harmonics + 1) * formant_shift
60 | 
61 |     # for each frequency in synth_harm_freqs, find the closest (higher) harmonic in harm_freqs
62 |     freq_idxs = torch.searchsorted(harm_freqs, synth_harm_freqs, right=True)
63 | 
64 |     # discard frequencies that are higher than the highest harmonic in harm_freqs or higher than nyquist
65 |     valid_freqs = (freq_idxs < harm_freqs.numel()) & (
66 |         synth_harm_freqs < sample_rate / 2
67 |     )
68 |     valid_idxs = freq_idxs[valid_freqs]
69 | 
70 |     # sample harm_mags with linear interpolation
71 |     alphas = (synth_harm_freqs[valid_freqs] - harm_freqs[valid_idxs - 1]) / (
72 |         harm_freqs[valid_idxs] - harm_freqs[valid_idxs - 1]
73 |     )
74 |     synth_harm_mags[valid_freqs] = harm_mags[valid_idxs - 1] + alphas * (
75 |         harm_mags[valid_idxs] - harm_mags[valid_idxs - 1]
76 |     )
77 | 
78 |     return synth_harm_mags
79 | 


--------------------------------------------------------------------------------