├── .github
    └── workflows
    │   └── python-package.yml
├── LICENSE
├── README.md
├── Syntheon_Demo.ipynb
├── docs
    └── syntheon-logo.png
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
├── syntheon
    ├── __init__.py
    ├── converter
    │   ├── __init__.py
    │   ├── converter.py
    │   ├── dexed
    │   │   ├── __init__.py
    │   │   ├── dexed_constants.py
    │   │   └── dexed_converter.py
    │   └── vital
    │   │   ├── __init__.py
    │   │   ├── vital_constants.py
    │   │   └── vital_converter.py
    ├── inferencer
    │   ├── __init__.py
    │   ├── dexed
    │   │   ├── Dexed_01.syx
    │   │   ├── __init__.py
    │   │   ├── checkpoints
    │   │   │   └── state_best.pth
    │   │   ├── dexed_inferencer.py
    │   │   └── models
    │   │   │   ├── __init__.py
    │   │   │   ├── amp_utils.py
    │   │   │   ├── conf
    │   │   │       ├── __init__.py
    │   │   │       ├── data_config.yaml
    │   │   │       └── recipes
    │   │   │       │   ├── README.md
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── config.yaml
    │   │   │       │   ├── hyperparams
    │   │   │       │       ├── ddx7.yaml
    │   │   │       │       └── hpn.yaml
    │   │   │       │   └── model
    │   │   │       │       ├── __init__.py
    │   │   │       │       ├── hpn_baseline.yaml
    │   │   │       │       ├── tcnres_f0ld_fm1stack2.yaml
    │   │   │       │       ├── tcnres_f0ld_fm1stack4.yaml
    │   │   │       │       ├── tcnres_f0ld_fm2stack2.yaml
    │   │   │       │       ├── tcnres_f0ld_fmablbrass.yaml
    │   │   │       │       ├── tcnres_f0ld_fmablflute.yaml
    │   │   │       │       ├── tcnres_f0ld_fmbrss.yaml
    │   │   │       │       ├── tcnres_f0ld_fmflt.yaml
    │   │   │       │       ├── tcnres_f0ld_fmstr.yaml
    │   │   │       │       └── tcnres_f0ld_fmstr_noreverb.yaml
    │   │   │   ├── ddx7
    │   │   │       ├── __init__.py
    │   │   │       ├── core.py
    │   │   │       ├── data_utils
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── h5_dataset.py
    │   │   │       │   └── preprocessor.py
    │   │   │       ├── loss_functions.py
    │   │   │       ├── models.py
    │   │   │       ├── spectral_ops.py
    │   │   │       └── synth.py
    │   │   │   └── preprocessor.py
    │   ├── inferencer.py
    │   └── vital
    │   │   ├── __init__.py
    │   │   ├── checkpoints
    │   │       ├── __init__.py
    │   │       └── model.pt
    │   │   ├── config.yaml
    │   │   ├── init.vital
    │   │   ├── models
    │   │       ├── adsr_envelope.py
    │   │       ├── core.py
    │   │       ├── model.py
    │   │       ├── preprocessor.py
    │   │       ├── utils.py
    │   │       └── wavetable_synth.py
    │   │   └── vital_inferencer.py
    ├── main.py
    ├── utils
    │   └── pitch_extractor.py
    └── version.py
└── test
    ├── test_audio
        ├── dexed_test_audio_1.wav
        ├── vital_test_pluck_1.wav
        ├── vital_test_pluck_2.wav
        ├── vital_test_synth_1.wav
        ├── vital_test_synth_2.wav
        ├── vital_test_synth_3.wav
        └── vital_test_wonky_bass_1.wav
    └── test_inferencer.py


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: syntheon
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.9"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Test with pytest
33 |       run: |
34 |         python -m pytest -s
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2022 Syntheon
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![syntheon_logo](docs/syntheon-logo.png)
 2 | 
 3 | # Syntheon
 4 | 
 5 | Syntheon - [Pantheon](https://en.wikipedia.org/wiki/Pantheon,_Rome) for music synthesizers. 
 6 | 
 7 | Syntheon provides **parameter inference** for music synthesizers using *deep learning models*. Given an audio sample, Syntheon infers the best parameter preset for a given synthesizer that can recreate the audio sample. 
 8 | 
 9 | **Check out [this presentation](https://docs.google.com/presentation/d/1PA4fom6QvCW_YG8L0MMVumrAluljcymndNlaK2HW5t0/edit?usp=sharing) on the recent advances of synth parameter inference.
10 | 
11 | For now: 
12 | - :heavy_check_mark: [Vital](https://vital.audio/) is supported
13 | - [Dexed](https://asb2m10.github.io/dexed/) is work-in-progress
14 | 
15 | Try it out on [our Colab notebook demo](https://colab.research.google.com/github/gudgud96/syntheon/blob/main/Syntheon_Demo.ipynb). 
16 | 
17 | ## Installation
18 | 
19 | ```
20 | python3 -m pip install syntheon
21 | ```
22 | 
23 | ## Usage
24 | 
25 | ```python
26 | from syntheon import infer_params
27 | 
28 | output_params_file, eval_dict = infer_params(
29 |     "your_audio.wav", 
30 |     "vital", 
31 |     enable_eval=True
32 | )
33 | ```
34 | 
35 | ## Testing
36 | 
37 | ```
38 | python3 -m pytest
39 | ```
40 | 
41 | ## Structure
42 | 
43 | For each synthesizer, we need to define:
44 | 
45 | - **converter** for preset format conversion: 
46 |     - `serializeToDict`: convert preset file to a Python dictionary to be handled by inferencer
47 |     - `parseToPluginFile`: convert Python dictionary back to preset file, to be loaded by the synthesizer
48 | 
49 | - **inferencer** for model inference:
50 |     - `convert`: define the workflow of `load_model` -> `inference` -> `convert_to_preset`
51 | 
52 | ## Contribution
53 | 
54 | Syntheon is actively under development, and contributions are welcomed. Some TODOs we have in mind include:
55 | 
56 | - Replicating state-of-the-art approaches
57 | - Improving current model performance
58 | - Incorporating new synthesizers 
59 | - Code refactoring 😅
60 | 


--------------------------------------------------------------------------------
/docs/syntheon-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/docs/syntheon-logo.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | librosa==0.9.1
 2 | torch==1.12.1
 3 | torchvision==0.13.1
 4 | torchaudio==0.12.1
 5 | pyyaml
 6 | mido 
 7 | nnAudio==0.3.1 
 8 | numpy 
 9 | bitstruct 
10 | torchcrepeV2


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = syntheon
 3 | version = attr: syntheon.version.version
 4 | description = Parameter inference of music synthesizers to simplify sound design process.
 5 | long_description = file: README.md
 6 | long_description_content_type = text/markdown; charset=UTF-8
 7 | url = https://github.com/gudgud96/syntheon
 8 | author = Hao Hao Tan
 9 | author_email = helloharry66@gmail.com
10 | 
11 | [options]
12 | packages = find:
13 | include_package_data = True
14 | install_requires =
15 |     librosa==0.9.1
16 |     torch==1.12.1
17 |     torchvision==0.13.1
18 |     torchaudio==0.12.1
19 |     pyyaml
20 |     mido 
21 |     nnAudio==0.3.1 
22 |     numpy 
23 |     bitstruct 
24 |     torchcrepeV2
25 | 
26 | python_requires = >=3.7
27 | 
28 | [options.package_data]
29 | * = inferencer/vital/checkpoints/model.pt, inferencer/vital/config.yaml, inferencer/vital/init.vital, inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmstr_noreverb.yaml, inferencer/dexed/models/conf/recipes/models/conf/data_config.yaml


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | if __name__ == '__main__':
4 |     setup()


--------------------------------------------------------------------------------
/syntheon/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import infer_params


--------------------------------------------------------------------------------
/syntheon/converter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/converter/__init__.py


--------------------------------------------------------------------------------
/syntheon/converter/converter.py:
--------------------------------------------------------------------------------
 1 | class SynthConverter:
 2 |     
 3 |     def __init__(self):
 4 |         self.dict = None
 5 |         self.keys = []      # keys that need to be filled for this synth
 6 |     
 7 |     def serializeToDict(self, fname):
 8 |         """
 9 |         From plugin file to protobuf.
10 | 
11 |         Args:
12 |         fname - input file name
13 |         """
14 |         return None
15 |     
16 |     def parseToPluginFile(self, fname):
17 |         """
18 |         From protobuf to plugin file.
19 | 
20 |         Args:
21 |         fname - output file name
22 |         """
23 |         return None
24 |     
25 |     def printMessage(self):
26 |         """
27 |         Print synth parameters.
28 |         """
29 |         if self.dict:
30 |             print(self.dict)
31 |         
32 |         else:
33 |             raise ValueError("synth parameters not serialized yet")
34 |     
35 |     def keys(self):
36 |         return self.keys
37 |     
38 |     def verify(self):
39 |         """
40 |         Verify if params are valid. Used in serializeToDict method.
41 |         """
42 |         if self.dict is None:
43 |             raise ValueError("synth parameters not serialized yet")
44 |         
45 |         # value range checks can leave to derived classes
46 |         for key in self.keys:
47 |             if isinstance(self.dict, list):
48 |                 for elem in self.dict:
49 |                     if key not in elem:
50 |                         raise ValueError("specified key not in synth parameters: {}".format(key))
51 |             elif isinstance(self.dict, dict):
52 |                 if key not in elem:
53 |                         raise ValueError("specified key not in synth parameters: {}".format(key))


--------------------------------------------------------------------------------
/syntheon/converter/dexed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/converter/dexed/__init__.py


--------------------------------------------------------------------------------
/syntheon/converter/dexed/dexed_constants.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DX7 syx -> json constants.
  3 | 
  4 | Credits to source: https://github.com/Nintorac/NeuralDX7/blob/master/scratch/dx7_constants.py
  5 | DX7 specification: https://github.com/asb2m10/dexed/blob/master/Documentation/sysex-format.txt
  6 | DX7 patches: DX7 patches: https://yamahablackboxes.com/collection/yamaha-dx7-synthesizer/patches/
  7 | """
  8 | import bitstruct
  9 | 
 10 | N_OSC = 6
 11 | N_VOICES = 32
 12 | 
 13 | GLOBAL_VALID_RANGES = {
 14 |     'PR1':  range(0, 99+1),
 15 |     'PR2':  range(0, 99+1),
 16 |     'PR3':  range(0, 99+1),
 17 |     'PR4':  range(0, 99+1),
 18 |     'PL1':  range(0, 99+1),
 19 |     'PL2':  range(0, 99+1),
 20 |     'PL3':  range(0, 99+1),
 21 |     'PL4':  range(0, 99+1),
 22 |     'ALG':  range(0, 31+1),
 23 |     'OKS':  range(0, 1+1),
 24 |     'FB':   range(0, 7+1),
 25 |     'LFS':  range(0, 99+1),
 26 |     'LFD':  range(0, 99+1),
 27 |     'LPMD':  range(0, 99+1),
 28 |     'LAMD':  range(0, 99+1),
 29 |     'LPMS': range(0, 7+1),
 30 |     'LFW':  range(0, 5+1),
 31 |     'LKS':  range(0, 1+1),
 32 |     'TRNSP':  range(0, 48+1),
 33 |     'NAME CHAR 1': range(128),
 34 |     'NAME CHAR 2': range(128),
 35 |     'NAME CHAR 3': range(128),
 36 |     'NAME CHAR 4': range(128),
 37 |     'NAME CHAR 5': range(128),
 38 |     'NAME CHAR 6': range(128),
 39 |     'NAME CHAR 7': range(128),
 40 |     'NAME CHAR 8': range(128),
 41 |     'NAME CHAR 9': range(128),
 42 |     'NAME CHAR 10': range(128),
 43 |  }
 44 | 
 45 | OSCILLATOR_VALID_RANGES = {
 46 |     'R1':  range(0, 99+1),
 47 |     'R2':  range(0, 99+1),
 48 |     'R3':  range(0, 99+1),
 49 |     'R4':  range(0, 99+1),
 50 |     'L1':  range(0, 99+1),
 51 |     'L2':  range(0, 99+1),
 52 |     'L3':  range(0, 99+1),
 53 |     'L4':  range(0, 99+1),
 54 |     'BP':  range(0, 99+1),
 55 |     'LD':  range(0, 99+1),
 56 |     'RD':  range(0, 99+1),
 57 |     'RC':  range(0, 3+1),
 58 |     'LC':  range(0, 3+1),
 59 |     'DET': range(0, 14+1),
 60 |     'RS':  range(0, 7+1),
 61 |     'KVS': range(0, 7+1),
 62 |     'AMS': range(0, 3+1),
 63 |     'OL':  range(0, 99+1),
 64 |     'FC':  range(0, 31+1),
 65 |     'M':   range(0, 1+1),
 66 |     'FF':  range(0, 99+1),
 67 | }
 68 | 
 69 | VOICE_PARAMETER_RANGES = {f'{i}_{key}': value for key, value in OSCILLATOR_VALID_RANGES.items() for i in range(N_OSC)}
 70 | VOICE_PARAMETER_RANGES.update(GLOBAL_VALID_RANGES)
 71 | 
 72 | 
 73 | HEADER_KEYS = [
 74 |     'ID',
 75 |     'Sub-status',
 76 |     'format number',
 77 |     'byte count',
 78 |     'byte count',
 79 | ]
 80 | 
 81 | GENERAL_KEYS = [
 82 |     'PR1',
 83 |     'PR2',
 84 |     'PR3',
 85 |     'PR4',
 86 |     'PL1',
 87 |     'PL2',
 88 |     'PL3',
 89 |     'PL4',
 90 |     'ALG',
 91 |     'OKS',
 92 |     'FB',
 93 |     'LFS',
 94 |     'LFD',
 95 |     'LPMD',
 96 |     'LAMD',
 97 |     'LPMS',
 98 |     'LFW',
 99 |     'LKS',
100 |     'TRNSP',
101 |     'NAME CHAR 1',
102 |     'NAME CHAR 2',
103 |     'NAME CHAR 3',
104 |     'NAME CHAR 4',
105 |     'NAME CHAR 5',
106 |     'NAME CHAR 6',
107 |     'NAME CHAR 7',
108 |     'NAME CHAR 8',
109 |     'NAME CHAR 9',
110 |     'NAME CHAR 10',
111 | ]
112 | 
113 | OSC_KEYS = [
114 |     'R1',
115 |     'R2',
116 |     'R3',
117 |     'R4',
118 |     'L1',
119 |     'L2',
120 |     'L3',
121 |     'L4',
122 |     'BP',
123 |     'LD',
124 |     'RD',
125 |     'RC',
126 |     'LC',
127 |     'DET',
128 |     'RS',
129 |     'KVS',
130 |     'AMS',
131 |     'OL',
132 |     'FC',
133 |     'M',
134 |     'FF',
135 | ]
136 | 
137 | FOOTER_KEYS = ['checksum']
138 | 
139 | 
140 | VOICE_KEYS = [f'{i}_{key}' for i in range(6) for key in OSC_KEYS] + \
141 |         GENERAL_KEYS 
142 | 
143 | KEYS =  HEADER_KEYS + \
144 |         list(VOICE_KEYS * N_VOICES) + \
145 |         FOOTER_KEYS
146 | 
147 | header_bytes = [
148 |     'p1u7',             # ID # (i=67; Yamaha)
149 |     'p1u7',             # Sub-status (s=0) & channel number (n=0; ch 1)
150 |     'p1u7',             # format number (f=9; 32 voices)
151 |     'p1u7',             # byte count MS byte
152 |     'p1u7',             # byte count LS byte (b=4096; 32 voices)
153 | ]
154 | 
155 | general_parameter_bytes = [ 
156 |     'p1u7',             # PR1
157 |     'p1u7',             # PR2
158 |     'p1u7',             # PR3
159 |     'p1u7',             # PR4
160 |     'p1u7',             # PL1
161 |     'p1u7',             # PL2
162 |     'p1u7',             # PL3
163 |     'p1u7',             # PL4
164 |     'p3u5',             # ALG
165 |     'p4u1u3',           # OKS|    FB
166 |     'p1u7',             # LFS
167 |     'p1u7',             # LFD
168 |     'p1u7',             # LPMD
169 |     'p1u7',             # LAMD
170 |     'p1u3u3u1',         # LPMS |      LFW      |LKS
171 |     'p1u7',             # TRNSP
172 |     'p1u7',             # NAME CHAR 1
173 |     'p1u7',             # NAME CHAR 2
174 |     'p1u7',             # NAME CHAR 3
175 |     'p1u7',             # NAME CHAR 4
176 |     'p1u7',             # NAME CHAR 5
177 |     'p1u7',             # NAME CHAR 6
178 |     'p1u7',             # NAME CHAR 7
179 |     'p1u7',             # NAME CHAR 8
180 |     'p1u7',             # NAME CHAR 9
181 |     'p1u7',             # NAME CHAR 10
182 | ]
183 | 
184 | osc_parameter_bytes = [
185 |     'p1u7',         # R1
186 |     'p1u7',         # R2
187 |     'p1u7',         # R3
188 |     'p1u7',         # R4
189 |     'p1u7',         # L1
190 |     'p1u7',         # L2
191 |     'p1u7',         # L3
192 |     'p1u7',         # L4
193 |     'p1u7',         # BP
194 |     'p1u7',         # LD
195 |     'p1u7',         # RD
196 |     'p4u2u2',       # RC | LC 
197 |     'p1u4u3',       # DET | RS
198 |     'p3u3u2',       # KVS | AMS
199 |     'p1u7',         # OL
200 |     'p2u5u1',       # FC | M
201 |     'p1u7'          # FF
202 | ]
203 | 
204 | voice_bytes = (osc_parameter_bytes * N_OSC) + general_parameter_bytes
205 | 
206 | tail_bytes = [
207 |     'p1u7',         # checksum
208 | ]
209 | 
210 | full_string = ''.join(header_bytes + osc_parameter_bytes * 6 + general_parameter_bytes)
211 | dx7_struct = bitstruct.compile(full_string)
212 | 
213 | voice_struct = bitstruct.compile(''.join(voice_bytes), names=VOICE_KEYS)
214 | header_struct = bitstruct.compile(''.join(header_bytes))


--------------------------------------------------------------------------------
/syntheon/converter/dexed/dexed_converter.py:
--------------------------------------------------------------------------------
  1 | from syntheon.converter.converter import SynthConverter
  2 | import mido
  3 | from pathlib import Path
  4 | from syntheon.converter.dexed.dexed_constants import voice_struct, VOICE_PARAMETER_RANGES, header_struct,\
  5 |     header_bytes, voice_bytes, N_VOICES, N_OSC, KEYS
  6 | 
  7 | 
  8 | def take(take_from, n):
  9 |     for _ in range(n):
 10 |         yield next(take_from)
 11 | 
 12 | 
 13 | def checksum(data):
 14 |     return (128-sum(data)&127)%128
 15 | 
 16 | 
 17 | class DexedConverter(SynthConverter):
 18 |     def __init__(self):
 19 |         SynthConverter.__init__(self)
 20 |         self.keys = KEYS
 21 | 
 22 |     def serializeToDict(self, fname):
 23 |         path = Path(fname).expanduser()
 24 |         try:
 25 |             preset = mido.read_syx_file(path.as_posix())[0]
 26 |         except IndexError as e:
 27 |             return None
 28 |         except ValueError as e:
 29 |             return None
 30 |         if len(preset.data) == 0:
 31 |             return None
 32 | 
 33 |         def get_voice(data):
 34 |             unpacked = voice_struct.unpack(data)
 35 |             # TODO: need to take actions after verify, skip for now
 36 |             # self.verify(unpacked, VOICE_PARAMETER_RANGES)
 37 |             return unpacked
 38 | 
 39 |         get_header = header_struct.unpack
 40 |         sysex_iter = iter(preset.data)
 41 |         lst = []
 42 |         try:
 43 |             header = get_header(bytes(take(sysex_iter, len(header_bytes))))
 44 |             for idx in range(N_VOICES):
 45 |                 x = get_voice(bytes(take(sysex_iter, len(voice_bytes)))) 
 46 |                 lst.append(x)
 47 |             
 48 |             self.dict = lst
 49 |             return lst
 50 |         except RuntimeError:
 51 |             return None
 52 |     
 53 |     def parseToPluginFile(self, fname):
 54 |         def encode_head():
 55 |             header = [  '0x43',
 56 |                         '0x00',
 57 |                         '0x09',
 58 |                         '0x20',
 59 |                         '0x00',]
 60 | 
 61 |             return [int(i, 0) for i in header]
 62 | 
 63 |         def encode_osc(params, n):
 64 |             oscillator_params = []
 65 | 
 66 |             oscillator_params += [params[f'{n}_R1']]
 67 |             oscillator_params += [params[f'{n}_R2']]
 68 |             oscillator_params += [params[f'{n}_R3']]
 69 |             oscillator_params += [params[f'{n}_R4']]
 70 |             oscillator_params += [params[f'{n}_L1']]
 71 |             oscillator_params += [params[f'{n}_L2']]
 72 |             oscillator_params += [params[f'{n}_L3']]
 73 |             oscillator_params += [params[f'{n}_L4']]
 74 |             oscillator_params += [params[f'{n}_BP']]
 75 |             oscillator_params += [params[f'{n}_LD']]
 76 |             oscillator_params += [params[f'{n}_RD']]
 77 | 
 78 |             RC = params[f'{n}_RC'] << 2
 79 |             LC = params[f'{n}_LC']
 80 |             oscillator_params += [RC | LC]
 81 | 
 82 |             DET = params[f'{n}_DET'] << 3
 83 |             RS = params[f'{n}_RS']
 84 |             oscillator_params += [DET | RS]
 85 | 
 86 |             KVS = params[f'{n}_KVS'] << 2
 87 |             AMS = params[f'{n}_AMS'] 
 88 |             oscillator_params += [KVS|AMS]
 89 |             oscillator_params += [params[f'{n}_OL']]
 90 | 
 91 |             FC = params[f'{n}_FC'] << 1
 92 |             M = params[f'{n}_M']
 93 |             oscillator_params += [FC|M]
 94 |             oscillator_params += [params[f'{n}_FF']]
 95 | 
 96 |             return oscillator_params
 97 | 
 98 |         def encode_global(params):
 99 |             global_params = []
100 |             global_params += [params['PR1']]
101 |             global_params += [params['PR2']]
102 |             global_params += [params['PR3']]
103 |             global_params += [params['PR4']]
104 |             global_params += [params['PL1']]
105 |             global_params += [params['PL2']]
106 |             global_params += [params['PL3']]
107 |             global_params += [params['PL4']]
108 | 
109 |             global_params += [params['ALG']]
110 | 
111 |             OKS = params['OKS'] << 3
112 |             FB = params['FB']
113 | 
114 |             global_params += [OKS|FB]
115 |             global_params += [params['LFS']]
116 |             global_params += [params['LFD']]
117 |             global_params += [params['LPMD']]
118 |             global_params += [params['LAMD']]
119 | 
120 |             LPMS = params['LPMS'] << 4
121 |             LFW = params['LFW'] << 1
122 |             LKS = params['LKS']
123 |             global_params += [LPMS | LFW | LKS]
124 |             global_params += [params['TRNSP']]
125 |             global_params += [params[f'NAME CHAR {i + 1}'] for i in range(10)]
126 | 
127 |             return global_params
128 | 
129 |         try:
130 |             head = encode_head()
131 | 
132 |             data = []
133 |             assert len(self.dict) == N_VOICES
134 | 
135 |             # voices
136 |             last_params = None
137 |             for params in self.dict:
138 |                 if len(params.keys()) == 0:
139 |                     params = last_params
140 |                 else:
141 |                     last_params = params
142 |                 for osc in range(N_OSC):
143 |                     data += encode_osc(params, osc)
144 | 
145 |                 data += encode_global(params)
146 | 
147 | 
148 |             this_checksum = checksum(data)
149 |             output = [*head, *data, this_checksum]
150 |             
151 |             message = mido.Message('sysex', data=output)
152 |             mido.write_syx_file(fname, [message])
153 |             return 0
154 |         
155 |         except Exception as e:
156 |             print(str(e))
157 |             return -1
158 |     
159 |     def verify(self, actual, ranges):
160 |         super().verify()
161 |         assert set(actual.keys())==set(ranges.keys()), 'Params dont match'
162 |         for key in actual:
163 |             if not actual[key] in ranges[key]:
164 |                 print("returning false", key, actual[key])
165 |                 return False
166 |         return True
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     dx_converter = DexedConverter()
171 |     dx_converter.serializeToDict("Dexed_01.syx")
172 |     dx_converter.printMessage()
173 |     dx_converter.parseToPluginFile("testing.syx")
174 | 
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/syntheon/converter/vital/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/converter/vital/__init__.py


--------------------------------------------------------------------------------
/syntheon/converter/vital/vital_constants.py:
--------------------------------------------------------------------------------
1 | N_WAVETABLES = 1
2 | 
3 | CUSTOM_KEYS = "vital_converter"


--------------------------------------------------------------------------------
/syntheon/converter/vital/vital_converter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import base64
  3 | import struct
  4 | from syntheon.converter.converter import SynthConverter
  5 | from syntheon.converter.vital.vital_constants import N_WAVETABLES, CUSTOM_KEYS
  6 | import numpy as np
  7 | import math
  8 | 
  9 | 
 10 | class Base64Converter:
 11 |     def __init__(self):
 12 |         pass
 13 | 
 14 |     def encode(self, signal):
 15 |         signal_bytes = struct.pack('{}f'.format(len(signal)), *signal)
 16 |         base64_string = base64.b64encode(signal_bytes)
 17 | 
 18 |         return base64_string.decode('ascii')
 19 |     
 20 |     def decode(self, base64_string, output_length=2048):
 21 |         signal_bytes = base64.decodebytes(base64_string.encode('ascii'))
 22 |         arr = [k for k in struct.iter_unpack('f', signal_bytes)]    # unpack as 2 bytes integer
 23 |         arr = [k[0] for k in arr]                                   # normalize
 24 | 
 25 |         return np.array(arr)
 26 | 
 27 | 
 28 | class VitalConverter(SynthConverter):
 29 |     def __init__(self):
 30 |         SynthConverter.__init__(self)
 31 |         self.keys = []
 32 |         self.base64_converter = Base64Converter()
 33 |     
 34 |     def serializeToDict(self, fname):
 35 |         try:
 36 |             with open(fname) as f:
 37 |                 self.dict = json.load(f)
 38 |             
 39 |             # decode custom part
 40 |             self.dict[CUSTOM_KEYS] = {}
 41 |             self.dict[CUSTOM_KEYS]["wavetables"] = []
 42 |             for idx in range(len(N_WAVETABLES)):
 43 |                 wavetable_str = self.dict["settings"]["wavetables"][idx]["groups"][0]["components"][0]["keyframes"][0]["wave_data"]
 44 |                 wavetable_name = self.dict["settings"]["wavetables"][idx]["name"]
 45 |                 wavetable_osc_level = self.dict["settings"]["osc_{}_level".format(idx + 1)]
 46 |                 wavetable = self.base64_converter.decode(wavetable_str)     # return np.array
 47 |                 cur_dict = {
 48 |                     "name": wavetable_name,
 49 |                     "wavetable": wavetable,
 50 |                     "osc_level": wavetable_osc_level
 51 |                 }
 52 |                 self.dict[CUSTOM_KEYS]["wavetables"].append(cur_dict)
 53 |             
 54 |             # switch off unused wavetables
 55 |             if N_WAVETABLES == 1:
 56 |                 self.dict["settings"]["osc_2_on"] = 0.0
 57 |                 self.dict["settings"]["osc_3_on"] = 0.0
 58 |             elif N_WAVETABLES == 2:
 59 |                 self.dict["settings"]["osc_3_on"] = 0.0
 60 |                     
 61 |         except Exception as e:
 62 |             print(str(e))
 63 |         
 64 |         return self.dict
 65 |     
 66 |     def parseToPluginFile(self, fname):
 67 |         """
 68 |         vital parameters value scale: https://github.com/mtytel/vital/blob/c0694a193777fc97853a598f86378bea625a6d81/src/common/synth_parameters.cpp
 69 |         value scale computation: https://github.com/mtytel/vital/blob/c0694a193777fc97853a598f86378bea625a6d81/src/plugin/value_bridge.h
 70 |         """
 71 |         # encode custom part
 72 |         wavetables = self.dict[CUSTOM_KEYS]["wavetables"]
 73 |         for idx in range(N_WAVETABLES):
 74 |             wavetable = wavetables[idx]["wavetable"]
 75 |             wavetable_name = wavetables[idx]["name"]
 76 |             wavetable_osc_level = wavetables[idx]["osc_level"]
 77 | 
 78 |             wavetable_str = self.base64_converter.encode(wavetable)
 79 |             self.dict["settings"]["wavetables"][idx]["groups"][0]["components"][0]["keyframes"][0]["wave_data"] = wavetable_str
 80 |             self.dict["settings"]["wavetables"][idx]["name"] = wavetable_name
 81 |             self.dict["settings"]["osc_{}_level".format(idx + 1)] = wavetable_osc_level
 82 |         
 83 |         # switch off unused wavetables
 84 |         if N_WAVETABLES == 1:
 85 |             self.dict["settings"]["osc_2_on"] = 0.0
 86 |             self.dict["settings"]["osc_3_on"] = 0.0
 87 |         elif N_WAVETABLES == 2:
 88 |             self.dict["settings"]["osc_3_on"] = 0.0
 89 |         
 90 |         # adsr filter
 91 |         adsrs = self.dict[CUSTOM_KEYS]["adsr"]
 92 |         # attack is kQuartic
 93 |         self.dict["settings"]["env_1_attack"] = math.sqrt(math.sqrt(adsrs["attack"]))
 94 |         # attack power is kLinear
 95 |         self.dict["settings"]["env_1_attack_power"] = adsrs["attack_power"]
 96 |         # decay is kQuartic
 97 |         self.dict["settings"]["env_1_decay"] = math.sqrt(math.sqrt(adsrs["decay"]))
 98 |         # decay power is kLinear
 99 |         self.dict["settings"]["env_1_decay_power"] = adsrs["decay_power"]
100 |         # sustain is kLinear
101 |         self.dict["settings"]["env_1_sustain"] = adsrs["sustain"]
102 | 
103 |         # self.dict["settings"]["env_1_delay"] = adsrs["delay"]
104 |         # self.dict["settings"]["env_1_hold"] = adsrs["hold"]
105 |         # self.dict["settings"]["env_1_release"] = adsrs["release"]
106 |         # self.dict["settings"]["env_1_release_power"] = adsrs["release_power"]
107 |         # y["settings"]["lfos"] = x_init["settings"]["lfos"]
108 | 
109 |         del self.dict[CUSTOM_KEYS]
110 | 
111 |         with open(fname ,"w+") as f:
112 |             json.dump(self.dict, f)


--------------------------------------------------------------------------------
/syntheon/inferencer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/Dexed_01.syx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/Dexed_01.syx


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/checkpoints/state_best.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/checkpoints/state_best.pth


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/dexed_inferencer.py:
--------------------------------------------------------------------------------
  1 | from syntheon.inferencer.inferencer import Inferencer, InferenceInput, InferenceOutput
  2 | from syntheon.inferencer.dexed.models.preprocessor import ProcessData, F0LoudnessRMSPreprocessor
  3 | from syntheon.inferencer.dexed.models.ddx7.models import DDSP_Decoder, TCNFMDecoder
  4 | from syntheon.inferencer.dexed.models.ddx7.synth import FMSynth
  5 | from syntheon.inferencer.dexed.models.amp_utils import *
  6 | from syntheon.converter.dexed.dexed_converter import DexedConverter
  7 | from syntheon.utils.pitch_extractor import extract_pitch
  8 | import yaml
  9 | import torch
 10 | import librosa
 11 | import soundfile as sf
 12 | import pickle
 13 | import os
 14 | import numpy as np
 15 | 
 16 | 
 17 | class DexedInferenceOutput(InferenceOutput):
 18 |     def __init__(self):
 19 |         InferenceOutput.__init__(self)
 20 |         self.synth_audio = None           # TODO: can put default values here
 21 |         self.ol = None
 22 | 
 23 | 
 24 | class DexedInferenceInput(InferenceInput):
 25 |     def __init__(self):
 26 |         self.x = None
 27 | 
 28 | 
 29 | class DexedInferencer(Inferencer):
 30 |     def convert(self, audio_fname, model_pt_fname=None, enable_eval=False):
 31 |         # TODO: convert should be more like framework. preprocess -> load_model -> inference -> post_process
 32 |         if model_pt_fname is None:
 33 |             model_pt_fname = "syntheon/inferencer/dexed/checkpoints/state_best.pth"
 34 |             
 35 |         with open(
 36 |             os.path.join(
 37 |                 os.path.dirname(os.path.realpath(__file__)),
 38 |                 "models/conf/data_config.yaml"
 39 |             ), 'r'
 40 |         ) as f:
 41 |             data_config = yaml.safe_load(f)
 42 |         
 43 |         preprocessor = ProcessData(
 44 |             silence_thresh_dB=data_config["data_processor"]["silence_thresh_dB"], 
 45 |             sr=data_config["data_processor"]["sr"], 
 46 |             device=data_config["data_processor"]["device"], 
 47 |             seq_len=data_config["data_processor"]["seq_len"],
 48 |             crepe_params=data_config["data_processor"]["crepe_params"], 
 49 |             loudness_params=data_config["data_processor"]["loudness_params"],
 50 |             rms_params=data_config["data_processor"]["rms_params"], 
 51 |             hop_size=data_config["data_processor"]["hop_size"], 
 52 |             max_len=data_config["data_processor"]["max_len"], 
 53 |             center=data_config["data_processor"]["center"]
 54 |         )
 55 | 
 56 |         audio, _ = librosa.load(audio_fname, sr=data_config["data_processor"]["sr"])
 57 | 
 58 |         f0 = extract_pitch(audio, data_config["data_processor"]["sr"], block_size=64)
 59 |         f0 = f0.astype(np.float32)
 60 |         loudness = preprocessor.calc_loudness(audio)
 61 |         rms = preprocessor.calc_rms(audio)
 62 | 
 63 |         scaler = F0LoudnessRMSPreprocessor()
 64 |         x = {
 65 |             "audio": torch.tensor(audio).unsqueeze(0).unsqueeze(-1),
 66 |             "f0": torch.tensor(f0).unsqueeze(0).unsqueeze(-1),
 67 |             "loudness": torch.tensor(loudness).unsqueeze(0).unsqueeze(-1),
 68 |             "rms": torch.tensor(rms).unsqueeze(0).unsqueeze(-1)
 69 |         }
 70 |         scaler.run(x)
 71 | 
 72 |         inference_input = DexedInferenceInput()
 73 |         inference_input.x = x
 74 | 
 75 |         model = self.load_model(model_pt_fname, self.device)
 76 |         inference_output = self.inference(model, inference_input, self.device, enable_eval=enable_eval)
 77 |         synth_params_dict = self.convert_to_preset(inference_output)
 78 |         return synth_params_dict, inference_output.eval_dict
 79 | 
 80 |     def load_model(self, model_pt_fname, device="cuda"):
 81 |         with open(
 82 |             os.path.join(
 83 |                 os.path.dirname(os.path.realpath(__file__)),
 84 |                 "models/conf/recipes/model/tcnres_f0ld_fmstr_noreverb.yaml"
 85 |             ), 'r'
 86 |         ) as f:
 87 |             config = yaml.safe_load(f)
 88 | 
 89 |         # prepare model
 90 |         decoder = TCNFMDecoder(n_blocks=config["decoder"]["n_blocks"], 
 91 |                                 hidden_channels=config["decoder"]["hidden_channels"], 
 92 |                                 out_channels=config["decoder"]["out_channels"],
 93 |                                 kernel_size=config["decoder"]["kernel_size"],
 94 |                                 dilation_base=config["decoder"]["dilation_base"],
 95 |                                 apply_padding=config["decoder"]["apply_padding"],
 96 |                                 deploy_residual=config["decoder"]["deploy_residual"],
 97 |                                 input_keys=config["decoder"]["input_keys"])
 98 | 
 99 |         synth = FMSynth(sample_rate=config["synth"]["sample_rate"],
100 |                         block_size=config["synth"]["block_size"],
101 |                         fr=config["synth"]["fr"],
102 |                         max_ol=config["synth"]["max_ol"],
103 |                         synth_module=config["synth"]["synth_module"],
104 |                         is_reverb=False)
105 | 
106 |         model = DDSP_Decoder(decoder, synth)
107 |         if device == "cuda":
108 |             model.load_state_dict(torch.load(model_pt_fname))
109 |             model.cuda()
110 |         else:
111 |             model.load_state_dict(torch.load(model_pt_fname, map_location=torch.device('cpu')))
112 |         model.eval()        
113 |         return model
114 |     
115 |     def inference(self, model, inference_input, device="cuda", enable_eval=False):
116 |         if device == "cuda":
117 |             inference_input.audio = inference_input.x["audio"].cuda()
118 |             inference_input.f0 = inference_input.x["f0"].cuda()
119 |             inference_input.loudness = inference_input.x["loudness"].cuda()
120 |             inference_input.rms = inference_input.x["rm"].cuda()
121 |         
122 |         # forward pass
123 |         synth_out = model(inference_input.x)
124 | 
125 |         inference_output = DexedInferenceOutput()
126 |         inference_output.synth_audio = synth_out["synth_audio"]
127 |         inference_output.ol = synth_out["ol"]
128 | 
129 |         return inference_output
130 |     
131 |     def convert_to_preset(self, inference_output):
132 | 
133 |         dx_converter = DexedConverter()
134 |         params_dict = dx_converter.serializeToDict("syntheon/inferencer/dexed/Dexed_01.syx")
135 | 
136 |         lst = []
137 |         for idx in range(6):
138 |             ol = inference_output.ol[0, :, idx]
139 |             ol = ol.cpu().detach().numpy()
140 |             ol = ol.reshape(-1, 5).mean(axis=1)
141 | 
142 |             # TODO: these are all hacky code...
143 |             if (idx == 0 or idx == 2):
144 |                 ol = ol / 0.32
145 |             
146 |             lst.append(np.mean(ol))
147 | 
148 |         lst = [amplitude_to_dexed_ol(k) for k in lst]
149 | 
150 |         params_dict[0]["5_OL"] = lst[0]
151 |         params_dict[0]["4_OL"] = lst[1]
152 |         params_dict[0]["3_OL"] = lst[2]
153 |         params_dict[0]["2_OL"] = lst[3]
154 |         params_dict[0]["1_OL"] = lst[4]
155 |         params_dict[0]["0_OL"] = lst[5]
156 |         params_dict[0]["NAME CHAR 1"] = 83
157 |         params_dict[0]["NAME CHAR 2"] = 89
158 |         params_dict[0]["NAME CHAR 3"] = 78
159 |         params_dict[0]["NAME CHAR 4"] = 84
160 |         params_dict[0]["NAME CHAR 5"] = 72
161 |         params_dict[0]["NAME CHAR 6"] = 69
162 |         params_dict[0]["NAME CHAR 7"] = 79
163 |         params_dict[0]["NAME CHAR 8"] = 78
164 |         params_dict[0]["NAME CHAR 9"] = 32
165 |         params_dict[0]["NAME CHAR 10"] = 32
166 | 
167 |         return params_dict
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     # TODO: move to test folder
172 |     dexed_inferencer = DexedInferencer(device="cpu")
173 |     params = dexed_inferencer.convert("test/test_audio/dexed_test_audio_1.wav")
174 | 
175 |     from syntheon.converter.dexed.dexed_converter import DexedConverter
176 |     dexed_converter = DexedConverter()
177 |     dexed_converter.dict = params
178 |     dexed_converter.parseToPluginFile("dexed_output.syx")


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/amp_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | We find it hard to map Dexed's 0-99 output level to actual amplitude.
 3 | So we conducted an empirical experiment, and manually fit the values using np.polyfit
 4 | RMS is xx. Details to be released.
 5 | """
 6 | import numpy as np
 7 | 
 8 | def dexed_ol_to_amplitude(x):
 9 |     return 4e-4 * np.exp(0.086 * x)
10 | 
11 | def amplitude_to_dexed_ol(x):
12 |     return int((np.log(x) - np.log(4e-4)) / 0.086)


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/conf/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/data_config.yaml:
--------------------------------------------------------------------------------
 1 | # Additional Dataset process options
 2 | testset:
 3 |   source_folder: '/homes/fsc01/proj/wavetest' # Path to a directory containing folders with instrument names
 4 |   instruments: ['violin','flute','trumpet']   # Specify names of the folders in test set
 5 |   contiguous: True                            # Do not 'chop' into instances, generate a single contiguous instance extracted from audio.
 6 |   clip_noise: True                            # Clip loudness back to -80db when f0 is over 1900 (avoids crepe to track noise)
 7 |   input_dir: 'files/test'
 8 |   output_dir: 'data/test'
 9 | 
10 | 
11 | # URMP Dataset process options
12 | urmp:
13 |   source_folder: 'E://URMP//Dataset' # path to urmp dataset finishing in ... "/URMP/Dataset"
14 |   instruments:  #URMP IDs of instruments
15 |     'vn': 'violin'
16 |     'tpt': 'trumpet'
17 |     'fl' : flute
18 |   mono_regex: 'AuSep'
19 |   num_workers: 4
20 |   input_dir: 'files/train'
21 |   output_dir: 'data/train'
22 | 
23 | data_processor:
24 |   _target_: dataset.create_data.ProcessData
25 |   silence_thresh_dB: 40                 # Silence threshold for splitting instances.
26 |   sr: 16000                             # Sample rate
27 |   device: 'cpu'                      # Torch Device ID
28 |   crepe_params:
29 |     model: 'full'                       # use 'full' for dataset generation - 'tiny' also available
30 |     confidence_threshold: 0.80          # used 0.80 for flute, and 0.85 for violin and trumpet
31 |     batch_size: 128
32 |     fmin: 50
33 |     fmax: 2000
34 |   loudness_params:
35 |     nfft: 2048
36 |   rms_params:
37 |     frame_size: 2048
38 |   hop_size: 64                           # hop size in samples for CREPE, RMS, or loudness
39 |   max_len: 4                             # Maximum block len ( in seconds )
40 |   seq_len: 3                             # Minimum block len (in seconds) -> block is padded to fit max_len
41 |   debug: False                           # Verbose
42 |   center: False                          # True: Center loudness and pitch window before computing. False: Pad at the end.
43 | 
44 | hydra:
45 |   run:
46 |     dir: outputs/null
47 | 
48 | process_urmp: True      # Process URMP.
49 | process_testset: False  # Process additional testset.
50 | skip_copy: False        # Skip file copying (if you have already done so)
51 | skip_process: False     # Dry run.


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/README.md:
--------------------------------------------------------------------------------
1 | # Training options
2 | 
3 | Training options are divided into three sections:
4 | 
5 | 1. `config.yaml` contains information about a particular set of experiments or a run.
6 | 1. The `models` directory stores config files used to build the models tested on the paper,
7 | and the config of the differentiable synthesizers.
8 | 1. The `hyperparams` directory contains settings used to train `DDX7` and the `HpN Baseline`.
9 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/conf/recipes/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - hyperparams: ddx7
 3 |   - model: tcnres_f0ld_fmstr    # Model to train (TCN w/ residual connections and string FM synth)
 4 | 
 5 | instrument: violin
 6 | device: cuda:0
 7 | mode: test
 8 | data_dir: data                  # Processed data dir
 9 | load_additional_testset: False  # Load additional testset from external files.
10 | seed: 1234
11 | train_split: 0.75               # Split factor for URMP train set. Rest is halved in valid and test set.
12 | resume_epoch: 0                 # Resume epoch to keep training or just to test. 0 for no resume
13 | 
14 | 
15 | run_dir: runs                   # Directory where to store runs.
16 |                                 # Each run dir contain experiments. Each experiment contain different runs.
17 | exp_name: exp_test              # Experiment name.
18 | run_name: testrun               # Run name (within experiment).
19 | 
20 | 
21 | hydra:
22 |   output_subdir: .
23 |   run:
24 |     dir: ${run_dir}/${exp_name}/${run_name}


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/hyperparams/ddx7.yaml:
--------------------------------------------------------------------------------
 1 | _target_: trainer.Hyperparams
 2 | steps: 120000
 3 | loss_fn:
 4 |   _target_: ddx7.loss_functions.rec_loss
 5 |   scales: [2048, 1024, 512, 256, 128, 64]
 6 |   overlap: 0.75
 7 | scheduler: ExponentialLR
 8 | opt: Adam
 9 | lr: 3e-4
10 | lr_decay_rate: 0.98
11 | lr_decay_steps: 10000
12 | grad_clip_norm: 2.0
13 | batch_size: 16
14 | n_store_best: 20 # How many checkpoints do we want to keep.


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/hyperparams/hpn.yaml:
--------------------------------------------------------------------------------
 1 | _target_: trainer.Hyperparams
 2 | steps: 120000 # Original is 1000000
 3 | loss_fn:
 4 |   _target_: ddx7.loss_functions.rec_loss
 5 |   scales: [2048, 1024, 512, 256, 128, 64]
 6 |   overlap: 0.75
 7 | scheduler: ExponentialLR
 8 | opt: Adam
 9 | lr: 1e-4
10 | lr_decay_rate: 0.98
11 | lr_decay_steps: 10000
12 | grad_clip_norm: 3.0
13 | batch_size: 16   # original is 32 (and reverb of 4 s, we use 1 s)
14 | n_store_best: 20 # How many checkpoints do we want to keep.


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/conf/recipes/model/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/hpn_baseline.yaml:
--------------------------------------------------------------------------------
 1 | _target_: aesddsp.ddsp.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: aesddsp.ddsp.models.RnnFCDecoder
 4 |   hidden_size: 512
 5 |   sample_rate: 16000
 6 |   input_keys: ['f0_scaled','loudness_scaled']
 7 |   input_sizes: [1,1]
 8 |   output_keys: ['amplitude','harmonic_distribution','noise_bands']
 9 |   output_sizes: [1,60,65]
10 | synth:
11 |   _target_: aesddsp.ddsp.synth.HNSynth
12 |   sample_rate: 16000
13 |   block_size: 64
14 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fm1stack2.yaml:
--------------------------------------------------------------------------------
 1 | _target_:ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 2
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32       #max_ol is specified in multiples of 2*pi, reciprocal
17 |   fr: [1,1]
18 |   synth_module: 1stack2


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fm1stack4.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 4
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32       #max_ol is specified in multiples of 2*pi
17 |   fr: [1,1,3,14]
18 |   synth_module: 1stack4


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fm2stack2.yaml:
--------------------------------------------------------------------------------
 1 | _target_:ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 4
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32       #max_ol is specified in multiples of 2*pi
17 |   fr: [1,1,1,1]
18 |   synth_module: 2stack2


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmablbrass.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 4
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32     #max_ol is specified in multiples of 2*pi
17 |   fr: [1, 1, 1, 3.2]
18 |   synth_module: fmablbrass #ablated brass patch (for abl brass and flute)


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmablflute.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 4
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32       #max_ol is specified in multiples of 2*pi
17 |   fr: [1,1,1,2]
18 |   synth_module: fmablbrass #ablated brass patch (for abl flute and brass)


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmbrss.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 6
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32       #max_ol is specified in multiples of 2*pi
17 |   fr: [1, 1, 1, 1, 3.2, 8.5]
18 |   synth_module: fmbrass


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmflt.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 6
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32       #max_ol is specified in multiples of 2*pi
17 |   fr: [1, 1, 1, 2, 2, 1.5]
18 |   synth_module: fmflute


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmstr.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 6
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32       #max_ol is specified in multiples of 2*pi
17 |   fr: [1,1,1,1,3,14]
18 |   synth_module: fmstrings


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmstr_noreverb.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ddx7.models.DDSP_Decoder
 2 | decoder:
 3 |   _target_: ddx7.models.TCNFMDecoder
 4 |   n_blocks: 5
 5 |   hidden_channels: 128
 6 |   out_channels: 6
 7 |   kernel_size: 3
 8 |   dilation_base: 2
 9 |   apply_padding: True
10 |   deploy_residual: True
11 |   input_keys: ['f0_scaled','loudness_scaled']
12 | synth:
13 |   _target_: ddx7.synth.FMSynth
14 |   sample_rate: 16000
15 |   block_size: 64
16 |   max_ol: 0.32       #max_ol is specified in multiples of 2*pi
17 |   fr: [1,1,1,1,3,14]
18 |   synth_module: fmstrings
19 |   is_reverb: False


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/ddx7/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/core.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.fft as fft
  4 | import numpy as np
  5 | import librosa as li
  6 | import math
  7 | 
  8 | _DB_RANGE = 80.0 #Min loudness
  9 | _REF_DB = 20.7  # White noise, amplitude=1.0, n_fft=2048
 10 | _F0_RANGE = 127
 11 | 
 12 | def safe_log(x,eps=1e-7):
 13 |     eps = torch.tensor(eps)
 14 |     return torch.log(x + eps)
 15 | 
 16 | def safe_divide(numerator, denominator, eps=1e-7):
 17 |     """Avoid dividing by zero by adding a small epsilon."""
 18 |     eps = torch.tensor(eps)
 19 |     safe_denominator = torch.where(denominator == 0.0, eps, denominator)
 20 |     return numerator / safe_denominator
 21 | 
 22 | def logb(x, base=2.0, eps=1e-5):
 23 |     """Logarithm with base as an argument."""
 24 |     return safe_divide(safe_log(x, eps), safe_log(base, eps), eps)
 25 | 
 26 | def hz_to_midi(frequencies):
 27 |     """Torch-compatible hz_to_midi function."""
 28 |     notes = 12.0 * (logb(frequencies, 2.0) - logb(440.0, 2.0)) + 69.0
 29 |     notes = torch.where(torch.le(frequencies, torch.zeros(1).to(frequencies)),
 30 |                         torch.zeros(1).to(frequencies), notes)
 31 |     return notes
 32 | 
 33 | 
 34 | @torch.no_grad()
 35 | def cumsum_nd(in_tensor,wrap_value=None):
 36 |     '''
 37 |     cumsum_nd() : cummulative sum - non differentiable and with wrap value.
 38 | 
 39 |     The problem with cumsum: when we work with phase tensors that are too large
 40 |     (i.e. more than a few tenths of seconds) cumsum gets to accumulate steps
 41 |     over a very large window, and it seems the float point variable loses precision.
 42 | 
 43 |     This workaround computes the accumulation step by step, resetting the
 44 |     accumulator in order for it to avoid to lose precision.
 45 | 
 46 |     NOTE: This implementation is very slow, and can't be used during training,
 47 |     only for final audio rendering on the test set.
 48 | 
 49 |     Assumes a tensor format used for audio rendering. [batch,len,1]
 50 | 
 51 |     NOTE:  Non integer frequency ratios do not work using current synthesis approach,
 52 |     because we render a common phase (wrapped using cumsum_nd) and then we multiply it
 53 |     by the frequency ratio. This introduces a misalignment if we multiply the wrapped phase
 54 |     by a non-integer frequency ratio.
 55 | 
 56 |     TODO: implement an efficient vectorial cumsum with wrapping we can use to accumulate
 57 |           phases from all oscillators separately
 58 |     '''
 59 |     print("[WARNING] Using non differentiable cumsum. Non-integer frequency ratios wont render well.")
 60 |     input_len = in_tensor.size()[1]
 61 |     nb = in_tensor.size()[0]
 62 |     acc = torch.zeros([nb,1,1])
 63 |     out_tensor = torch.zeros([nb,input_len,1])
 64 |     #print("in size{} - out size{}".format(in_tensor.size(),out_tensor.size()))
 65 |     for i in range(input_len):
 66 |         acc += in_tensor[:,i,0]
 67 |         if(wrap_value is not None):
 68 |             acc = acc - (acc > wrap_value)*wrap_value
 69 |         out_tensor[:,i,0] = acc
 70 |     return out_tensor
 71 | 
 72 | 
 73 | 
 74 | @torch.no_grad()
 75 | def mean_std_loudness(dataset):
 76 |     mean = 0
 77 |     std = 0
 78 |     n = 0
 79 |     for _, _, l in dataset:
 80 |         n += 1
 81 |         mean += (l.mean().item() - mean) / n
 82 |         std += (l.std().item() - std) / n
 83 |     return mean, std
 84 | 
 85 | 
 86 | def multiscale_fft(signal, scales, overlap):
 87 |     stfts = []
 88 |     for s in scales:
 89 |         S = torch.stft(
 90 |             signal,
 91 |             s,
 92 |             int(s * (1 - overlap)),
 93 |             s,
 94 |             torch.hann_window(s).to(signal),
 95 |             True,
 96 |             normalized=True,
 97 |             return_complex=True,
 98 |         ).abs()
 99 |         stfts.append(S)
100 |     return stfts
101 | 
102 | 
103 | def resample(x, factor: int):
104 |     batch, frame, channel = x.shape
105 |     x = x.permute(0, 2, 1).reshape(batch * channel, 1, frame)
106 | 
107 |     window = torch.hann_window(
108 |         factor * 2,
109 |         dtype=x.dtype,
110 |         device=x.device,
111 |     ).reshape(1, 1, -1)
112 |     y = torch.zeros(x.shape[0], x.shape[1], factor * x.shape[2]).to(x)
113 |     y[..., ::factor] = x
114 |     y[..., -1:] = x[..., -1:]
115 |     y = torch.nn.functional.pad(y, [factor, factor])
116 |     y = torch.nn.functional.conv1d(y, window)[..., :-1]
117 | 
118 |     y = y.reshape(batch, channel, factor * frame).permute(0, 2, 1)
119 | 
120 |     return y
121 | 
122 | 
123 | def upsample(signal, factor,mode='nearest'):
124 |     signal = signal.permute(0, 2, 1)
125 |     signal = nn.functional.interpolate(signal, size=signal.shape[-1] * factor,mode=mode)
126 |     return signal.permute(0, 2, 1)
127 | 
128 | 
129 | def extract_loudness(signal, sampling_rate, block_size, n_fft=2048):
130 |     S = li.stft(
131 |         signal,
132 |         n_fft=n_fft,
133 |         hop_length=block_size,
134 |         win_length=n_fft,
135 |         center=True,
136 |     )
137 |     S = np.log(abs(S) + 1e-7)
138 |     f = li.fft_frequencies(sampling_rate, n_fft)
139 |     a_weight = li.A_weighting(f)
140 | 
141 |     S = S + a_weight.reshape(-1, 1)
142 | 
143 |     S = np.mean(S, 0)[..., :-1]
144 | 
145 |     return S
146 | 
147 | 
148 | 
149 | def get_mlp(in_size, hidden_size, n_layers):
150 |     channels = [in_size] + (n_layers) * [hidden_size]
151 |     net = []
152 |     for i in range(n_layers):
153 |         net.append(nn.Linear(channels[i], channels[i + 1]))
154 |         net.append(nn.LayerNorm(channels[i + 1]))
155 |         net.append(nn.LeakyReLU())
156 |     return nn.Sequential(*net)
157 | 
158 | 
159 | def get_gru(n_input, hidden_size):
160 |     return nn.GRU(n_input * hidden_size, hidden_size, batch_first=True)
161 | 
162 | 
163 | def amp_to_impulse_response(amp, target_size):
164 |     amp = torch.stack([amp, torch.zeros_like(amp)], -1)
165 |     amp = torch.view_as_complex(amp)
166 |     amp = fft.irfft(amp)
167 | 
168 |     filter_size = amp.shape[-1]
169 | 
170 |     amp = torch.roll(amp, filter_size // 2, -1)
171 |     win = torch.hann_window(filter_size, dtype=amp.dtype, device=amp.device)
172 | 
173 |     amp = amp * win
174 | 
175 |     amp = nn.functional.pad(amp, (0, int(target_size) - int(filter_size)))
176 |     amp = torch.roll(amp, -filter_size // 2, -1)
177 | 
178 |     return amp
179 | 
180 | 
181 | def fft_convolve(signal, kernel):
182 |     signal = nn.functional.pad(signal, (0, signal.shape[-1]))
183 |     kernel = nn.functional.pad(kernel, (kernel.shape[-1], 0))
184 | 
185 |     output = fft.irfft(fft.rfft(signal) * fft.rfft(kernel))
186 |     output = output[..., output.shape[-1] // 2:]
187 | 
188 |     return output
189 | 
190 | 
191 | def harmonic_synth(pitch, amplitudes, sampling_rate,use_safe_cumsum=False):
192 | 
193 |     if(use_safe_cumsum==True):
194 |         omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi)
195 |     else:
196 |         omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1)
197 | 
198 |     n_harmonic = amplitudes.shape[-1]
199 |     omegas = omega * torch.arange(1, n_harmonic + 1).to(omega)
200 |     signal = (torch.sin(omegas) * amplitudes).sum(-1, keepdim=True)
201 |     return signal
202 | 
203 | OP6=5
204 | OP5=4
205 | OP4=3
206 | OP3=2
207 | OP2=1
208 | OP1=0
209 | 
210 | def fm_2stack2(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False):
211 | 
212 |     if(use_safe_cumsum==True):
213 |         omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi)
214 |     else:
215 |         omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1)
216 | 
217 |     # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases.
218 | 
219 |     op4_phase =  fr[OP4] * omega
220 |     op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase)
221 | 
222 |     op3_phase =  fr[OP3] * omega + 2 * np.pi * op4_output
223 |     op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase) # output of stack of 2
224 | 
225 |     op2_phase =  fr[OP2] * omega
226 |     op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase)
227 | 
228 |     op1_phase =  fr[OP1] * omega + 2 * np.pi * op2_output
229 |     op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase) # output of stack of 2
230 | 
231 |     return (op3_output + op1_output)/max_ol
232 | 
233 | def fm_1stack2(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False):
234 | 
235 |     if(use_safe_cumsum==True):
236 |         omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi)
237 |     else:
238 |         omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1)
239 | 
240 |     # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases.
241 | 
242 |     op2_phase =  fr[OP2] * omega
243 |     op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase)
244 | 
245 |     op1_phase =  fr[OP1] * omega + 2 * np.pi * op2_output
246 |     op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase) # output of stack of 2
247 | 
248 |     return op1_output/max_ol
249 | 
250 | 
251 | def fm_1stack4(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False):
252 | 
253 |     if(use_safe_cumsum==True):
254 |         omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi)
255 |     else:
256 |         omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1)
257 | 
258 |     # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases.
259 | 
260 |     op4_phase =  fr[OP4] * omega
261 |     op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase)
262 | 
263 |     op3_phase =  fr[OP3] * omega + 2 * np.pi * op4_output
264 |     op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase) # output of stack of 4
265 | 
266 |     op2_phase =  fr[OP2] * omega + 2 * np.pi * op3_output
267 |     op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase)
268 | 
269 |     op1_phase =  fr[OP1] * omega + 2 * np.pi * op2_output
270 |     op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase) # output of stack of 2
271 | 
272 |     return op1_output/max_ol
273 | 
274 | 
275 | '''
276 | Ablated Brass FM Synth - with phase wrapping (it does not change behaviour)
277 |      OP4->OP3->|
278 |           OP2->|->OP1->out
279 | 
280 | '''
281 | def fm_ablbrass_synth(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False):
282 | 
283 |     if(use_safe_cumsum==True):
284 |         omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi)
285 |     else:
286 |         omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1)
287 | 
288 |     # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases.
289 | 
290 |     op4_phase =  fr[OP4] * omega
291 |     op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase % (2*np.pi))
292 | 
293 |     op3_phase =  fr[OP3] * omega + 2 * np.pi * op4_output
294 |     op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase % (2*np.pi)) # output of stack of 2
295 | 
296 |     op2_phase =  fr[OP2] * omega
297 |     op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase % (2*np.pi)) # output stack of 1
298 | 
299 |     op1_phase =  fr[OP1] * omega + 2 * np.pi * (op2_output + op3_output)
300 |     op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase % (2*np.pi)) # global carrier
301 | 
302 |     return op1_output/max_ol
303 | 
304 | '''
305 | String FM Synth - with phase wrapping (it does not change behaviour)
306 | PATCH NAME: STRINGS 1
307 | OP6->OP5->OP4->OP3 |
308 |        (R)OP2->OP1 |->out
309 | '''
310 | def fm_string_synth(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False):
311 | 
312 |     if(use_safe_cumsum==True):
313 |         omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi)
314 |     else:
315 |         omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1)
316 | 
317 |     # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases.
318 |     op6_phase =  fr[OP6] * omega
319 |     op6_output = torch.unsqueeze(ol[:,:,OP6], dim=-1) * torch.sin(op6_phase % (2*np.pi))
320 | 
321 |     op5_phase =  fr[OP5] * omega + 2 * np.pi * op6_output
322 |     op5_output = torch.unsqueeze(ol[:,:,OP5], dim=-1)*torch.sin(op5_phase % (2*np.pi))
323 | 
324 |     op4_phase =  fr[OP4] * omega + 2 * np.pi * op5_output
325 |     op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase % (2*np.pi))
326 | 
327 |     op3_phase =  fr[OP3] * omega + 2 * np.pi * op4_output
328 |     op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase % (2*np.pi)) # output of stack of 4
329 | 
330 |     op2_phase =  fr[OP2] * omega
331 |     op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase % (2*np.pi))
332 | 
333 |     op1_phase =  fr[OP1] * omega + 2 * np.pi * op2_output
334 |     op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase % (2*np.pi)) # output of stack of 2
335 | 
336 |     return (op3_output + op1_output)/max_ol
337 | 
338 | '''
339 | Flute FM Synth - with phase wrapping (it does not change behaviour)
340 | PATCH NAME: FLUTE 1
341 | (R)OP6->OP5->|
342 |    OP4->OP3->|
343 |         OP2->|->OP1->out
344 | '''
345 | def fm_flute_synth(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False):
346 | 
347 |     if(use_safe_cumsum==True):
348 |         omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi)
349 |     else:
350 |         omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1)
351 | 
352 |     # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases.
353 |     op6_phase =  fr[OP6] * omega
354 |     op6_output = torch.unsqueeze(ol[:,:,OP6], dim=-1) * torch.sin(op6_phase % (2*np.pi))
355 | 
356 |     op5_phase =  fr[OP5] * omega + 2 * np.pi * op6_output
357 |     op5_output = torch.unsqueeze(ol[:,:,OP5], dim=-1)*torch.sin(op5_phase % (2*np.pi)) # output of stack of 2
358 | 
359 |     op4_phase =  fr[OP4] * omega
360 |     op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase % (2*np.pi))
361 | 
362 |     op3_phase =  fr[OP3] * omega + 2 * np.pi * op4_output
363 |     op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase % (2*np.pi)) # output of stack of 2
364 | 
365 |     op2_phase =  fr[OP2] * omega
366 |     op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase % (2*np.pi)) # output stack of 1
367 | 
368 |     op1_phase =  fr[OP1] * omega + 2 * np.pi * (op2_output + op3_output + op5_output)
369 |     op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase % (2*np.pi)) # carrier
370 | 
371 |     return op1_output/max_ol
372 | 
373 | '''
374 | Brass FM Synth - with phase wrapping (it does not change behaviour)
375 | PATCH NAME: BRASS 3
376 | OP6->OP5->OP4->|
377 |        (R)OP3->|
378 |           OP2->|->OP1->out
379 | '''
380 | def fm_brass_synth(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False):
381 | 
382 |     if(use_safe_cumsum==True):
383 |         omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi)
384 |     else:
385 |         omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1)
386 | 
387 |     # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases.
388 |     op6_phase =  fr[OP6] * omega
389 |     op6_output = torch.unsqueeze(ol[:,:,OP6], dim=-1) * torch.sin(op6_phase % (2*np.pi))
390 | 
391 |     op5_phase =  fr[OP5] * omega + 2 * np.pi * op6_output
392 |     op5_output = torch.unsqueeze(ol[:,:,OP5], dim=-1)*torch.sin(op5_phase % (2*np.pi))
393 | 
394 |     op4_phase =  fr[OP4] * omega + 2 * np.pi * op5_output
395 |     op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase % (2*np.pi)) # output of stack of 3
396 | 
397 |     op3_phase =  fr[OP3] * omega
398 |     op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase % (2*np.pi)) # output of stack of 1
399 | 
400 |     op2_phase =  fr[OP2] * omega
401 |     op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase % (2*np.pi)) # output stack of 1
402 | 
403 |     op1_phase =  fr[OP1] * omega + 2 * np.pi * (op2_output + op3_output + op4_output)
404 |     op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase % (2*np.pi)) # carrier
405 | 
406 |     return op1_output/max_ol
407 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/data_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/ddx7/data_utils/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/data_utils/h5_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import h5py
 3 | import numpy as np
 4 | import random
 5 | import torch
 6 | import math
 7 | 
 8 | class h5Dataset(Dataset):
 9 |     def __init__(self, sr, data_path,input_keys,max_audio_val=1,device='cpu'):
10 |         self.sr = sr
11 |         self.data_path = data_path
12 |         self.input_data_dicts,self.dataset_len = self.cache_data(self.data_path,len(input_keys))
13 |         self.max_audio_val = max_audio_val
14 |         self.input_keys = input_keys
15 |         self.device = device
16 | 
17 |     def cache_data(self, data_path,nfeatures):
18 |         '''
19 |         Load data to dictionary in RAM
20 |         '''
21 |         h5f = h5py.File(data_path, 'r')
22 |         cache = {}
23 |         keys = h5f.keys()
24 |         nkeys = len(keys)
25 |         ndata = (len(keys)//nfeatures)
26 |         if((nkeys//nfeatures)*nfeatures != nkeys):
27 |             raise Exception("Unexpected dataset len.")
28 | 
29 |         for key in keys:
30 |             cache[key] = np.array(h5f[key])
31 |         h5f.close()
32 | 
33 |         return cache, ndata
34 | 
35 |     def __getitem__(self, idx):
36 |         #print("[DEBUG] __getitem__ fetching: {}".format(idx))
37 | 
38 |         #Generate current item keys to fetch from RAM cache
39 |         item_keys = [f'{idx}_{k}' for k in self.input_keys ]
40 | 
41 |         # Load dictionary
42 |         x = {}
43 |         for v,k in enumerate(self.input_keys):
44 |             x[k] = torch.tensor(self.input_data_dicts[item_keys[v]]).unsqueeze(-1).to(self.device)
45 | 
46 |         #for k in x.keys():
47 |         #    print(f'{k}: {x[k].shape} ',end='')
48 |         #print('')
49 | 
50 |         return x
51 | 
52 |     def __len__(self):
53 |         return self.dataset_len
54 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/data_utils/preprocessor.py:
--------------------------------------------------------------------------------
 1 | import ddx7.core as core
 2 | 
 3 | class F0LoudnessRMSPreprocessor():
 4 |     """Scales 'f0_hz' and 'loudness_db' features."""
 5 |     def __init__(self):
 6 |         return
 7 | 
 8 |     def run(self,x):
 9 |         x['loudness_scaled'] = self.scale_db(x['loudness'])
10 |         x['rms_scaled'] = self.scale_db(x['rms'])
11 |         x['f0_scaled'] = self.scale_f0_hz(x['f0'])
12 |         return x
13 | 
14 |     def scale_db(self,db):
15 |         """Scales [-DB_RANGE, 0] to [0, 1]."""
16 |         return (db / core._DB_RANGE) + 1.0
17 | 
18 |     def scale_f0_hz(self,f0_hz):
19 |         """Scales [0, Nyquist] Hz to [0, 1.0] MIDI-scaled."""
20 |         return core.hz_to_midi(f0_hz) / core._F0_RANGE


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/loss_functions.py:
--------------------------------------------------------------------------------
  1 | import ddx7.core as core
  2 | import torch
  3 | import torch.nn as nn
  4 | from functools import partial
  5 | 
  6 | 
  7 | '''
  8 | Asimetric L1 distance
  9 | '''
 10 | def asim_l1_distance(a,b,alpha=1,beta=1):
 11 |     diff = a-b
 12 |     pos_diff = diff * (diff > 0)
 13 |     neg_diff = diff * (diff < 0)
 14 |     as_diff = alpha * pos_diff + beta * neg_diff
 15 |     as_mse = torch.abs(as_diff).mean()
 16 |     return as_mse
 17 | 
 18 | 
 19 | def asim_msfft_loss(a1,
 20 |                     a2,
 21 |                     scales=[4096, 2048, 1024, 512, 256, 128],
 22 |                     overlap=0.75,
 23 |                     alpha=1,
 24 |                     beta=1):
 25 |     '''
 26 |     DDSP Original MS FFT loss with lin + log spectra analysis
 27 |     '''
 28 |     if(len(a1.size()) == 3):
 29 |         a1 = a1.squeeze(-1)
 30 |     if(len(a2.size()) == 3):
 31 |         a2 = a2.squeeze(-1)
 32 |     ori_stft = core.multiscale_fft(
 33 |         a1,
 34 |         scales,
 35 |         overlap,
 36 |     )
 37 |     rec_stft = core.multiscale_fft(
 38 |         a2,
 39 |         scales,
 40 |         overlap,
 41 |     )
 42 | 
 43 |     loss = 0
 44 |     for s_x, s_y in zip(ori_stft, rec_stft):
 45 |         lin_loss = asim_l1_distance(s_x, s_y,alpha,beta)
 46 |         log_loss = asim_l1_distance(core.safe_log(s_x),core.safe_log(s_y),alpha,beta)
 47 |         loss = loss + lin_loss + log_loss
 48 | 
 49 |     return loss
 50 | 
 51 | 
 52 | 
 53 | def ddsp_msfft_loss(a1,
 54 |                     a2,
 55 |                     scales=[4096, 2048, 1024, 512, 256, 128],
 56 |                     overlap=0.75):
 57 |     '''
 58 |     DDSP Original MS FFT loss with lin + log spectra analysis
 59 |         Some remarks: the stfts have to be normalized otherwise the netowrk weights different excerpts to different importance.
 60 |                       We compute the mean of the L1 difference between normalized magnitude spectrograms
 61 |                       so that the magnitude of the loss do not change with the window size.
 62 |     '''
 63 |     if(len(a1.size()) == 3):
 64 |         a1 = a1.squeeze(-1)
 65 |     if(len(a2.size()) == 3):
 66 |         a2 = a2.squeeze(-1)
 67 |     ori_stft = core.multiscale_fft(
 68 |         a1,
 69 |         scales,
 70 |         overlap,
 71 |     )
 72 |     rec_stft = core.multiscale_fft(
 73 |         a2,
 74 |         scales,
 75 |         overlap,
 76 |     )
 77 | 
 78 |     loss = 0
 79 |     for s_x, s_y in zip(ori_stft, rec_stft):
 80 |         lin_loss = (s_x - s_y).abs().mean()
 81 |         log_loss = (core.safe_log(s_x) - core.safe_log(s_y)).abs().mean()
 82 |         loss = loss + lin_loss + log_loss
 83 | 
 84 |     return loss
 85 | 
 86 | class rec_loss(nn.Module):
 87 |     def __init__(self,scales,overlap,alpha=None,beta=None):
 88 |         super().__init__()
 89 |         self.scales = scales
 90 |         self.overlap = overlap
 91 |         if(alpha is not None and beta is not None):
 92 |             self.loss_fn = partial(asim_msfft_loss,alpha=alpha,beta=beta)
 93 |             print(f'[INFO] rec_loss() - Using asimetrical reconstruction loss. alpha: {alpha} - beta: {beta}')
 94 |         else:
 95 |             self.loss_fn = ddsp_msfft_loss
 96 |     def forward(self,ref,synth):
 97 |         return self.loss_fn(ref,synth,
 98 |                     self.scales,
 99 |                     self.overlap)
100 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from syntheon.inferencer.dexed.models.ddx7.core import get_gru, get_mlp
  4 | import torch.nn.functional as F
  5 | 
  6 | '''
  7 | Wrapper class for either HpN or DDX7
  8 | '''
  9 | class DDSP_Decoder(nn.Module):
 10 |     def __init__(self,decoder,synth):
 11 |         super().__init__()
 12 |         net = []
 13 |         net.append(decoder)
 14 |         net.append(synth)
 15 |         self.net = nn.Sequential(*net)
 16 | 
 17 |     def forward(self,x):
 18 |         return self.net(x)
 19 | 
 20 |     def get_sr(self):
 21 |         return self.net[1].sample_rate
 22 | 
 23 |     def enable_cumsum_nd(self):
 24 |         self.net[1].use_cumsum_nd=True
 25 | 
 26 |     def get_params(self,param):
 27 |         if(param == 'reverb_decay'):
 28 |             return self.net[1].reverb.decay.item()
 29 |         if(param == 'reverb_wet'):
 30 |             return self.net[1].reverb.wet.item()
 31 | 
 32 | '''
 33 | GRU-Based decoder for HpN Baseline
 34 | '''
 35 | class RnnFCDecoder(nn.Module):
 36 |     def __init__(self, hidden_size=512, sample_rate=16000,
 37 |                  input_keys=None,input_sizes=[1,1,16],
 38 |                  output_keys=['amplitude','harmonic_distribution','noise_bands'],
 39 |                  output_sizes=[1,100,65]):
 40 |         super().__init__()
 41 |         self.input_keys = input_keys
 42 |         self.input_sizes = input_sizes
 43 |         n_keys = len(input_keys)
 44 |         # Generate MLPs of size: in_size: 1 ; n_layers = 3 (with layer normalization and leaky relu)
 45 |         if(n_keys == 2):
 46 |             self.in_mlps = nn.ModuleList([get_mlp(input_sizes[0], hidden_size, 3),
 47 |                                           get_mlp(input_sizes[1], hidden_size, 3)])
 48 |         elif(n_keys == 3):
 49 |             self.in_mlps = nn.ModuleList([get_mlp(input_sizes[0], hidden_size, 3),
 50 |                                           get_mlp(input_sizes[1], hidden_size, 3),
 51 |                                           get_mlp(input_sizes[2], hidden_size, 3)])
 52 |         else:
 53 |             raise ValueError("Expected 2 or 3 input keys. got: {}".format(input_keys))
 54 | 
 55 |         #Generate GRU: input_size = n_keys * hidden_size ; n_layers = 1 (that's the default config)
 56 |         self.gru = get_gru(n_keys, hidden_size)
 57 | 
 58 |         #Generate output MLP: in_size: hidden_size + 2 ; n_layers = 3
 59 |         self.out_mlp = get_mlp(hidden_size + 2, hidden_size, 3)
 60 | 
 61 |         self.proj_matrices = []
 62 |         self.output_keys = output_keys
 63 |         self.output_sizes = output_sizes
 64 |         for v,k in enumerate(output_keys):
 65 |             self.proj_matrices.append(nn.Linear(hidden_size,output_sizes[v]))
 66 | 
 67 |         self.proj_matrices = nn.ModuleList(self.proj_matrices)
 68 |         self.sample_rate = sample_rate
 69 | 
 70 |     def forward(self, x):
 71 |         # Run pitch and loudness and z (if available) inputs through the respectives input MLPs.
 72 |         # Then, concatenate the outputs in a flat vector.
 73 | 
 74 |         # Run through input_keys and load inputs accordingly
 75 |         hidden = torch.cat([self.in_mlps[v](x[k]) for v,k in enumerate(self.input_keys)],-1)
 76 | 
 77 |         # Run the flattened vector through the GRU.
 78 |         # The GRU predicts the embedding.
 79 |         # Then, concatenate the embedding with the disentangled parameters of pitch and loudness (nhid+2 size vector)
 80 |         hidden = torch.cat([self.gru(hidden)[0], x['f0_scaled'], x['loudness_scaled']], -1)
 81 |         # Run the embedding through the output MLP to obtain a 512-sized output vector.
 82 |         hidden = self.out_mlp(hidden)
 83 | 
 84 | 
 85 |         # Run embedding through a projection_matrix to get outputs
 86 |         controls = {}
 87 |         for v,k in enumerate(self.output_keys):
 88 |             controls[k] = self.proj_matrices[v](hidden)
 89 | 
 90 |         controls['f0_hz'] = x['f0']
 91 | 
 92 |         return controls
 93 | 
 94 | '''
 95 | TCN-Based decoder for DDX7
 96 | '''
 97 | class TCNFMDecoder(nn.Module):
 98 |     '''
 99 |     FM Decoder with sigmoid output
100 |     '''
101 |     def __init__(self,n_blocks=2,hidden_channels=64,out_channels=6,
102 |                 kernel_size=3,dilation_base=2,apply_padding=True,
103 |                 deploy_residual=False,
104 |                 input_keys=None,z_size=None,
105 |                 output_complete_controls=True):
106 |         super().__init__()
107 | 
108 |         # Store receptive field
109 |         dilation_factor = (dilation_base**n_blocks-1)/(dilation_base-1)
110 |         self.receptive_field = 1 + 2*(kernel_size-1)*dilation_factor
111 |         print("[INFO] TCNFNDecoder - receptive field is: {}".format(self.receptive_field))
112 | 
113 |         self.input_keys = input_keys
114 |         n_keys = len(input_keys)
115 |         self.output_complete_controls = output_complete_controls
116 | 
117 |         if(n_keys == 2):
118 |             in_channels = 2
119 |         elif(n_keys == 3):
120 |             in_channels = 2 + z_size
121 |         else:
122 |             raise ValueError("Expected 2 or 3 input keys. got: {}".format(input_keys))
123 | 
124 |         base = 0
125 |         net = []
126 | 
127 |         net.append(TCN_block(in_channels,hidden_channels,hidden_channels,kernel_size,
128 |             dilation=dilation_base**base,apply_padding=apply_padding,
129 |             deploy_residual=deploy_residual))
130 |         if(n_blocks>2):
131 |             for i in range(n_blocks-2):
132 |                 base += 1
133 |                 net.append(TCN_block(hidden_channels,hidden_channels,hidden_channels,
134 |                     kernel_size,dilation=dilation_base**base,apply_padding=apply_padding))
135 | 
136 |         base += 1
137 |         net.append(TCN_block(hidden_channels,hidden_channels,out_channels,kernel_size,
138 |             dilation=dilation_base**base,apply_padding=apply_padding,
139 |             deploy_residual=deploy_residual,last_block=True))
140 | 
141 |         self.net = nn.Sequential(*net)
142 | 
143 |     def forward(self,x):
144 |         # Reshape features to follow Conv1d convention (nb,ch,seq_Len)
145 |         conditioning = torch.cat([x[k] for v,k in enumerate(self.input_keys)],-1).permute([0,-1,-2])
146 | 
147 |         ol = self.net(conditioning)
148 |         ol = ol.permute([0,-1,-2])
149 |         if self.output_complete_controls is True:
150 |             synth_params = {
151 |                 'f0_hz': x['f0'], #In Hz
152 |                 'ol': ol
153 |                 }
154 |         else:
155 |             synth_params = ol
156 |         return synth_params
157 | 
158 | class TCN_block(nn.Module):
159 |     '''
160 |     TCN Block
161 |     '''
162 |     def __init__(self,in_channels,hidden_channels,out_channels,
163 |                 kernel_size,stride=1,dilation=1,apply_padding=True,
164 |                 last_block=False,deploy_residual=False):
165 |         super().__init__()
166 |         block = []
167 |         cnv1 = CausalConv1d(in_channels,hidden_channels,kernel_size,
168 |             stride=stride,dilation=dilation,apply_padding=apply_padding)
169 |         block.append(torch.nn.utils.weight_norm( cnv1 ) )
170 |         block.append(nn.ReLU())
171 |         block.append(nn.Dropout())
172 | 
173 |         cnv2 = CausalConv1d(hidden_channels,out_channels,kernel_size,
174 |             stride=stride,dilation=dilation,apply_padding=apply_padding)
175 |         block.append(torch.nn.utils.weight_norm( cnv2 ) )
176 |         if(last_block == False):
177 |             block.append(nn.ReLU())
178 |             block.append(nn.Dropout())
179 | 
180 |         self.block = nn.Sequential(*block)
181 |         self.residual = None
182 |         if(deploy_residual):
183 |             if(apply_padding):
184 |                 self.residual = nn.Conv1d(in_channels,out_channels,1,padding = 0,stride=stride)
185 |             else:
186 |                 raise ValueError("Residual connection is only possible when padding is enabled.")
187 | 
188 |     def forward(self,data):
189 |         block_out = self.block(data)
190 |         if(self.residual is not None):
191 |             residual = self.residual(data)
192 |             block_out = block_out + residual
193 |         return block_out
194 | 
195 | 
196 | class CausalConv1d(torch.nn.Conv1d):
197 |     '''
198 |     Basic layer for implementing a TCN
199 |     '''
200 |     def __init__(self,
201 |                  in_channels,
202 |                  out_channels,
203 |                  kernel_size,
204 |                  stride=1,
205 |                  dilation=1,
206 |                  groups=1,
207 |                  bias=True,
208 |                  apply_padding=True):
209 | 
210 |         super(CausalConv1d, self).__init__(
211 |             in_channels,
212 |             out_channels,
213 |             kernel_size=kernel_size,
214 |             stride=stride,
215 |             padding=0,
216 |             dilation=dilation,
217 |             groups=groups,
218 |             bias=bias)
219 | 
220 |         self.apply_padding = apply_padding
221 |         self.__padding = dilation*(kernel_size - 1)
222 | 
223 |     def forward(self, input):
224 |         # Apply left padding using torch.nn.functional and then compute conv.
225 |         if(self.apply_padding):
226 |             return super(CausalConv1d, self).forward(F.pad(input, (self.__padding, 0)))
227 |         else:
228 |             return super(CausalConv1d, self).forward(input)
229 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/spectral_ops.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import torchcrepe
  4 | import torchaudio
  5 | import librosa
  6 | from syntheon.inferencer.dexed.models.ddx7.core import _DB_RANGE,_REF_DB
  7 | import math
  8 | import numpy as np
  9 | from time import time
 10 | 
 11 | _RMS_FRAME = 2048
 12 | _CREPE_WIN_LEN = 1024
 13 | _LD_N_FFT = 2048
 14 | 
 15 | def safe_log(x):
 16 |     return torch.log(x + 1e-7)
 17 | 
 18 | def calc_f0(audio, rate, hop_size,fmin,fmax,model,
 19 |             batch_size,device,center=False):
 20 |     if center is False:
 21 |       # Add padding to the end. Then execute crepe w/o padding.
 22 |       # Crepe pads so that the signal stays in the center.
 23 |       n_samples_initial = int(audio.shape[-1])
 24 |       n_frames = int(np.ceil(n_samples_initial / hop_size))
 25 |       n_samples_final = (n_frames - 1) * hop_size + _CREPE_WIN_LEN
 26 |       pad = n_samples_final - n_samples_initial
 27 |       audio = np.pad(audio, ((0, pad),), "constant")
 28 | 
 29 |     audio = torch.from_numpy(audio).unsqueeze(0).float().to(device)
 30 | 
 31 |     t1 = time()
 32 |     print("predicting...")
 33 |     crepe_tuple = torchcrepe.predict(audio,
 34 |                         rate,
 35 |                         hop_size,
 36 |                         fmin,
 37 |                         fmax,
 38 |                         model,
 39 |                         return_periodicity=True,
 40 |                         batch_size=batch_size,
 41 |                         device=device,
 42 |                         pad=center)
 43 |     print("done...", time() - t1)
 44 | 
 45 |     f0 = crepe_tuple[0]
 46 |     confidence = crepe_tuple[1]
 47 |     if center is True:
 48 |       f0 = f0[:,0:-1] #Discard the last sample
 49 |       confidence = confidence[:,0:-1] #Discard the last sample
 50 | 
 51 |     f0 = f0.squeeze(0).cpu().numpy()
 52 |     confidence = confidence.squeeze(0).cpu().numpy()
 53 |     return f0,confidence
 54 | 
 55 | def calc_loudness(audio, rate, n_fft=_LD_N_FFT, hop_size=64,
 56 |                   range_db=_DB_RANGE,ref_db=_REF_DB,center=False):
 57 |     np.seterr(divide='ignore')
 58 | 
 59 |     """Compute loudness, add to example (ref is white noise, amplitude=1)."""
 60 |     # Copied from magenta/ddsp/spectral_ops.py
 61 |     # Get magnitudes.
 62 |     if center is False:
 63 |         # Add padding to the end
 64 |         n_samples_initial = int(audio.shape[-1])
 65 |         n_frames = int(np.ceil(n_samples_initial / hop_size))
 66 |         n_samples_final = (n_frames - 1) * hop_size + n_fft
 67 |         pad = n_samples_final - n_samples_initial
 68 |         audio = np.pad(audio, ((0, pad),), "constant")
 69 |     spectra = librosa.stft(
 70 |         audio, n_fft=n_fft, hop_length=hop_size, center=center).T
 71 | 
 72 |     # Compute power
 73 |     amplitude = np.abs(spectra)
 74 |     amin = 1e-20  # Avoid log(0) instabilities.
 75 |     power_db = np.log10(np.maximum(amin, amplitude))
 76 |     power_db *= 20.0
 77 | 
 78 |     # Perceptual weighting.
 79 |     frequencies = librosa.fft_frequencies(sr=rate, n_fft=n_fft)
 80 |     a_weighting = librosa.A_weighting(frequencies)[np.newaxis, :]
 81 |     loudness = power_db + a_weighting
 82 | 
 83 |     # Set dynamic range.
 84 |     loudness -= ref_db
 85 |     loudness = np.maximum(loudness, -range_db)
 86 | 
 87 |     # Average over frequency bins. (loudness is taken from the fft dimension!)
 88 |     mean_loudness_db = np.mean(loudness, axis=-1)
 89 |     return mean_loudness_db.astype(np.float32)
 90 | 
 91 | '''
 92 | RMS POWER COMPUTATION.
 93 | '''
 94 | 
 95 | def amplitude_to_db(amplitude):
 96 |   """Converts amplitude to decibels."""
 97 |   amin = 1e-20  # Avoid log(0) instabilities.
 98 |   db = np.log10(np.maximum(amin, amplitude))
 99 |   db *= 20.0
100 |   return db
101 | 
102 | def compute_rms_energy(audio,
103 |                        frame_size=2048,
104 |                        hop_size=64,
105 |                        pad_end=True):
106 |   """Compute root mean squared energy of audio."""
107 |   if pad_end is True:
108 |     # Add padding to the end
109 |     n_samples_initial = int(audio.shape[-1])
110 |     n_frames = int(np.ceil(n_samples_initial / hop_size))
111 |     n_samples_final = (n_frames - 1) * hop_size + frame_size
112 |     pad = n_samples_final - n_samples_initial
113 |     audio = np.pad(audio, ((0, pad),), "constant")
114 | 
115 |   audio = torch.tensor(audio)
116 |   audio_frames = audio.unfold(-1,frame_size,hop_size)
117 |   rms_energy = torch.mean(audio_frames**2.0,dim=-1)**0.5
118 | 
119 |   return rms_energy.cpu().numpy()
120 | 
121 | 
122 | def calc_power(audio,
123 |                 frame_size=_RMS_FRAME,
124 |                 hop_size=64,
125 |                 range_db=_DB_RANGE,
126 |                 ref_db=20.7,
127 |                 pad_end=True):
128 |   """Compute power of audio in dB."""
129 |   rms_energy = compute_rms_energy(audio, frame_size, hop_size,pad_end=pad_end)
130 |   power_db = amplitude_to_db(rms_energy**2)
131 |   #print(power_db)
132 |   # Set dynamic range.
133 |   power_db -= ref_db
134 |   power_db = np.maximum(power_db, -range_db)
135 |   return power_db.astype(np.float32)
136 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/ddx7/synth.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | from syntheon.inferencer.dexed.models.ddx7.core import *
  5 | import soundfile as sf
  6 | import librosa
  7 | 
  8 | def exp_sigmoid(x):
  9 |     return 2 * torch.sigmoid(x)**(math.log(10)) + 1e-7
 10 | 
 11 | def remove_above_nyquist(amplitudes, pitch, sampling_rate):
 12 |     n_harm = amplitudes.shape[-1]
 13 |     pitches = pitch * torch.arange(1, n_harm + 1).to(pitch)
 14 |     aa = (pitches < sampling_rate / 2).float() + 1e-4
 15 |     return amplitudes * aa
 16 | 
 17 | 
 18 | class FMSynth(nn.Module):
 19 |     def __init__(self,sample_rate,block_size,fr=[1,1,1,1,3,14],max_ol=2,
 20 |         scale_fn = torch.sigmoid,synth_module='fmstrings',is_reverb=True):
 21 |         super().__init__()
 22 |         self.sample_rate = sample_rate
 23 |         self.block_size = block_size
 24 |         self.reverb = Reverb(length=sample_rate, sample_rate=sample_rate)
 25 |         fr = torch.tensor(fr) # Frequency Ratio
 26 |         self.register_buffer("fr", fr) #Non learnable but sent to GPU if declared as buffers, and stored in model dictionary
 27 |         self.scale_fn = scale_fn
 28 |         self.use_cumsum_nd = False
 29 |         self.max_ol = max_ol
 30 |         self.is_reverb = is_reverb
 31 | 
 32 |         available_synths = {
 33 |             'fmbrass': fm_brass_synth,
 34 |             'fmflute': fm_flute_synth,
 35 |             'fmstrings': fm_string_synth,
 36 |             'fmablbrass': fm_ablbrass_synth,
 37 |             '2stack2': fm_2stack2,
 38 |             '1stack2':fm_1stack2,
 39 |             '1stack4': fm_1stack4}
 40 | 
 41 |         self.synth_module = available_synths[synth_module]
 42 | 
 43 |     def forward(self,controls):
 44 | 
 45 |         ol = self.max_ol*self.scale_fn(controls['ol'])
 46 |         ol_up = upsample(ol, self.block_size,'linear')
 47 |         f0_up = upsample(controls['f0_hz'], self.block_size,'linear')
 48 |         signal = self.synth_module(f0_up,
 49 |                                 ol_up,
 50 |                                 self.fr,
 51 |                                 self.sample_rate,
 52 |                                 self.max_ol,
 53 |                                 self.use_cumsum_nd)
 54 |         #reverb part
 55 |         if self.is_reverb:
 56 |             signal = self.reverb(signal)
 57 | 
 58 |         synth_out = {
 59 |             'synth_audio': signal,
 60 |             'ol': ol,
 61 |             'f0_hz': controls['f0_hz']
 62 |             }
 63 |         return synth_out
 64 | 
 65 | class HNSynth(nn.Module):
 66 |     def __init__(self,sample_rate,block_size,scale_fn = exp_sigmoid):
 67 |         super().__init__()
 68 |         self.sample_rate = sample_rate
 69 |         self.block_size = block_size
 70 |         self.reverb = Reverb(length=sample_rate, sample_rate=sample_rate)
 71 |         self.use_cumsum_nd = False
 72 |         self.scale_fn = scale_fn
 73 | 
 74 |     # expects: harmonic_distr, amplitude, noise_bands
 75 |     def forward(self,controls):
 76 | 
 77 |         harmonics = self.scale_fn(controls['harmonic_distribution'])
 78 |         noise_bands = self.scale_fn(controls['noise_bands'])
 79 |         total_amp = self.scale_fn(controls['amplitude'])
 80 | 
 81 |         harmonics = remove_above_nyquist(
 82 |             harmonics,
 83 |             controls['f0_hz'],
 84 |             self.sample_rate,
 85 |         )
 86 |         harmonics /= harmonics.sum(-1, keepdim=True)
 87 |         harmonics *= total_amp
 88 | 
 89 |         harmonics_up = upsample(harmonics, self.block_size)
 90 |         f0_up = upsample(controls['f0_hz'], self.block_size,'linear')
 91 | 
 92 |         harmonic = harmonic_synth(f0_up, harmonics_up, self.sample_rate, self.use_cumsum_nd)
 93 |         impulse = amp_to_impulse_response(noise_bands, self.block_size)
 94 | 
 95 |         noise = torch.rand(
 96 |             impulse.shape[0],
 97 |             impulse.shape[1],
 98 |             self.block_size,
 99 |             ).to(impulse) * 2 - 1
100 | 
101 |         noise = fft_convolve(noise, impulse).contiguous()
102 |         noise = noise.reshape(noise.shape[0], -1, 1)
103 | 
104 |         signal = harmonic + noise
105 | 
106 |         #reverb part
107 |         signal = self.reverb(signal)
108 |         synth_out = {
109 |             'synth_audio': signal,
110 |             'harmonic_distribution': harmonics,
111 |             'noise_bands': noise_bands,
112 |             'f0_hz': controls['f0_hz']
113 |             }
114 | 
115 |         return synth_out
116 | 
117 | class Reverb(nn.Module):
118 |     def __init__(self, length, sample_rate, initial_wet=0, initial_decay=5):
119 |         super().__init__()
120 |         self.length = length
121 |         self.sample_rate = sample_rate
122 | 
123 |         self.noise = nn.Parameter((torch.rand(length) * 2 - 1).unsqueeze(-1))
124 |         self.decay = nn.Parameter(torch.tensor(float(initial_decay)))
125 |         self.wet = nn.Parameter(torch.tensor(float(initial_wet)))
126 | 
127 |         t = torch.arange(self.length) / self.sample_rate
128 |         t = t.reshape(1, -1, 1)
129 |         self.register_buffer("t", t)
130 | 
131 |     def build_impulse(self):
132 |         t = torch.exp(-nn.functional.softplus(-self.decay) * self.t * 500)
133 |         noise = self.noise * t
134 |         impulse = noise * torch.sigmoid(self.wet)
135 |         impulse[:, 0] = 1
136 |         return impulse
137 | 
138 |     def forward(self, x):
139 |         lenx = x.shape[1]
140 |         impulse = self.build_impulse()
141 |         impulse = nn.functional.pad(impulse, (0, 0, 0, lenx - self.length))
142 | 
143 |         x = fft_convolve(x.squeeze(-1), impulse.squeeze(-1)).unsqueeze(-1)
144 | 
145 |         return x
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     fmsynth_string = FMSynth(is_reverb=False, sample_rate=16000, block_size=64)
150 |     freq = 523.25
151 |     controls = {}
152 |     controls['f0_hz'] = torch.ones(1, 1000, 1) * freq
153 |     controls['ol'] = torch.zeros(1, 1000, 6)
154 | 
155 |     synth_out = fmsynth_string(controls)['synth_audio']
156 | 
157 |     signal = synth_out.squeeze().cpu().detach().numpy()
158 |     # signal_gt, sr = librosa.load("dexed_output_ol50_coarse1.wav", sr=16000)
159 |     # signal_gt = signal_gt / np.amax(signal_gt)
160 |     # # print(signal_gt.shape)
161 | 
162 |     # plt.plot(signal[16000:16400], label="signal")
163 |     # plt.plot(signal_gt[16001:16401], label="signal_gt")
164 |     # plt.legend()
165 |     # plt.show()
166 | 
167 |     # print(np.amax(signal_gt))
168 | 
169 |     # S_dexed = np.abs(librosa.stft(signal_gt))
170 |     # S_test = np.abs(librosa.stft(signal))
171 | 
172 |     # fig, ax = plt.subplots()
173 |     # import librosa.display
174 |     # img = librosa.display.specshow(librosa.amplitude_to_db(S_dexed,
175 |     #                                                     ref=np.max),
176 |     #                             y_axis='log', x_axis='time', ax=ax)
177 |     # ax.set_title('Dexed')
178 |     # fig.colorbar(img, ax=ax, format="%+2.0f dB")
179 |     # plt.show()
180 | 
181 |     # fig, ax = plt.subplots()
182 |     # img = librosa.display.specshow(librosa.amplitude_to_db(S_test,
183 |     #                                                     ref=np.max),
184 |     #                             y_axis='log', x_axis='time', ax=ax)
185 |     # ax.set_title('Test')
186 |     # fig.colorbar(img, ax=ax, format="%+2.0f dB")
187 |     # plt.show()
188 | 
189 |     sf.write("dexed_test_92.wav", signal, 16000)
190 | 
191 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/dexed/models/preprocessor.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import operator
  3 | import functools
  4 | import h5py
  5 | from pathlib import Path
  6 | from tqdm import tqdm
  7 | import numpy as np
  8 | import librosa
  9 | from syntheon.inferencer.dexed.models.ddx7 import spectral_ops
 10 | from syntheon.inferencer.dexed.models.ddx7.core import hz_to_midi, _DB_RANGE, _F0_RANGE
 11 | 
 12 | 
 13 | class dotdict(dict):
 14 |     """dot.notation access to dictionary attributes"""
 15 |     __getattr__ = dict.get
 16 |     __setattr__ = dict.__setitem__
 17 |     __delattr__ = dict.__delitem__
 18 | 
 19 | 
 20 | class ProcessData():
 21 |     def __init__(self, silence_thresh_dB, sr, device, seq_len,
 22 |                 crepe_params, loudness_params,
 23 |                 rms_params, hop_size, max_len, center,
 24 |                 overlap = 0.0,
 25 |                 debug = False,
 26 |                 contiguous = False,
 27 |                 contiguous_clip_noise = False):
 28 |         super().__init__()
 29 |         self.silence_thresh_dB = silence_thresh_dB
 30 |         self.crepe_params = crepe_params
 31 |         self.sr = sr
 32 |         self.device = torch.device(device)
 33 |         self.seq_len = seq_len
 34 |         self.loudness_params = loudness_params
 35 |         self.rms = rms_params
 36 |         self.max_len = max_len
 37 |         self.hop_size = hop_size
 38 |         self.feat_size = self.max_len*self.sr //self.hop_size
 39 |         self.audio_size = self.max_len*self.sr
 40 |         self.center = center
 41 |         self.overlap = overlap
 42 |         self.debug = debug
 43 |         self.contiguous = contiguous
 44 |         self.contiguous_clip_noise = contiguous_clip_noise
 45 | 
 46 |     def set_confidence(self,confidence):
 47 |         self.crepe_params.confidence_threshold = confidence
 48 | 
 49 |     def process_indices(self, indices: list) -> list:
 50 |         # Length in samples.
 51 |         max_len = self.max_len * self.sr
 52 | 
 53 |         def expand_long(indices_tuple: tuple) -> list:
 54 |             if indices_tuple[1] - indices_tuple[0] > max_len:
 55 |                 ret = [(start, start+max_len) for start in np.arange(indices_tuple[0], indices_tuple[1] - max_len, max_len)]
 56 |                 ret.append((ret[-1][-1], min(ret[-1][-1] + max_len, indices_tuple[1])))
 57 |                 return ret
 58 |             else:
 59 |                 return [indices_tuple]
 60 | 
 61 |         new_indices = [*map(expand_long, indices)]
 62 |         new_indices = functools.reduce(operator.concat, new_indices, [])
 63 |         new_indices = [x for x in new_indices if (x[1] - x[0] > self.seq_len * self.sr)]
 64 |         return new_indices
 65 | 
 66 |     def pad_to_expected_size(self,features,expected_size,pad_value):
 67 | 
 68 |         #Pad to next integer division if we are processing a whole file in one go.
 69 |         if(self.contiguous == True):
 70 |             # Pad up to next integer division
 71 |             pad_len = (features.shape[-1] // expected_size + 1)*expected_size - features.shape[-1]
 72 |             #print(f'feat len {features.shape[-1]} expected {expected_size} pad {pad_len}')
 73 |             features = np.pad(features,(0,pad_len),'constant',constant_values=pad_value)
 74 |             return features
 75 |         else:
 76 |             if(self.debug):
 77 |                 print("Feat shape {} - expected size: {}".format(features.shape[-1],expected_size))
 78 |             if(features.shape[-1] < expected_size):
 79 |                 pad_len = expected_size - features.shape[-1]
 80 |                 features = np.pad(features,(0,pad_len),'constant',constant_values=pad_value)
 81 |             if(features.shape[-1] > expected_size):
 82 |                 raise Exception('Expected size is smaller than current value')
 83 |         return features
 84 | 
 85 | 
 86 |     def extract_f0(self, audio):
 87 |         if isinstance(self.crepe_params, dict):
 88 |             self.crepe_params = dotdict(self.crepe_params)
 89 |         (f0,confidence) = spectral_ops.calc_f0(audio,
 90 |                                 rate=self.sr,
 91 |                                 hop_size=self.hop_size,
 92 |                                 fmin=self.crepe_params.fmin,
 93 |                                 fmax=self.crepe_params.fmax,
 94 |                                 model=self.crepe_params.model,
 95 |                                 batch_size=self.crepe_params.batch_size,
 96 |                                 device=self.device,
 97 |                                 center=self.center)
 98 | 
 99 |         if confidence.mean() < self.crepe_params.confidence_threshold:
100 |             #print("Low confidence: {}".format(confidence.mean()))
101 |             raise ValueError('Low f0 confidence')
102 | 
103 |         f0 = self.pad_to_expected_size(f0,
104 |                 expected_size = self.feat_size,
105 |                 pad_value=0)
106 | 
107 |         return f0
108 | 
109 |     def calc_loudness(self,audio):
110 |         if isinstance(self.loudness_params, dict):
111 |             self.loudness_params = dotdict(self.loudness_params)
112 |         loudness = spectral_ops.calc_loudness(audio, rate=self.sr,
113 |                                             n_fft=self.loudness_params.nfft,
114 |                                             hop_size=self.hop_size,
115 |                                             center=self.center,)
116 | 
117 |         loudness = self.pad_to_expected_size(loudness,
118 |                 expected_size = self.feat_size,
119 |                 pad_value=-_DB_RANGE)
120 |         return loudness
121 | 
122 |     # TODO: Add center padding capability here.
123 |     def calc_rms(self,audio):
124 |         if isinstance(self.rms, dict):
125 |             self.rms = dotdict(self.rms)
126 |         rms = spectral_ops.calc_power(audio, frame_size=self.rms.frame_size,
127 |                                         hop_size=self.hop_size,pad_end=True)
128 |         rms = self.pad_to_expected_size(rms,
129 |                 expected_size = self.feat_size,
130 |                 pad_value=-_DB_RANGE)
131 |         return rms
132 | 
133 |     def save_data(self, audio, f0, loudness, rms, h5f, counter):
134 |         h5f.create_dataset(f'{counter}_audio', data=audio)
135 |         h5f.create_dataset(f'{counter}_f0', data=f0)
136 |         h5f.create_dataset(f'{counter}_loudness', data=loudness)
137 |         h5f.create_dataset(f'{counter}_rms', data=rms)
138 |         return counter + 1
139 | 
140 |     def init_h5(self, data_dir):
141 |         return h5py.File(data_dir / f'{self.sr}.h5', 'w')
142 | 
143 |     def close_h5(self, h5f):
144 |         h5f.close()
145 | 
146 |     '''
147 |     Main audio processing function
148 |     '''
149 |     def run_on_files(self, data_dir, input_dir, output_dir):
150 |         audio_files = list((input_dir/data_dir).glob('*.wav'))
151 |         output_dir = output_dir / data_dir
152 |         output_dir.mkdir(exist_ok=True)
153 | 
154 |         # Open container
155 |         h5f = self.init_h5(output_dir)
156 |         counter = 0
157 | 
158 |         for audio_file in tqdm(audio_files):
159 |             if(self.debug): print("Processing: {}".format(audio_file))
160 | 
161 |             # load and split files
162 |             data, sr = librosa.load(audio_file.as_posix(), sr=self.sr)
163 |             data = librosa.util.normalize(data) # Peak-normalize audio
164 |             sounds_indices = []
165 |             if(self.contiguous):
166 |                 sounds_indices.append([0,len(data)])
167 |             else:
168 |                 sounds_indices = librosa.effects.split(data, top_db=self.silence_thresh_dB)
169 |                 #print("[DEBUG] Sound indices {}".format(sounds_indices))
170 |                 sounds_indices = self.process_indices(sounds_indices)
171 |             if len(sounds_indices) == 0:
172 |                 continue
173 | 
174 | 
175 |             for indices in sounds_indices:
176 |                 audio = data[indices[0]:indices[1]]
177 |                 if(self.debug): print("\tIndexes: {} {} - len: {}".format(indices[0],indices[1],indices[1]-indices[0]))
178 | 
179 |                 # Feature retrieval segment
180 | 
181 |                 try: # Only process audio with enough CREPE confidence
182 |                     f0 = self.extract_f0(audio)
183 |                 except ValueError:
184 |                     continue
185 | 
186 |                 # Further downsamples the audio back to the other specified sample rates and returns a dictionary.
187 |                 loudness = self.calc_loudness(audio)
188 |                 rms = self.calc_rms(audio)
189 |                 if(self.contiguous):
190 |                     if(self.contiguous_clip_noise):
191 |                         if(self.debug): print("[DEBUG] clipping noise")
192 |                         clip_pos = (f0 > 1900.0)
193 |                         loudness[clip_pos] = -_DB_RANGE
194 |                     audio = self.pad_to_expected_size(audio,f0.shape[0]*self.hop_size,0)
195 | 
196 |                 else:
197 |                     audio = self.pad_to_expected_size(audio,self.audio_size,0)
198 |                 if(self.debug): print(f'\t Store block {counter}: f0 : {f0.shape} - loudness : {loudness.shape} - rms {rms.shape} - audio : {audio.shape}')
199 |                 counter = self.save_data(audio, f0, loudness, rms, h5f, counter)
200 | 
201 |         # Finished storing f0 and loudness
202 |         self.close_h5(h5f)
203 | 
204 | 
205 |     def run_on_dirs(self, input_dir: Path, output_dir: Path):
206 |         #print("Starting with crepe confidence: {}".format(self.crepe_params.confidence_threshold))
207 |         folders = [x for x in input_dir.glob('./*') if x.is_dir()]
208 |         for folder in tqdm(folders):
209 |             self.run_on_files(folder.name, input_dir, output_dir)
210 | 
211 | 
212 | class F0LoudnessRMSPreprocessor():
213 |     """Scales 'f0_hz' and 'loudness_db' features."""
214 |     def __init__(self):
215 |         return
216 | 
217 |     def run(self,x):
218 |         x['loudness_scaled'] = self.scale_db(x['loudness'])
219 |         x['rms_scaled'] = self.scale_db(x['rms'])
220 |         x['f0_scaled'] = self.scale_f0_hz(x['f0'])
221 |         return x
222 | 
223 |     def scale_db(self,db):
224 |         """Scales [-DB_RANGE, 0] to [0, 1]."""
225 |         return (db / _DB_RANGE) + 1.0
226 | 
227 |     def scale_f0_hz(self,f0_hz):
228 |         """Scales [0, Nyquist] Hz to [0, 1.0] MIDI-scaled."""
229 |         return hz_to_midi(f0_hz) / _F0_RANGE


--------------------------------------------------------------------------------
/syntheon/inferencer/inferencer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Connects model output to synth preset parameter IR.
 3 | """
 4 | class InferenceInput:
 5 |     def __init__(self):
 6 |         return NotImplementedError
 7 | 
 8 | 
 9 | class InferenceOutput:
10 |     def __init__(self):
11 |         # for storing evaluation results
12 |         self.eval_dict = {
13 |             "loss":  -1
14 |         }
15 | 
16 | 
17 | class Inferencer:
18 |     def __init__(self, device="cuda"):
19 |         self.device = device
20 | 
21 |     def convert(self, model_pt_fname, audio_fname):
22 |         model = self.load_model(model_pt_fname, self.device)
23 |         inference_output = self.inference(model, audio_fname, self.device)
24 |         synth_params_dict = self.convert_to_preset(inference_output)
25 |         return synth_params_dict, inference_output.eval_dict
26 |     
27 |     def load_model(self, model_pt_fname, device="cuda"):
28 |         return NotImplementedError
29 |         
30 |     def inference(self, model, audio_fname):
31 |         return NotImplementedError
32 | 
33 |     def convert_to_preset(self, inference_output):
34 |         """
35 |         Output a Python dictionary to be handled by the converter.
36 |         """
37 |         return NotImplementedError


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/vital/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/checkpoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/vital/checkpoints/__init__.py


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/checkpoints/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/vital/checkpoints/model.pt


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/config.yaml:
--------------------------------------------------------------------------------
 1 | common:
 2 |   sampling_rate: 16000
 3 |   block_size: 160
 4 |   duration_secs: 4
 5 | 
 6 | train:
 7 |   batch_size: 16
 8 |   scales: [4096, 2048, 1024, 512, 256, 128]
 9 |   overlap: .75
10 |   start_lr: 0.001
11 |   stop_lr: 0.0001
12 |   decay_over: 400000
13 |   hidden_size: 256
14 |   n_harmonic: 100
15 |   n_bands: 65
16 |   n_wavetables: 10
17 |   n_mfcc: 30
18 |   epochs: 100000
19 | 
20 | test:
21 |   batch_size: 2
22 |   scales: [4096, 2048, 1024, 512, 256, 128]
23 |   overlap: .75
24 |   hidden_size: 256
25 |   n_harmonic: 100
26 |   n_bands: 65
27 |   n_wavetables: 10
28 |   n_mfcc: 30
29 | 
30 | crepe:
31 |   model: "large"
32 | 
33 | visualize: false
34 | device: "cpu"
35 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/models/adsr_envelope.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Differentiable ADSR envelope shaper.
  3 | Code largely influenced by https://github.com/hyakuchiki/diffsynth/blob/master/diffsynth/modules/envelope.py.
  4 | """
  5 | import numpy as np
  6 | import torch
  7 | import os
  8 | from torch import nn
  9 | import yaml
 10 | 
 11 | 
 12 | with open(
 13 |     os.path.join(
 14 |         os.path.dirname(os.path.realpath(__file__)),
 15 |         "../config.yaml"
 16 |     ), 'r'
 17 | ) as stream:
 18 |     config = yaml.safe_load(stream)
 19 | device = config["device"]
 20 | 
 21 | 
 22 | def soft_clamp_min(x, min_v, T=100):
 23 |     return torch.sigmoid((min_v-x)*T)*(min_v-x)+x
 24 | 
 25 | 
 26 | class DiffRoundFunc(torch.autograd.Function):
 27 |     @staticmethod
 28 |     def forward(ctx, input):
 29 |         ctx.input = input
 30 |         return torch.round(input * 10 ** 2) / (10 ** 2)     # because 2 decimal point, 0.01 is the minimum ratio
 31 | 
 32 |     @staticmethod
 33 |     def backward(ctx, grad_output):
 34 |         grad_input = grad_output.clone()
 35 |         return grad_input
 36 | 
 37 | 
 38 | class ADSREnvelopeShaper(nn.Module):
 39 |     def __init__(self, is_round_secs=False):
 40 |         super(ADSREnvelopeShaper, self).__init__()
 41 |         self.attack_secs = torch.tensor([0])
 42 |         self.attack_power = torch.tensor([0])
 43 |         self.decay_secs = torch.tensor([0])
 44 |         self.decay_power = torch.tensor([0])
 45 |         self.sustain_level = torch.tensor([0])
 46 |         self.release_secs = torch.tensor([0])
 47 |         self.release_power = torch.tensor([0])
 48 | 
 49 |         self.is_round_secs = is_round_secs
 50 |         self.round_decimal_places = 2   # because block size = 100, so min resolution = 1/ 100
 51 |     
 52 |     def power_function(self, x, pow=2):
 53 |         if pow > 0: # convex
 54 |             # transpose
 55 | 
 56 |             if x.squeeze()[0] > x.squeeze()[-1]:
 57 |                 y_intercept = x.squeeze()[-1]
 58 |                 y = x - x[:, -1, :]
 59 |                 max_val = y.squeeze()[0]
 60 |                 y = y / max_val
 61 |             else:
 62 |                 y_intercept = x.squeeze()[0]
 63 |                 y = x - x[:, 0, :]
 64 |                 max_val = y.squeeze()[-1]
 65 |                 y = y / max_val
 66 |             
 67 |             y = y ** pow
 68 | 
 69 |             # transpose back
 70 |             y = y * max_val + y_intercept
 71 | 
 72 |         else:
 73 |             # transpose
 74 |             if x.squeeze()[0] > x.squeeze()[-1]:
 75 |                 max_val = x.squeeze()[0]
 76 |                 y = x - x[:, 0, :]
 77 |                 y_intercept = y.squeeze()[-1]
 78 |                 y = y / -y_intercept
 79 |             else:
 80 |                 max_val = x.squeeze()[-1]
 81 |                 y = x - x[:, -1, :]
 82 |                 y_intercept = y.squeeze()[0]
 83 |                 y = y / -y_intercept
 84 | 
 85 |             y = -(y ** -pow)
 86 | 
 87 |             # transpose back
 88 |             y = y * -y_intercept + max_val
 89 | 
 90 |         return y
 91 |     
 92 |     def gen_envelope(self, attack, decay, sus_level, release,
 93 |                      floor=None, peak=None, n_frames=250, pow=2):
 94 |         """generate envelopes from parameters
 95 |         Args:
 96 |             floor (torch.Tensor): floor level of the signal 0~1, 0=min_value (batch, 1, channels)
 97 |             peak (torch.Tensor): peak level of the signal 0~1, 1=max_value (batch, 1, channels)
 98 |             attack (torch.Tensor): relative attack point 0~1 (batch, 1, channels)
 99 |             decay (torch.Tensor): actual decay point is attack+decay (batch, 1, channels)
100 |             sus_level (torch.Tensor): sustain level 0~1 (batch, 1, channels)
101 |             release (torch.Tensor): release point is attack+decay+release (batch, 1, channels)
102 |             note_off (float or torch.Tensor, optional): note off position. Defaults to 0.8.
103 |             n_frames (int, optional): number of frames. Defaults to None.
104 |         Returns:
105 |             torch.Tensor: envelope signal (batch_size, n_frames, 1)
106 |         """
107 |         if floor is None:
108 |             floor = torch.tensor([0.]).unsqueeze(0).unsqueeze(-1)
109 |             if device == "cuda":
110 |                 floor = floor.cuda()
111 |         if peak is None:
112 |             peak = torch.tensor([1.]).unsqueeze(0).unsqueeze(-1)
113 |             if device == "cuda":
114 |                 peak = peak.cuda()
115 |         
116 |         attack = torch.clamp(attack, min=0, max=1)
117 |         decay = torch.clamp(decay, min=0, max=1)
118 |         sus_level = torch.clamp(sus_level, min=0.001, max=1)
119 |         release = torch.clamp(release, min=0, max=1)
120 | 
121 |         batch_size = attack.shape[0]
122 |         if n_frames is None:
123 |             n_frames = self.n_frames
124 | 
125 |         x = torch.linspace(0, 1.0, n_frames)[None, :, None].repeat(batch_size, 1, 1)
126 |         x[:, 0, :] = 1e-6       # offset 0 to epsilon value, so when attack = 0, first adsr value is not 0 but 1
127 |         x = x.to(attack.device)
128 | 
129 |         A = x / (attack + 1e-6)
130 |         # A = self.power_function(A, pow=2)
131 |         A = torch.clamp(A, max=1.0)
132 | 
133 |         D = (x - attack) * (sus_level - 1) / (decay+1e-6)
134 |         # D = self.power_function(D, pow=-2.7)
135 |         D = torch.clamp(D, max=0.0)
136 |         D = soft_clamp_min(D, sus_level-1)
137 | 
138 |         S = (x - 1) * (-sus_level / (release+1e-6))
139 |         S = torch.clamp(S, max=0.0)
140 |         S = soft_clamp_min(S, -sus_level)
141 | 
142 |         signal = (A + D + S) * (peak - floor) + floor
143 |         return torch.clamp(signal, min=0., max=1.)
144 |     
145 |     def forward(self, 
146 |                 attack_secs,
147 |                 decay_secs,
148 |                 sustain_level,
149 |                 block_size=100, 
150 |                 sr=44100, 
151 |                 total_secs=8):
152 |         if self.is_round_secs:
153 |             attack_secs = DiffRoundFunc.apply(attack_secs)
154 |             decay_secs = DiffRoundFunc.apply(decay_secs)
155 | 
156 |         self.attack_secs = attack_secs
157 |         self.decay_secs = decay_secs
158 |         self.sustain_level = sustain_level
159 |         
160 |         attack_ratio = attack_secs / total_secs
161 |         decay_ratio = decay_secs / total_secs
162 |         # TODO: parameterize release_ratio
163 |         release_ratio = torch.tensor([0.]).repeat(attack_secs.size(0), 1, 1)
164 |         if device == "cuda":
165 |             release_ratio = release_ratio.cuda()
166 | 
167 |         attack_ratio = attack_ratio.unsqueeze(-1).unsqueeze(-1)
168 |         decay_ratio = decay_ratio.unsqueeze(-1).unsqueeze(-1)
169 |         sus_level = sustain_level.unsqueeze(-1).unsqueeze(-1)
170 | 
171 |         signal = self.gen_envelope(attack_ratio, decay_ratio, sus_level, release_ratio,
172 |                                     floor=None, peak=None, n_frames=int(total_secs * block_size),
173 |                                     pow=2)
174 |         return signal.squeeze()
175 | 
176 | 
177 | def get_amp_shaper(
178 |                 shaper, 
179 |                 onsets, 
180 |                 attack_secs,
181 |                 decay_secs,
182 |                 sustain_level,
183 |                 offsets=None):
184 |     """
185 |     implement case with no offset first. enable batches
186 |     """
187 |     if offsets is None:
188 |         # if offset not specified, take next onset as offset
189 |         offsets = onsets[1:]
190 |         onsets = onsets[:len(onsets) - 1]
191 | 
192 |     start_offset = int(onsets[0] * 100)        # TODO: 100 is block size
193 |     onsets, offsets = torch.tensor(onsets), torch.tensor(offsets)
194 |     if device == "cuda":
195 |         onsets, offsets = onsets.cuda(), offsets.cuda()
196 |     dur_vec = offsets - onsets
197 |     lst = []
198 | 
199 |     # append zeros first before first onset
200 |     if device == "cuda":
201 |         lst.append(torch.zeros(start_offset).cuda())
202 |     else:
203 |         lst.append(torch.zeros(start_offset))
204 | 
205 |     for dur in dur_vec:
206 |         dur = round(dur.item(), 2)
207 |         adsr = shaper(
208 |             attack_secs=attack_secs, 
209 |             decay_secs=decay_secs,
210 |             sustain_level=sustain_level,
211 |             total_secs=dur)
212 |         
213 |         # adsr shape should be (bs, dur * block_size)
214 |         lst.append(adsr)
215 |     
216 |     final_signal = torch.cat(lst, dim=-1)
217 |     return final_signal
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     # TODO: unit test for this class
222 |     import matplotlib.pyplot as plt
223 | 
224 |     shaper = ADSREnvelopeShaper(is_round_secs=False)
225 |     adsrs = []
226 |     for elem in [0.0, 0.001, 0.005, 0.01, 0.02]:
227 |         attack_secs, decay_secs, sustain_level = torch.tensor([0.2]), torch.tensor([elem]), torch.tensor([0.8])
228 |         if device == "cuda": 
229 |             attack_secs = attack_secs.cuda()
230 |             decay_secs = decay_secs.cuda()
231 |             sustain_level = sustain_level.cuda()
232 |         
233 |         x2 = shaper(
234 |             attack_secs=attack_secs, 
235 |             decay_secs=decay_secs,
236 |             sustain_level=sustain_level,
237 |             total_secs=4)
238 | 
239 |         adsrs.append(x2.squeeze().cpu().detach().numpy()[:30])
240 | 
241 |     for idx, elem in enumerate([0.0, 0.001, 0.005, 0.01, 0.02]):
242 |         plt.plot(adsrs[idx], label=str(elem))
243 |         plt.scatter(range(30), adsrs[idx])
244 |     plt.legend()
245 |     plt.show()


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/models/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Core functions. 
  3 | The code mainly comes from https://github.com/acids-ircam/ddsp_pytorch with minor adaptations.
  4 | """
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.fft as fft
  8 | import numpy as np
  9 | import librosa as li
 10 | import math
 11 | import yaml 
 12 | import os
 13 | 
 14 | 
 15 | with open(
 16 |     os.path.join(
 17 |         os.path.dirname(os.path.realpath(__file__)),
 18 |         "../config.yaml"
 19 |     ), 'r'
 20 | ) as stream:
 21 |     config = yaml.safe_load(stream)
 22 | 
 23 | device = config["device"]
 24 | 
 25 | 
 26 | def safe_log(x):
 27 |     return torch.log(x + 1e-7)
 28 | 
 29 | 
 30 | @torch.no_grad()
 31 | def mean_std_loudness(dataset):
 32 |     mean = 0
 33 |     std = 0
 34 |     n = 0
 35 |     for _, _, l in dataset:
 36 |         n += 1
 37 |         mean += (l.mean().item() - mean) / n
 38 |         std += (l.std().item() - std) / n
 39 |     return mean, std
 40 | 
 41 | 
 42 | def multiscale_fft(signal, scales, overlap):
 43 |     stfts = []
 44 |     for s in scales:
 45 |         S = torch.stft(
 46 |             signal,
 47 |             s,
 48 |             int(s * (1 - overlap)),
 49 |             s,
 50 |             torch.hann_window(s).to(signal),
 51 |             True,
 52 |             normalized=True,
 53 |             return_complex=True,
 54 |         ).abs()
 55 |         stfts.append(S)
 56 |     return stfts
 57 | 
 58 | 
 59 | def resample(x, factor: int):
 60 |     batch, frame, channel = x.shape
 61 |     x = x.permute(0, 2, 1).reshape(batch * channel, 1, frame)
 62 | 
 63 |     window = torch.hann_window(
 64 |         factor * 2,
 65 |         dtype=x.dtype,
 66 |         device=x.device,
 67 |     ).reshape(1, 1, -1)
 68 |     y = torch.zeros(x.shape[0], x.shape[1], factor * x.shape[2]).to(x)
 69 |     y[..., ::factor] = x
 70 |     y[..., -1:] = x[..., -1:]
 71 |     y = torch.nn.functional.pad(y, [factor, factor])
 72 |     y = torch.nn.functional.conv1d(y, window)[..., :-1]
 73 | 
 74 |     y = y.reshape(batch, channel, factor * frame).permute(0, 2, 1)
 75 | 
 76 |     return y
 77 | 
 78 | 
 79 | def upsample(signal, factor, preferred_size=None, mode="nearest"):
 80 |     signal = signal.permute(0, 2, 1)
 81 |     if preferred_size is not None:
 82 |         signal = nn.functional.interpolate(signal, size=preferred_size, mode=mode)
 83 |     else:
 84 |         signal = nn.functional.interpolate(signal, size=signal.shape[-1] * factor, mode=mode)
 85 |     return signal.permute(0, 2, 1)
 86 | 
 87 | 
 88 | def remove_above_nyquist(amplitudes, pitch, sampling_rate):
 89 |     n_harm = amplitudes.shape[-1]
 90 |     pitches = pitch * torch.arange(1, n_harm + 1).to(pitch)
 91 |     aa = (pitches < sampling_rate / 2).float() + 1e-4
 92 |     return amplitudes * aa
 93 | 
 94 | 
 95 | def scale_function(x):
 96 |     return 2 * torch.sigmoid(x)**(math.log(10)) + 1e-7
 97 | 
 98 | 
 99 | def amplitude_to_db(amplitude):
100 |     amin = 1e-20  # Avoid log(0) instabilities.
101 |     db = torch.log10(torch.clamp(amplitude, min=amin))
102 |     db *= 20.0
103 |     return db
104 | 
105 | 
106 | def extract_loudness(audio, sampling_rate, block_size=None, n_fft=2048, frame_rate=None):
107 |     assert (block_size is None) != (frame_rate is None), "Specify exactly one of block_size or frame_rate"
108 | 
109 |     if frame_rate is not None:
110 |         block_size = sampling_rate // frame_rate
111 |     else:
112 |         frame_rate = int(sampling_rate / block_size)
113 | 
114 |     if sampling_rate % frame_rate != 0:
115 |         raise ValueError(
116 |             'frame_rate: {} must evenly divide sample_rate: {}.'
117 |             'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz'
118 |             .format(frame_rate, sampling_rate))
119 | 
120 |     if isinstance(audio, np.ndarray):
121 |         audio = torch.tensor(audio)
122 | 
123 |     # Temporarily a batch dimension for single examples.
124 |     is_1d = (len(audio.shape) == 1)
125 |     audio = audio[None, :] if is_1d else audio
126 | 
127 |     # Take STFT.
128 |     overlap = 1 - block_size / n_fft
129 |     amplitude = torch.stft(audio, n_fft=n_fft, hop_length=block_size, center=True, pad_mode='reflect', return_complex=True).abs()
130 |     amplitude = amplitude[:, :, :-1]
131 |     
132 |     # Compute power.
133 |     power_db = amplitude_to_db(amplitude)
134 | 
135 |     # Perceptual weighting.
136 |     frequencies = li.fft_frequencies(sr=sampling_rate, n_fft=n_fft)
137 |     a_weighting = li.A_weighting(frequencies)[None,:,None]
138 |     loudness = power_db + a_weighting
139 | 
140 |     loudness = torch.mean(torch.pow(10, loudness / 10.0), axis=1)
141 |     loudness = 10.0 * torch.log10(torch.clamp(loudness, min=1e-20))
142 | 
143 |     # Remove temporary batch dimension.
144 |     loudness = loudness[0] if is_1d else loudness
145 |     loudness = loudness.numpy()
146 | 
147 |     return loudness
148 | 
149 | 
150 | def mlp(in_size, hidden_size, n_layers):
151 |     channels = [in_size] + (n_layers) * [hidden_size]
152 |     net = []
153 |     for i in range(n_layers):
154 |         net.append(nn.Linear(channels[i], channels[i + 1]))
155 |         net.append(nn.LayerNorm(channels[i + 1]))
156 |         net.append(nn.LeakyReLU())
157 |     return nn.Sequential(*net)
158 | 
159 | 
160 | def gru(n_input, hidden_size):
161 |     return nn.GRU(n_input * hidden_size, hidden_size, batch_first=True)
162 | 
163 | 
164 | def harmonic_synth(pitch, amplitudes, sampling_rate):
165 |     n_harmonic = amplitudes.shape[-1]
166 |     omega = torch.cumsum(2 * math.pi * pitch / sampling_rate, 1)
167 |     omegas = omega * torch.arange(1, n_harmonic + 1).to(omega)
168 | 
169 |     signal = (torch.sin(omegas) * amplitudes).sum(-1, keepdim=True)
170 |     return signal
171 | 
172 | 
173 | def amp_to_impulse_response(amp, target_size):
174 |     amp = torch.stack([amp, torch.zeros_like(amp)], -1)
175 |     amp = torch.view_as_complex(amp)
176 |     amp = fft.irfft(amp)
177 | 
178 |     filter_size = amp.shape[-1]
179 | 
180 |     amp = torch.roll(amp, filter_size // 2, -1)
181 |     win = torch.hann_window(filter_size, dtype=amp.dtype, device=amp.device)
182 | 
183 |     amp = amp * win
184 | 
185 |     amp = nn.functional.pad(amp, (0, int(target_size) - int(filter_size)))
186 |     amp = torch.roll(amp, -filter_size // 2, -1)
187 | 
188 |     return amp
189 | 
190 | 
191 | def fft_convolve(signal, kernel):
192 |     signal = nn.functional.pad(signal, (0, signal.shape[-1]))
193 |     kernel = nn.functional.pad(kernel, (kernel.shape[-1], 0))
194 | 
195 |     output = fft.irfft(fft.rfft(signal) * fft.rfft(kernel))
196 |     output = output[..., output.shape[-1] // 2:]
197 | 
198 |     return output
199 | 
200 | 
201 | def get_scheduler(len_dataset, start_lr, stop_lr, length):
202 |     def schedule(epoch):
203 |         step = epoch * len_dataset
204 |         if step < length:
205 |             t = step / length
206 |             return start_lr * (1 - t) + stop_lr * t
207 |         else:
208 |             return stop_lr
209 | 
210 |     return schedule


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/models/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Diff-WTS model. Main adapted from https://github.com/acids-ircam/ddsp_pytorch.
  3 | """
  4 | from syntheon.inferencer.vital.models.wavetable_synth import WavetableSynthV2
  5 | import torch
  6 | import torch.nn as nn
  7 | from syntheon.inferencer.vital.models.core import mlp, gru, scale_function, remove_above_nyquist, upsample
  8 | from syntheon.inferencer.vital.models.core import amp_to_impulse_response, fft_convolve
  9 | from syntheon.inferencer.vital.models.adsr_envelope import *
 10 | import numpy as np
 11 | from torchvision.transforms import Resize
 12 | from time import time
 13 | 
 14 | class PrintLayer(nn.Module):
 15 |     def __init__(self, name):
 16 |         super(PrintLayer, self).__init__()
 17 |         self.name = name
 18 |     
 19 |     def forward(self, x):
 20 |         # Do your print / debug stuff here
 21 |         print(self.name, x[0].squeeze().item())
 22 |         x += 1e-2
 23 |         return x
 24 | 
 25 | 
 26 | class Reverb(nn.Module):
 27 |     def __init__(self, length, sampling_rate, initial_wet=0, initial_decay=5):
 28 |         super().__init__()
 29 |         self.length = length
 30 |         self.sampling_rate = sampling_rate
 31 | 
 32 |         self.noise = nn.Parameter((torch.rand(length) * 2 - 1).unsqueeze(-1))
 33 |         self.decay = nn.Parameter(torch.tensor(float(initial_decay)))
 34 |         self.wet = nn.Parameter(torch.tensor(float(initial_wet)))
 35 | 
 36 |         t = torch.arange(self.length) / self.sampling_rate
 37 |         t = t.reshape(1, -1, 1)
 38 |         self.register_buffer("t", t)
 39 | 
 40 |     def build_impulse(self):
 41 |         t = torch.exp(-nn.functional.softplus(-self.decay) * self.t * 500)
 42 |         noise = self.noise * t
 43 |         impulse = noise * torch.sigmoid(self.wet)
 44 |         impulse[:, 0] = 1
 45 |         return impulse
 46 | 
 47 |     def forward(self, x):
 48 |         lenx = x.shape[1]
 49 |         impulse = self.build_impulse()
 50 |         impulse = nn.functional.pad(impulse, (0, 0, 0, lenx - self.length))
 51 | 
 52 |         x = fft_convolve(x.squeeze(-1), impulse.squeeze(-1)).unsqueeze(-1)
 53 | 
 54 |         return x
 55 | 
 56 | 
 57 | def infer_wavetables(y, pitch):
 58 |     """
 59 |     TODO: VERY BUGGY CODE. need to care for edge cases (like silence)
 60 | 
 61 |     y: (64000,)
 62 |     pitch: (400,), 1 second 100 frames
 63 |     """
 64 |     period = 1 / pitch * 16000
 65 |     
 66 |     # find the first continuous pitch
 67 |     # TODO: find the most continuous pitch across the sample, for best results
 68 |     continuous_threshold = 10   # at least 10 steps = 0.1 sec, will be a problem for plucks, but for now do it like this
 69 |     continuous_pitch = -1
 70 |     continuous_pitch_idx = -1
 71 |     cur_pitch = pitch[0]
 72 |     step = 0
 73 |     
 74 |     for idx in range(1, len(pitch)):
 75 |         if abs(pitch[idx] - cur_pitch) < 1e-2: # equal freq tolerance 1e-2
 76 |             step += 1
 77 |             if step > continuous_threshold:
 78 |                 continuous_pitch = cur_pitch
 79 |                 continuous_pitch_idx = idx - step
 80 |                 break
 81 |         else:
 82 |             cur_pitch = pitch[idx]
 83 |             step = 0
 84 |     
 85 |     if continuous_pitch == -1:  # fallback
 86 |         continuous_pitch = pitch[0]
 87 |         continuous_pitch_idx = 0
 88 |     
 89 |     period = int(1 / continuous_pitch * 16000)
 90 |     pitch_offset_idx = continuous_pitch_idx * 160  # 160 = sr / frame_size (100)
 91 |         
 92 |     # find local minimum within a window of 2 periods
 93 |     cur = y[pitch_offset_idx : pitch_offset_idx + 1600]
 94 |     min_idx = torch.argmin(cur).item()
 95 |     
 96 |     # here we take first wavelet, but also can take the average of a few wavelets
 97 |     # TODO: prone to silence right now. need to fix. now HACK search for local minima across 1600 samples to solve
 98 |     wavelet = y[min_idx : min_idx + period]
 99 |     
100 |     # upsample + normalize magnitude
101 |     wavelet_tensor = wavelet.clone().detach().unsqueeze(-1).unsqueeze(0)
102 |     if torch.isinf(wavelet_tensor).any() or torch.isnan(wavelet_tensor).any():
103 |         print('wavelet tensor has inf or nan', torch.isinf(wavelet_tensor).any(), torch.isnan(wavelet_tensor).any())
104 |     wavelet_upsample = upsample(wavelet_tensor, factor=0, preferred_size=512, mode="linear").squeeze()
105 |     if torch.isinf(wavelet_upsample).any() or torch.isnan(wavelet_upsample).any():
106 |         print('wavelet upsample has inf or nan', torch.isinf(wavelet_upsample).any(), torch.isnan(wavelet_upsample).any())
107 |     if wavelet_upsample.max() - wavelet_upsample.min() < 1e-4:
108 |         # don't min-max norm in this case
109 |         pass
110 |     else:
111 |         wavelet_upsample = (wavelet_upsample - wavelet_upsample.min()) / \
112 |                             (wavelet_upsample.max() - wavelet_upsample.min())
113 |         wavelet_upsample = wavelet_upsample * 2 - 1
114 |     if torch.isinf(wavelet_upsample).any() or torch.isnan(wavelet_upsample).any():
115 |         print('wavelet upsample 2 has inf or nan', torch.isinf(wavelet_upsample).any(), torch.isnan(wavelet_upsample).any(),
116 |                 wavelet_upsample.max(), wavelet_upsample.min())
117 | 
118 |     return wavelet_upsample
119 | 
120 | 
121 | class WTSv2(nn.Module):
122 |     def __init__(self, hidden_size, n_harmonic, n_bands, sampling_rate,
123 |                  block_size, mode="wavetable", duration_secs=3, num_wavetables=3,
124 |                  wavetable_smoothing=False, min_smoothing_sigma=0.5, max_smoothing_sigma=50,
125 |                  preload_wt=False, is_round_secs=False, enable_amplitude=True, device='cuda'
126 |                 ):
127 |         super().__init__()
128 |         self.register_buffer("sampling_rate", torch.tensor(sampling_rate))
129 |         self.register_buffer("block_size", torch.tensor(block_size))
130 | 
131 |         # feature extractors
132 |         self.encoder = mlp(30, hidden_size, 3)
133 |         self.layer_norm = nn.LayerNorm(30)
134 |         self.gru_mfcc = nn.GRU(30, 512, batch_first=True)
135 |         self.mlp_mfcc = nn.Linear(512, 16)
136 | 
137 |         self.in_mlps = nn.ModuleList([mlp(1, hidden_size, 3),
138 |                                       mlp(1, hidden_size, 3),
139 |                                       mlp(16, hidden_size, 3)])
140 |         self.gru = gru(3, hidden_size)
141 |         self.out_mlp = mlp(hidden_size * 4, hidden_size, 3)
142 | 
143 |         self.loudness_mlp = nn.Sequential(
144 |             nn.Linear(1, 1),
145 |             nn.Sigmoid()
146 |         )
147 |         self.proj_matrices = nn.ModuleList([
148 |             nn.Linear(hidden_size, n_harmonic + 1),
149 |             nn.Linear(hidden_size, n_bands),
150 |         ])
151 | 
152 |         # for wavetable learning
153 |         self.wt1_conv1d = nn.Sequential(
154 |             nn.Conv1d(1, num_wavetables, 16, stride=16),     # 3 here is num_wavetable
155 |             nn.Tanh(),
156 |             nn.Conv1d(num_wavetables, num_wavetables, 8, stride=8),
157 |             nn.Tanh(),
158 |             nn.Linear(500, 512),                # 512 is wavetable length
159 |             nn.Tanh()
160 |         )
161 |         self.attention_wt1 = nn.Linear(512, 1)
162 |         self.smoothing_linear = nn.Linear(512, 1)
163 |         self.smoothing_sigmoid = nn.Sigmoid()
164 | 
165 |         # for adsr learning
166 |         self.shaper = ADSREnvelopeShaper(is_round_secs)
167 |         self.adsr_conv1d = nn.Conv1d(1, 1, block_size, stride=block_size)
168 | 
169 |         self.attack_gru = nn.GRU(1, 8, batch_first=True, bidirectional=True)
170 |         self.decay_gru = nn.GRU(1, 8, batch_first=True, bidirectional=True)
171 |         self.sustain_gru = nn.GRU(1, 8, batch_first=True, bidirectional=True)
172 | 
173 |         self.attack_sec_head = nn.Sequential(
174 |             nn.Linear(16, 1),
175 |             nn.Sigmoid()
176 |         )
177 |         self.decay_sec_head = nn.Sequential(
178 |             nn.Linear(16, 1),
179 |             nn.Sigmoid()
180 |         )
181 |         self.sustain_level_head = nn.Sequential(
182 |             nn.Linear(16, 1),
183 |             nn.Sigmoid()
184 |         )
185 | 
186 |         # for adsr result storage
187 |         self.attack_sec = nn.Parameter(torch.ones(1,))
188 |         self.decay_sec = nn.Parameter(torch.ones(1,))
189 |         self.sustain_level = nn.Parameter(torch.ones(1,))       
190 | 
191 |         self.max_attack_secs = 2.0
192 |         self.max_decay_secs = 2.0 
193 | 
194 |         # for synthesis
195 |         self.reverb = Reverb(sampling_rate, sampling_rate)
196 |         self.wts = WavetableSynthV2(sr=sampling_rate, 
197 |                                     duration_secs=duration_secs,
198 |                                     block_size=block_size,
199 |                                     enable_amplitude=enable_amplitude)
200 |         self.wavetable_smoothing = wavetable_smoothing
201 |         self.min_smoothing_sigma = min_smoothing_sigma
202 |         self.max_smoothing_sigma = max_smoothing_sigma
203 | 
204 |         self.preload_wt = preload_wt
205 | 
206 |         self.mode = mode
207 |         self.duration_secs = duration_secs
208 |         self.device = device
209 | 
210 |     def forward(self, y, mfcc, pitch, loudness, times, onset_frames):
211 |         batch_size = y.shape[0]
212 | 
213 |         # encode mfcc first
214 |         # use layer norm instead of trainable norm, not much difference found
215 |         mfcc = self.layer_norm(torch.transpose(mfcc, 1, 2))
216 |         mfcc = self.gru_mfcc(mfcc)[0]
217 |         mfcc = self.mlp_mfcc(mfcc)
218 | 
219 |         # use image resize to align dimensions, ddsp also do this...
220 |         mfcc = Resize(size=(self.duration_secs * 100, 16))(mfcc)
221 | 
222 |         hidden = torch.cat([
223 |             self.in_mlps[0](pitch),
224 |             self.in_mlps[1](loudness),
225 |             self.in_mlps[2](mfcc)
226 |         ], -1)
227 |         hidden = torch.cat([self.gru(hidden)[0], hidden], -1)
228 |         hidden = self.out_mlp(hidden)
229 | 
230 |         # harmonic part
231 |         total_amp = self.loudness_mlp(loudness)
232 |         pitch_prev = pitch
233 | 
234 |         # TODO: upsample is very slow
235 |         pitch = upsample(pitch, self.block_size)
236 |         total_amp = upsample(total_amp, self.block_size)    # use this instead for wavetable
237 | 
238 |         # diff-wave-synth synthesizer
239 |         if self.preload_wt:
240 |             # TODO: very slow implementation...
241 |             wavetables = []
242 |             for idx in range(batch_size):
243 |                 wt = infer_wavetables(y[idx].squeeze(), pitch_prev[idx].squeeze())
244 |                 wavetables.append(wt)
245 |             wavetables = torch.stack(wavetables, dim=0).unsqueeze(1)
246 |             if torch.isinf(wavetables).any() or torch.isnan(wavetables).any():
247 |                 print('wavetables has inf or nan', torch.isinf(wavetables).any(), torch.isnan(wavetables).any())
248 |         else:
249 |             wavetables = self.wt1_conv1d(y.unsqueeze(1))
250 | 
251 |         if self.wavetable_smoothing:
252 |             smoothing_coeff = self.smoothing_linear(wavetables)
253 |             smoothing_coeff = smoothing_coeff.squeeze(1)        # HACK: here should assume only 1 wavetable
254 |             smoothing_coeff = self.smoothing_sigmoid(smoothing_coeff)
255 |             wavetables_old = wavetables
256 |             wavetables = self.smoothing(wavetables, smoothing_coeff)
257 |         else:
258 |             wavetables_old = None
259 |             smoothing_coeff = None
260 | 
261 |         attention_output = self.attention_wt1(wavetables).squeeze(-1)
262 |         attention_output = nn.Softmax(dim=-1)(attention_output)
263 | 
264 |         harmonic, attention_output = self.wts(pitch, total_amp, wavetables, attention_output)
265 | 
266 |         # noise part
267 |         param = scale_function(self.proj_matrices[1](hidden) - 5)
268 | 
269 |         impulse = amp_to_impulse_response(param, self.block_size)
270 |         noise = torch.rand(
271 |             impulse.shape[0],
272 |             impulse.shape[1],
273 |             self.block_size,
274 |         ).to(impulse) * 2 - 1
275 | 
276 |         noise = fft_convolve(noise, impulse).contiguous()
277 |         noise = noise.reshape(noise.shape[0], -1, 1)
278 | 
279 |         signal = harmonic + noise
280 | 
281 |         # adsr shaping
282 |         output_attack, hn_attack = self.attack_gru(loudness)
283 |         hn_attack = torch.cat([hn_attack[0], hn_attack[1]], dim=-1)
284 |         output_decay, hn_decay = self.decay_gru(loudness)
285 |         hn_decay = torch.cat([hn_decay[0], hn_decay[1]], dim=-1)
286 |         output_sustain, hn_sustain = self.sustain_gru(loudness)
287 |         hn_sustain = torch.cat([hn_sustain[0], hn_sustain[1]], dim=-1)
288 | 
289 |         # print(hn_decay[:10])
290 |         attack_level = self.attack_sec_head(hn_attack).squeeze()            # 0-1
291 |         decay_level = self.decay_sec_head(hn_decay).squeeze()               # 0-1
292 |         sustain_level = self.sustain_level_head(hn_sustain).squeeze()
293 | 
294 |         attack_secs = attack_level * self.max_attack_secs
295 |         decay_secs = decay_level * self.max_decay_secs
296 | 
297 |         amp_onsets = np.append(times[onset_frames], np.array([times[-1]]))  # TODO: now 1 onset is enough, because all training samples pitch are the same
298 | 
299 |         adsr = get_amp_shaper(self.shaper, amp_onsets, 
300 |                                 attack_secs=attack_secs,
301 |                                 decay_secs=decay_secs,
302 |                                 sustain_level=sustain_level)
303 |         if adsr.shape[1] < pitch_prev.shape[1]:
304 |             # adsr = torch.nn.functional.pad(adsr, (0, pitch_prev.shape[1] - adsr.shape[1]), "constant", adsr[-1].item())
305 |             adsr = torch.cat([adsr, adsr[:, -1].unsqueeze(-1)], dim=-1)
306 |         else:
307 |             adsr = adsr[:pitch_prev.shape[1]]
308 |         
309 |         self.adsr = adsr
310 |         adsr = adsr.unsqueeze(-1)
311 |         adsr = upsample(adsr, self.block_size).squeeze(-1)   
312 | 
313 |         adsr = adsr[:, :signal.shape[1]]
314 | 
315 |         final_signal = signal.squeeze() * adsr
316 | 
317 |         # reverb part
318 |         # signal = self.reverb(signal)
319 | 
320 |         return signal, (attack_secs, decay_secs, sustain_level), final_signal, attention_output, wavetables, wavetables_old, smoothing_coeff
321 |     
322 |     def smoothing(self, wavetables, p):       
323 |         bs, wavetable_length = wavetables.shape[0], wavetables.shape[2]
324 |         smoothed_wavetables = torch.zeros((bs, wavetable_length))
325 |         if self.device == "cuda":
326 |             smoothed_wavetables = smoothed_wavetables.cuda()
327 |         
328 |         sigma = p * (self.max_smoothing_sigma - self.min_smoothing_sigma) + self.min_smoothing_sigma
329 |         sigma = sigma.unsqueeze(-1)                         # size (bs, 1, 1)
330 |         
331 |         kernel = torch.arange(wavetable_length)
332 |         if self.device == "cuda":
333 |             kernel = kernel.cuda()
334 |         kernel = kernel.unsqueeze(0) - kernel.unsqueeze(-1)  # x_position - x_vals, size (wt_len, wt_len)
335 | 
336 |         kernel = torch.exp(-kernel ** 2 / (2 * sigma ** 2))   # size (b, wt_len, wt_len)    
337 |         kernel = kernel / torch.sum(kernel, dim=-1).unsqueeze(-1)          # dim 1 or dim -1?
338 |             
339 |         # wavetables = wavetables.unsqueeze(1)
340 |         smoothed_wavetables = torch.bmm(wavetables, kernel)  # (bs, 1, wt_len) * (bs, wt_len, wt_len)
341 |         return smoothed_wavetables


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/models/preprocessor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For loading and preprocessing audio
  3 | """
  4 | import numpy as np
  5 | import os
  6 | import torch
  7 | from syntheon.utils.pitch_extractor import extract_pitch
  8 | from syntheon.inferencer.vital.models.core import extract_loudness
  9 | import librosa
 10 | import yaml 
 11 | from nnAudio import Spectrogram
 12 | 
 13 | with open(
 14 |     os.path.join(
 15 |         os.path.dirname(os.path.realpath(__file__)),
 16 |         "../config.yaml"
 17 |     ), 'r'
 18 | ) as stream:
 19 |     config = yaml.safe_load(stream)
 20 | 
 21 | # general parameters
 22 | sr = config["common"]["sampling_rate"]
 23 | n_mfcc = config["train"]["n_mfcc"]
 24 | 
 25 | spec = Spectrogram.MFCC(sr=sr, n_mfcc=n_mfcc)
 26 | 
 27 | 
 28 | def sanitize_onsets(times, onset_frames, onset_strengths):
 29 |     """
 30 |     times: actual timestamp per frame in STFT (by hop length)
 31 |             e.g. [0, 0.3, 0.6, ...]
 32 |     onset_frames: index list, index on `times` to get an onset event
 33 |     onset_strengths: get strength per frame. same shape as times.
 34 |                         so when strength is high will have a onset event, index in `onset_frames`  
 35 |     """
 36 |     # TODO: need to check if we need this always
 37 |     res_frames = []
 38 | 
 39 |     cur_frame = onset_frames[0]
 40 |     cur_time = times[cur_frame]
 41 |     res_frames.append(cur_frame)
 42 | 
 43 |     for frame in onset_frames[1:]:
 44 |         if times[frame] - cur_time > 0.05:  # TODO: parameterize
 45 |             res_frames.append(frame)
 46 |         cur_frame = frame
 47 |         cur_time = times[frame]
 48 | 
 49 |     return np.array(res_frames)
 50 | 
 51 | 
 52 | def aggregate(vals):
 53 |     """
 54 |     aggregate the window of pitch values.
 55 |     rationale: bin pitch values (to reduce fluctuation), get the bin with most values within the window
 56 |     """
 57 |     bins = {}
 58 |     for val in vals:
 59 |         bin = val // 10
 60 |         if bin in bins:
 61 |             bins[bin].append(val)
 62 |         else:
 63 |             bins[bin] = [val]
 64 |     
 65 |     sorted_bins = sorted(bins.keys())
 66 |     max_len_bin = sorted_bins[0]
 67 | 
 68 |     for bin in sorted_bins:
 69 |         if len(bins[bin]) > len(bins[max_len_bin]):
 70 |             max_len_bin = bin
 71 |     
 72 |     return bins[max_len_bin][0]
 73 | 
 74 | 
 75 | 
 76 | def monotonize_pitch(times, onset_frames, pitch):
 77 |     """
 78 |     remove wobbling frequencies in pitch. take the pitch value on the onset frame
 79 |     problem is accuracy issue -- need to align onset and pitch
 80 |     because librosa onset might read wrong pitch from crepe output
 81 |     """
 82 |     res_pitch = np.zeros(pitch.shape)
 83 |     pitch_map_lst = []
 84 | 
 85 |     prev_ts = times[onset_frames[0]]
 86 | 
 87 |     for idx, frame in enumerate(onset_frames):
 88 |         if idx == 0:
 89 |             continue
 90 |         ts = times[frame]
 91 |         pitch_vals = pitch[int(prev_ts * 100) : int(ts * 100)]
 92 | 
 93 |         if len(pitch_vals) > 0:
 94 |             cur_pitch = aggregate(pitch_vals)
 95 |             pitch_map_lst.append((int(prev_ts * 100), cur_pitch))
 96 |             prev_ts = ts
 97 | 
 98 |     # for final frame
 99 |     ts = times[-1]
100 |     pitch_vals = pitch[int(prev_ts * 100) : int(ts * 100)]
101 |     if len(pitch_vals) > 0:
102 |         cur_pitch = aggregate(pitch_vals)
103 |         pitch_map_lst.append((int(prev_ts * 100), cur_pitch))
104 |     
105 |     if pitch_map_lst[0][0] == 0:
106 |         res_pitch[0] = pitch_map_lst[0][1]
107 |         cur_pitch = pitch_map_lst[0][1]
108 |         cur_idx = 1
109 |     else:
110 |         res_pitch[0] = 0
111 |         cur_pitch = 0
112 |         cur_idx = 0
113 |     
114 |     for i in range(1, len(pitch)):
115 |         if i == pitch_map_lst[cur_idx][0]:
116 |             cur_pitch = pitch_map_lst[cur_idx][1]
117 |             res_pitch[i] = cur_pitch
118 |             if cur_idx < len(pitch_map_lst) - 1:
119 |                 cur_idx += 1
120 |         else:
121 |             res_pitch[i] = cur_pitch
122 | 
123 |     return res_pitch
124 |     
125 | 
126 | def preprocess(f, sampling_rate, block_size, signal_length=-1, oneshot=True):
127 |     x, sr = librosa.load(f, sampling_rate)
128 |     if signal_length == -1:     # full length
129 |         signal_length = len(x)
130 |     else:
131 |         if len(x) > signal_length:
132 |             x = x[:signal_length*sampling_rate]
133 |         elif len(x) < signal_length:
134 |             N = (signal_length - len(x) % signal_length) % signal_length
135 |             x = np.pad(x, (0, N))
136 | 
137 |         if oneshot:
138 |             x = x[..., :signal_length]
139 | 
140 |     D = np.abs(librosa.stft(x))
141 |     times = librosa.times_like(D, sr=sr)
142 |     onset_strengths = librosa.onset.onset_strength(y=x, sr=sr, aggregate=np.median)
143 |     onset_frames = librosa.onset.onset_detect(y=x, sr=sr)
144 | 
145 |     onset_frames = sanitize_onsets(times, onset_frames, onset_strengths)
146 | 
147 |     # TODO: HACK for now, onset detector missed. not all samples need this!!
148 |     onset_frames = np.concatenate([np.array([0]), onset_frames])
149 | 
150 |     pitch = extract_pitch(x, sampling_rate, block_size)
151 |     loudness = extract_loudness(x, sampling_rate, block_size)
152 | 
153 |     pitch_monotonize = monotonize_pitch(times, onset_frames, pitch)
154 |     pitch = pitch_monotonize
155 |     x = x.reshape(-1, signal_length)
156 |     pitch = pitch.reshape(x.shape[0], -1).squeeze()
157 |     loudness = loudness.reshape(x.shape[0], -1)
158 | 
159 |     # prepare for inference input
160 |     x = torch.tensor(x)
161 |     pitch = torch.tensor(pitch).unsqueeze(0)
162 |     loudness = torch.tensor(loudness)
163 | 
164 |     x = torch.cat([x, x], dim=0)
165 |     pitch = torch.cat([pitch, pitch], dim=0)
166 |     loudness = torch.cat([loudness, loudness], dim=0)
167 | 
168 |     mean_loudness, std_loudness = -39.74668743704927, 54.19612404969509
169 |     pitch, loudness = pitch.unsqueeze(-1).float(), loudness.unsqueeze(-1).float()
170 |     loudness = (loudness - mean_loudness) / std_loudness
171 | 
172 |     mfcc = spec(x)
173 | 
174 |     return x, pitch, loudness, times, onset_frames, mfcc
175 | 


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/models/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import librosa
 3 | import soundfile as sf
 4 | 
 5 | 
 6 | # helper functions to generate wavetable
 7 | def generate_wavetable(length, f):
 8 |     wavetable = np.zeros((length,), dtype=np.float32)
 9 |     for i in range(length):
10 |         wavetable[i] = f(2 * np.pi * i / length)
11 |     return wavetable
12 | 
13 | 
14 | def sawtooth_waveform(x):
15 |     """Sawtooth with period 2 pi."""
16 |     return (x + np.pi) / np.pi % 2 - 1
17 | 
18 | 
19 | def square_waveform(x):
20 |     """Square waveform with period 2 pi."""
21 |     return np.sign(np.sin(x))
22 | 
23 | 
24 | def trim_audio(in_name, out_name, start_sec, end_sec, sr=44100):
25 |     x, sr = librosa.load(in_name, sr=sr)
26 |     x = x[start_sec * sr: end_sec * sr]
27 |     sf.write(out_name, x, sr, 'PCM_24')
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     trim_audio("test_audio/kygo_pluck.mp3", "kygo_pluck.wav", 75, 85)


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/models/wavetable_synth.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Differentiable wavetable synthesis component.
  3 | """
  4 | import torch
  5 | from torch import nn
  6 | import numpy as np
  7 | from syntheon.inferencer.vital.models.utils import *
  8 | import soundfile as sf
  9 | from syntheon.inferencer.vital.models.core import upsample
 10 | from syntheon.inferencer.vital.models.adsr_envelope import *
 11 | 
 12 | 
 13 | def wavetable_osc(wavetable, freq, sr):
 14 |     """
 15 |     General wavetable synthesis oscilator.
 16 |     wavetable: (wavetable_len,)
 17 |     freq: (batch_size, dur * sr)
 18 |     sr: const
 19 |     """
 20 |     freq = freq.squeeze()
 21 |     increment = freq / sr * wavetable.shape[0]
 22 |     index = torch.cumsum(increment, dim=1) - increment[0]
 23 |     index = index % wavetable.shape[0]
 24 | 
 25 |     # uses linear interpolation implementation
 26 |     index_low = torch.floor(index.clone())
 27 |     index_high = torch.ceil(index.clone())
 28 |     alpha = index - index_low
 29 |     index_low = index_low.long()
 30 |     index_high = index_high.long()
 31 | 
 32 |     output = wavetable[index_low] + alpha * (wavetable[index_high % wavetable.shape[0]] - wavetable[index_low])
 33 |         
 34 |     return output
 35 | 
 36 | 
 37 | def wavetable_osc_v2(wavetable, freq, sr):
 38 |     """
 39 |     General wavetable synthesis oscilator, wavetable per item in batch
 40 |     wavetable: (batch_size, wavetable_len,)
 41 |     freq: (batch_size, dur * sr)
 42 |     sr: const
 43 |     """
 44 |     freq = freq.squeeze()
 45 |     increment = freq / sr * wavetable.shape[1]
 46 |     index = torch.cumsum(increment, dim=1) - increment[1]
 47 |     index = index % wavetable.shape[1]
 48 | 
 49 |     # uses linear interpolation implementation
 50 |     index_low = torch.floor(index.clone())
 51 |     index_high = torch.ceil(index.clone())
 52 |     alpha = index - index_low
 53 |     index_low = index_low.long()
 54 |     index_high = index_high.long()
 55 | 
 56 |     batch_size = wavetable.shape[0]
 57 |     output = []
 58 | 
 59 |     # TODO: do for loop for now, think any ways to parallelize this (einsum?)
 60 |     for bs in range(batch_size):
 61 |         wt, idx_l, idx_h, alp = wavetable[bs], index_low[bs].unsqueeze(0), index_high[bs].unsqueeze(0), alpha[bs].unsqueeze(0)
 62 |         signal = wt[idx_l] + alp * (wt[idx_h % wt.shape[0]] - wt[idx_l])
 63 |         output.append(signal)
 64 |     
 65 |     output = torch.cat(output, dim=0)
 66 |     return output
 67 | 
 68 | 
 69 | def generate_wavetable(length, f, cycle=1, phase=0):
 70 |     """
 71 |     Generate a wavetable of specified length using 
 72 |     function f(x) where x is phase.
 73 |     Period of f is assumed to be 2 pi.
 74 |     """
 75 |     wavetable = np.zeros((length,), dtype=np.float32)
 76 |     for i in range(length):
 77 |         wavetable[i] = f(cycle * 2 * np.pi * i / length + 2 * phase * np.pi)
 78 |     return torch.tensor(wavetable)
 79 | 
 80 | 
 81 | class WavetableSynth(nn.Module):
 82 |     def __init__(self,
 83 |                  wavetables=None,
 84 |                  n_wavetables=64,
 85 |                  wavetable_len=512,
 86 |                  sr=44100,
 87 |                  duration_secs=3,
 88 |                  block_size=160,
 89 |                  is_initial_wt_trainable=True):
 90 |         super(WavetableSynth, self).__init__()
 91 |         if wavetables is None: 
 92 |             self.wavetables = []
 93 |             for _ in range(n_wavetables):
 94 |                 cur = nn.Parameter(torch.empty(wavetable_len).normal_(mean=0, std=0.01))
 95 |                 self.wavetables.append(cur)
 96 | 
 97 |             self.wavetables = nn.ParameterList(self.wavetables)
 98 | 
 99 |             for idx, wt in enumerate(self.wavetables):
100 |                 wt.data = torch.cat([wt[:-1], wt[0].unsqueeze(-1)], dim=-1)
101 |                 wt.requires_grad = is_initial_wt_trainable
102 |         else:
103 |             self.wavetables = wavetables
104 |         
105 |         self.attention = nn.Parameter(torch.ones(n_wavetables,).cuda())
106 |         self.sr = sr
107 |         self.block_size = block_size
108 |         self.attention_softmax = nn.Softmax(dim=0)
109 |         self.duration_secs = duration_secs
110 | 
111 |     def forward(self, pitch, amplitude):
112 |         output_waveform_lst = []
113 |         for wt_idx in range(len(self.wavetables)):
114 |             wt = self.wavetables[wt_idx]
115 |             if wt_idx not in [0, 1, 2, 3]:
116 |                 wt = nn.Tanh()(wt)  # ensure wavetable range is between [-1, 1]
117 |             waveform = wavetable_osc(wt, pitch, self.sr)
118 |             output_waveform_lst.append(waveform)
119 | 
120 |         # apply attention 
121 |         attention = self.attention_softmax(self.attention)
122 |         attention_output = attention
123 |         attention = torch.stack(100 * self.duration_secs * [attention], dim=-1)
124 |         attention_upsample = upsample(attention.unsqueeze(-1), self.block_size).squeeze()
125 | 
126 |         output_waveform = torch.stack(output_waveform_lst, dim=1)
127 |         output_waveform = output_waveform * attention_upsample
128 |         output_waveform_after = torch.sum(output_waveform, dim=1)
129 |       
130 |         output_waveform_after = output_waveform_after.unsqueeze(-1)
131 |         output_waveform_after = output_waveform_after * amplitude
132 |        
133 |         return output_waveform_after, attention_output
134 | 
135 | 
136 | class WavetableSynthV2(nn.Module):
137 |     """
138 |     take wavetable as input, not model parameters
139 |     """
140 |     def __init__(self,
141 |                  sr=44100,
142 |                  duration_secs=4,
143 |                  block_size=160,
144 |                  enable_amplitude=True):
145 |         """
146 |         Turn on smoothing to reduce noise in learnt wavetables.
147 |         Smoothing takes in a 0-1 value, which is window size ratio w.r.t. wavetable length
148 |         Also a max_smooth_window_size is specified
149 |         """
150 |         super(WavetableSynthV2, self).__init__()       
151 |         self.sr = sr
152 |         self.block_size = block_size
153 |         self.duration_secs = duration_secs
154 |         self.enable_amplitude = enable_amplitude
155 | 
156 |     def forward(self, pitch, amplitude, wavetables, attention):
157 |         """
158 |         batch size version
159 |         input:
160 |         wavetables: (bs, n_wavetables, wavetable_len), -1 to 1
161 |         attention: softmax-ed, (bs, n_wavetables,)
162 |         smoothing_coeff: (bs, ), 0 to 1
163 | 
164 |         output:
165 |         (bs, dur * sr)
166 |         """
167 |         output_waveform_lst = []
168 |         for wt_idx in range(wavetables.shape[1]):
169 |             wt = wavetables[:, wt_idx, :]
170 |             waveform = wavetable_osc_v2(wt, pitch, self.sr)
171 | 
172 |             output_waveform_lst.append(waveform)
173 | 
174 |         # apply attention 
175 |         attention_upsample = torch.stack(100 * self.duration_secs * [attention], dim=-1)
176 |         attention_upsample = upsample(torch.permute(attention_upsample, (1, 2, 0)), self.block_size)
177 |         if (attention_upsample.shape[0] != 1):
178 |             attention_upsample = attention_upsample.squeeze()   # TODO: a little hacky code here, need to remove
179 |         attention_upsample = torch.permute(attention_upsample, (2, 0, 1))
180 | 
181 |         output_waveform = torch.stack(output_waveform_lst, dim=1)
182 |         output_waveform = output_waveform * attention_upsample
183 |         output_waveform_after = torch.sum(output_waveform, dim=1)
184 |       
185 |         output_waveform_after = output_waveform_after.unsqueeze(-1)
186 |         if self.enable_amplitude:
187 |             output_waveform_after = output_waveform_after * amplitude
188 |        
189 |         return output_waveform_after, attention
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     # create a sine wavetable and to a simple synthesis test
194 |     wavetable_len = 512
195 |     sr = 16000
196 |     duration = 4
197 |     freq_t_1 = [739.99 for _ in range(sr)] + [523.25 for _ in range(sr)] + [349.23 for _ in range(sr * 2)]
198 |     freq_t_1 = torch.tensor(freq_t_1)
199 |     freq_t_2 = [523.25 for _ in range(sr)] + [349.23 for _ in range(sr)] + [739.99 for _ in range(sr * 2)]
200 |     freq_t_2 = torch.tensor(freq_t_2)
201 |     freq_t_3 = [349.23 for _ in range(sr)] + [739.99 for _ in range(sr)] + [523.25 for _ in range(sr * 2)]
202 |     freq_t_3 = torch.tensor(freq_t_3)
203 | 
204 |     pitch, onset_frames, times = np.load("pitch.npy"), np.load("onset.npy"), np.load("times.npy")
205 |     pitch = torch.tensor(pitch)
206 |     pitch = upsample(pitch.unsqueeze(-1).unsqueeze(0), 160).squeeze()
207 | 
208 |     freq_t = torch.stack([pitch, pitch, pitch], dim=0)
209 |     sine_wavetable = generate_wavetable(wavetable_len, np.sin)
210 |     from utils import sawtooth_waveform
211 |     saw_wavetable = generate_wavetable(wavetable_len, sawtooth_waveform)
212 |     square_wavetable = generate_wavetable(wavetable_len, square_waveform)
213 | 
214 |     wavetable = torch.stack([sine_wavetable, saw_wavetable, square_wavetable], dim=0)
215 | 
216 |     # test batch wavetable_osc
217 |     signal = wavetable_osc_v2(wavetable, freq_t, sr)
218 | 
219 |     # test with adsr
220 |     shaper = ADSREnvelopeShaper()
221 |     adsr = get_amp_shaper(shaper, times[onset_frames], 
222 |                             attack_secs=torch.tensor([0.00]),
223 |                             decay_secs=torch.tensor([0.05]),
224 |                             sustain_level=torch.tensor([0.0]))
225 |     if adsr.shape[0] < 400:
226 |         append_tensor = torch.tensor([adsr[-1]] * (400 - adsr.shape[0]))
227 |         adsr = torch.cat([adsr, append_tensor], dim=-1)
228 |     else:
229 |         adsr = adsr[:400]
230 |     adsr = upsample(adsr.unsqueeze(-1).unsqueeze(0), 160).squeeze()
231 | 
232 |     signal = signal * adsr
233 | 
234 |     sf.write('test_3s_v1.wav', signal.squeeze()[0].detach().numpy(), sr, 'PCM_24')
235 |     sf.write('test_3s_v2.wav', signal.squeeze()[1].detach().numpy(), sr, 'PCM_24')
236 |     sf.write('test_3s_v3.wav', signal.squeeze()[2].detach().numpy(), sr, 'PCM_24')


--------------------------------------------------------------------------------
/syntheon/inferencer/vital/vital_inferencer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Vital inferencer.
  3 | """
  4 | from syntheon.inferencer.inferencer import Inferencer, InferenceInput, InferenceOutput
  5 | from syntheon.inferencer.vital.models.model import WTSv2
  6 | from syntheon.inferencer.vital.models.preprocessor import *
  7 | from syntheon.inferencer.vital.models.core import multiscale_fft
  8 | from syntheon.converter.vital.vital_constants import N_WAVETABLES, CUSTOM_KEYS
  9 | import yaml 
 10 | import torch
 11 | import numpy as np
 12 | import json
 13 | 
 14 | with open(
 15 |         os.path.join(
 16 |             os.path.dirname(os.path.realpath(__file__)),
 17 |             "config.yaml"
 18 |         ), 'r'
 19 |     ) as stream:
 20 |     config = yaml.safe_load(stream)
 21 | 
 22 | # general parameters
 23 | sr = config["common"]["sampling_rate"]
 24 | block_size = config["common"]["block_size"]
 25 | duration_secs = config["common"]["duration_secs"]
 26 | batch_size = config["train"]["batch_size"]
 27 | scales = config["train"]["scales"]
 28 | overlap = config["train"]["overlap"]
 29 | hidden_size = config["train"]["hidden_size"]
 30 | n_harmonic = config["train"]["n_harmonic"]
 31 | n_bands = config["train"]["n_bands"]
 32 | n_wavetables = config["train"]["n_wavetables"]
 33 | n_mfcc = config["train"]["n_mfcc"]
 34 | train_lr = config["train"]["start_lr"]
 35 | visualize = config["visualize"]
 36 | device = config["device"]
 37 | signal_length = sr * 4
 38 | 
 39 | 
 40 | class VitalInferenceOutput(InferenceOutput):
 41 |     def __init__(self):
 42 |         InferenceOutput.__init__(self)
 43 |         self.wt_output = None           # TODO: can put default values here
 44 |         self.attention_output = None
 45 |         self.attack = None
 46 |         self.decay = None
 47 |         self.sustain = None
 48 | 
 49 | 
 50 | class VitalInferenceInput(InferenceInput):
 51 |     def __init__(self):
 52 |         self.y = None
 53 |         self.pitch = None
 54 |         self.loudness = None
 55 |         self.times = None
 56 |         self.onset_frames = None
 57 |         self.mfcc = None
 58 | 
 59 | 
 60 | class VitalInferencer(Inferencer):
 61 |     def convert(self, audio_fname, model_pt_fname=None, enable_eval=False):
 62 |         # TODO: switch to torchhub
 63 |         if model_pt_fname is None:
 64 |             model_pt_fname = os.path.join(
 65 |                 os.path.dirname(os.path.realpath(__file__)),
 66 |                 "checkpoints/model.pt"
 67 |             )
 68 |         
 69 |         y, pitch, loudness, times, onset_frames, mfcc = preprocess(audio_fname, sampling_rate=16000, block_size=160, 
 70 |                                                                    signal_length=signal_length)
 71 |         inference_input = VitalInferenceInput()
 72 |         inference_input.y = y
 73 |         inference_input.pitch = pitch
 74 |         inference_input.loudness = loudness
 75 |         inference_input.times = times
 76 |         inference_input.onset_frames = onset_frames
 77 |         inference_input.mfcc = mfcc
 78 | 
 79 |         model = self.load_model(model_pt_fname, self.device)
 80 |         inference_output = self.inference(model, inference_input, self.device, enable_eval=enable_eval)
 81 |         synth_params_dict = self.convert_to_preset(inference_output)
 82 |         return synth_params_dict, inference_output.eval_dict
 83 | 
 84 |     def load_model(self, model_pt_fname, device="cuda"):
 85 |         model = WTSv2(hidden_size=hidden_size, n_harmonic=n_harmonic, n_bands=n_bands, sampling_rate=sr,
 86 |                 block_size=block_size,  mode="wavetable", 
 87 |                 duration_secs=4, num_wavetables=1, wavetable_smoothing=False, preload_wt=True, enable_amplitude=False,
 88 |                 is_round_secs=False, device=device)
 89 |         if device == "cuda":
 90 |             model.load_state_dict(torch.load(model_pt_fname))
 91 |             model.cuda()
 92 |         else:
 93 |             model.load_state_dict(torch.load(model_pt_fname, map_location=torch.device('cpu')))
 94 |         model.eval()        
 95 |         return model
 96 |     
 97 |     def inference(self, model, inference_input, device="cuda", enable_eval=False):
 98 |         if device == "cuda":
 99 |             inference_input.y = inference_input.y.cuda()
100 |             inference_input.mfcc = inference_input.mfcc.cuda()
101 |             inference_input.pitch = inference_input.pitch.cuda()
102 |             inference_input.loudness = inference_input.loudness.cuda()
103 | 
104 |         # forward pass
105 |         with torch.no_grad():
106 |             _, adsr, output, attention_output, wavetables, _, _ = model(
107 |                 inference_input.y, 
108 |                 inference_input.mfcc, 
109 |                 inference_input.pitch, 
110 |                 inference_input.loudness, 
111 |                 inference_input.times, 
112 |                 inference_input.onset_frames
113 |             )
114 | 
115 |         # write wavetables to numpy file
116 |         wt_output = []
117 | 
118 |         # interp from 512 to 2048
119 |         output_length = 2048
120 |         for i in range(N_WAVETABLES):
121 |             wt = wavetables[i].cpu().detach().numpy().squeeze()
122 |             wt_interp = np.interp(
123 |                 np.linspace(0, 1, output_length, endpoint=False),
124 |                 np.linspace(0, 1, wt.shape[0], endpoint=False),
125 |                 wt,
126 |             )
127 |             wt_output.append(wt_interp)
128 | 
129 |         wt_output = np.stack(wt_output, axis=0)
130 |         attention_output = attention_output.cpu().detach().numpy().squeeze()
131 | 
132 |         inference_output = VitalInferenceOutput()
133 |         inference_output.wt_output = wt_output
134 |         inference_output.attention_output = attention_output
135 |         inference_output.attack = adsr[0][0].cpu().detach().numpy().squeeze().item()
136 |         inference_output.decay = adsr[1][0].cpu().detach().numpy().squeeze().item()
137 |         inference_output.sustain = adsr[2][0].cpu().detach().numpy().squeeze().item()
138 | 
139 |         if enable_eval:
140 |             self.eval(inference_input.y, output, inference_output)
141 | 
142 |         return inference_output
143 |     
144 |     def convert_to_preset(self, inference_output):
145 |         with open(
146 |             os.path.join(
147 |                 os.path.dirname(os.path.realpath(__file__)),
148 |                 "init.vital"
149 |             ), 'r'
150 |         ) as f:
151 |             x = json.load(f)
152 | 
153 |         x[CUSTOM_KEYS] = {}
154 |         x[CUSTOM_KEYS]["wavetables"] = []
155 |         for idx in range(N_WAVETABLES):
156 |             cur_dict = {
157 |                 "name": "Litmus WT {}".format(idx + 1),
158 |                 "wavetable": inference_output.wt_output[idx],
159 |                 "osc_level": inference_output.attention_output[idx].item()
160 |             }
161 |             x[CUSTOM_KEYS]["wavetables"].append(cur_dict)
162 |             x[CUSTOM_KEYS]["adsr"] = {}
163 |             x[CUSTOM_KEYS]["adsr"]["attack"] = inference_output.attack
164 |             x[CUSTOM_KEYS]["adsr"]["attack_power"] = 0.0
165 |             x[CUSTOM_KEYS]["adsr"]["decay"] = inference_output.decay
166 |             x[CUSTOM_KEYS]["adsr"]["decay_power"] = 0.0
167 |             x[CUSTOM_KEYS]["adsr"]["sustain"] = inference_output.sustain
168 | 
169 |         return x
170 |     
171 |     def eval(self, y, output, inference_output):
172 |         ori_stft = multiscale_fft(
173 |                     y[0].squeeze(),
174 |                     scales,
175 |                     overlap,
176 |                 )
177 |         rec_stft = multiscale_fft(
178 |             output[0].squeeze(),
179 |             scales,
180 |             overlap,
181 |         )
182 | 
183 |         loss = 0
184 |         for s_x, s_y in zip(ori_stft, rec_stft): 
185 |             lin_loss = ((s_x - s_y).abs()).mean()
186 |             loss += lin_loss
187 |         
188 |         inference_output.eval_dict["loss"] = loss.item()
189 |         inference_output.eval_dict["output"] = output[0].cpu().detach().numpy().squeeze()
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     vital_inferencer = VitalInferencer(device="cpu")
194 |     params, eval_dict = vital_inferencer.convert("test/test_audio/vital_test_audio_2.wav", enable_eval=True)
195 | 
196 |     from syntheon.converter.vital.vital_converter import VitalConverter
197 |     vital_converter = VitalConverter()
198 |     vital_converter.dict = params
199 |     vital_converter.parseToPluginFile("vital_output.vital")
200 | 
201 |     


--------------------------------------------------------------------------------
/syntheon/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Function APIs to be called externally.
 3 | """
 4 | from .converter.vital.vital_converter import VitalConverter
 5 | from .inferencer.vital.vital_inferencer import VitalInferencer
 6 | 
 7 | 
 8 | obj_dict = {
 9 |     "vital": {
10 |         "converter": VitalConverter,
11 |         "inferencer": VitalInferencer,
12 |         "file_ext": "vital"
13 |     }
14 | }
15 | 
16 | def infer_params(input_audio_name, synth_name, enable_eval=False):
17 |     if synth_name not in obj_dict:
18 |         raise ValueError("Synth name {} not available for parameter inference".format(synth_name))
19 |     
20 |     inferencer = obj_dict[synth_name]["inferencer"](device="cpu")
21 |     params, eval_dict = inferencer.convert(input_audio_name, enable_eval=enable_eval)
22 | 
23 |     converter = obj_dict[synth_name]["converter"]()
24 |     converter.dict = params
25 |     output_fname = "{}_output.{}".format(synth_name, obj_dict[synth_name]["file_ext"])
26 |     converter.parseToPluginFile(output_fname)
27 | 
28 |     return output_fname, eval_dict


--------------------------------------------------------------------------------
/syntheon/utils/pitch_extractor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | [WIP] Common class for pitch extraction across all synthesizers.
 3 | """
 4 | 
 5 | import numpy as np
 6 | import os
 7 | from torchcrepeV2 import ONNXTorchCrepePredictor
 8 | import yaml 
 9 | 
10 | with open(
11 |     os.path.join(
12 |         os.path.dirname(os.path.realpath(__file__)),
13 |         "../inferencer/vital/config.yaml"
14 |     ), 'r'
15 | ) as stream:
16 |     config = yaml.safe_load(stream)
17 | 
18 | 
19 | crepe_predictor = ONNXTorchCrepePredictor()
20 | 
21 | 
22 | def extract_pitch(signal, sampling_rate, block_size, model_capacity="full"):
23 |     length = signal.shape[-1] // block_size
24 |     f0 = crepe_predictor.predict(
25 |         audio=signal, 
26 |         sr=sampling_rate, 
27 |         viterbi=True, 
28 |         center=True, 
29 |         step_size=int(1000 * block_size / sampling_rate),
30 |     )
31 | 
32 |     if f0.shape[-1] != length:
33 |         f0 = np.interp(
34 |             np.linspace(0, 1, length, endpoint=False),
35 |             np.linspace(0, 1, f0.shape[-1], endpoint=False),
36 |             f0,
37 |         )
38 | 
39 |     return f0


--------------------------------------------------------------------------------
/syntheon/version.py:
--------------------------------------------------------------------------------
1 | version = "0.1.0"


--------------------------------------------------------------------------------
/test/test_audio/dexed_test_audio_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/dexed_test_audio_1.wav


--------------------------------------------------------------------------------
/test/test_audio/vital_test_pluck_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_pluck_1.wav


--------------------------------------------------------------------------------
/test/test_audio/vital_test_pluck_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_pluck_2.wav


--------------------------------------------------------------------------------
/test/test_audio/vital_test_synth_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_synth_1.wav


--------------------------------------------------------------------------------
/test/test_audio/vital_test_synth_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_synth_2.wav


--------------------------------------------------------------------------------
/test/test_audio/vital_test_synth_3.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_synth_3.wav


--------------------------------------------------------------------------------
/test/test_audio/vital_test_wonky_bass_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_wonky_bass_1.wav


--------------------------------------------------------------------------------
/test/test_inferencer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | from syntheon import infer_params
 4 | 
 5 | 
 6 | # def test_dexed_inferencer():
 7 | #     """
 8 | #     just check if everything runs well for Dexed
 9 | #     """
10 | #     output_params_file, eval_dict = infer_params(
11 | #         "test/test_audio/dexed_test_audio_1.wav", 
12 | #         "dexed", 
13 | #         enable_eval=True
14 | #     )
15 | #     assert os.path.exists(output_params_file)
16 | 
17 | #     os.remove(output_params_file)
18 | 
19 | 
20 | def test_vital_inferencer_1():
21 |     """
22 |     just check if everything runs well for Vital
23 |     """
24 |     loss_lst = [0.11, 0.06, 0.37, 0.42, 0.18, 0.15]
25 |     audios = sorted(glob.glob("test/test_audio/vital_*.wav"))
26 |     for i in range(len(audios)):
27 |         output_params_file, eval_dict = infer_params(
28 |             audios[i],
29 |             "vital", 
30 |             enable_eval=True
31 |         )
32 |         assert os.path.exists(output_params_file)
33 |         assert eval_dict["loss"] < loss_lst[i]
34 |         os.remove(output_params_file)


--------------------------------------------------------------------------------