├── .github └── workflows │ └── python-package.yml ├── LICENSE ├── README.md ├── Syntheon_Demo.ipynb ├── docs └── syntheon-logo.png ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py ├── syntheon ├── __init__.py ├── converter │ ├── __init__.py │ ├── converter.py │ ├── dexed │ │ ├── __init__.py │ │ ├── dexed_constants.py │ │ └── dexed_converter.py │ └── vital │ │ ├── __init__.py │ │ ├── vital_constants.py │ │ └── vital_converter.py ├── inferencer │ ├── __init__.py │ ├── dexed │ │ ├── Dexed_01.syx │ │ ├── __init__.py │ │ ├── checkpoints │ │ │ └── state_best.pth │ │ ├── dexed_inferencer.py │ │ └── models │ │ │ ├── __init__.py │ │ │ ├── amp_utils.py │ │ │ ├── conf │ │ │ ├── __init__.py │ │ │ ├── data_config.yaml │ │ │ └── recipes │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── config.yaml │ │ │ │ ├── hyperparams │ │ │ │ ├── ddx7.yaml │ │ │ │ └── hpn.yaml │ │ │ │ └── model │ │ │ │ ├── __init__.py │ │ │ │ ├── hpn_baseline.yaml │ │ │ │ ├── tcnres_f0ld_fm1stack2.yaml │ │ │ │ ├── tcnres_f0ld_fm1stack4.yaml │ │ │ │ ├── tcnres_f0ld_fm2stack2.yaml │ │ │ │ ├── tcnres_f0ld_fmablbrass.yaml │ │ │ │ ├── tcnres_f0ld_fmablflute.yaml │ │ │ │ ├── tcnres_f0ld_fmbrss.yaml │ │ │ │ ├── tcnres_f0ld_fmflt.yaml │ │ │ │ ├── tcnres_f0ld_fmstr.yaml │ │ │ │ └── tcnres_f0ld_fmstr_noreverb.yaml │ │ │ ├── ddx7 │ │ │ ├── __init__.py │ │ │ ├── core.py │ │ │ ├── data_utils │ │ │ │ ├── __init__.py │ │ │ │ ├── h5_dataset.py │ │ │ │ └── preprocessor.py │ │ │ ├── loss_functions.py │ │ │ ├── models.py │ │ │ ├── spectral_ops.py │ │ │ └── synth.py │ │ │ └── preprocessor.py │ ├── inferencer.py │ └── vital │ │ ├── __init__.py │ │ ├── checkpoints │ │ ├── __init__.py │ │ └── model.pt │ │ ├── config.yaml │ │ ├── init.vital │ │ ├── models │ │ ├── adsr_envelope.py │ │ ├── core.py │ │ ├── model.py │ │ ├── preprocessor.py │ │ ├── utils.py │ │ └── wavetable_synth.py │ │ └── vital_inferencer.py ├── main.py ├── utils │ └── pitch_extractor.py └── version.py └── test ├── test_audio ├── dexed_test_audio_1.wav ├── vital_test_pluck_1.wav ├── vital_test_pluck_2.wav ├── vital_test_synth_1.wav ├── vital_test_synth_2.wav ├── vital_test_synth_3.wav └── vital_test_wonky_bass_1.wav └── test_inferencer.py /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: syntheon 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.9"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Test with pytest 33 | run: | 34 | python -m pytest -s 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2022 Syntheon 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![syntheon_logo](docs/syntheon-logo.png) 2 | 3 | # Syntheon 4 | 5 | Syntheon - [Pantheon](https://en.wikipedia.org/wiki/Pantheon,_Rome) for music synthesizers. 6 | 7 | Syntheon provides **parameter inference** for music synthesizers using *deep learning models*. Given an audio sample, Syntheon infers the best parameter preset for a given synthesizer that can recreate the audio sample. 8 | 9 | **Check out [this presentation](https://docs.google.com/presentation/d/1PA4fom6QvCW_YG8L0MMVumrAluljcymndNlaK2HW5t0/edit?usp=sharing) on the recent advances of synth parameter inference. 10 | 11 | For now: 12 | - :heavy_check_mark: [Vital](https://vital.audio/) is supported 13 | - [Dexed](https://asb2m10.github.io/dexed/) is work-in-progress 14 | 15 | Try it out on [our Colab notebook demo](https://colab.research.google.com/github/gudgud96/syntheon/blob/main/Syntheon_Demo.ipynb). 16 | 17 | ## Installation 18 | 19 | ``` 20 | python3 -m pip install syntheon 21 | ``` 22 | 23 | ## Usage 24 | 25 | ```python 26 | from syntheon import infer_params 27 | 28 | output_params_file, eval_dict = infer_params( 29 | "your_audio.wav", 30 | "vital", 31 | enable_eval=True 32 | ) 33 | ``` 34 | 35 | ## Testing 36 | 37 | ``` 38 | python3 -m pytest 39 | ``` 40 | 41 | ## Structure 42 | 43 | For each synthesizer, we need to define: 44 | 45 | - **converter** for preset format conversion: 46 | - `serializeToDict`: convert preset file to a Python dictionary to be handled by inferencer 47 | - `parseToPluginFile`: convert Python dictionary back to preset file, to be loaded by the synthesizer 48 | 49 | - **inferencer** for model inference: 50 | - `convert`: define the workflow of `load_model` -> `inference` -> `convert_to_preset` 51 | 52 | ## Contribution 53 | 54 | Syntheon is actively under development, and contributions are welcomed. Some TODOs we have in mind include: 55 | 56 | - Replicating state-of-the-art approaches 57 | - Improving current model performance 58 | - Incorporating new synthesizers 59 | - Code refactoring 😅 60 | -------------------------------------------------------------------------------- /docs/syntheon-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/docs/syntheon-logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | librosa==0.9.1 2 | torch==1.12.1 3 | torchvision==0.13.1 4 | torchaudio==0.12.1 5 | pyyaml 6 | mido 7 | nnAudio==0.3.1 8 | numpy 9 | bitstruct 10 | torchcrepeV2 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = syntheon 3 | version = attr: syntheon.version.version 4 | description = Parameter inference of music synthesizers to simplify sound design process. 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown; charset=UTF-8 7 | url = https://github.com/gudgud96/syntheon 8 | author = Hao Hao Tan 9 | author_email = helloharry66@gmail.com 10 | 11 | [options] 12 | packages = find: 13 | include_package_data = True 14 | install_requires = 15 | librosa==0.9.1 16 | torch==1.12.1 17 | torchvision==0.13.1 18 | torchaudio==0.12.1 19 | pyyaml 20 | mido 21 | nnAudio==0.3.1 22 | numpy 23 | bitstruct 24 | torchcrepeV2 25 | 26 | python_requires = >=3.7 27 | 28 | [options.package_data] 29 | * = inferencer/vital/checkpoints/model.pt, inferencer/vital/config.yaml, inferencer/vital/init.vital, inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmstr_noreverb.yaml, inferencer/dexed/models/conf/recipes/models/conf/data_config.yaml -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | if __name__ == '__main__': 4 | setup() -------------------------------------------------------------------------------- /syntheon/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import infer_params -------------------------------------------------------------------------------- /syntheon/converter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/converter/__init__.py -------------------------------------------------------------------------------- /syntheon/converter/converter.py: -------------------------------------------------------------------------------- 1 | class SynthConverter: 2 | 3 | def __init__(self): 4 | self.dict = None 5 | self.keys = [] # keys that need to be filled for this synth 6 | 7 | def serializeToDict(self, fname): 8 | """ 9 | From plugin file to protobuf. 10 | 11 | Args: 12 | fname - input file name 13 | """ 14 | return None 15 | 16 | def parseToPluginFile(self, fname): 17 | """ 18 | From protobuf to plugin file. 19 | 20 | Args: 21 | fname - output file name 22 | """ 23 | return None 24 | 25 | def printMessage(self): 26 | """ 27 | Print synth parameters. 28 | """ 29 | if self.dict: 30 | print(self.dict) 31 | 32 | else: 33 | raise ValueError("synth parameters not serialized yet") 34 | 35 | def keys(self): 36 | return self.keys 37 | 38 | def verify(self): 39 | """ 40 | Verify if params are valid. Used in serializeToDict method. 41 | """ 42 | if self.dict is None: 43 | raise ValueError("synth parameters not serialized yet") 44 | 45 | # value range checks can leave to derived classes 46 | for key in self.keys: 47 | if isinstance(self.dict, list): 48 | for elem in self.dict: 49 | if key not in elem: 50 | raise ValueError("specified key not in synth parameters: {}".format(key)) 51 | elif isinstance(self.dict, dict): 52 | if key not in elem: 53 | raise ValueError("specified key not in synth parameters: {}".format(key)) -------------------------------------------------------------------------------- /syntheon/converter/dexed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/converter/dexed/__init__.py -------------------------------------------------------------------------------- /syntheon/converter/dexed/dexed_constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | DX7 syx -> json constants. 3 | 4 | Credits to source: https://github.com/Nintorac/NeuralDX7/blob/master/scratch/dx7_constants.py 5 | DX7 specification: https://github.com/asb2m10/dexed/blob/master/Documentation/sysex-format.txt 6 | DX7 patches: DX7 patches: https://yamahablackboxes.com/collection/yamaha-dx7-synthesizer/patches/ 7 | """ 8 | import bitstruct 9 | 10 | N_OSC = 6 11 | N_VOICES = 32 12 | 13 | GLOBAL_VALID_RANGES = { 14 | 'PR1': range(0, 99+1), 15 | 'PR2': range(0, 99+1), 16 | 'PR3': range(0, 99+1), 17 | 'PR4': range(0, 99+1), 18 | 'PL1': range(0, 99+1), 19 | 'PL2': range(0, 99+1), 20 | 'PL3': range(0, 99+1), 21 | 'PL4': range(0, 99+1), 22 | 'ALG': range(0, 31+1), 23 | 'OKS': range(0, 1+1), 24 | 'FB': range(0, 7+1), 25 | 'LFS': range(0, 99+1), 26 | 'LFD': range(0, 99+1), 27 | 'LPMD': range(0, 99+1), 28 | 'LAMD': range(0, 99+1), 29 | 'LPMS': range(0, 7+1), 30 | 'LFW': range(0, 5+1), 31 | 'LKS': range(0, 1+1), 32 | 'TRNSP': range(0, 48+1), 33 | 'NAME CHAR 1': range(128), 34 | 'NAME CHAR 2': range(128), 35 | 'NAME CHAR 3': range(128), 36 | 'NAME CHAR 4': range(128), 37 | 'NAME CHAR 5': range(128), 38 | 'NAME CHAR 6': range(128), 39 | 'NAME CHAR 7': range(128), 40 | 'NAME CHAR 8': range(128), 41 | 'NAME CHAR 9': range(128), 42 | 'NAME CHAR 10': range(128), 43 | } 44 | 45 | OSCILLATOR_VALID_RANGES = { 46 | 'R1': range(0, 99+1), 47 | 'R2': range(0, 99+1), 48 | 'R3': range(0, 99+1), 49 | 'R4': range(0, 99+1), 50 | 'L1': range(0, 99+1), 51 | 'L2': range(0, 99+1), 52 | 'L3': range(0, 99+1), 53 | 'L4': range(0, 99+1), 54 | 'BP': range(0, 99+1), 55 | 'LD': range(0, 99+1), 56 | 'RD': range(0, 99+1), 57 | 'RC': range(0, 3+1), 58 | 'LC': range(0, 3+1), 59 | 'DET': range(0, 14+1), 60 | 'RS': range(0, 7+1), 61 | 'KVS': range(0, 7+1), 62 | 'AMS': range(0, 3+1), 63 | 'OL': range(0, 99+1), 64 | 'FC': range(0, 31+1), 65 | 'M': range(0, 1+1), 66 | 'FF': range(0, 99+1), 67 | } 68 | 69 | VOICE_PARAMETER_RANGES = {f'{i}_{key}': value for key, value in OSCILLATOR_VALID_RANGES.items() for i in range(N_OSC)} 70 | VOICE_PARAMETER_RANGES.update(GLOBAL_VALID_RANGES) 71 | 72 | 73 | HEADER_KEYS = [ 74 | 'ID', 75 | 'Sub-status', 76 | 'format number', 77 | 'byte count', 78 | 'byte count', 79 | ] 80 | 81 | GENERAL_KEYS = [ 82 | 'PR1', 83 | 'PR2', 84 | 'PR3', 85 | 'PR4', 86 | 'PL1', 87 | 'PL2', 88 | 'PL3', 89 | 'PL4', 90 | 'ALG', 91 | 'OKS', 92 | 'FB', 93 | 'LFS', 94 | 'LFD', 95 | 'LPMD', 96 | 'LAMD', 97 | 'LPMS', 98 | 'LFW', 99 | 'LKS', 100 | 'TRNSP', 101 | 'NAME CHAR 1', 102 | 'NAME CHAR 2', 103 | 'NAME CHAR 3', 104 | 'NAME CHAR 4', 105 | 'NAME CHAR 5', 106 | 'NAME CHAR 6', 107 | 'NAME CHAR 7', 108 | 'NAME CHAR 8', 109 | 'NAME CHAR 9', 110 | 'NAME CHAR 10', 111 | ] 112 | 113 | OSC_KEYS = [ 114 | 'R1', 115 | 'R2', 116 | 'R3', 117 | 'R4', 118 | 'L1', 119 | 'L2', 120 | 'L3', 121 | 'L4', 122 | 'BP', 123 | 'LD', 124 | 'RD', 125 | 'RC', 126 | 'LC', 127 | 'DET', 128 | 'RS', 129 | 'KVS', 130 | 'AMS', 131 | 'OL', 132 | 'FC', 133 | 'M', 134 | 'FF', 135 | ] 136 | 137 | FOOTER_KEYS = ['checksum'] 138 | 139 | 140 | VOICE_KEYS = [f'{i}_{key}' for i in range(6) for key in OSC_KEYS] + \ 141 | GENERAL_KEYS 142 | 143 | KEYS = HEADER_KEYS + \ 144 | list(VOICE_KEYS * N_VOICES) + \ 145 | FOOTER_KEYS 146 | 147 | header_bytes = [ 148 | 'p1u7', # ID # (i=67; Yamaha) 149 | 'p1u7', # Sub-status (s=0) & channel number (n=0; ch 1) 150 | 'p1u7', # format number (f=9; 32 voices) 151 | 'p1u7', # byte count MS byte 152 | 'p1u7', # byte count LS byte (b=4096; 32 voices) 153 | ] 154 | 155 | general_parameter_bytes = [ 156 | 'p1u7', # PR1 157 | 'p1u7', # PR2 158 | 'p1u7', # PR3 159 | 'p1u7', # PR4 160 | 'p1u7', # PL1 161 | 'p1u7', # PL2 162 | 'p1u7', # PL3 163 | 'p1u7', # PL4 164 | 'p3u5', # ALG 165 | 'p4u1u3', # OKS| FB 166 | 'p1u7', # LFS 167 | 'p1u7', # LFD 168 | 'p1u7', # LPMD 169 | 'p1u7', # LAMD 170 | 'p1u3u3u1', # LPMS | LFW |LKS 171 | 'p1u7', # TRNSP 172 | 'p1u7', # NAME CHAR 1 173 | 'p1u7', # NAME CHAR 2 174 | 'p1u7', # NAME CHAR 3 175 | 'p1u7', # NAME CHAR 4 176 | 'p1u7', # NAME CHAR 5 177 | 'p1u7', # NAME CHAR 6 178 | 'p1u7', # NAME CHAR 7 179 | 'p1u7', # NAME CHAR 8 180 | 'p1u7', # NAME CHAR 9 181 | 'p1u7', # NAME CHAR 10 182 | ] 183 | 184 | osc_parameter_bytes = [ 185 | 'p1u7', # R1 186 | 'p1u7', # R2 187 | 'p1u7', # R3 188 | 'p1u7', # R4 189 | 'p1u7', # L1 190 | 'p1u7', # L2 191 | 'p1u7', # L3 192 | 'p1u7', # L4 193 | 'p1u7', # BP 194 | 'p1u7', # LD 195 | 'p1u7', # RD 196 | 'p4u2u2', # RC | LC 197 | 'p1u4u3', # DET | RS 198 | 'p3u3u2', # KVS | AMS 199 | 'p1u7', # OL 200 | 'p2u5u1', # FC | M 201 | 'p1u7' # FF 202 | ] 203 | 204 | voice_bytes = (osc_parameter_bytes * N_OSC) + general_parameter_bytes 205 | 206 | tail_bytes = [ 207 | 'p1u7', # checksum 208 | ] 209 | 210 | full_string = ''.join(header_bytes + osc_parameter_bytes * 6 + general_parameter_bytes) 211 | dx7_struct = bitstruct.compile(full_string) 212 | 213 | voice_struct = bitstruct.compile(''.join(voice_bytes), names=VOICE_KEYS) 214 | header_struct = bitstruct.compile(''.join(header_bytes)) -------------------------------------------------------------------------------- /syntheon/converter/dexed/dexed_converter.py: -------------------------------------------------------------------------------- 1 | from syntheon.converter.converter import SynthConverter 2 | import mido 3 | from pathlib import Path 4 | from syntheon.converter.dexed.dexed_constants import voice_struct, VOICE_PARAMETER_RANGES, header_struct,\ 5 | header_bytes, voice_bytes, N_VOICES, N_OSC, KEYS 6 | 7 | 8 | def take(take_from, n): 9 | for _ in range(n): 10 | yield next(take_from) 11 | 12 | 13 | def checksum(data): 14 | return (128-sum(data)&127)%128 15 | 16 | 17 | class DexedConverter(SynthConverter): 18 | def __init__(self): 19 | SynthConverter.__init__(self) 20 | self.keys = KEYS 21 | 22 | def serializeToDict(self, fname): 23 | path = Path(fname).expanduser() 24 | try: 25 | preset = mido.read_syx_file(path.as_posix())[0] 26 | except IndexError as e: 27 | return None 28 | except ValueError as e: 29 | return None 30 | if len(preset.data) == 0: 31 | return None 32 | 33 | def get_voice(data): 34 | unpacked = voice_struct.unpack(data) 35 | # TODO: need to take actions after verify, skip for now 36 | # self.verify(unpacked, VOICE_PARAMETER_RANGES) 37 | return unpacked 38 | 39 | get_header = header_struct.unpack 40 | sysex_iter = iter(preset.data) 41 | lst = [] 42 | try: 43 | header = get_header(bytes(take(sysex_iter, len(header_bytes)))) 44 | for idx in range(N_VOICES): 45 | x = get_voice(bytes(take(sysex_iter, len(voice_bytes)))) 46 | lst.append(x) 47 | 48 | self.dict = lst 49 | return lst 50 | except RuntimeError: 51 | return None 52 | 53 | def parseToPluginFile(self, fname): 54 | def encode_head(): 55 | header = [ '0x43', 56 | '0x00', 57 | '0x09', 58 | '0x20', 59 | '0x00',] 60 | 61 | return [int(i, 0) for i in header] 62 | 63 | def encode_osc(params, n): 64 | oscillator_params = [] 65 | 66 | oscillator_params += [params[f'{n}_R1']] 67 | oscillator_params += [params[f'{n}_R2']] 68 | oscillator_params += [params[f'{n}_R3']] 69 | oscillator_params += [params[f'{n}_R4']] 70 | oscillator_params += [params[f'{n}_L1']] 71 | oscillator_params += [params[f'{n}_L2']] 72 | oscillator_params += [params[f'{n}_L3']] 73 | oscillator_params += [params[f'{n}_L4']] 74 | oscillator_params += [params[f'{n}_BP']] 75 | oscillator_params += [params[f'{n}_LD']] 76 | oscillator_params += [params[f'{n}_RD']] 77 | 78 | RC = params[f'{n}_RC'] << 2 79 | LC = params[f'{n}_LC'] 80 | oscillator_params += [RC | LC] 81 | 82 | DET = params[f'{n}_DET'] << 3 83 | RS = params[f'{n}_RS'] 84 | oscillator_params += [DET | RS] 85 | 86 | KVS = params[f'{n}_KVS'] << 2 87 | AMS = params[f'{n}_AMS'] 88 | oscillator_params += [KVS|AMS] 89 | oscillator_params += [params[f'{n}_OL']] 90 | 91 | FC = params[f'{n}_FC'] << 1 92 | M = params[f'{n}_M'] 93 | oscillator_params += [FC|M] 94 | oscillator_params += [params[f'{n}_FF']] 95 | 96 | return oscillator_params 97 | 98 | def encode_global(params): 99 | global_params = [] 100 | global_params += [params['PR1']] 101 | global_params += [params['PR2']] 102 | global_params += [params['PR3']] 103 | global_params += [params['PR4']] 104 | global_params += [params['PL1']] 105 | global_params += [params['PL2']] 106 | global_params += [params['PL3']] 107 | global_params += [params['PL4']] 108 | 109 | global_params += [params['ALG']] 110 | 111 | OKS = params['OKS'] << 3 112 | FB = params['FB'] 113 | 114 | global_params += [OKS|FB] 115 | global_params += [params['LFS']] 116 | global_params += [params['LFD']] 117 | global_params += [params['LPMD']] 118 | global_params += [params['LAMD']] 119 | 120 | LPMS = params['LPMS'] << 4 121 | LFW = params['LFW'] << 1 122 | LKS = params['LKS'] 123 | global_params += [LPMS | LFW | LKS] 124 | global_params += [params['TRNSP']] 125 | global_params += [params[f'NAME CHAR {i + 1}'] for i in range(10)] 126 | 127 | return global_params 128 | 129 | try: 130 | head = encode_head() 131 | 132 | data = [] 133 | assert len(self.dict) == N_VOICES 134 | 135 | # voices 136 | last_params = None 137 | for params in self.dict: 138 | if len(params.keys()) == 0: 139 | params = last_params 140 | else: 141 | last_params = params 142 | for osc in range(N_OSC): 143 | data += encode_osc(params, osc) 144 | 145 | data += encode_global(params) 146 | 147 | 148 | this_checksum = checksum(data) 149 | output = [*head, *data, this_checksum] 150 | 151 | message = mido.Message('sysex', data=output) 152 | mido.write_syx_file(fname, [message]) 153 | return 0 154 | 155 | except Exception as e: 156 | print(str(e)) 157 | return -1 158 | 159 | def verify(self, actual, ranges): 160 | super().verify() 161 | assert set(actual.keys())==set(ranges.keys()), 'Params dont match' 162 | for key in actual: 163 | if not actual[key] in ranges[key]: 164 | print("returning false", key, actual[key]) 165 | return False 166 | return True 167 | 168 | 169 | if __name__ == "__main__": 170 | dx_converter = DexedConverter() 171 | dx_converter.serializeToDict("Dexed_01.syx") 172 | dx_converter.printMessage() 173 | dx_converter.parseToPluginFile("testing.syx") 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /syntheon/converter/vital/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/converter/vital/__init__.py -------------------------------------------------------------------------------- /syntheon/converter/vital/vital_constants.py: -------------------------------------------------------------------------------- 1 | N_WAVETABLES = 1 2 | 3 | CUSTOM_KEYS = "vital_converter" -------------------------------------------------------------------------------- /syntheon/converter/vital/vital_converter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import base64 3 | import struct 4 | from syntheon.converter.converter import SynthConverter 5 | from syntheon.converter.vital.vital_constants import N_WAVETABLES, CUSTOM_KEYS 6 | import numpy as np 7 | import math 8 | 9 | 10 | class Base64Converter: 11 | def __init__(self): 12 | pass 13 | 14 | def encode(self, signal): 15 | signal_bytes = struct.pack('{}f'.format(len(signal)), *signal) 16 | base64_string = base64.b64encode(signal_bytes) 17 | 18 | return base64_string.decode('ascii') 19 | 20 | def decode(self, base64_string, output_length=2048): 21 | signal_bytes = base64.decodebytes(base64_string.encode('ascii')) 22 | arr = [k for k in struct.iter_unpack('f', signal_bytes)] # unpack as 2 bytes integer 23 | arr = [k[0] for k in arr] # normalize 24 | 25 | return np.array(arr) 26 | 27 | 28 | class VitalConverter(SynthConverter): 29 | def __init__(self): 30 | SynthConverter.__init__(self) 31 | self.keys = [] 32 | self.base64_converter = Base64Converter() 33 | 34 | def serializeToDict(self, fname): 35 | try: 36 | with open(fname) as f: 37 | self.dict = json.load(f) 38 | 39 | # decode custom part 40 | self.dict[CUSTOM_KEYS] = {} 41 | self.dict[CUSTOM_KEYS]["wavetables"] = [] 42 | for idx in range(len(N_WAVETABLES)): 43 | wavetable_str = self.dict["settings"]["wavetables"][idx]["groups"][0]["components"][0]["keyframes"][0]["wave_data"] 44 | wavetable_name = self.dict["settings"]["wavetables"][idx]["name"] 45 | wavetable_osc_level = self.dict["settings"]["osc_{}_level".format(idx + 1)] 46 | wavetable = self.base64_converter.decode(wavetable_str) # return np.array 47 | cur_dict = { 48 | "name": wavetable_name, 49 | "wavetable": wavetable, 50 | "osc_level": wavetable_osc_level 51 | } 52 | self.dict[CUSTOM_KEYS]["wavetables"].append(cur_dict) 53 | 54 | # switch off unused wavetables 55 | if N_WAVETABLES == 1: 56 | self.dict["settings"]["osc_2_on"] = 0.0 57 | self.dict["settings"]["osc_3_on"] = 0.0 58 | elif N_WAVETABLES == 2: 59 | self.dict["settings"]["osc_3_on"] = 0.0 60 | 61 | except Exception as e: 62 | print(str(e)) 63 | 64 | return self.dict 65 | 66 | def parseToPluginFile(self, fname): 67 | """ 68 | vital parameters value scale: https://github.com/mtytel/vital/blob/c0694a193777fc97853a598f86378bea625a6d81/src/common/synth_parameters.cpp 69 | value scale computation: https://github.com/mtytel/vital/blob/c0694a193777fc97853a598f86378bea625a6d81/src/plugin/value_bridge.h 70 | """ 71 | # encode custom part 72 | wavetables = self.dict[CUSTOM_KEYS]["wavetables"] 73 | for idx in range(N_WAVETABLES): 74 | wavetable = wavetables[idx]["wavetable"] 75 | wavetable_name = wavetables[idx]["name"] 76 | wavetable_osc_level = wavetables[idx]["osc_level"] 77 | 78 | wavetable_str = self.base64_converter.encode(wavetable) 79 | self.dict["settings"]["wavetables"][idx]["groups"][0]["components"][0]["keyframes"][0]["wave_data"] = wavetable_str 80 | self.dict["settings"]["wavetables"][idx]["name"] = wavetable_name 81 | self.dict["settings"]["osc_{}_level".format(idx + 1)] = wavetable_osc_level 82 | 83 | # switch off unused wavetables 84 | if N_WAVETABLES == 1: 85 | self.dict["settings"]["osc_2_on"] = 0.0 86 | self.dict["settings"]["osc_3_on"] = 0.0 87 | elif N_WAVETABLES == 2: 88 | self.dict["settings"]["osc_3_on"] = 0.0 89 | 90 | # adsr filter 91 | adsrs = self.dict[CUSTOM_KEYS]["adsr"] 92 | # attack is kQuartic 93 | self.dict["settings"]["env_1_attack"] = math.sqrt(math.sqrt(adsrs["attack"])) 94 | # attack power is kLinear 95 | self.dict["settings"]["env_1_attack_power"] = adsrs["attack_power"] 96 | # decay is kQuartic 97 | self.dict["settings"]["env_1_decay"] = math.sqrt(math.sqrt(adsrs["decay"])) 98 | # decay power is kLinear 99 | self.dict["settings"]["env_1_decay_power"] = adsrs["decay_power"] 100 | # sustain is kLinear 101 | self.dict["settings"]["env_1_sustain"] = adsrs["sustain"] 102 | 103 | # self.dict["settings"]["env_1_delay"] = adsrs["delay"] 104 | # self.dict["settings"]["env_1_hold"] = adsrs["hold"] 105 | # self.dict["settings"]["env_1_release"] = adsrs["release"] 106 | # self.dict["settings"]["env_1_release_power"] = adsrs["release_power"] 107 | # y["settings"]["lfos"] = x_init["settings"]["lfos"] 108 | 109 | del self.dict[CUSTOM_KEYS] 110 | 111 | with open(fname ,"w+") as f: 112 | json.dump(self.dict, f) -------------------------------------------------------------------------------- /syntheon/inferencer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/Dexed_01.syx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/Dexed_01.syx -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/checkpoints/state_best.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/checkpoints/state_best.pth -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/dexed_inferencer.py: -------------------------------------------------------------------------------- 1 | from syntheon.inferencer.inferencer import Inferencer, InferenceInput, InferenceOutput 2 | from syntheon.inferencer.dexed.models.preprocessor import ProcessData, F0LoudnessRMSPreprocessor 3 | from syntheon.inferencer.dexed.models.ddx7.models import DDSP_Decoder, TCNFMDecoder 4 | from syntheon.inferencer.dexed.models.ddx7.synth import FMSynth 5 | from syntheon.inferencer.dexed.models.amp_utils import * 6 | from syntheon.converter.dexed.dexed_converter import DexedConverter 7 | from syntheon.utils.pitch_extractor import extract_pitch 8 | import yaml 9 | import torch 10 | import librosa 11 | import soundfile as sf 12 | import pickle 13 | import os 14 | import numpy as np 15 | 16 | 17 | class DexedInferenceOutput(InferenceOutput): 18 | def __init__(self): 19 | InferenceOutput.__init__(self) 20 | self.synth_audio = None # TODO: can put default values here 21 | self.ol = None 22 | 23 | 24 | class DexedInferenceInput(InferenceInput): 25 | def __init__(self): 26 | self.x = None 27 | 28 | 29 | class DexedInferencer(Inferencer): 30 | def convert(self, audio_fname, model_pt_fname=None, enable_eval=False): 31 | # TODO: convert should be more like framework. preprocess -> load_model -> inference -> post_process 32 | if model_pt_fname is None: 33 | model_pt_fname = "syntheon/inferencer/dexed/checkpoints/state_best.pth" 34 | 35 | with open( 36 | os.path.join( 37 | os.path.dirname(os.path.realpath(__file__)), 38 | "models/conf/data_config.yaml" 39 | ), 'r' 40 | ) as f: 41 | data_config = yaml.safe_load(f) 42 | 43 | preprocessor = ProcessData( 44 | silence_thresh_dB=data_config["data_processor"]["silence_thresh_dB"], 45 | sr=data_config["data_processor"]["sr"], 46 | device=data_config["data_processor"]["device"], 47 | seq_len=data_config["data_processor"]["seq_len"], 48 | crepe_params=data_config["data_processor"]["crepe_params"], 49 | loudness_params=data_config["data_processor"]["loudness_params"], 50 | rms_params=data_config["data_processor"]["rms_params"], 51 | hop_size=data_config["data_processor"]["hop_size"], 52 | max_len=data_config["data_processor"]["max_len"], 53 | center=data_config["data_processor"]["center"] 54 | ) 55 | 56 | audio, _ = librosa.load(audio_fname, sr=data_config["data_processor"]["sr"]) 57 | 58 | f0 = extract_pitch(audio, data_config["data_processor"]["sr"], block_size=64) 59 | f0 = f0.astype(np.float32) 60 | loudness = preprocessor.calc_loudness(audio) 61 | rms = preprocessor.calc_rms(audio) 62 | 63 | scaler = F0LoudnessRMSPreprocessor() 64 | x = { 65 | "audio": torch.tensor(audio).unsqueeze(0).unsqueeze(-1), 66 | "f0": torch.tensor(f0).unsqueeze(0).unsqueeze(-1), 67 | "loudness": torch.tensor(loudness).unsqueeze(0).unsqueeze(-1), 68 | "rms": torch.tensor(rms).unsqueeze(0).unsqueeze(-1) 69 | } 70 | scaler.run(x) 71 | 72 | inference_input = DexedInferenceInput() 73 | inference_input.x = x 74 | 75 | model = self.load_model(model_pt_fname, self.device) 76 | inference_output = self.inference(model, inference_input, self.device, enable_eval=enable_eval) 77 | synth_params_dict = self.convert_to_preset(inference_output) 78 | return synth_params_dict, inference_output.eval_dict 79 | 80 | def load_model(self, model_pt_fname, device="cuda"): 81 | with open( 82 | os.path.join( 83 | os.path.dirname(os.path.realpath(__file__)), 84 | "models/conf/recipes/model/tcnres_f0ld_fmstr_noreverb.yaml" 85 | ), 'r' 86 | ) as f: 87 | config = yaml.safe_load(f) 88 | 89 | # prepare model 90 | decoder = TCNFMDecoder(n_blocks=config["decoder"]["n_blocks"], 91 | hidden_channels=config["decoder"]["hidden_channels"], 92 | out_channels=config["decoder"]["out_channels"], 93 | kernel_size=config["decoder"]["kernel_size"], 94 | dilation_base=config["decoder"]["dilation_base"], 95 | apply_padding=config["decoder"]["apply_padding"], 96 | deploy_residual=config["decoder"]["deploy_residual"], 97 | input_keys=config["decoder"]["input_keys"]) 98 | 99 | synth = FMSynth(sample_rate=config["synth"]["sample_rate"], 100 | block_size=config["synth"]["block_size"], 101 | fr=config["synth"]["fr"], 102 | max_ol=config["synth"]["max_ol"], 103 | synth_module=config["synth"]["synth_module"], 104 | is_reverb=False) 105 | 106 | model = DDSP_Decoder(decoder, synth) 107 | if device == "cuda": 108 | model.load_state_dict(torch.load(model_pt_fname)) 109 | model.cuda() 110 | else: 111 | model.load_state_dict(torch.load(model_pt_fname, map_location=torch.device('cpu'))) 112 | model.eval() 113 | return model 114 | 115 | def inference(self, model, inference_input, device="cuda", enable_eval=False): 116 | if device == "cuda": 117 | inference_input.audio = inference_input.x["audio"].cuda() 118 | inference_input.f0 = inference_input.x["f0"].cuda() 119 | inference_input.loudness = inference_input.x["loudness"].cuda() 120 | inference_input.rms = inference_input.x["rm"].cuda() 121 | 122 | # forward pass 123 | synth_out = model(inference_input.x) 124 | 125 | inference_output = DexedInferenceOutput() 126 | inference_output.synth_audio = synth_out["synth_audio"] 127 | inference_output.ol = synth_out["ol"] 128 | 129 | return inference_output 130 | 131 | def convert_to_preset(self, inference_output): 132 | 133 | dx_converter = DexedConverter() 134 | params_dict = dx_converter.serializeToDict("syntheon/inferencer/dexed/Dexed_01.syx") 135 | 136 | lst = [] 137 | for idx in range(6): 138 | ol = inference_output.ol[0, :, idx] 139 | ol = ol.cpu().detach().numpy() 140 | ol = ol.reshape(-1, 5).mean(axis=1) 141 | 142 | # TODO: these are all hacky code... 143 | if (idx == 0 or idx == 2): 144 | ol = ol / 0.32 145 | 146 | lst.append(np.mean(ol)) 147 | 148 | lst = [amplitude_to_dexed_ol(k) for k in lst] 149 | 150 | params_dict[0]["5_OL"] = lst[0] 151 | params_dict[0]["4_OL"] = lst[1] 152 | params_dict[0]["3_OL"] = lst[2] 153 | params_dict[0]["2_OL"] = lst[3] 154 | params_dict[0]["1_OL"] = lst[4] 155 | params_dict[0]["0_OL"] = lst[5] 156 | params_dict[0]["NAME CHAR 1"] = 83 157 | params_dict[0]["NAME CHAR 2"] = 89 158 | params_dict[0]["NAME CHAR 3"] = 78 159 | params_dict[0]["NAME CHAR 4"] = 84 160 | params_dict[0]["NAME CHAR 5"] = 72 161 | params_dict[0]["NAME CHAR 6"] = 69 162 | params_dict[0]["NAME CHAR 7"] = 79 163 | params_dict[0]["NAME CHAR 8"] = 78 164 | params_dict[0]["NAME CHAR 9"] = 32 165 | params_dict[0]["NAME CHAR 10"] = 32 166 | 167 | return params_dict 168 | 169 | 170 | if __name__ == "__main__": 171 | # TODO: move to test folder 172 | dexed_inferencer = DexedInferencer(device="cpu") 173 | params = dexed_inferencer.convert("test/test_audio/dexed_test_audio_1.wav") 174 | 175 | from syntheon.converter.dexed.dexed_converter import DexedConverter 176 | dexed_converter = DexedConverter() 177 | dexed_converter.dict = params 178 | dexed_converter.parseToPluginFile("dexed_output.syx") -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/amp_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | We find it hard to map Dexed's 0-99 output level to actual amplitude. 3 | So we conducted an empirical experiment, and manually fit the values using np.polyfit 4 | RMS is xx. Details to be released. 5 | """ 6 | import numpy as np 7 | 8 | def dexed_ol_to_amplitude(x): 9 | return 4e-4 * np.exp(0.086 * x) 10 | 11 | def amplitude_to_dexed_ol(x): 12 | return int((np.log(x) - np.log(4e-4)) / 0.086) -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/conf/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/data_config.yaml: -------------------------------------------------------------------------------- 1 | # Additional Dataset process options 2 | testset: 3 | source_folder: '/homes/fsc01/proj/wavetest' # Path to a directory containing folders with instrument names 4 | instruments: ['violin','flute','trumpet'] # Specify names of the folders in test set 5 | contiguous: True # Do not 'chop' into instances, generate a single contiguous instance extracted from audio. 6 | clip_noise: True # Clip loudness back to -80db when f0 is over 1900 (avoids crepe to track noise) 7 | input_dir: 'files/test' 8 | output_dir: 'data/test' 9 | 10 | 11 | # URMP Dataset process options 12 | urmp: 13 | source_folder: 'E://URMP//Dataset' # path to urmp dataset finishing in ... "/URMP/Dataset" 14 | instruments: #URMP IDs of instruments 15 | 'vn': 'violin' 16 | 'tpt': 'trumpet' 17 | 'fl' : flute 18 | mono_regex: 'AuSep' 19 | num_workers: 4 20 | input_dir: 'files/train' 21 | output_dir: 'data/train' 22 | 23 | data_processor: 24 | _target_: dataset.create_data.ProcessData 25 | silence_thresh_dB: 40 # Silence threshold for splitting instances. 26 | sr: 16000 # Sample rate 27 | device: 'cpu' # Torch Device ID 28 | crepe_params: 29 | model: 'full' # use 'full' for dataset generation - 'tiny' also available 30 | confidence_threshold: 0.80 # used 0.80 for flute, and 0.85 for violin and trumpet 31 | batch_size: 128 32 | fmin: 50 33 | fmax: 2000 34 | loudness_params: 35 | nfft: 2048 36 | rms_params: 37 | frame_size: 2048 38 | hop_size: 64 # hop size in samples for CREPE, RMS, or loudness 39 | max_len: 4 # Maximum block len ( in seconds ) 40 | seq_len: 3 # Minimum block len (in seconds) -> block is padded to fit max_len 41 | debug: False # Verbose 42 | center: False # True: Center loudness and pitch window before computing. False: Pad at the end. 43 | 44 | hydra: 45 | run: 46 | dir: outputs/null 47 | 48 | process_urmp: True # Process URMP. 49 | process_testset: False # Process additional testset. 50 | skip_copy: False # Skip file copying (if you have already done so) 51 | skip_process: False # Dry run. -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/README.md: -------------------------------------------------------------------------------- 1 | # Training options 2 | 3 | Training options are divided into three sections: 4 | 5 | 1. `config.yaml` contains information about a particular set of experiments or a run. 6 | 1. The `models` directory stores config files used to build the models tested on the paper, 7 | and the config of the differentiable synthesizers. 8 | 1. The `hyperparams` directory contains settings used to train `DDX7` and the `HpN Baseline`. 9 | -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/conf/recipes/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - hyperparams: ddx7 3 | - model: tcnres_f0ld_fmstr # Model to train (TCN w/ residual connections and string FM synth) 4 | 5 | instrument: violin 6 | device: cuda:0 7 | mode: test 8 | data_dir: data # Processed data dir 9 | load_additional_testset: False # Load additional testset from external files. 10 | seed: 1234 11 | train_split: 0.75 # Split factor for URMP train set. Rest is halved in valid and test set. 12 | resume_epoch: 0 # Resume epoch to keep training or just to test. 0 for no resume 13 | 14 | 15 | run_dir: runs # Directory where to store runs. 16 | # Each run dir contain experiments. Each experiment contain different runs. 17 | exp_name: exp_test # Experiment name. 18 | run_name: testrun # Run name (within experiment). 19 | 20 | 21 | hydra: 22 | output_subdir: . 23 | run: 24 | dir: ${run_dir}/${exp_name}/${run_name} -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/hyperparams/ddx7.yaml: -------------------------------------------------------------------------------- 1 | _target_: trainer.Hyperparams 2 | steps: 120000 3 | loss_fn: 4 | _target_: ddx7.loss_functions.rec_loss 5 | scales: [2048, 1024, 512, 256, 128, 64] 6 | overlap: 0.75 7 | scheduler: ExponentialLR 8 | opt: Adam 9 | lr: 3e-4 10 | lr_decay_rate: 0.98 11 | lr_decay_steps: 10000 12 | grad_clip_norm: 2.0 13 | batch_size: 16 14 | n_store_best: 20 # How many checkpoints do we want to keep. -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/hyperparams/hpn.yaml: -------------------------------------------------------------------------------- 1 | _target_: trainer.Hyperparams 2 | steps: 120000 # Original is 1000000 3 | loss_fn: 4 | _target_: ddx7.loss_functions.rec_loss 5 | scales: [2048, 1024, 512, 256, 128, 64] 6 | overlap: 0.75 7 | scheduler: ExponentialLR 8 | opt: Adam 9 | lr: 1e-4 10 | lr_decay_rate: 0.98 11 | lr_decay_steps: 10000 12 | grad_clip_norm: 3.0 13 | batch_size: 16 # original is 32 (and reverb of 4 s, we use 1 s) 14 | n_store_best: 20 # How many checkpoints do we want to keep. -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/conf/recipes/model/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/hpn_baseline.yaml: -------------------------------------------------------------------------------- 1 | _target_: aesddsp.ddsp.models.DDSP_Decoder 2 | decoder: 3 | _target_: aesddsp.ddsp.models.RnnFCDecoder 4 | hidden_size: 512 5 | sample_rate: 16000 6 | input_keys: ['f0_scaled','loudness_scaled'] 7 | input_sizes: [1,1] 8 | output_keys: ['amplitude','harmonic_distribution','noise_bands'] 9 | output_sizes: [1,60,65] 10 | synth: 11 | _target_: aesddsp.ddsp.synth.HNSynth 12 | sample_rate: 16000 13 | block_size: 64 14 | -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fm1stack2.yaml: -------------------------------------------------------------------------------- 1 | _target_:ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 2 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi, reciprocal 17 | fr: [1,1] 18 | synth_module: 1stack2 -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fm1stack4.yaml: -------------------------------------------------------------------------------- 1 | _target_: ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 4 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi 17 | fr: [1,1,3,14] 18 | synth_module: 1stack4 -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fm2stack2.yaml: -------------------------------------------------------------------------------- 1 | _target_:ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 4 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi 17 | fr: [1,1,1,1] 18 | synth_module: 2stack2 -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmablbrass.yaml: -------------------------------------------------------------------------------- 1 | _target_: ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 4 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi 17 | fr: [1, 1, 1, 3.2] 18 | synth_module: fmablbrass #ablated brass patch (for abl brass and flute) -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmablflute.yaml: -------------------------------------------------------------------------------- 1 | _target_: ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 4 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi 17 | fr: [1,1,1,2] 18 | synth_module: fmablbrass #ablated brass patch (for abl flute and brass) -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmbrss.yaml: -------------------------------------------------------------------------------- 1 | _target_: ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 6 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi 17 | fr: [1, 1, 1, 1, 3.2, 8.5] 18 | synth_module: fmbrass -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmflt.yaml: -------------------------------------------------------------------------------- 1 | _target_: ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 6 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi 17 | fr: [1, 1, 1, 2, 2, 1.5] 18 | synth_module: fmflute -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmstr.yaml: -------------------------------------------------------------------------------- 1 | _target_: ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 6 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi 17 | fr: [1,1,1,1,3,14] 18 | synth_module: fmstrings -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/conf/recipes/model/tcnres_f0ld_fmstr_noreverb.yaml: -------------------------------------------------------------------------------- 1 | _target_: ddx7.models.DDSP_Decoder 2 | decoder: 3 | _target_: ddx7.models.TCNFMDecoder 4 | n_blocks: 5 5 | hidden_channels: 128 6 | out_channels: 6 7 | kernel_size: 3 8 | dilation_base: 2 9 | apply_padding: True 10 | deploy_residual: True 11 | input_keys: ['f0_scaled','loudness_scaled'] 12 | synth: 13 | _target_: ddx7.synth.FMSynth 14 | sample_rate: 16000 15 | block_size: 64 16 | max_ol: 0.32 #max_ol is specified in multiples of 2*pi 17 | fr: [1,1,1,1,3,14] 18 | synth_module: fmstrings 19 | is_reverb: False -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/ddx7/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/core.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.fft as fft 4 | import numpy as np 5 | import librosa as li 6 | import math 7 | 8 | _DB_RANGE = 80.0 #Min loudness 9 | _REF_DB = 20.7 # White noise, amplitude=1.0, n_fft=2048 10 | _F0_RANGE = 127 11 | 12 | def safe_log(x,eps=1e-7): 13 | eps = torch.tensor(eps) 14 | return torch.log(x + eps) 15 | 16 | def safe_divide(numerator, denominator, eps=1e-7): 17 | """Avoid dividing by zero by adding a small epsilon.""" 18 | eps = torch.tensor(eps) 19 | safe_denominator = torch.where(denominator == 0.0, eps, denominator) 20 | return numerator / safe_denominator 21 | 22 | def logb(x, base=2.0, eps=1e-5): 23 | """Logarithm with base as an argument.""" 24 | return safe_divide(safe_log(x, eps), safe_log(base, eps), eps) 25 | 26 | def hz_to_midi(frequencies): 27 | """Torch-compatible hz_to_midi function.""" 28 | notes = 12.0 * (logb(frequencies, 2.0) - logb(440.0, 2.0)) + 69.0 29 | notes = torch.where(torch.le(frequencies, torch.zeros(1).to(frequencies)), 30 | torch.zeros(1).to(frequencies), notes) 31 | return notes 32 | 33 | 34 | @torch.no_grad() 35 | def cumsum_nd(in_tensor,wrap_value=None): 36 | ''' 37 | cumsum_nd() : cummulative sum - non differentiable and with wrap value. 38 | 39 | The problem with cumsum: when we work with phase tensors that are too large 40 | (i.e. more than a few tenths of seconds) cumsum gets to accumulate steps 41 | over a very large window, and it seems the float point variable loses precision. 42 | 43 | This workaround computes the accumulation step by step, resetting the 44 | accumulator in order for it to avoid to lose precision. 45 | 46 | NOTE: This implementation is very slow, and can't be used during training, 47 | only for final audio rendering on the test set. 48 | 49 | Assumes a tensor format used for audio rendering. [batch,len,1] 50 | 51 | NOTE: Non integer frequency ratios do not work using current synthesis approach, 52 | because we render a common phase (wrapped using cumsum_nd) and then we multiply it 53 | by the frequency ratio. This introduces a misalignment if we multiply the wrapped phase 54 | by a non-integer frequency ratio. 55 | 56 | TODO: implement an efficient vectorial cumsum with wrapping we can use to accumulate 57 | phases from all oscillators separately 58 | ''' 59 | print("[WARNING] Using non differentiable cumsum. Non-integer frequency ratios wont render well.") 60 | input_len = in_tensor.size()[1] 61 | nb = in_tensor.size()[0] 62 | acc = torch.zeros([nb,1,1]) 63 | out_tensor = torch.zeros([nb,input_len,1]) 64 | #print("in size{} - out size{}".format(in_tensor.size(),out_tensor.size())) 65 | for i in range(input_len): 66 | acc += in_tensor[:,i,0] 67 | if(wrap_value is not None): 68 | acc = acc - (acc > wrap_value)*wrap_value 69 | out_tensor[:,i,0] = acc 70 | return out_tensor 71 | 72 | 73 | 74 | @torch.no_grad() 75 | def mean_std_loudness(dataset): 76 | mean = 0 77 | std = 0 78 | n = 0 79 | for _, _, l in dataset: 80 | n += 1 81 | mean += (l.mean().item() - mean) / n 82 | std += (l.std().item() - std) / n 83 | return mean, std 84 | 85 | 86 | def multiscale_fft(signal, scales, overlap): 87 | stfts = [] 88 | for s in scales: 89 | S = torch.stft( 90 | signal, 91 | s, 92 | int(s * (1 - overlap)), 93 | s, 94 | torch.hann_window(s).to(signal), 95 | True, 96 | normalized=True, 97 | return_complex=True, 98 | ).abs() 99 | stfts.append(S) 100 | return stfts 101 | 102 | 103 | def resample(x, factor: int): 104 | batch, frame, channel = x.shape 105 | x = x.permute(0, 2, 1).reshape(batch * channel, 1, frame) 106 | 107 | window = torch.hann_window( 108 | factor * 2, 109 | dtype=x.dtype, 110 | device=x.device, 111 | ).reshape(1, 1, -1) 112 | y = torch.zeros(x.shape[0], x.shape[1], factor * x.shape[2]).to(x) 113 | y[..., ::factor] = x 114 | y[..., -1:] = x[..., -1:] 115 | y = torch.nn.functional.pad(y, [factor, factor]) 116 | y = torch.nn.functional.conv1d(y, window)[..., :-1] 117 | 118 | y = y.reshape(batch, channel, factor * frame).permute(0, 2, 1) 119 | 120 | return y 121 | 122 | 123 | def upsample(signal, factor,mode='nearest'): 124 | signal = signal.permute(0, 2, 1) 125 | signal = nn.functional.interpolate(signal, size=signal.shape[-1] * factor,mode=mode) 126 | return signal.permute(0, 2, 1) 127 | 128 | 129 | def extract_loudness(signal, sampling_rate, block_size, n_fft=2048): 130 | S = li.stft( 131 | signal, 132 | n_fft=n_fft, 133 | hop_length=block_size, 134 | win_length=n_fft, 135 | center=True, 136 | ) 137 | S = np.log(abs(S) + 1e-7) 138 | f = li.fft_frequencies(sampling_rate, n_fft) 139 | a_weight = li.A_weighting(f) 140 | 141 | S = S + a_weight.reshape(-1, 1) 142 | 143 | S = np.mean(S, 0)[..., :-1] 144 | 145 | return S 146 | 147 | 148 | 149 | def get_mlp(in_size, hidden_size, n_layers): 150 | channels = [in_size] + (n_layers) * [hidden_size] 151 | net = [] 152 | for i in range(n_layers): 153 | net.append(nn.Linear(channels[i], channels[i + 1])) 154 | net.append(nn.LayerNorm(channels[i + 1])) 155 | net.append(nn.LeakyReLU()) 156 | return nn.Sequential(*net) 157 | 158 | 159 | def get_gru(n_input, hidden_size): 160 | return nn.GRU(n_input * hidden_size, hidden_size, batch_first=True) 161 | 162 | 163 | def amp_to_impulse_response(amp, target_size): 164 | amp = torch.stack([amp, torch.zeros_like(amp)], -1) 165 | amp = torch.view_as_complex(amp) 166 | amp = fft.irfft(amp) 167 | 168 | filter_size = amp.shape[-1] 169 | 170 | amp = torch.roll(amp, filter_size // 2, -1) 171 | win = torch.hann_window(filter_size, dtype=amp.dtype, device=amp.device) 172 | 173 | amp = amp * win 174 | 175 | amp = nn.functional.pad(amp, (0, int(target_size) - int(filter_size))) 176 | amp = torch.roll(amp, -filter_size // 2, -1) 177 | 178 | return amp 179 | 180 | 181 | def fft_convolve(signal, kernel): 182 | signal = nn.functional.pad(signal, (0, signal.shape[-1])) 183 | kernel = nn.functional.pad(kernel, (kernel.shape[-1], 0)) 184 | 185 | output = fft.irfft(fft.rfft(signal) * fft.rfft(kernel)) 186 | output = output[..., output.shape[-1] // 2:] 187 | 188 | return output 189 | 190 | 191 | def harmonic_synth(pitch, amplitudes, sampling_rate,use_safe_cumsum=False): 192 | 193 | if(use_safe_cumsum==True): 194 | omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi) 195 | else: 196 | omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1) 197 | 198 | n_harmonic = amplitudes.shape[-1] 199 | omegas = omega * torch.arange(1, n_harmonic + 1).to(omega) 200 | signal = (torch.sin(omegas) * amplitudes).sum(-1, keepdim=True) 201 | return signal 202 | 203 | OP6=5 204 | OP5=4 205 | OP4=3 206 | OP3=2 207 | OP2=1 208 | OP1=0 209 | 210 | def fm_2stack2(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False): 211 | 212 | if(use_safe_cumsum==True): 213 | omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi) 214 | else: 215 | omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1) 216 | 217 | # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases. 218 | 219 | op4_phase = fr[OP4] * omega 220 | op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase) 221 | 222 | op3_phase = fr[OP3] * omega + 2 * np.pi * op4_output 223 | op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase) # output of stack of 2 224 | 225 | op2_phase = fr[OP2] * omega 226 | op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase) 227 | 228 | op1_phase = fr[OP1] * omega + 2 * np.pi * op2_output 229 | op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase) # output of stack of 2 230 | 231 | return (op3_output + op1_output)/max_ol 232 | 233 | def fm_1stack2(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False): 234 | 235 | if(use_safe_cumsum==True): 236 | omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi) 237 | else: 238 | omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1) 239 | 240 | # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases. 241 | 242 | op2_phase = fr[OP2] * omega 243 | op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase) 244 | 245 | op1_phase = fr[OP1] * omega + 2 * np.pi * op2_output 246 | op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase) # output of stack of 2 247 | 248 | return op1_output/max_ol 249 | 250 | 251 | def fm_1stack4(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False): 252 | 253 | if(use_safe_cumsum==True): 254 | omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi) 255 | else: 256 | omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1) 257 | 258 | # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases. 259 | 260 | op4_phase = fr[OP4] * omega 261 | op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase) 262 | 263 | op3_phase = fr[OP3] * omega + 2 * np.pi * op4_output 264 | op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase) # output of stack of 4 265 | 266 | op2_phase = fr[OP2] * omega + 2 * np.pi * op3_output 267 | op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase) 268 | 269 | op1_phase = fr[OP1] * omega + 2 * np.pi * op2_output 270 | op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase) # output of stack of 2 271 | 272 | return op1_output/max_ol 273 | 274 | 275 | ''' 276 | Ablated Brass FM Synth - with phase wrapping (it does not change behaviour) 277 | OP4->OP3->| 278 | OP2->|->OP1->out 279 | 280 | ''' 281 | def fm_ablbrass_synth(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False): 282 | 283 | if(use_safe_cumsum==True): 284 | omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi) 285 | else: 286 | omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1) 287 | 288 | # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases. 289 | 290 | op4_phase = fr[OP4] * omega 291 | op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase % (2*np.pi)) 292 | 293 | op3_phase = fr[OP3] * omega + 2 * np.pi * op4_output 294 | op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase % (2*np.pi)) # output of stack of 2 295 | 296 | op2_phase = fr[OP2] * omega 297 | op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase % (2*np.pi)) # output stack of 1 298 | 299 | op1_phase = fr[OP1] * omega + 2 * np.pi * (op2_output + op3_output) 300 | op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase % (2*np.pi)) # global carrier 301 | 302 | return op1_output/max_ol 303 | 304 | ''' 305 | String FM Synth - with phase wrapping (it does not change behaviour) 306 | PATCH NAME: STRINGS 1 307 | OP6->OP5->OP4->OP3 | 308 | (R)OP2->OP1 |->out 309 | ''' 310 | def fm_string_synth(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False): 311 | 312 | if(use_safe_cumsum==True): 313 | omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi) 314 | else: 315 | omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1) 316 | 317 | # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases. 318 | op6_phase = fr[OP6] * omega 319 | op6_output = torch.unsqueeze(ol[:,:,OP6], dim=-1) * torch.sin(op6_phase % (2*np.pi)) 320 | 321 | op5_phase = fr[OP5] * omega + 2 * np.pi * op6_output 322 | op5_output = torch.unsqueeze(ol[:,:,OP5], dim=-1)*torch.sin(op5_phase % (2*np.pi)) 323 | 324 | op4_phase = fr[OP4] * omega + 2 * np.pi * op5_output 325 | op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase % (2*np.pi)) 326 | 327 | op3_phase = fr[OP3] * omega + 2 * np.pi * op4_output 328 | op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase % (2*np.pi)) # output of stack of 4 329 | 330 | op2_phase = fr[OP2] * omega 331 | op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase % (2*np.pi)) 332 | 333 | op1_phase = fr[OP1] * omega + 2 * np.pi * op2_output 334 | op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase % (2*np.pi)) # output of stack of 2 335 | 336 | return (op3_output + op1_output)/max_ol 337 | 338 | ''' 339 | Flute FM Synth - with phase wrapping (it does not change behaviour) 340 | PATCH NAME: FLUTE 1 341 | (R)OP6->OP5->| 342 | OP4->OP3->| 343 | OP2->|->OP1->out 344 | ''' 345 | def fm_flute_synth(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False): 346 | 347 | if(use_safe_cumsum==True): 348 | omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi) 349 | else: 350 | omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1) 351 | 352 | # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases. 353 | op6_phase = fr[OP6] * omega 354 | op6_output = torch.unsqueeze(ol[:,:,OP6], dim=-1) * torch.sin(op6_phase % (2*np.pi)) 355 | 356 | op5_phase = fr[OP5] * omega + 2 * np.pi * op6_output 357 | op5_output = torch.unsqueeze(ol[:,:,OP5], dim=-1)*torch.sin(op5_phase % (2*np.pi)) # output of stack of 2 358 | 359 | op4_phase = fr[OP4] * omega 360 | op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase % (2*np.pi)) 361 | 362 | op3_phase = fr[OP3] * omega + 2 * np.pi * op4_output 363 | op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase % (2*np.pi)) # output of stack of 2 364 | 365 | op2_phase = fr[OP2] * omega 366 | op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase % (2*np.pi)) # output stack of 1 367 | 368 | op1_phase = fr[OP1] * omega + 2 * np.pi * (op2_output + op3_output + op5_output) 369 | op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase % (2*np.pi)) # carrier 370 | 371 | return op1_output/max_ol 372 | 373 | ''' 374 | Brass FM Synth - with phase wrapping (it does not change behaviour) 375 | PATCH NAME: BRASS 3 376 | OP6->OP5->OP4->| 377 | (R)OP3->| 378 | OP2->|->OP1->out 379 | ''' 380 | def fm_brass_synth(pitch, ol, fr, sampling_rate,max_ol,use_safe_cumsum=False): 381 | 382 | if(use_safe_cumsum==True): 383 | omega = cumsum_nd(2 * np.pi * pitch / sampling_rate, 2*np.pi) 384 | else: 385 | omega = torch.cumsum(2 * np.pi * pitch / sampling_rate, 1) 386 | 387 | # Torch unsqueeze with dim -1 adds a new dimension at the end of ol to match phases. 388 | op6_phase = fr[OP6] * omega 389 | op6_output = torch.unsqueeze(ol[:,:,OP6], dim=-1) * torch.sin(op6_phase % (2*np.pi)) 390 | 391 | op5_phase = fr[OP5] * omega + 2 * np.pi * op6_output 392 | op5_output = torch.unsqueeze(ol[:,:,OP5], dim=-1)*torch.sin(op5_phase % (2*np.pi)) 393 | 394 | op4_phase = fr[OP4] * omega + 2 * np.pi * op5_output 395 | op4_output = torch.unsqueeze(ol[:,:,OP4], dim=-1) * torch.sin(op4_phase % (2*np.pi)) # output of stack of 3 396 | 397 | op3_phase = fr[OP3] * omega 398 | op3_output = torch.unsqueeze(ol[:,:,OP3], dim=-1) * torch.sin(op3_phase % (2*np.pi)) # output of stack of 1 399 | 400 | op2_phase = fr[OP2] * omega 401 | op2_output = torch.unsqueeze(ol[:,:,OP2], dim=-1) * torch.sin(op2_phase % (2*np.pi)) # output stack of 1 402 | 403 | op1_phase = fr[OP1] * omega + 2 * np.pi * (op2_output + op3_output + op4_output) 404 | op1_output = torch.unsqueeze(ol[:,:,OP1], dim=-1) * torch.sin(op1_phase % (2*np.pi)) # carrier 405 | 406 | return op1_output/max_ol 407 | -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/data_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/dexed/models/ddx7/data_utils/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/data_utils/h5_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import h5py 3 | import numpy as np 4 | import random 5 | import torch 6 | import math 7 | 8 | class h5Dataset(Dataset): 9 | def __init__(self, sr, data_path,input_keys,max_audio_val=1,device='cpu'): 10 | self.sr = sr 11 | self.data_path = data_path 12 | self.input_data_dicts,self.dataset_len = self.cache_data(self.data_path,len(input_keys)) 13 | self.max_audio_val = max_audio_val 14 | self.input_keys = input_keys 15 | self.device = device 16 | 17 | def cache_data(self, data_path,nfeatures): 18 | ''' 19 | Load data to dictionary in RAM 20 | ''' 21 | h5f = h5py.File(data_path, 'r') 22 | cache = {} 23 | keys = h5f.keys() 24 | nkeys = len(keys) 25 | ndata = (len(keys)//nfeatures) 26 | if((nkeys//nfeatures)*nfeatures != nkeys): 27 | raise Exception("Unexpected dataset len.") 28 | 29 | for key in keys: 30 | cache[key] = np.array(h5f[key]) 31 | h5f.close() 32 | 33 | return cache, ndata 34 | 35 | def __getitem__(self, idx): 36 | #print("[DEBUG] __getitem__ fetching: {}".format(idx)) 37 | 38 | #Generate current item keys to fetch from RAM cache 39 | item_keys = [f'{idx}_{k}' for k in self.input_keys ] 40 | 41 | # Load dictionary 42 | x = {} 43 | for v,k in enumerate(self.input_keys): 44 | x[k] = torch.tensor(self.input_data_dicts[item_keys[v]]).unsqueeze(-1).to(self.device) 45 | 46 | #for k in x.keys(): 47 | # print(f'{k}: {x[k].shape} ',end='') 48 | #print('') 49 | 50 | return x 51 | 52 | def __len__(self): 53 | return self.dataset_len 54 | -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/data_utils/preprocessor.py: -------------------------------------------------------------------------------- 1 | import ddx7.core as core 2 | 3 | class F0LoudnessRMSPreprocessor(): 4 | """Scales 'f0_hz' and 'loudness_db' features.""" 5 | def __init__(self): 6 | return 7 | 8 | def run(self,x): 9 | x['loudness_scaled'] = self.scale_db(x['loudness']) 10 | x['rms_scaled'] = self.scale_db(x['rms']) 11 | x['f0_scaled'] = self.scale_f0_hz(x['f0']) 12 | return x 13 | 14 | def scale_db(self,db): 15 | """Scales [-DB_RANGE, 0] to [0, 1].""" 16 | return (db / core._DB_RANGE) + 1.0 17 | 18 | def scale_f0_hz(self,f0_hz): 19 | """Scales [0, Nyquist] Hz to [0, 1.0] MIDI-scaled.""" 20 | return core.hz_to_midi(f0_hz) / core._F0_RANGE -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/loss_functions.py: -------------------------------------------------------------------------------- 1 | import ddx7.core as core 2 | import torch 3 | import torch.nn as nn 4 | from functools import partial 5 | 6 | 7 | ''' 8 | Asimetric L1 distance 9 | ''' 10 | def asim_l1_distance(a,b,alpha=1,beta=1): 11 | diff = a-b 12 | pos_diff = diff * (diff > 0) 13 | neg_diff = diff * (diff < 0) 14 | as_diff = alpha * pos_diff + beta * neg_diff 15 | as_mse = torch.abs(as_diff).mean() 16 | return as_mse 17 | 18 | 19 | def asim_msfft_loss(a1, 20 | a2, 21 | scales=[4096, 2048, 1024, 512, 256, 128], 22 | overlap=0.75, 23 | alpha=1, 24 | beta=1): 25 | ''' 26 | DDSP Original MS FFT loss with lin + log spectra analysis 27 | ''' 28 | if(len(a1.size()) == 3): 29 | a1 = a1.squeeze(-1) 30 | if(len(a2.size()) == 3): 31 | a2 = a2.squeeze(-1) 32 | ori_stft = core.multiscale_fft( 33 | a1, 34 | scales, 35 | overlap, 36 | ) 37 | rec_stft = core.multiscale_fft( 38 | a2, 39 | scales, 40 | overlap, 41 | ) 42 | 43 | loss = 0 44 | for s_x, s_y in zip(ori_stft, rec_stft): 45 | lin_loss = asim_l1_distance(s_x, s_y,alpha,beta) 46 | log_loss = asim_l1_distance(core.safe_log(s_x),core.safe_log(s_y),alpha,beta) 47 | loss = loss + lin_loss + log_loss 48 | 49 | return loss 50 | 51 | 52 | 53 | def ddsp_msfft_loss(a1, 54 | a2, 55 | scales=[4096, 2048, 1024, 512, 256, 128], 56 | overlap=0.75): 57 | ''' 58 | DDSP Original MS FFT loss with lin + log spectra analysis 59 | Some remarks: the stfts have to be normalized otherwise the netowrk weights different excerpts to different importance. 60 | We compute the mean of the L1 difference between normalized magnitude spectrograms 61 | so that the magnitude of the loss do not change with the window size. 62 | ''' 63 | if(len(a1.size()) == 3): 64 | a1 = a1.squeeze(-1) 65 | if(len(a2.size()) == 3): 66 | a2 = a2.squeeze(-1) 67 | ori_stft = core.multiscale_fft( 68 | a1, 69 | scales, 70 | overlap, 71 | ) 72 | rec_stft = core.multiscale_fft( 73 | a2, 74 | scales, 75 | overlap, 76 | ) 77 | 78 | loss = 0 79 | for s_x, s_y in zip(ori_stft, rec_stft): 80 | lin_loss = (s_x - s_y).abs().mean() 81 | log_loss = (core.safe_log(s_x) - core.safe_log(s_y)).abs().mean() 82 | loss = loss + lin_loss + log_loss 83 | 84 | return loss 85 | 86 | class rec_loss(nn.Module): 87 | def __init__(self,scales,overlap,alpha=None,beta=None): 88 | super().__init__() 89 | self.scales = scales 90 | self.overlap = overlap 91 | if(alpha is not None and beta is not None): 92 | self.loss_fn = partial(asim_msfft_loss,alpha=alpha,beta=beta) 93 | print(f'[INFO] rec_loss() - Using asimetrical reconstruction loss. alpha: {alpha} - beta: {beta}') 94 | else: 95 | self.loss_fn = ddsp_msfft_loss 96 | def forward(self,ref,synth): 97 | return self.loss_fn(ref,synth, 98 | self.scales, 99 | self.overlap) 100 | -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from syntheon.inferencer.dexed.models.ddx7.core import get_gru, get_mlp 4 | import torch.nn.functional as F 5 | 6 | ''' 7 | Wrapper class for either HpN or DDX7 8 | ''' 9 | class DDSP_Decoder(nn.Module): 10 | def __init__(self,decoder,synth): 11 | super().__init__() 12 | net = [] 13 | net.append(decoder) 14 | net.append(synth) 15 | self.net = nn.Sequential(*net) 16 | 17 | def forward(self,x): 18 | return self.net(x) 19 | 20 | def get_sr(self): 21 | return self.net[1].sample_rate 22 | 23 | def enable_cumsum_nd(self): 24 | self.net[1].use_cumsum_nd=True 25 | 26 | def get_params(self,param): 27 | if(param == 'reverb_decay'): 28 | return self.net[1].reverb.decay.item() 29 | if(param == 'reverb_wet'): 30 | return self.net[1].reverb.wet.item() 31 | 32 | ''' 33 | GRU-Based decoder for HpN Baseline 34 | ''' 35 | class RnnFCDecoder(nn.Module): 36 | def __init__(self, hidden_size=512, sample_rate=16000, 37 | input_keys=None,input_sizes=[1,1,16], 38 | output_keys=['amplitude','harmonic_distribution','noise_bands'], 39 | output_sizes=[1,100,65]): 40 | super().__init__() 41 | self.input_keys = input_keys 42 | self.input_sizes = input_sizes 43 | n_keys = len(input_keys) 44 | # Generate MLPs of size: in_size: 1 ; n_layers = 3 (with layer normalization and leaky relu) 45 | if(n_keys == 2): 46 | self.in_mlps = nn.ModuleList([get_mlp(input_sizes[0], hidden_size, 3), 47 | get_mlp(input_sizes[1], hidden_size, 3)]) 48 | elif(n_keys == 3): 49 | self.in_mlps = nn.ModuleList([get_mlp(input_sizes[0], hidden_size, 3), 50 | get_mlp(input_sizes[1], hidden_size, 3), 51 | get_mlp(input_sizes[2], hidden_size, 3)]) 52 | else: 53 | raise ValueError("Expected 2 or 3 input keys. got: {}".format(input_keys)) 54 | 55 | #Generate GRU: input_size = n_keys * hidden_size ; n_layers = 1 (that's the default config) 56 | self.gru = get_gru(n_keys, hidden_size) 57 | 58 | #Generate output MLP: in_size: hidden_size + 2 ; n_layers = 3 59 | self.out_mlp = get_mlp(hidden_size + 2, hidden_size, 3) 60 | 61 | self.proj_matrices = [] 62 | self.output_keys = output_keys 63 | self.output_sizes = output_sizes 64 | for v,k in enumerate(output_keys): 65 | self.proj_matrices.append(nn.Linear(hidden_size,output_sizes[v])) 66 | 67 | self.proj_matrices = nn.ModuleList(self.proj_matrices) 68 | self.sample_rate = sample_rate 69 | 70 | def forward(self, x): 71 | # Run pitch and loudness and z (if available) inputs through the respectives input MLPs. 72 | # Then, concatenate the outputs in a flat vector. 73 | 74 | # Run through input_keys and load inputs accordingly 75 | hidden = torch.cat([self.in_mlps[v](x[k]) for v,k in enumerate(self.input_keys)],-1) 76 | 77 | # Run the flattened vector through the GRU. 78 | # The GRU predicts the embedding. 79 | # Then, concatenate the embedding with the disentangled parameters of pitch and loudness (nhid+2 size vector) 80 | hidden = torch.cat([self.gru(hidden)[0], x['f0_scaled'], x['loudness_scaled']], -1) 81 | # Run the embedding through the output MLP to obtain a 512-sized output vector. 82 | hidden = self.out_mlp(hidden) 83 | 84 | 85 | # Run embedding through a projection_matrix to get outputs 86 | controls = {} 87 | for v,k in enumerate(self.output_keys): 88 | controls[k] = self.proj_matrices[v](hidden) 89 | 90 | controls['f0_hz'] = x['f0'] 91 | 92 | return controls 93 | 94 | ''' 95 | TCN-Based decoder for DDX7 96 | ''' 97 | class TCNFMDecoder(nn.Module): 98 | ''' 99 | FM Decoder with sigmoid output 100 | ''' 101 | def __init__(self,n_blocks=2,hidden_channels=64,out_channels=6, 102 | kernel_size=3,dilation_base=2,apply_padding=True, 103 | deploy_residual=False, 104 | input_keys=None,z_size=None, 105 | output_complete_controls=True): 106 | super().__init__() 107 | 108 | # Store receptive field 109 | dilation_factor = (dilation_base**n_blocks-1)/(dilation_base-1) 110 | self.receptive_field = 1 + 2*(kernel_size-1)*dilation_factor 111 | print("[INFO] TCNFNDecoder - receptive field is: {}".format(self.receptive_field)) 112 | 113 | self.input_keys = input_keys 114 | n_keys = len(input_keys) 115 | self.output_complete_controls = output_complete_controls 116 | 117 | if(n_keys == 2): 118 | in_channels = 2 119 | elif(n_keys == 3): 120 | in_channels = 2 + z_size 121 | else: 122 | raise ValueError("Expected 2 or 3 input keys. got: {}".format(input_keys)) 123 | 124 | base = 0 125 | net = [] 126 | 127 | net.append(TCN_block(in_channels,hidden_channels,hidden_channels,kernel_size, 128 | dilation=dilation_base**base,apply_padding=apply_padding, 129 | deploy_residual=deploy_residual)) 130 | if(n_blocks>2): 131 | for i in range(n_blocks-2): 132 | base += 1 133 | net.append(TCN_block(hidden_channels,hidden_channels,hidden_channels, 134 | kernel_size,dilation=dilation_base**base,apply_padding=apply_padding)) 135 | 136 | base += 1 137 | net.append(TCN_block(hidden_channels,hidden_channels,out_channels,kernel_size, 138 | dilation=dilation_base**base,apply_padding=apply_padding, 139 | deploy_residual=deploy_residual,last_block=True)) 140 | 141 | self.net = nn.Sequential(*net) 142 | 143 | def forward(self,x): 144 | # Reshape features to follow Conv1d convention (nb,ch,seq_Len) 145 | conditioning = torch.cat([x[k] for v,k in enumerate(self.input_keys)],-1).permute([0,-1,-2]) 146 | 147 | ol = self.net(conditioning) 148 | ol = ol.permute([0,-1,-2]) 149 | if self.output_complete_controls is True: 150 | synth_params = { 151 | 'f0_hz': x['f0'], #In Hz 152 | 'ol': ol 153 | } 154 | else: 155 | synth_params = ol 156 | return synth_params 157 | 158 | class TCN_block(nn.Module): 159 | ''' 160 | TCN Block 161 | ''' 162 | def __init__(self,in_channels,hidden_channels,out_channels, 163 | kernel_size,stride=1,dilation=1,apply_padding=True, 164 | last_block=False,deploy_residual=False): 165 | super().__init__() 166 | block = [] 167 | cnv1 = CausalConv1d(in_channels,hidden_channels,kernel_size, 168 | stride=stride,dilation=dilation,apply_padding=apply_padding) 169 | block.append(torch.nn.utils.weight_norm( cnv1 ) ) 170 | block.append(nn.ReLU()) 171 | block.append(nn.Dropout()) 172 | 173 | cnv2 = CausalConv1d(hidden_channels,out_channels,kernel_size, 174 | stride=stride,dilation=dilation,apply_padding=apply_padding) 175 | block.append(torch.nn.utils.weight_norm( cnv2 ) ) 176 | if(last_block == False): 177 | block.append(nn.ReLU()) 178 | block.append(nn.Dropout()) 179 | 180 | self.block = nn.Sequential(*block) 181 | self.residual = None 182 | if(deploy_residual): 183 | if(apply_padding): 184 | self.residual = nn.Conv1d(in_channels,out_channels,1,padding = 0,stride=stride) 185 | else: 186 | raise ValueError("Residual connection is only possible when padding is enabled.") 187 | 188 | def forward(self,data): 189 | block_out = self.block(data) 190 | if(self.residual is not None): 191 | residual = self.residual(data) 192 | block_out = block_out + residual 193 | return block_out 194 | 195 | 196 | class CausalConv1d(torch.nn.Conv1d): 197 | ''' 198 | Basic layer for implementing a TCN 199 | ''' 200 | def __init__(self, 201 | in_channels, 202 | out_channels, 203 | kernel_size, 204 | stride=1, 205 | dilation=1, 206 | groups=1, 207 | bias=True, 208 | apply_padding=True): 209 | 210 | super(CausalConv1d, self).__init__( 211 | in_channels, 212 | out_channels, 213 | kernel_size=kernel_size, 214 | stride=stride, 215 | padding=0, 216 | dilation=dilation, 217 | groups=groups, 218 | bias=bias) 219 | 220 | self.apply_padding = apply_padding 221 | self.__padding = dilation*(kernel_size - 1) 222 | 223 | def forward(self, input): 224 | # Apply left padding using torch.nn.functional and then compute conv. 225 | if(self.apply_padding): 226 | return super(CausalConv1d, self).forward(F.pad(input, (self.__padding, 0))) 227 | else: 228 | return super(CausalConv1d, self).forward(input) 229 | -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/spectral_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torchcrepe 4 | import torchaudio 5 | import librosa 6 | from syntheon.inferencer.dexed.models.ddx7.core import _DB_RANGE,_REF_DB 7 | import math 8 | import numpy as np 9 | from time import time 10 | 11 | _RMS_FRAME = 2048 12 | _CREPE_WIN_LEN = 1024 13 | _LD_N_FFT = 2048 14 | 15 | def safe_log(x): 16 | return torch.log(x + 1e-7) 17 | 18 | def calc_f0(audio, rate, hop_size,fmin,fmax,model, 19 | batch_size,device,center=False): 20 | if center is False: 21 | # Add padding to the end. Then execute crepe w/o padding. 22 | # Crepe pads so that the signal stays in the center. 23 | n_samples_initial = int(audio.shape[-1]) 24 | n_frames = int(np.ceil(n_samples_initial / hop_size)) 25 | n_samples_final = (n_frames - 1) * hop_size + _CREPE_WIN_LEN 26 | pad = n_samples_final - n_samples_initial 27 | audio = np.pad(audio, ((0, pad),), "constant") 28 | 29 | audio = torch.from_numpy(audio).unsqueeze(0).float().to(device) 30 | 31 | t1 = time() 32 | print("predicting...") 33 | crepe_tuple = torchcrepe.predict(audio, 34 | rate, 35 | hop_size, 36 | fmin, 37 | fmax, 38 | model, 39 | return_periodicity=True, 40 | batch_size=batch_size, 41 | device=device, 42 | pad=center) 43 | print("done...", time() - t1) 44 | 45 | f0 = crepe_tuple[0] 46 | confidence = crepe_tuple[1] 47 | if center is True: 48 | f0 = f0[:,0:-1] #Discard the last sample 49 | confidence = confidence[:,0:-1] #Discard the last sample 50 | 51 | f0 = f0.squeeze(0).cpu().numpy() 52 | confidence = confidence.squeeze(0).cpu().numpy() 53 | return f0,confidence 54 | 55 | def calc_loudness(audio, rate, n_fft=_LD_N_FFT, hop_size=64, 56 | range_db=_DB_RANGE,ref_db=_REF_DB,center=False): 57 | np.seterr(divide='ignore') 58 | 59 | """Compute loudness, add to example (ref is white noise, amplitude=1).""" 60 | # Copied from magenta/ddsp/spectral_ops.py 61 | # Get magnitudes. 62 | if center is False: 63 | # Add padding to the end 64 | n_samples_initial = int(audio.shape[-1]) 65 | n_frames = int(np.ceil(n_samples_initial / hop_size)) 66 | n_samples_final = (n_frames - 1) * hop_size + n_fft 67 | pad = n_samples_final - n_samples_initial 68 | audio = np.pad(audio, ((0, pad),), "constant") 69 | spectra = librosa.stft( 70 | audio, n_fft=n_fft, hop_length=hop_size, center=center).T 71 | 72 | # Compute power 73 | amplitude = np.abs(spectra) 74 | amin = 1e-20 # Avoid log(0) instabilities. 75 | power_db = np.log10(np.maximum(amin, amplitude)) 76 | power_db *= 20.0 77 | 78 | # Perceptual weighting. 79 | frequencies = librosa.fft_frequencies(sr=rate, n_fft=n_fft) 80 | a_weighting = librosa.A_weighting(frequencies)[np.newaxis, :] 81 | loudness = power_db + a_weighting 82 | 83 | # Set dynamic range. 84 | loudness -= ref_db 85 | loudness = np.maximum(loudness, -range_db) 86 | 87 | # Average over frequency bins. (loudness is taken from the fft dimension!) 88 | mean_loudness_db = np.mean(loudness, axis=-1) 89 | return mean_loudness_db.astype(np.float32) 90 | 91 | ''' 92 | RMS POWER COMPUTATION. 93 | ''' 94 | 95 | def amplitude_to_db(amplitude): 96 | """Converts amplitude to decibels.""" 97 | amin = 1e-20 # Avoid log(0) instabilities. 98 | db = np.log10(np.maximum(amin, amplitude)) 99 | db *= 20.0 100 | return db 101 | 102 | def compute_rms_energy(audio, 103 | frame_size=2048, 104 | hop_size=64, 105 | pad_end=True): 106 | """Compute root mean squared energy of audio.""" 107 | if pad_end is True: 108 | # Add padding to the end 109 | n_samples_initial = int(audio.shape[-1]) 110 | n_frames = int(np.ceil(n_samples_initial / hop_size)) 111 | n_samples_final = (n_frames - 1) * hop_size + frame_size 112 | pad = n_samples_final - n_samples_initial 113 | audio = np.pad(audio, ((0, pad),), "constant") 114 | 115 | audio = torch.tensor(audio) 116 | audio_frames = audio.unfold(-1,frame_size,hop_size) 117 | rms_energy = torch.mean(audio_frames**2.0,dim=-1)**0.5 118 | 119 | return rms_energy.cpu().numpy() 120 | 121 | 122 | def calc_power(audio, 123 | frame_size=_RMS_FRAME, 124 | hop_size=64, 125 | range_db=_DB_RANGE, 126 | ref_db=20.7, 127 | pad_end=True): 128 | """Compute power of audio in dB.""" 129 | rms_energy = compute_rms_energy(audio, frame_size, hop_size,pad_end=pad_end) 130 | power_db = amplitude_to_db(rms_energy**2) 131 | #print(power_db) 132 | # Set dynamic range. 133 | power_db -= ref_db 134 | power_db = np.maximum(power_db, -range_db) 135 | return power_db.astype(np.float32) 136 | -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/ddx7/synth.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | from syntheon.inferencer.dexed.models.ddx7.core import * 5 | import soundfile as sf 6 | import librosa 7 | 8 | def exp_sigmoid(x): 9 | return 2 * torch.sigmoid(x)**(math.log(10)) + 1e-7 10 | 11 | def remove_above_nyquist(amplitudes, pitch, sampling_rate): 12 | n_harm = amplitudes.shape[-1] 13 | pitches = pitch * torch.arange(1, n_harm + 1).to(pitch) 14 | aa = (pitches < sampling_rate / 2).float() + 1e-4 15 | return amplitudes * aa 16 | 17 | 18 | class FMSynth(nn.Module): 19 | def __init__(self,sample_rate,block_size,fr=[1,1,1,1,3,14],max_ol=2, 20 | scale_fn = torch.sigmoid,synth_module='fmstrings',is_reverb=True): 21 | super().__init__() 22 | self.sample_rate = sample_rate 23 | self.block_size = block_size 24 | self.reverb = Reverb(length=sample_rate, sample_rate=sample_rate) 25 | fr = torch.tensor(fr) # Frequency Ratio 26 | self.register_buffer("fr", fr) #Non learnable but sent to GPU if declared as buffers, and stored in model dictionary 27 | self.scale_fn = scale_fn 28 | self.use_cumsum_nd = False 29 | self.max_ol = max_ol 30 | self.is_reverb = is_reverb 31 | 32 | available_synths = { 33 | 'fmbrass': fm_brass_synth, 34 | 'fmflute': fm_flute_synth, 35 | 'fmstrings': fm_string_synth, 36 | 'fmablbrass': fm_ablbrass_synth, 37 | '2stack2': fm_2stack2, 38 | '1stack2':fm_1stack2, 39 | '1stack4': fm_1stack4} 40 | 41 | self.synth_module = available_synths[synth_module] 42 | 43 | def forward(self,controls): 44 | 45 | ol = self.max_ol*self.scale_fn(controls['ol']) 46 | ol_up = upsample(ol, self.block_size,'linear') 47 | f0_up = upsample(controls['f0_hz'], self.block_size,'linear') 48 | signal = self.synth_module(f0_up, 49 | ol_up, 50 | self.fr, 51 | self.sample_rate, 52 | self.max_ol, 53 | self.use_cumsum_nd) 54 | #reverb part 55 | if self.is_reverb: 56 | signal = self.reverb(signal) 57 | 58 | synth_out = { 59 | 'synth_audio': signal, 60 | 'ol': ol, 61 | 'f0_hz': controls['f0_hz'] 62 | } 63 | return synth_out 64 | 65 | class HNSynth(nn.Module): 66 | def __init__(self,sample_rate,block_size,scale_fn = exp_sigmoid): 67 | super().__init__() 68 | self.sample_rate = sample_rate 69 | self.block_size = block_size 70 | self.reverb = Reverb(length=sample_rate, sample_rate=sample_rate) 71 | self.use_cumsum_nd = False 72 | self.scale_fn = scale_fn 73 | 74 | # expects: harmonic_distr, amplitude, noise_bands 75 | def forward(self,controls): 76 | 77 | harmonics = self.scale_fn(controls['harmonic_distribution']) 78 | noise_bands = self.scale_fn(controls['noise_bands']) 79 | total_amp = self.scale_fn(controls['amplitude']) 80 | 81 | harmonics = remove_above_nyquist( 82 | harmonics, 83 | controls['f0_hz'], 84 | self.sample_rate, 85 | ) 86 | harmonics /= harmonics.sum(-1, keepdim=True) 87 | harmonics *= total_amp 88 | 89 | harmonics_up = upsample(harmonics, self.block_size) 90 | f0_up = upsample(controls['f0_hz'], self.block_size,'linear') 91 | 92 | harmonic = harmonic_synth(f0_up, harmonics_up, self.sample_rate, self.use_cumsum_nd) 93 | impulse = amp_to_impulse_response(noise_bands, self.block_size) 94 | 95 | noise = torch.rand( 96 | impulse.shape[0], 97 | impulse.shape[1], 98 | self.block_size, 99 | ).to(impulse) * 2 - 1 100 | 101 | noise = fft_convolve(noise, impulse).contiguous() 102 | noise = noise.reshape(noise.shape[0], -1, 1) 103 | 104 | signal = harmonic + noise 105 | 106 | #reverb part 107 | signal = self.reverb(signal) 108 | synth_out = { 109 | 'synth_audio': signal, 110 | 'harmonic_distribution': harmonics, 111 | 'noise_bands': noise_bands, 112 | 'f0_hz': controls['f0_hz'] 113 | } 114 | 115 | return synth_out 116 | 117 | class Reverb(nn.Module): 118 | def __init__(self, length, sample_rate, initial_wet=0, initial_decay=5): 119 | super().__init__() 120 | self.length = length 121 | self.sample_rate = sample_rate 122 | 123 | self.noise = nn.Parameter((torch.rand(length) * 2 - 1).unsqueeze(-1)) 124 | self.decay = nn.Parameter(torch.tensor(float(initial_decay))) 125 | self.wet = nn.Parameter(torch.tensor(float(initial_wet))) 126 | 127 | t = torch.arange(self.length) / self.sample_rate 128 | t = t.reshape(1, -1, 1) 129 | self.register_buffer("t", t) 130 | 131 | def build_impulse(self): 132 | t = torch.exp(-nn.functional.softplus(-self.decay) * self.t * 500) 133 | noise = self.noise * t 134 | impulse = noise * torch.sigmoid(self.wet) 135 | impulse[:, 0] = 1 136 | return impulse 137 | 138 | def forward(self, x): 139 | lenx = x.shape[1] 140 | impulse = self.build_impulse() 141 | impulse = nn.functional.pad(impulse, (0, 0, 0, lenx - self.length)) 142 | 143 | x = fft_convolve(x.squeeze(-1), impulse.squeeze(-1)).unsqueeze(-1) 144 | 145 | return x 146 | 147 | 148 | if __name__ == "__main__": 149 | fmsynth_string = FMSynth(is_reverb=False, sample_rate=16000, block_size=64) 150 | freq = 523.25 151 | controls = {} 152 | controls['f0_hz'] = torch.ones(1, 1000, 1) * freq 153 | controls['ol'] = torch.zeros(1, 1000, 6) 154 | 155 | synth_out = fmsynth_string(controls)['synth_audio'] 156 | 157 | signal = synth_out.squeeze().cpu().detach().numpy() 158 | # signal_gt, sr = librosa.load("dexed_output_ol50_coarse1.wav", sr=16000) 159 | # signal_gt = signal_gt / np.amax(signal_gt) 160 | # # print(signal_gt.shape) 161 | 162 | # plt.plot(signal[16000:16400], label="signal") 163 | # plt.plot(signal_gt[16001:16401], label="signal_gt") 164 | # plt.legend() 165 | # plt.show() 166 | 167 | # print(np.amax(signal_gt)) 168 | 169 | # S_dexed = np.abs(librosa.stft(signal_gt)) 170 | # S_test = np.abs(librosa.stft(signal)) 171 | 172 | # fig, ax = plt.subplots() 173 | # import librosa.display 174 | # img = librosa.display.specshow(librosa.amplitude_to_db(S_dexed, 175 | # ref=np.max), 176 | # y_axis='log', x_axis='time', ax=ax) 177 | # ax.set_title('Dexed') 178 | # fig.colorbar(img, ax=ax, format="%+2.0f dB") 179 | # plt.show() 180 | 181 | # fig, ax = plt.subplots() 182 | # img = librosa.display.specshow(librosa.amplitude_to_db(S_test, 183 | # ref=np.max), 184 | # y_axis='log', x_axis='time', ax=ax) 185 | # ax.set_title('Test') 186 | # fig.colorbar(img, ax=ax, format="%+2.0f dB") 187 | # plt.show() 188 | 189 | sf.write("dexed_test_92.wav", signal, 16000) 190 | 191 | -------------------------------------------------------------------------------- /syntheon/inferencer/dexed/models/preprocessor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import operator 3 | import functools 4 | import h5py 5 | from pathlib import Path 6 | from tqdm import tqdm 7 | import numpy as np 8 | import librosa 9 | from syntheon.inferencer.dexed.models.ddx7 import spectral_ops 10 | from syntheon.inferencer.dexed.models.ddx7.core import hz_to_midi, _DB_RANGE, _F0_RANGE 11 | 12 | 13 | class dotdict(dict): 14 | """dot.notation access to dictionary attributes""" 15 | __getattr__ = dict.get 16 | __setattr__ = dict.__setitem__ 17 | __delattr__ = dict.__delitem__ 18 | 19 | 20 | class ProcessData(): 21 | def __init__(self, silence_thresh_dB, sr, device, seq_len, 22 | crepe_params, loudness_params, 23 | rms_params, hop_size, max_len, center, 24 | overlap = 0.0, 25 | debug = False, 26 | contiguous = False, 27 | contiguous_clip_noise = False): 28 | super().__init__() 29 | self.silence_thresh_dB = silence_thresh_dB 30 | self.crepe_params = crepe_params 31 | self.sr = sr 32 | self.device = torch.device(device) 33 | self.seq_len = seq_len 34 | self.loudness_params = loudness_params 35 | self.rms = rms_params 36 | self.max_len = max_len 37 | self.hop_size = hop_size 38 | self.feat_size = self.max_len*self.sr //self.hop_size 39 | self.audio_size = self.max_len*self.sr 40 | self.center = center 41 | self.overlap = overlap 42 | self.debug = debug 43 | self.contiguous = contiguous 44 | self.contiguous_clip_noise = contiguous_clip_noise 45 | 46 | def set_confidence(self,confidence): 47 | self.crepe_params.confidence_threshold = confidence 48 | 49 | def process_indices(self, indices: list) -> list: 50 | # Length in samples. 51 | max_len = self.max_len * self.sr 52 | 53 | def expand_long(indices_tuple: tuple) -> list: 54 | if indices_tuple[1] - indices_tuple[0] > max_len: 55 | ret = [(start, start+max_len) for start in np.arange(indices_tuple[0], indices_tuple[1] - max_len, max_len)] 56 | ret.append((ret[-1][-1], min(ret[-1][-1] + max_len, indices_tuple[1]))) 57 | return ret 58 | else: 59 | return [indices_tuple] 60 | 61 | new_indices = [*map(expand_long, indices)] 62 | new_indices = functools.reduce(operator.concat, new_indices, []) 63 | new_indices = [x for x in new_indices if (x[1] - x[0] > self.seq_len * self.sr)] 64 | return new_indices 65 | 66 | def pad_to_expected_size(self,features,expected_size,pad_value): 67 | 68 | #Pad to next integer division if we are processing a whole file in one go. 69 | if(self.contiguous == True): 70 | # Pad up to next integer division 71 | pad_len = (features.shape[-1] // expected_size + 1)*expected_size - features.shape[-1] 72 | #print(f'feat len {features.shape[-1]} expected {expected_size} pad {pad_len}') 73 | features = np.pad(features,(0,pad_len),'constant',constant_values=pad_value) 74 | return features 75 | else: 76 | if(self.debug): 77 | print("Feat shape {} - expected size: {}".format(features.shape[-1],expected_size)) 78 | if(features.shape[-1] < expected_size): 79 | pad_len = expected_size - features.shape[-1] 80 | features = np.pad(features,(0,pad_len),'constant',constant_values=pad_value) 81 | if(features.shape[-1] > expected_size): 82 | raise Exception('Expected size is smaller than current value') 83 | return features 84 | 85 | 86 | def extract_f0(self, audio): 87 | if isinstance(self.crepe_params, dict): 88 | self.crepe_params = dotdict(self.crepe_params) 89 | (f0,confidence) = spectral_ops.calc_f0(audio, 90 | rate=self.sr, 91 | hop_size=self.hop_size, 92 | fmin=self.crepe_params.fmin, 93 | fmax=self.crepe_params.fmax, 94 | model=self.crepe_params.model, 95 | batch_size=self.crepe_params.batch_size, 96 | device=self.device, 97 | center=self.center) 98 | 99 | if confidence.mean() < self.crepe_params.confidence_threshold: 100 | #print("Low confidence: {}".format(confidence.mean())) 101 | raise ValueError('Low f0 confidence') 102 | 103 | f0 = self.pad_to_expected_size(f0, 104 | expected_size = self.feat_size, 105 | pad_value=0) 106 | 107 | return f0 108 | 109 | def calc_loudness(self,audio): 110 | if isinstance(self.loudness_params, dict): 111 | self.loudness_params = dotdict(self.loudness_params) 112 | loudness = spectral_ops.calc_loudness(audio, rate=self.sr, 113 | n_fft=self.loudness_params.nfft, 114 | hop_size=self.hop_size, 115 | center=self.center,) 116 | 117 | loudness = self.pad_to_expected_size(loudness, 118 | expected_size = self.feat_size, 119 | pad_value=-_DB_RANGE) 120 | return loudness 121 | 122 | # TODO: Add center padding capability here. 123 | def calc_rms(self,audio): 124 | if isinstance(self.rms, dict): 125 | self.rms = dotdict(self.rms) 126 | rms = spectral_ops.calc_power(audio, frame_size=self.rms.frame_size, 127 | hop_size=self.hop_size,pad_end=True) 128 | rms = self.pad_to_expected_size(rms, 129 | expected_size = self.feat_size, 130 | pad_value=-_DB_RANGE) 131 | return rms 132 | 133 | def save_data(self, audio, f0, loudness, rms, h5f, counter): 134 | h5f.create_dataset(f'{counter}_audio', data=audio) 135 | h5f.create_dataset(f'{counter}_f0', data=f0) 136 | h5f.create_dataset(f'{counter}_loudness', data=loudness) 137 | h5f.create_dataset(f'{counter}_rms', data=rms) 138 | return counter + 1 139 | 140 | def init_h5(self, data_dir): 141 | return h5py.File(data_dir / f'{self.sr}.h5', 'w') 142 | 143 | def close_h5(self, h5f): 144 | h5f.close() 145 | 146 | ''' 147 | Main audio processing function 148 | ''' 149 | def run_on_files(self, data_dir, input_dir, output_dir): 150 | audio_files = list((input_dir/data_dir).glob('*.wav')) 151 | output_dir = output_dir / data_dir 152 | output_dir.mkdir(exist_ok=True) 153 | 154 | # Open container 155 | h5f = self.init_h5(output_dir) 156 | counter = 0 157 | 158 | for audio_file in tqdm(audio_files): 159 | if(self.debug): print("Processing: {}".format(audio_file)) 160 | 161 | # load and split files 162 | data, sr = librosa.load(audio_file.as_posix(), sr=self.sr) 163 | data = librosa.util.normalize(data) # Peak-normalize audio 164 | sounds_indices = [] 165 | if(self.contiguous): 166 | sounds_indices.append([0,len(data)]) 167 | else: 168 | sounds_indices = librosa.effects.split(data, top_db=self.silence_thresh_dB) 169 | #print("[DEBUG] Sound indices {}".format(sounds_indices)) 170 | sounds_indices = self.process_indices(sounds_indices) 171 | if len(sounds_indices) == 0: 172 | continue 173 | 174 | 175 | for indices in sounds_indices: 176 | audio = data[indices[0]:indices[1]] 177 | if(self.debug): print("\tIndexes: {} {} - len: {}".format(indices[0],indices[1],indices[1]-indices[0])) 178 | 179 | # Feature retrieval segment 180 | 181 | try: # Only process audio with enough CREPE confidence 182 | f0 = self.extract_f0(audio) 183 | except ValueError: 184 | continue 185 | 186 | # Further downsamples the audio back to the other specified sample rates and returns a dictionary. 187 | loudness = self.calc_loudness(audio) 188 | rms = self.calc_rms(audio) 189 | if(self.contiguous): 190 | if(self.contiguous_clip_noise): 191 | if(self.debug): print("[DEBUG] clipping noise") 192 | clip_pos = (f0 > 1900.0) 193 | loudness[clip_pos] = -_DB_RANGE 194 | audio = self.pad_to_expected_size(audio,f0.shape[0]*self.hop_size,0) 195 | 196 | else: 197 | audio = self.pad_to_expected_size(audio,self.audio_size,0) 198 | if(self.debug): print(f'\t Store block {counter}: f0 : {f0.shape} - loudness : {loudness.shape} - rms {rms.shape} - audio : {audio.shape}') 199 | counter = self.save_data(audio, f0, loudness, rms, h5f, counter) 200 | 201 | # Finished storing f0 and loudness 202 | self.close_h5(h5f) 203 | 204 | 205 | def run_on_dirs(self, input_dir: Path, output_dir: Path): 206 | #print("Starting with crepe confidence: {}".format(self.crepe_params.confidence_threshold)) 207 | folders = [x for x in input_dir.glob('./*') if x.is_dir()] 208 | for folder in tqdm(folders): 209 | self.run_on_files(folder.name, input_dir, output_dir) 210 | 211 | 212 | class F0LoudnessRMSPreprocessor(): 213 | """Scales 'f0_hz' and 'loudness_db' features.""" 214 | def __init__(self): 215 | return 216 | 217 | def run(self,x): 218 | x['loudness_scaled'] = self.scale_db(x['loudness']) 219 | x['rms_scaled'] = self.scale_db(x['rms']) 220 | x['f0_scaled'] = self.scale_f0_hz(x['f0']) 221 | return x 222 | 223 | def scale_db(self,db): 224 | """Scales [-DB_RANGE, 0] to [0, 1].""" 225 | return (db / _DB_RANGE) + 1.0 226 | 227 | def scale_f0_hz(self,f0_hz): 228 | """Scales [0, Nyquist] Hz to [0, 1.0] MIDI-scaled.""" 229 | return hz_to_midi(f0_hz) / _F0_RANGE -------------------------------------------------------------------------------- /syntheon/inferencer/inferencer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Connects model output to synth preset parameter IR. 3 | """ 4 | class InferenceInput: 5 | def __init__(self): 6 | return NotImplementedError 7 | 8 | 9 | class InferenceOutput: 10 | def __init__(self): 11 | # for storing evaluation results 12 | self.eval_dict = { 13 | "loss": -1 14 | } 15 | 16 | 17 | class Inferencer: 18 | def __init__(self, device="cuda"): 19 | self.device = device 20 | 21 | def convert(self, model_pt_fname, audio_fname): 22 | model = self.load_model(model_pt_fname, self.device) 23 | inference_output = self.inference(model, audio_fname, self.device) 24 | synth_params_dict = self.convert_to_preset(inference_output) 25 | return synth_params_dict, inference_output.eval_dict 26 | 27 | def load_model(self, model_pt_fname, device="cuda"): 28 | return NotImplementedError 29 | 30 | def inference(self, model, audio_fname): 31 | return NotImplementedError 32 | 33 | def convert_to_preset(self, inference_output): 34 | """ 35 | Output a Python dictionary to be handled by the converter. 36 | """ 37 | return NotImplementedError -------------------------------------------------------------------------------- /syntheon/inferencer/vital/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/vital/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/vital/checkpoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/vital/checkpoints/__init__.py -------------------------------------------------------------------------------- /syntheon/inferencer/vital/checkpoints/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/syntheon/inferencer/vital/checkpoints/model.pt -------------------------------------------------------------------------------- /syntheon/inferencer/vital/config.yaml: -------------------------------------------------------------------------------- 1 | common: 2 | sampling_rate: 16000 3 | block_size: 160 4 | duration_secs: 4 5 | 6 | train: 7 | batch_size: 16 8 | scales: [4096, 2048, 1024, 512, 256, 128] 9 | overlap: .75 10 | start_lr: 0.001 11 | stop_lr: 0.0001 12 | decay_over: 400000 13 | hidden_size: 256 14 | n_harmonic: 100 15 | n_bands: 65 16 | n_wavetables: 10 17 | n_mfcc: 30 18 | epochs: 100000 19 | 20 | test: 21 | batch_size: 2 22 | scales: [4096, 2048, 1024, 512, 256, 128] 23 | overlap: .75 24 | hidden_size: 256 25 | n_harmonic: 100 26 | n_bands: 65 27 | n_wavetables: 10 28 | n_mfcc: 30 29 | 30 | crepe: 31 | model: "large" 32 | 33 | visualize: false 34 | device: "cpu" 35 | -------------------------------------------------------------------------------- /syntheon/inferencer/vital/models/adsr_envelope.py: -------------------------------------------------------------------------------- 1 | """ 2 | Differentiable ADSR envelope shaper. 3 | Code largely influenced by https://github.com/hyakuchiki/diffsynth/blob/master/diffsynth/modules/envelope.py. 4 | """ 5 | import numpy as np 6 | import torch 7 | import os 8 | from torch import nn 9 | import yaml 10 | 11 | 12 | with open( 13 | os.path.join( 14 | os.path.dirname(os.path.realpath(__file__)), 15 | "../config.yaml" 16 | ), 'r' 17 | ) as stream: 18 | config = yaml.safe_load(stream) 19 | device = config["device"] 20 | 21 | 22 | def soft_clamp_min(x, min_v, T=100): 23 | return torch.sigmoid((min_v-x)*T)*(min_v-x)+x 24 | 25 | 26 | class DiffRoundFunc(torch.autograd.Function): 27 | @staticmethod 28 | def forward(ctx, input): 29 | ctx.input = input 30 | return torch.round(input * 10 ** 2) / (10 ** 2) # because 2 decimal point, 0.01 is the minimum ratio 31 | 32 | @staticmethod 33 | def backward(ctx, grad_output): 34 | grad_input = grad_output.clone() 35 | return grad_input 36 | 37 | 38 | class ADSREnvelopeShaper(nn.Module): 39 | def __init__(self, is_round_secs=False): 40 | super(ADSREnvelopeShaper, self).__init__() 41 | self.attack_secs = torch.tensor([0]) 42 | self.attack_power = torch.tensor([0]) 43 | self.decay_secs = torch.tensor([0]) 44 | self.decay_power = torch.tensor([0]) 45 | self.sustain_level = torch.tensor([0]) 46 | self.release_secs = torch.tensor([0]) 47 | self.release_power = torch.tensor([0]) 48 | 49 | self.is_round_secs = is_round_secs 50 | self.round_decimal_places = 2 # because block size = 100, so min resolution = 1/ 100 51 | 52 | def power_function(self, x, pow=2): 53 | if pow > 0: # convex 54 | # transpose 55 | 56 | if x.squeeze()[0] > x.squeeze()[-1]: 57 | y_intercept = x.squeeze()[-1] 58 | y = x - x[:, -1, :] 59 | max_val = y.squeeze()[0] 60 | y = y / max_val 61 | else: 62 | y_intercept = x.squeeze()[0] 63 | y = x - x[:, 0, :] 64 | max_val = y.squeeze()[-1] 65 | y = y / max_val 66 | 67 | y = y ** pow 68 | 69 | # transpose back 70 | y = y * max_val + y_intercept 71 | 72 | else: 73 | # transpose 74 | if x.squeeze()[0] > x.squeeze()[-1]: 75 | max_val = x.squeeze()[0] 76 | y = x - x[:, 0, :] 77 | y_intercept = y.squeeze()[-1] 78 | y = y / -y_intercept 79 | else: 80 | max_val = x.squeeze()[-1] 81 | y = x - x[:, -1, :] 82 | y_intercept = y.squeeze()[0] 83 | y = y / -y_intercept 84 | 85 | y = -(y ** -pow) 86 | 87 | # transpose back 88 | y = y * -y_intercept + max_val 89 | 90 | return y 91 | 92 | def gen_envelope(self, attack, decay, sus_level, release, 93 | floor=None, peak=None, n_frames=250, pow=2): 94 | """generate envelopes from parameters 95 | Args: 96 | floor (torch.Tensor): floor level of the signal 0~1, 0=min_value (batch, 1, channels) 97 | peak (torch.Tensor): peak level of the signal 0~1, 1=max_value (batch, 1, channels) 98 | attack (torch.Tensor): relative attack point 0~1 (batch, 1, channels) 99 | decay (torch.Tensor): actual decay point is attack+decay (batch, 1, channels) 100 | sus_level (torch.Tensor): sustain level 0~1 (batch, 1, channels) 101 | release (torch.Tensor): release point is attack+decay+release (batch, 1, channels) 102 | note_off (float or torch.Tensor, optional): note off position. Defaults to 0.8. 103 | n_frames (int, optional): number of frames. Defaults to None. 104 | Returns: 105 | torch.Tensor: envelope signal (batch_size, n_frames, 1) 106 | """ 107 | if floor is None: 108 | floor = torch.tensor([0.]).unsqueeze(0).unsqueeze(-1) 109 | if device == "cuda": 110 | floor = floor.cuda() 111 | if peak is None: 112 | peak = torch.tensor([1.]).unsqueeze(0).unsqueeze(-1) 113 | if device == "cuda": 114 | peak = peak.cuda() 115 | 116 | attack = torch.clamp(attack, min=0, max=1) 117 | decay = torch.clamp(decay, min=0, max=1) 118 | sus_level = torch.clamp(sus_level, min=0.001, max=1) 119 | release = torch.clamp(release, min=0, max=1) 120 | 121 | batch_size = attack.shape[0] 122 | if n_frames is None: 123 | n_frames = self.n_frames 124 | 125 | x = torch.linspace(0, 1.0, n_frames)[None, :, None].repeat(batch_size, 1, 1) 126 | x[:, 0, :] = 1e-6 # offset 0 to epsilon value, so when attack = 0, first adsr value is not 0 but 1 127 | x = x.to(attack.device) 128 | 129 | A = x / (attack + 1e-6) 130 | # A = self.power_function(A, pow=2) 131 | A = torch.clamp(A, max=1.0) 132 | 133 | D = (x - attack) * (sus_level - 1) / (decay+1e-6) 134 | # D = self.power_function(D, pow=-2.7) 135 | D = torch.clamp(D, max=0.0) 136 | D = soft_clamp_min(D, sus_level-1) 137 | 138 | S = (x - 1) * (-sus_level / (release+1e-6)) 139 | S = torch.clamp(S, max=0.0) 140 | S = soft_clamp_min(S, -sus_level) 141 | 142 | signal = (A + D + S) * (peak - floor) + floor 143 | return torch.clamp(signal, min=0., max=1.) 144 | 145 | def forward(self, 146 | attack_secs, 147 | decay_secs, 148 | sustain_level, 149 | block_size=100, 150 | sr=44100, 151 | total_secs=8): 152 | if self.is_round_secs: 153 | attack_secs = DiffRoundFunc.apply(attack_secs) 154 | decay_secs = DiffRoundFunc.apply(decay_secs) 155 | 156 | self.attack_secs = attack_secs 157 | self.decay_secs = decay_secs 158 | self.sustain_level = sustain_level 159 | 160 | attack_ratio = attack_secs / total_secs 161 | decay_ratio = decay_secs / total_secs 162 | # TODO: parameterize release_ratio 163 | release_ratio = torch.tensor([0.]).repeat(attack_secs.size(0), 1, 1) 164 | if device == "cuda": 165 | release_ratio = release_ratio.cuda() 166 | 167 | attack_ratio = attack_ratio.unsqueeze(-1).unsqueeze(-1) 168 | decay_ratio = decay_ratio.unsqueeze(-1).unsqueeze(-1) 169 | sus_level = sustain_level.unsqueeze(-1).unsqueeze(-1) 170 | 171 | signal = self.gen_envelope(attack_ratio, decay_ratio, sus_level, release_ratio, 172 | floor=None, peak=None, n_frames=int(total_secs * block_size), 173 | pow=2) 174 | return signal.squeeze() 175 | 176 | 177 | def get_amp_shaper( 178 | shaper, 179 | onsets, 180 | attack_secs, 181 | decay_secs, 182 | sustain_level, 183 | offsets=None): 184 | """ 185 | implement case with no offset first. enable batches 186 | """ 187 | if offsets is None: 188 | # if offset not specified, take next onset as offset 189 | offsets = onsets[1:] 190 | onsets = onsets[:len(onsets) - 1] 191 | 192 | start_offset = int(onsets[0] * 100) # TODO: 100 is block size 193 | onsets, offsets = torch.tensor(onsets), torch.tensor(offsets) 194 | if device == "cuda": 195 | onsets, offsets = onsets.cuda(), offsets.cuda() 196 | dur_vec = offsets - onsets 197 | lst = [] 198 | 199 | # append zeros first before first onset 200 | if device == "cuda": 201 | lst.append(torch.zeros(start_offset).cuda()) 202 | else: 203 | lst.append(torch.zeros(start_offset)) 204 | 205 | for dur in dur_vec: 206 | dur = round(dur.item(), 2) 207 | adsr = shaper( 208 | attack_secs=attack_secs, 209 | decay_secs=decay_secs, 210 | sustain_level=sustain_level, 211 | total_secs=dur) 212 | 213 | # adsr shape should be (bs, dur * block_size) 214 | lst.append(adsr) 215 | 216 | final_signal = torch.cat(lst, dim=-1) 217 | return final_signal 218 | 219 | 220 | if __name__ == "__main__": 221 | # TODO: unit test for this class 222 | import matplotlib.pyplot as plt 223 | 224 | shaper = ADSREnvelopeShaper(is_round_secs=False) 225 | adsrs = [] 226 | for elem in [0.0, 0.001, 0.005, 0.01, 0.02]: 227 | attack_secs, decay_secs, sustain_level = torch.tensor([0.2]), torch.tensor([elem]), torch.tensor([0.8]) 228 | if device == "cuda": 229 | attack_secs = attack_secs.cuda() 230 | decay_secs = decay_secs.cuda() 231 | sustain_level = sustain_level.cuda() 232 | 233 | x2 = shaper( 234 | attack_secs=attack_secs, 235 | decay_secs=decay_secs, 236 | sustain_level=sustain_level, 237 | total_secs=4) 238 | 239 | adsrs.append(x2.squeeze().cpu().detach().numpy()[:30]) 240 | 241 | for idx, elem in enumerate([0.0, 0.001, 0.005, 0.01, 0.02]): 242 | plt.plot(adsrs[idx], label=str(elem)) 243 | plt.scatter(range(30), adsrs[idx]) 244 | plt.legend() 245 | plt.show() -------------------------------------------------------------------------------- /syntheon/inferencer/vital/models/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core functions. 3 | The code mainly comes from https://github.com/acids-ircam/ddsp_pytorch with minor adaptations. 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | import torch.fft as fft 8 | import numpy as np 9 | import librosa as li 10 | import math 11 | import yaml 12 | import os 13 | 14 | 15 | with open( 16 | os.path.join( 17 | os.path.dirname(os.path.realpath(__file__)), 18 | "../config.yaml" 19 | ), 'r' 20 | ) as stream: 21 | config = yaml.safe_load(stream) 22 | 23 | device = config["device"] 24 | 25 | 26 | def safe_log(x): 27 | return torch.log(x + 1e-7) 28 | 29 | 30 | @torch.no_grad() 31 | def mean_std_loudness(dataset): 32 | mean = 0 33 | std = 0 34 | n = 0 35 | for _, _, l in dataset: 36 | n += 1 37 | mean += (l.mean().item() - mean) / n 38 | std += (l.std().item() - std) / n 39 | return mean, std 40 | 41 | 42 | def multiscale_fft(signal, scales, overlap): 43 | stfts = [] 44 | for s in scales: 45 | S = torch.stft( 46 | signal, 47 | s, 48 | int(s * (1 - overlap)), 49 | s, 50 | torch.hann_window(s).to(signal), 51 | True, 52 | normalized=True, 53 | return_complex=True, 54 | ).abs() 55 | stfts.append(S) 56 | return stfts 57 | 58 | 59 | def resample(x, factor: int): 60 | batch, frame, channel = x.shape 61 | x = x.permute(0, 2, 1).reshape(batch * channel, 1, frame) 62 | 63 | window = torch.hann_window( 64 | factor * 2, 65 | dtype=x.dtype, 66 | device=x.device, 67 | ).reshape(1, 1, -1) 68 | y = torch.zeros(x.shape[0], x.shape[1], factor * x.shape[2]).to(x) 69 | y[..., ::factor] = x 70 | y[..., -1:] = x[..., -1:] 71 | y = torch.nn.functional.pad(y, [factor, factor]) 72 | y = torch.nn.functional.conv1d(y, window)[..., :-1] 73 | 74 | y = y.reshape(batch, channel, factor * frame).permute(0, 2, 1) 75 | 76 | return y 77 | 78 | 79 | def upsample(signal, factor, preferred_size=None, mode="nearest"): 80 | signal = signal.permute(0, 2, 1) 81 | if preferred_size is not None: 82 | signal = nn.functional.interpolate(signal, size=preferred_size, mode=mode) 83 | else: 84 | signal = nn.functional.interpolate(signal, size=signal.shape[-1] * factor, mode=mode) 85 | return signal.permute(0, 2, 1) 86 | 87 | 88 | def remove_above_nyquist(amplitudes, pitch, sampling_rate): 89 | n_harm = amplitudes.shape[-1] 90 | pitches = pitch * torch.arange(1, n_harm + 1).to(pitch) 91 | aa = (pitches < sampling_rate / 2).float() + 1e-4 92 | return amplitudes * aa 93 | 94 | 95 | def scale_function(x): 96 | return 2 * torch.sigmoid(x)**(math.log(10)) + 1e-7 97 | 98 | 99 | def amplitude_to_db(amplitude): 100 | amin = 1e-20 # Avoid log(0) instabilities. 101 | db = torch.log10(torch.clamp(amplitude, min=amin)) 102 | db *= 20.0 103 | return db 104 | 105 | 106 | def extract_loudness(audio, sampling_rate, block_size=None, n_fft=2048, frame_rate=None): 107 | assert (block_size is None) != (frame_rate is None), "Specify exactly one of block_size or frame_rate" 108 | 109 | if frame_rate is not None: 110 | block_size = sampling_rate // frame_rate 111 | else: 112 | frame_rate = int(sampling_rate / block_size) 113 | 114 | if sampling_rate % frame_rate != 0: 115 | raise ValueError( 116 | 'frame_rate: {} must evenly divide sample_rate: {}.' 117 | 'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz' 118 | .format(frame_rate, sampling_rate)) 119 | 120 | if isinstance(audio, np.ndarray): 121 | audio = torch.tensor(audio) 122 | 123 | # Temporarily a batch dimension for single examples. 124 | is_1d = (len(audio.shape) == 1) 125 | audio = audio[None, :] if is_1d else audio 126 | 127 | # Take STFT. 128 | overlap = 1 - block_size / n_fft 129 | amplitude = torch.stft(audio, n_fft=n_fft, hop_length=block_size, center=True, pad_mode='reflect', return_complex=True).abs() 130 | amplitude = amplitude[:, :, :-1] 131 | 132 | # Compute power. 133 | power_db = amplitude_to_db(amplitude) 134 | 135 | # Perceptual weighting. 136 | frequencies = li.fft_frequencies(sr=sampling_rate, n_fft=n_fft) 137 | a_weighting = li.A_weighting(frequencies)[None,:,None] 138 | loudness = power_db + a_weighting 139 | 140 | loudness = torch.mean(torch.pow(10, loudness / 10.0), axis=1) 141 | loudness = 10.0 * torch.log10(torch.clamp(loudness, min=1e-20)) 142 | 143 | # Remove temporary batch dimension. 144 | loudness = loudness[0] if is_1d else loudness 145 | loudness = loudness.numpy() 146 | 147 | return loudness 148 | 149 | 150 | def mlp(in_size, hidden_size, n_layers): 151 | channels = [in_size] + (n_layers) * [hidden_size] 152 | net = [] 153 | for i in range(n_layers): 154 | net.append(nn.Linear(channels[i], channels[i + 1])) 155 | net.append(nn.LayerNorm(channels[i + 1])) 156 | net.append(nn.LeakyReLU()) 157 | return nn.Sequential(*net) 158 | 159 | 160 | def gru(n_input, hidden_size): 161 | return nn.GRU(n_input * hidden_size, hidden_size, batch_first=True) 162 | 163 | 164 | def harmonic_synth(pitch, amplitudes, sampling_rate): 165 | n_harmonic = amplitudes.shape[-1] 166 | omega = torch.cumsum(2 * math.pi * pitch / sampling_rate, 1) 167 | omegas = omega * torch.arange(1, n_harmonic + 1).to(omega) 168 | 169 | signal = (torch.sin(omegas) * amplitudes).sum(-1, keepdim=True) 170 | return signal 171 | 172 | 173 | def amp_to_impulse_response(amp, target_size): 174 | amp = torch.stack([amp, torch.zeros_like(amp)], -1) 175 | amp = torch.view_as_complex(amp) 176 | amp = fft.irfft(amp) 177 | 178 | filter_size = amp.shape[-1] 179 | 180 | amp = torch.roll(amp, filter_size // 2, -1) 181 | win = torch.hann_window(filter_size, dtype=amp.dtype, device=amp.device) 182 | 183 | amp = amp * win 184 | 185 | amp = nn.functional.pad(amp, (0, int(target_size) - int(filter_size))) 186 | amp = torch.roll(amp, -filter_size // 2, -1) 187 | 188 | return amp 189 | 190 | 191 | def fft_convolve(signal, kernel): 192 | signal = nn.functional.pad(signal, (0, signal.shape[-1])) 193 | kernel = nn.functional.pad(kernel, (kernel.shape[-1], 0)) 194 | 195 | output = fft.irfft(fft.rfft(signal) * fft.rfft(kernel)) 196 | output = output[..., output.shape[-1] // 2:] 197 | 198 | return output 199 | 200 | 201 | def get_scheduler(len_dataset, start_lr, stop_lr, length): 202 | def schedule(epoch): 203 | step = epoch * len_dataset 204 | if step < length: 205 | t = step / length 206 | return start_lr * (1 - t) + stop_lr * t 207 | else: 208 | return stop_lr 209 | 210 | return schedule -------------------------------------------------------------------------------- /syntheon/inferencer/vital/models/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Diff-WTS model. Main adapted from https://github.com/acids-ircam/ddsp_pytorch. 3 | """ 4 | from syntheon.inferencer.vital.models.wavetable_synth import WavetableSynthV2 5 | import torch 6 | import torch.nn as nn 7 | from syntheon.inferencer.vital.models.core import mlp, gru, scale_function, remove_above_nyquist, upsample 8 | from syntheon.inferencer.vital.models.core import amp_to_impulse_response, fft_convolve 9 | from syntheon.inferencer.vital.models.adsr_envelope import * 10 | import numpy as np 11 | from torchvision.transforms import Resize 12 | from time import time 13 | 14 | class PrintLayer(nn.Module): 15 | def __init__(self, name): 16 | super(PrintLayer, self).__init__() 17 | self.name = name 18 | 19 | def forward(self, x): 20 | # Do your print / debug stuff here 21 | print(self.name, x[0].squeeze().item()) 22 | x += 1e-2 23 | return x 24 | 25 | 26 | class Reverb(nn.Module): 27 | def __init__(self, length, sampling_rate, initial_wet=0, initial_decay=5): 28 | super().__init__() 29 | self.length = length 30 | self.sampling_rate = sampling_rate 31 | 32 | self.noise = nn.Parameter((torch.rand(length) * 2 - 1).unsqueeze(-1)) 33 | self.decay = nn.Parameter(torch.tensor(float(initial_decay))) 34 | self.wet = nn.Parameter(torch.tensor(float(initial_wet))) 35 | 36 | t = torch.arange(self.length) / self.sampling_rate 37 | t = t.reshape(1, -1, 1) 38 | self.register_buffer("t", t) 39 | 40 | def build_impulse(self): 41 | t = torch.exp(-nn.functional.softplus(-self.decay) * self.t * 500) 42 | noise = self.noise * t 43 | impulse = noise * torch.sigmoid(self.wet) 44 | impulse[:, 0] = 1 45 | return impulse 46 | 47 | def forward(self, x): 48 | lenx = x.shape[1] 49 | impulse = self.build_impulse() 50 | impulse = nn.functional.pad(impulse, (0, 0, 0, lenx - self.length)) 51 | 52 | x = fft_convolve(x.squeeze(-1), impulse.squeeze(-1)).unsqueeze(-1) 53 | 54 | return x 55 | 56 | 57 | def infer_wavetables(y, pitch): 58 | """ 59 | TODO: VERY BUGGY CODE. need to care for edge cases (like silence) 60 | 61 | y: (64000,) 62 | pitch: (400,), 1 second 100 frames 63 | """ 64 | period = 1 / pitch * 16000 65 | 66 | # find the first continuous pitch 67 | # TODO: find the most continuous pitch across the sample, for best results 68 | continuous_threshold = 10 # at least 10 steps = 0.1 sec, will be a problem for plucks, but for now do it like this 69 | continuous_pitch = -1 70 | continuous_pitch_idx = -1 71 | cur_pitch = pitch[0] 72 | step = 0 73 | 74 | for idx in range(1, len(pitch)): 75 | if abs(pitch[idx] - cur_pitch) < 1e-2: # equal freq tolerance 1e-2 76 | step += 1 77 | if step > continuous_threshold: 78 | continuous_pitch = cur_pitch 79 | continuous_pitch_idx = idx - step 80 | break 81 | else: 82 | cur_pitch = pitch[idx] 83 | step = 0 84 | 85 | if continuous_pitch == -1: # fallback 86 | continuous_pitch = pitch[0] 87 | continuous_pitch_idx = 0 88 | 89 | period = int(1 / continuous_pitch * 16000) 90 | pitch_offset_idx = continuous_pitch_idx * 160 # 160 = sr / frame_size (100) 91 | 92 | # find local minimum within a window of 2 periods 93 | cur = y[pitch_offset_idx : pitch_offset_idx + 1600] 94 | min_idx = torch.argmin(cur).item() 95 | 96 | # here we take first wavelet, but also can take the average of a few wavelets 97 | # TODO: prone to silence right now. need to fix. now HACK search for local minima across 1600 samples to solve 98 | wavelet = y[min_idx : min_idx + period] 99 | 100 | # upsample + normalize magnitude 101 | wavelet_tensor = wavelet.clone().detach().unsqueeze(-1).unsqueeze(0) 102 | if torch.isinf(wavelet_tensor).any() or torch.isnan(wavelet_tensor).any(): 103 | print('wavelet tensor has inf or nan', torch.isinf(wavelet_tensor).any(), torch.isnan(wavelet_tensor).any()) 104 | wavelet_upsample = upsample(wavelet_tensor, factor=0, preferred_size=512, mode="linear").squeeze() 105 | if torch.isinf(wavelet_upsample).any() or torch.isnan(wavelet_upsample).any(): 106 | print('wavelet upsample has inf or nan', torch.isinf(wavelet_upsample).any(), torch.isnan(wavelet_upsample).any()) 107 | if wavelet_upsample.max() - wavelet_upsample.min() < 1e-4: 108 | # don't min-max norm in this case 109 | pass 110 | else: 111 | wavelet_upsample = (wavelet_upsample - wavelet_upsample.min()) / \ 112 | (wavelet_upsample.max() - wavelet_upsample.min()) 113 | wavelet_upsample = wavelet_upsample * 2 - 1 114 | if torch.isinf(wavelet_upsample).any() or torch.isnan(wavelet_upsample).any(): 115 | print('wavelet upsample 2 has inf or nan', torch.isinf(wavelet_upsample).any(), torch.isnan(wavelet_upsample).any(), 116 | wavelet_upsample.max(), wavelet_upsample.min()) 117 | 118 | return wavelet_upsample 119 | 120 | 121 | class WTSv2(nn.Module): 122 | def __init__(self, hidden_size, n_harmonic, n_bands, sampling_rate, 123 | block_size, mode="wavetable", duration_secs=3, num_wavetables=3, 124 | wavetable_smoothing=False, min_smoothing_sigma=0.5, max_smoothing_sigma=50, 125 | preload_wt=False, is_round_secs=False, enable_amplitude=True, device='cuda' 126 | ): 127 | super().__init__() 128 | self.register_buffer("sampling_rate", torch.tensor(sampling_rate)) 129 | self.register_buffer("block_size", torch.tensor(block_size)) 130 | 131 | # feature extractors 132 | self.encoder = mlp(30, hidden_size, 3) 133 | self.layer_norm = nn.LayerNorm(30) 134 | self.gru_mfcc = nn.GRU(30, 512, batch_first=True) 135 | self.mlp_mfcc = nn.Linear(512, 16) 136 | 137 | self.in_mlps = nn.ModuleList([mlp(1, hidden_size, 3), 138 | mlp(1, hidden_size, 3), 139 | mlp(16, hidden_size, 3)]) 140 | self.gru = gru(3, hidden_size) 141 | self.out_mlp = mlp(hidden_size * 4, hidden_size, 3) 142 | 143 | self.loudness_mlp = nn.Sequential( 144 | nn.Linear(1, 1), 145 | nn.Sigmoid() 146 | ) 147 | self.proj_matrices = nn.ModuleList([ 148 | nn.Linear(hidden_size, n_harmonic + 1), 149 | nn.Linear(hidden_size, n_bands), 150 | ]) 151 | 152 | # for wavetable learning 153 | self.wt1_conv1d = nn.Sequential( 154 | nn.Conv1d(1, num_wavetables, 16, stride=16), # 3 here is num_wavetable 155 | nn.Tanh(), 156 | nn.Conv1d(num_wavetables, num_wavetables, 8, stride=8), 157 | nn.Tanh(), 158 | nn.Linear(500, 512), # 512 is wavetable length 159 | nn.Tanh() 160 | ) 161 | self.attention_wt1 = nn.Linear(512, 1) 162 | self.smoothing_linear = nn.Linear(512, 1) 163 | self.smoothing_sigmoid = nn.Sigmoid() 164 | 165 | # for adsr learning 166 | self.shaper = ADSREnvelopeShaper(is_round_secs) 167 | self.adsr_conv1d = nn.Conv1d(1, 1, block_size, stride=block_size) 168 | 169 | self.attack_gru = nn.GRU(1, 8, batch_first=True, bidirectional=True) 170 | self.decay_gru = nn.GRU(1, 8, batch_first=True, bidirectional=True) 171 | self.sustain_gru = nn.GRU(1, 8, batch_first=True, bidirectional=True) 172 | 173 | self.attack_sec_head = nn.Sequential( 174 | nn.Linear(16, 1), 175 | nn.Sigmoid() 176 | ) 177 | self.decay_sec_head = nn.Sequential( 178 | nn.Linear(16, 1), 179 | nn.Sigmoid() 180 | ) 181 | self.sustain_level_head = nn.Sequential( 182 | nn.Linear(16, 1), 183 | nn.Sigmoid() 184 | ) 185 | 186 | # for adsr result storage 187 | self.attack_sec = nn.Parameter(torch.ones(1,)) 188 | self.decay_sec = nn.Parameter(torch.ones(1,)) 189 | self.sustain_level = nn.Parameter(torch.ones(1,)) 190 | 191 | self.max_attack_secs = 2.0 192 | self.max_decay_secs = 2.0 193 | 194 | # for synthesis 195 | self.reverb = Reverb(sampling_rate, sampling_rate) 196 | self.wts = WavetableSynthV2(sr=sampling_rate, 197 | duration_secs=duration_secs, 198 | block_size=block_size, 199 | enable_amplitude=enable_amplitude) 200 | self.wavetable_smoothing = wavetable_smoothing 201 | self.min_smoothing_sigma = min_smoothing_sigma 202 | self.max_smoothing_sigma = max_smoothing_sigma 203 | 204 | self.preload_wt = preload_wt 205 | 206 | self.mode = mode 207 | self.duration_secs = duration_secs 208 | self.device = device 209 | 210 | def forward(self, y, mfcc, pitch, loudness, times, onset_frames): 211 | batch_size = y.shape[0] 212 | 213 | # encode mfcc first 214 | # use layer norm instead of trainable norm, not much difference found 215 | mfcc = self.layer_norm(torch.transpose(mfcc, 1, 2)) 216 | mfcc = self.gru_mfcc(mfcc)[0] 217 | mfcc = self.mlp_mfcc(mfcc) 218 | 219 | # use image resize to align dimensions, ddsp also do this... 220 | mfcc = Resize(size=(self.duration_secs * 100, 16))(mfcc) 221 | 222 | hidden = torch.cat([ 223 | self.in_mlps[0](pitch), 224 | self.in_mlps[1](loudness), 225 | self.in_mlps[2](mfcc) 226 | ], -1) 227 | hidden = torch.cat([self.gru(hidden)[0], hidden], -1) 228 | hidden = self.out_mlp(hidden) 229 | 230 | # harmonic part 231 | total_amp = self.loudness_mlp(loudness) 232 | pitch_prev = pitch 233 | 234 | # TODO: upsample is very slow 235 | pitch = upsample(pitch, self.block_size) 236 | total_amp = upsample(total_amp, self.block_size) # use this instead for wavetable 237 | 238 | # diff-wave-synth synthesizer 239 | if self.preload_wt: 240 | # TODO: very slow implementation... 241 | wavetables = [] 242 | for idx in range(batch_size): 243 | wt = infer_wavetables(y[idx].squeeze(), pitch_prev[idx].squeeze()) 244 | wavetables.append(wt) 245 | wavetables = torch.stack(wavetables, dim=0).unsqueeze(1) 246 | if torch.isinf(wavetables).any() or torch.isnan(wavetables).any(): 247 | print('wavetables has inf or nan', torch.isinf(wavetables).any(), torch.isnan(wavetables).any()) 248 | else: 249 | wavetables = self.wt1_conv1d(y.unsqueeze(1)) 250 | 251 | if self.wavetable_smoothing: 252 | smoothing_coeff = self.smoothing_linear(wavetables) 253 | smoothing_coeff = smoothing_coeff.squeeze(1) # HACK: here should assume only 1 wavetable 254 | smoothing_coeff = self.smoothing_sigmoid(smoothing_coeff) 255 | wavetables_old = wavetables 256 | wavetables = self.smoothing(wavetables, smoothing_coeff) 257 | else: 258 | wavetables_old = None 259 | smoothing_coeff = None 260 | 261 | attention_output = self.attention_wt1(wavetables).squeeze(-1) 262 | attention_output = nn.Softmax(dim=-1)(attention_output) 263 | 264 | harmonic, attention_output = self.wts(pitch, total_amp, wavetables, attention_output) 265 | 266 | # noise part 267 | param = scale_function(self.proj_matrices[1](hidden) - 5) 268 | 269 | impulse = amp_to_impulse_response(param, self.block_size) 270 | noise = torch.rand( 271 | impulse.shape[0], 272 | impulse.shape[1], 273 | self.block_size, 274 | ).to(impulse) * 2 - 1 275 | 276 | noise = fft_convolve(noise, impulse).contiguous() 277 | noise = noise.reshape(noise.shape[0], -1, 1) 278 | 279 | signal = harmonic + noise 280 | 281 | # adsr shaping 282 | output_attack, hn_attack = self.attack_gru(loudness) 283 | hn_attack = torch.cat([hn_attack[0], hn_attack[1]], dim=-1) 284 | output_decay, hn_decay = self.decay_gru(loudness) 285 | hn_decay = torch.cat([hn_decay[0], hn_decay[1]], dim=-1) 286 | output_sustain, hn_sustain = self.sustain_gru(loudness) 287 | hn_sustain = torch.cat([hn_sustain[0], hn_sustain[1]], dim=-1) 288 | 289 | # print(hn_decay[:10]) 290 | attack_level = self.attack_sec_head(hn_attack).squeeze() # 0-1 291 | decay_level = self.decay_sec_head(hn_decay).squeeze() # 0-1 292 | sustain_level = self.sustain_level_head(hn_sustain).squeeze() 293 | 294 | attack_secs = attack_level * self.max_attack_secs 295 | decay_secs = decay_level * self.max_decay_secs 296 | 297 | amp_onsets = np.append(times[onset_frames], np.array([times[-1]])) # TODO: now 1 onset is enough, because all training samples pitch are the same 298 | 299 | adsr = get_amp_shaper(self.shaper, amp_onsets, 300 | attack_secs=attack_secs, 301 | decay_secs=decay_secs, 302 | sustain_level=sustain_level) 303 | if adsr.shape[1] < pitch_prev.shape[1]: 304 | # adsr = torch.nn.functional.pad(adsr, (0, pitch_prev.shape[1] - adsr.shape[1]), "constant", adsr[-1].item()) 305 | adsr = torch.cat([adsr, adsr[:, -1].unsqueeze(-1)], dim=-1) 306 | else: 307 | adsr = adsr[:pitch_prev.shape[1]] 308 | 309 | self.adsr = adsr 310 | adsr = adsr.unsqueeze(-1) 311 | adsr = upsample(adsr, self.block_size).squeeze(-1) 312 | 313 | adsr = adsr[:, :signal.shape[1]] 314 | 315 | final_signal = signal.squeeze() * adsr 316 | 317 | # reverb part 318 | # signal = self.reverb(signal) 319 | 320 | return signal, (attack_secs, decay_secs, sustain_level), final_signal, attention_output, wavetables, wavetables_old, smoothing_coeff 321 | 322 | def smoothing(self, wavetables, p): 323 | bs, wavetable_length = wavetables.shape[0], wavetables.shape[2] 324 | smoothed_wavetables = torch.zeros((bs, wavetable_length)) 325 | if self.device == "cuda": 326 | smoothed_wavetables = smoothed_wavetables.cuda() 327 | 328 | sigma = p * (self.max_smoothing_sigma - self.min_smoothing_sigma) + self.min_smoothing_sigma 329 | sigma = sigma.unsqueeze(-1) # size (bs, 1, 1) 330 | 331 | kernel = torch.arange(wavetable_length) 332 | if self.device == "cuda": 333 | kernel = kernel.cuda() 334 | kernel = kernel.unsqueeze(0) - kernel.unsqueeze(-1) # x_position - x_vals, size (wt_len, wt_len) 335 | 336 | kernel = torch.exp(-kernel ** 2 / (2 * sigma ** 2)) # size (b, wt_len, wt_len) 337 | kernel = kernel / torch.sum(kernel, dim=-1).unsqueeze(-1) # dim 1 or dim -1? 338 | 339 | # wavetables = wavetables.unsqueeze(1) 340 | smoothed_wavetables = torch.bmm(wavetables, kernel) # (bs, 1, wt_len) * (bs, wt_len, wt_len) 341 | return smoothed_wavetables -------------------------------------------------------------------------------- /syntheon/inferencer/vital/models/preprocessor.py: -------------------------------------------------------------------------------- 1 | """ 2 | For loading and preprocessing audio 3 | """ 4 | import numpy as np 5 | import os 6 | import torch 7 | from syntheon.utils.pitch_extractor import extract_pitch 8 | from syntheon.inferencer.vital.models.core import extract_loudness 9 | import librosa 10 | import yaml 11 | from nnAudio import Spectrogram 12 | 13 | with open( 14 | os.path.join( 15 | os.path.dirname(os.path.realpath(__file__)), 16 | "../config.yaml" 17 | ), 'r' 18 | ) as stream: 19 | config = yaml.safe_load(stream) 20 | 21 | # general parameters 22 | sr = config["common"]["sampling_rate"] 23 | n_mfcc = config["train"]["n_mfcc"] 24 | 25 | spec = Spectrogram.MFCC(sr=sr, n_mfcc=n_mfcc) 26 | 27 | 28 | def sanitize_onsets(times, onset_frames, onset_strengths): 29 | """ 30 | times: actual timestamp per frame in STFT (by hop length) 31 | e.g. [0, 0.3, 0.6, ...] 32 | onset_frames: index list, index on `times` to get an onset event 33 | onset_strengths: get strength per frame. same shape as times. 34 | so when strength is high will have a onset event, index in `onset_frames` 35 | """ 36 | # TODO: need to check if we need this always 37 | res_frames = [] 38 | 39 | cur_frame = onset_frames[0] 40 | cur_time = times[cur_frame] 41 | res_frames.append(cur_frame) 42 | 43 | for frame in onset_frames[1:]: 44 | if times[frame] - cur_time > 0.05: # TODO: parameterize 45 | res_frames.append(frame) 46 | cur_frame = frame 47 | cur_time = times[frame] 48 | 49 | return np.array(res_frames) 50 | 51 | 52 | def aggregate(vals): 53 | """ 54 | aggregate the window of pitch values. 55 | rationale: bin pitch values (to reduce fluctuation), get the bin with most values within the window 56 | """ 57 | bins = {} 58 | for val in vals: 59 | bin = val // 10 60 | if bin in bins: 61 | bins[bin].append(val) 62 | else: 63 | bins[bin] = [val] 64 | 65 | sorted_bins = sorted(bins.keys()) 66 | max_len_bin = sorted_bins[0] 67 | 68 | for bin in sorted_bins: 69 | if len(bins[bin]) > len(bins[max_len_bin]): 70 | max_len_bin = bin 71 | 72 | return bins[max_len_bin][0] 73 | 74 | 75 | 76 | def monotonize_pitch(times, onset_frames, pitch): 77 | """ 78 | remove wobbling frequencies in pitch. take the pitch value on the onset frame 79 | problem is accuracy issue -- need to align onset and pitch 80 | because librosa onset might read wrong pitch from crepe output 81 | """ 82 | res_pitch = np.zeros(pitch.shape) 83 | pitch_map_lst = [] 84 | 85 | prev_ts = times[onset_frames[0]] 86 | 87 | for idx, frame in enumerate(onset_frames): 88 | if idx == 0: 89 | continue 90 | ts = times[frame] 91 | pitch_vals = pitch[int(prev_ts * 100) : int(ts * 100)] 92 | 93 | if len(pitch_vals) > 0: 94 | cur_pitch = aggregate(pitch_vals) 95 | pitch_map_lst.append((int(prev_ts * 100), cur_pitch)) 96 | prev_ts = ts 97 | 98 | # for final frame 99 | ts = times[-1] 100 | pitch_vals = pitch[int(prev_ts * 100) : int(ts * 100)] 101 | if len(pitch_vals) > 0: 102 | cur_pitch = aggregate(pitch_vals) 103 | pitch_map_lst.append((int(prev_ts * 100), cur_pitch)) 104 | 105 | if pitch_map_lst[0][0] == 0: 106 | res_pitch[0] = pitch_map_lst[0][1] 107 | cur_pitch = pitch_map_lst[0][1] 108 | cur_idx = 1 109 | else: 110 | res_pitch[0] = 0 111 | cur_pitch = 0 112 | cur_idx = 0 113 | 114 | for i in range(1, len(pitch)): 115 | if i == pitch_map_lst[cur_idx][0]: 116 | cur_pitch = pitch_map_lst[cur_idx][1] 117 | res_pitch[i] = cur_pitch 118 | if cur_idx < len(pitch_map_lst) - 1: 119 | cur_idx += 1 120 | else: 121 | res_pitch[i] = cur_pitch 122 | 123 | return res_pitch 124 | 125 | 126 | def preprocess(f, sampling_rate, block_size, signal_length=-1, oneshot=True): 127 | x, sr = librosa.load(f, sampling_rate) 128 | if signal_length == -1: # full length 129 | signal_length = len(x) 130 | else: 131 | if len(x) > signal_length: 132 | x = x[:signal_length*sampling_rate] 133 | elif len(x) < signal_length: 134 | N = (signal_length - len(x) % signal_length) % signal_length 135 | x = np.pad(x, (0, N)) 136 | 137 | if oneshot: 138 | x = x[..., :signal_length] 139 | 140 | D = np.abs(librosa.stft(x)) 141 | times = librosa.times_like(D, sr=sr) 142 | onset_strengths = librosa.onset.onset_strength(y=x, sr=sr, aggregate=np.median) 143 | onset_frames = librosa.onset.onset_detect(y=x, sr=sr) 144 | 145 | onset_frames = sanitize_onsets(times, onset_frames, onset_strengths) 146 | 147 | # TODO: HACK for now, onset detector missed. not all samples need this!! 148 | onset_frames = np.concatenate([np.array([0]), onset_frames]) 149 | 150 | pitch = extract_pitch(x, sampling_rate, block_size) 151 | loudness = extract_loudness(x, sampling_rate, block_size) 152 | 153 | pitch_monotonize = monotonize_pitch(times, onset_frames, pitch) 154 | pitch = pitch_monotonize 155 | x = x.reshape(-1, signal_length) 156 | pitch = pitch.reshape(x.shape[0], -1).squeeze() 157 | loudness = loudness.reshape(x.shape[0], -1) 158 | 159 | # prepare for inference input 160 | x = torch.tensor(x) 161 | pitch = torch.tensor(pitch).unsqueeze(0) 162 | loudness = torch.tensor(loudness) 163 | 164 | x = torch.cat([x, x], dim=0) 165 | pitch = torch.cat([pitch, pitch], dim=0) 166 | loudness = torch.cat([loudness, loudness], dim=0) 167 | 168 | mean_loudness, std_loudness = -39.74668743704927, 54.19612404969509 169 | pitch, loudness = pitch.unsqueeze(-1).float(), loudness.unsqueeze(-1).float() 170 | loudness = (loudness - mean_loudness) / std_loudness 171 | 172 | mfcc = spec(x) 173 | 174 | return x, pitch, loudness, times, onset_frames, mfcc 175 | -------------------------------------------------------------------------------- /syntheon/inferencer/vital/models/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import soundfile as sf 4 | 5 | 6 | # helper functions to generate wavetable 7 | def generate_wavetable(length, f): 8 | wavetable = np.zeros((length,), dtype=np.float32) 9 | for i in range(length): 10 | wavetable[i] = f(2 * np.pi * i / length) 11 | return wavetable 12 | 13 | 14 | def sawtooth_waveform(x): 15 | """Sawtooth with period 2 pi.""" 16 | return (x + np.pi) / np.pi % 2 - 1 17 | 18 | 19 | def square_waveform(x): 20 | """Square waveform with period 2 pi.""" 21 | return np.sign(np.sin(x)) 22 | 23 | 24 | def trim_audio(in_name, out_name, start_sec, end_sec, sr=44100): 25 | x, sr = librosa.load(in_name, sr=sr) 26 | x = x[start_sec * sr: end_sec * sr] 27 | sf.write(out_name, x, sr, 'PCM_24') 28 | 29 | 30 | if __name__ == "__main__": 31 | trim_audio("test_audio/kygo_pluck.mp3", "kygo_pluck.wav", 75, 85) -------------------------------------------------------------------------------- /syntheon/inferencer/vital/models/wavetable_synth.py: -------------------------------------------------------------------------------- 1 | """ 2 | Differentiable wavetable synthesis component. 3 | """ 4 | import torch 5 | from torch import nn 6 | import numpy as np 7 | from syntheon.inferencer.vital.models.utils import * 8 | import soundfile as sf 9 | from syntheon.inferencer.vital.models.core import upsample 10 | from syntheon.inferencer.vital.models.adsr_envelope import * 11 | 12 | 13 | def wavetable_osc(wavetable, freq, sr): 14 | """ 15 | General wavetable synthesis oscilator. 16 | wavetable: (wavetable_len,) 17 | freq: (batch_size, dur * sr) 18 | sr: const 19 | """ 20 | freq = freq.squeeze() 21 | increment = freq / sr * wavetable.shape[0] 22 | index = torch.cumsum(increment, dim=1) - increment[0] 23 | index = index % wavetable.shape[0] 24 | 25 | # uses linear interpolation implementation 26 | index_low = torch.floor(index.clone()) 27 | index_high = torch.ceil(index.clone()) 28 | alpha = index - index_low 29 | index_low = index_low.long() 30 | index_high = index_high.long() 31 | 32 | output = wavetable[index_low] + alpha * (wavetable[index_high % wavetable.shape[0]] - wavetable[index_low]) 33 | 34 | return output 35 | 36 | 37 | def wavetable_osc_v2(wavetable, freq, sr): 38 | """ 39 | General wavetable synthesis oscilator, wavetable per item in batch 40 | wavetable: (batch_size, wavetable_len,) 41 | freq: (batch_size, dur * sr) 42 | sr: const 43 | """ 44 | freq = freq.squeeze() 45 | increment = freq / sr * wavetable.shape[1] 46 | index = torch.cumsum(increment, dim=1) - increment[1] 47 | index = index % wavetable.shape[1] 48 | 49 | # uses linear interpolation implementation 50 | index_low = torch.floor(index.clone()) 51 | index_high = torch.ceil(index.clone()) 52 | alpha = index - index_low 53 | index_low = index_low.long() 54 | index_high = index_high.long() 55 | 56 | batch_size = wavetable.shape[0] 57 | output = [] 58 | 59 | # TODO: do for loop for now, think any ways to parallelize this (einsum?) 60 | for bs in range(batch_size): 61 | wt, idx_l, idx_h, alp = wavetable[bs], index_low[bs].unsqueeze(0), index_high[bs].unsqueeze(0), alpha[bs].unsqueeze(0) 62 | signal = wt[idx_l] + alp * (wt[idx_h % wt.shape[0]] - wt[idx_l]) 63 | output.append(signal) 64 | 65 | output = torch.cat(output, dim=0) 66 | return output 67 | 68 | 69 | def generate_wavetable(length, f, cycle=1, phase=0): 70 | """ 71 | Generate a wavetable of specified length using 72 | function f(x) where x is phase. 73 | Period of f is assumed to be 2 pi. 74 | """ 75 | wavetable = np.zeros((length,), dtype=np.float32) 76 | for i in range(length): 77 | wavetable[i] = f(cycle * 2 * np.pi * i / length + 2 * phase * np.pi) 78 | return torch.tensor(wavetable) 79 | 80 | 81 | class WavetableSynth(nn.Module): 82 | def __init__(self, 83 | wavetables=None, 84 | n_wavetables=64, 85 | wavetable_len=512, 86 | sr=44100, 87 | duration_secs=3, 88 | block_size=160, 89 | is_initial_wt_trainable=True): 90 | super(WavetableSynth, self).__init__() 91 | if wavetables is None: 92 | self.wavetables = [] 93 | for _ in range(n_wavetables): 94 | cur = nn.Parameter(torch.empty(wavetable_len).normal_(mean=0, std=0.01)) 95 | self.wavetables.append(cur) 96 | 97 | self.wavetables = nn.ParameterList(self.wavetables) 98 | 99 | for idx, wt in enumerate(self.wavetables): 100 | wt.data = torch.cat([wt[:-1], wt[0].unsqueeze(-1)], dim=-1) 101 | wt.requires_grad = is_initial_wt_trainable 102 | else: 103 | self.wavetables = wavetables 104 | 105 | self.attention = nn.Parameter(torch.ones(n_wavetables,).cuda()) 106 | self.sr = sr 107 | self.block_size = block_size 108 | self.attention_softmax = nn.Softmax(dim=0) 109 | self.duration_secs = duration_secs 110 | 111 | def forward(self, pitch, amplitude): 112 | output_waveform_lst = [] 113 | for wt_idx in range(len(self.wavetables)): 114 | wt = self.wavetables[wt_idx] 115 | if wt_idx not in [0, 1, 2, 3]: 116 | wt = nn.Tanh()(wt) # ensure wavetable range is between [-1, 1] 117 | waveform = wavetable_osc(wt, pitch, self.sr) 118 | output_waveform_lst.append(waveform) 119 | 120 | # apply attention 121 | attention = self.attention_softmax(self.attention) 122 | attention_output = attention 123 | attention = torch.stack(100 * self.duration_secs * [attention], dim=-1) 124 | attention_upsample = upsample(attention.unsqueeze(-1), self.block_size).squeeze() 125 | 126 | output_waveform = torch.stack(output_waveform_lst, dim=1) 127 | output_waveform = output_waveform * attention_upsample 128 | output_waveform_after = torch.sum(output_waveform, dim=1) 129 | 130 | output_waveform_after = output_waveform_after.unsqueeze(-1) 131 | output_waveform_after = output_waveform_after * amplitude 132 | 133 | return output_waveform_after, attention_output 134 | 135 | 136 | class WavetableSynthV2(nn.Module): 137 | """ 138 | take wavetable as input, not model parameters 139 | """ 140 | def __init__(self, 141 | sr=44100, 142 | duration_secs=4, 143 | block_size=160, 144 | enable_amplitude=True): 145 | """ 146 | Turn on smoothing to reduce noise in learnt wavetables. 147 | Smoothing takes in a 0-1 value, which is window size ratio w.r.t. wavetable length 148 | Also a max_smooth_window_size is specified 149 | """ 150 | super(WavetableSynthV2, self).__init__() 151 | self.sr = sr 152 | self.block_size = block_size 153 | self.duration_secs = duration_secs 154 | self.enable_amplitude = enable_amplitude 155 | 156 | def forward(self, pitch, amplitude, wavetables, attention): 157 | """ 158 | batch size version 159 | input: 160 | wavetables: (bs, n_wavetables, wavetable_len), -1 to 1 161 | attention: softmax-ed, (bs, n_wavetables,) 162 | smoothing_coeff: (bs, ), 0 to 1 163 | 164 | output: 165 | (bs, dur * sr) 166 | """ 167 | output_waveform_lst = [] 168 | for wt_idx in range(wavetables.shape[1]): 169 | wt = wavetables[:, wt_idx, :] 170 | waveform = wavetable_osc_v2(wt, pitch, self.sr) 171 | 172 | output_waveform_lst.append(waveform) 173 | 174 | # apply attention 175 | attention_upsample = torch.stack(100 * self.duration_secs * [attention], dim=-1) 176 | attention_upsample = upsample(torch.permute(attention_upsample, (1, 2, 0)), self.block_size) 177 | if (attention_upsample.shape[0] != 1): 178 | attention_upsample = attention_upsample.squeeze() # TODO: a little hacky code here, need to remove 179 | attention_upsample = torch.permute(attention_upsample, (2, 0, 1)) 180 | 181 | output_waveform = torch.stack(output_waveform_lst, dim=1) 182 | output_waveform = output_waveform * attention_upsample 183 | output_waveform_after = torch.sum(output_waveform, dim=1) 184 | 185 | output_waveform_after = output_waveform_after.unsqueeze(-1) 186 | if self.enable_amplitude: 187 | output_waveform_after = output_waveform_after * amplitude 188 | 189 | return output_waveform_after, attention 190 | 191 | 192 | if __name__ == "__main__": 193 | # create a sine wavetable and to a simple synthesis test 194 | wavetable_len = 512 195 | sr = 16000 196 | duration = 4 197 | freq_t_1 = [739.99 for _ in range(sr)] + [523.25 for _ in range(sr)] + [349.23 for _ in range(sr * 2)] 198 | freq_t_1 = torch.tensor(freq_t_1) 199 | freq_t_2 = [523.25 for _ in range(sr)] + [349.23 for _ in range(sr)] + [739.99 for _ in range(sr * 2)] 200 | freq_t_2 = torch.tensor(freq_t_2) 201 | freq_t_3 = [349.23 for _ in range(sr)] + [739.99 for _ in range(sr)] + [523.25 for _ in range(sr * 2)] 202 | freq_t_3 = torch.tensor(freq_t_3) 203 | 204 | pitch, onset_frames, times = np.load("pitch.npy"), np.load("onset.npy"), np.load("times.npy") 205 | pitch = torch.tensor(pitch) 206 | pitch = upsample(pitch.unsqueeze(-1).unsqueeze(0), 160).squeeze() 207 | 208 | freq_t = torch.stack([pitch, pitch, pitch], dim=0) 209 | sine_wavetable = generate_wavetable(wavetable_len, np.sin) 210 | from utils import sawtooth_waveform 211 | saw_wavetable = generate_wavetable(wavetable_len, sawtooth_waveform) 212 | square_wavetable = generate_wavetable(wavetable_len, square_waveform) 213 | 214 | wavetable = torch.stack([sine_wavetable, saw_wavetable, square_wavetable], dim=0) 215 | 216 | # test batch wavetable_osc 217 | signal = wavetable_osc_v2(wavetable, freq_t, sr) 218 | 219 | # test with adsr 220 | shaper = ADSREnvelopeShaper() 221 | adsr = get_amp_shaper(shaper, times[onset_frames], 222 | attack_secs=torch.tensor([0.00]), 223 | decay_secs=torch.tensor([0.05]), 224 | sustain_level=torch.tensor([0.0])) 225 | if adsr.shape[0] < 400: 226 | append_tensor = torch.tensor([adsr[-1]] * (400 - adsr.shape[0])) 227 | adsr = torch.cat([adsr, append_tensor], dim=-1) 228 | else: 229 | adsr = adsr[:400] 230 | adsr = upsample(adsr.unsqueeze(-1).unsqueeze(0), 160).squeeze() 231 | 232 | signal = signal * adsr 233 | 234 | sf.write('test_3s_v1.wav', signal.squeeze()[0].detach().numpy(), sr, 'PCM_24') 235 | sf.write('test_3s_v2.wav', signal.squeeze()[1].detach().numpy(), sr, 'PCM_24') 236 | sf.write('test_3s_v3.wav', signal.squeeze()[2].detach().numpy(), sr, 'PCM_24') -------------------------------------------------------------------------------- /syntheon/inferencer/vital/vital_inferencer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vital inferencer. 3 | """ 4 | from syntheon.inferencer.inferencer import Inferencer, InferenceInput, InferenceOutput 5 | from syntheon.inferencer.vital.models.model import WTSv2 6 | from syntheon.inferencer.vital.models.preprocessor import * 7 | from syntheon.inferencer.vital.models.core import multiscale_fft 8 | from syntheon.converter.vital.vital_constants import N_WAVETABLES, CUSTOM_KEYS 9 | import yaml 10 | import torch 11 | import numpy as np 12 | import json 13 | 14 | with open( 15 | os.path.join( 16 | os.path.dirname(os.path.realpath(__file__)), 17 | "config.yaml" 18 | ), 'r' 19 | ) as stream: 20 | config = yaml.safe_load(stream) 21 | 22 | # general parameters 23 | sr = config["common"]["sampling_rate"] 24 | block_size = config["common"]["block_size"] 25 | duration_secs = config["common"]["duration_secs"] 26 | batch_size = config["train"]["batch_size"] 27 | scales = config["train"]["scales"] 28 | overlap = config["train"]["overlap"] 29 | hidden_size = config["train"]["hidden_size"] 30 | n_harmonic = config["train"]["n_harmonic"] 31 | n_bands = config["train"]["n_bands"] 32 | n_wavetables = config["train"]["n_wavetables"] 33 | n_mfcc = config["train"]["n_mfcc"] 34 | train_lr = config["train"]["start_lr"] 35 | visualize = config["visualize"] 36 | device = config["device"] 37 | signal_length = sr * 4 38 | 39 | 40 | class VitalInferenceOutput(InferenceOutput): 41 | def __init__(self): 42 | InferenceOutput.__init__(self) 43 | self.wt_output = None # TODO: can put default values here 44 | self.attention_output = None 45 | self.attack = None 46 | self.decay = None 47 | self.sustain = None 48 | 49 | 50 | class VitalInferenceInput(InferenceInput): 51 | def __init__(self): 52 | self.y = None 53 | self.pitch = None 54 | self.loudness = None 55 | self.times = None 56 | self.onset_frames = None 57 | self.mfcc = None 58 | 59 | 60 | class VitalInferencer(Inferencer): 61 | def convert(self, audio_fname, model_pt_fname=None, enable_eval=False): 62 | # TODO: switch to torchhub 63 | if model_pt_fname is None: 64 | model_pt_fname = os.path.join( 65 | os.path.dirname(os.path.realpath(__file__)), 66 | "checkpoints/model.pt" 67 | ) 68 | 69 | y, pitch, loudness, times, onset_frames, mfcc = preprocess(audio_fname, sampling_rate=16000, block_size=160, 70 | signal_length=signal_length) 71 | inference_input = VitalInferenceInput() 72 | inference_input.y = y 73 | inference_input.pitch = pitch 74 | inference_input.loudness = loudness 75 | inference_input.times = times 76 | inference_input.onset_frames = onset_frames 77 | inference_input.mfcc = mfcc 78 | 79 | model = self.load_model(model_pt_fname, self.device) 80 | inference_output = self.inference(model, inference_input, self.device, enable_eval=enable_eval) 81 | synth_params_dict = self.convert_to_preset(inference_output) 82 | return synth_params_dict, inference_output.eval_dict 83 | 84 | def load_model(self, model_pt_fname, device="cuda"): 85 | model = WTSv2(hidden_size=hidden_size, n_harmonic=n_harmonic, n_bands=n_bands, sampling_rate=sr, 86 | block_size=block_size, mode="wavetable", 87 | duration_secs=4, num_wavetables=1, wavetable_smoothing=False, preload_wt=True, enable_amplitude=False, 88 | is_round_secs=False, device=device) 89 | if device == "cuda": 90 | model.load_state_dict(torch.load(model_pt_fname)) 91 | model.cuda() 92 | else: 93 | model.load_state_dict(torch.load(model_pt_fname, map_location=torch.device('cpu'))) 94 | model.eval() 95 | return model 96 | 97 | def inference(self, model, inference_input, device="cuda", enable_eval=False): 98 | if device == "cuda": 99 | inference_input.y = inference_input.y.cuda() 100 | inference_input.mfcc = inference_input.mfcc.cuda() 101 | inference_input.pitch = inference_input.pitch.cuda() 102 | inference_input.loudness = inference_input.loudness.cuda() 103 | 104 | # forward pass 105 | with torch.no_grad(): 106 | _, adsr, output, attention_output, wavetables, _, _ = model( 107 | inference_input.y, 108 | inference_input.mfcc, 109 | inference_input.pitch, 110 | inference_input.loudness, 111 | inference_input.times, 112 | inference_input.onset_frames 113 | ) 114 | 115 | # write wavetables to numpy file 116 | wt_output = [] 117 | 118 | # interp from 512 to 2048 119 | output_length = 2048 120 | for i in range(N_WAVETABLES): 121 | wt = wavetables[i].cpu().detach().numpy().squeeze() 122 | wt_interp = np.interp( 123 | np.linspace(0, 1, output_length, endpoint=False), 124 | np.linspace(0, 1, wt.shape[0], endpoint=False), 125 | wt, 126 | ) 127 | wt_output.append(wt_interp) 128 | 129 | wt_output = np.stack(wt_output, axis=0) 130 | attention_output = attention_output.cpu().detach().numpy().squeeze() 131 | 132 | inference_output = VitalInferenceOutput() 133 | inference_output.wt_output = wt_output 134 | inference_output.attention_output = attention_output 135 | inference_output.attack = adsr[0][0].cpu().detach().numpy().squeeze().item() 136 | inference_output.decay = adsr[1][0].cpu().detach().numpy().squeeze().item() 137 | inference_output.sustain = adsr[2][0].cpu().detach().numpy().squeeze().item() 138 | 139 | if enable_eval: 140 | self.eval(inference_input.y, output, inference_output) 141 | 142 | return inference_output 143 | 144 | def convert_to_preset(self, inference_output): 145 | with open( 146 | os.path.join( 147 | os.path.dirname(os.path.realpath(__file__)), 148 | "init.vital" 149 | ), 'r' 150 | ) as f: 151 | x = json.load(f) 152 | 153 | x[CUSTOM_KEYS] = {} 154 | x[CUSTOM_KEYS]["wavetables"] = [] 155 | for idx in range(N_WAVETABLES): 156 | cur_dict = { 157 | "name": "Litmus WT {}".format(idx + 1), 158 | "wavetable": inference_output.wt_output[idx], 159 | "osc_level": inference_output.attention_output[idx].item() 160 | } 161 | x[CUSTOM_KEYS]["wavetables"].append(cur_dict) 162 | x[CUSTOM_KEYS]["adsr"] = {} 163 | x[CUSTOM_KEYS]["adsr"]["attack"] = inference_output.attack 164 | x[CUSTOM_KEYS]["adsr"]["attack_power"] = 0.0 165 | x[CUSTOM_KEYS]["adsr"]["decay"] = inference_output.decay 166 | x[CUSTOM_KEYS]["adsr"]["decay_power"] = 0.0 167 | x[CUSTOM_KEYS]["adsr"]["sustain"] = inference_output.sustain 168 | 169 | return x 170 | 171 | def eval(self, y, output, inference_output): 172 | ori_stft = multiscale_fft( 173 | y[0].squeeze(), 174 | scales, 175 | overlap, 176 | ) 177 | rec_stft = multiscale_fft( 178 | output[0].squeeze(), 179 | scales, 180 | overlap, 181 | ) 182 | 183 | loss = 0 184 | for s_x, s_y in zip(ori_stft, rec_stft): 185 | lin_loss = ((s_x - s_y).abs()).mean() 186 | loss += lin_loss 187 | 188 | inference_output.eval_dict["loss"] = loss.item() 189 | inference_output.eval_dict["output"] = output[0].cpu().detach().numpy().squeeze() 190 | 191 | 192 | if __name__ == "__main__": 193 | vital_inferencer = VitalInferencer(device="cpu") 194 | params, eval_dict = vital_inferencer.convert("test/test_audio/vital_test_audio_2.wav", enable_eval=True) 195 | 196 | from syntheon.converter.vital.vital_converter import VitalConverter 197 | vital_converter = VitalConverter() 198 | vital_converter.dict = params 199 | vital_converter.parseToPluginFile("vital_output.vital") 200 | 201 | -------------------------------------------------------------------------------- /syntheon/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Function APIs to be called externally. 3 | """ 4 | from .converter.vital.vital_converter import VitalConverter 5 | from .inferencer.vital.vital_inferencer import VitalInferencer 6 | 7 | 8 | obj_dict = { 9 | "vital": { 10 | "converter": VitalConverter, 11 | "inferencer": VitalInferencer, 12 | "file_ext": "vital" 13 | } 14 | } 15 | 16 | def infer_params(input_audio_name, synth_name, enable_eval=False): 17 | if synth_name not in obj_dict: 18 | raise ValueError("Synth name {} not available for parameter inference".format(synth_name)) 19 | 20 | inferencer = obj_dict[synth_name]["inferencer"](device="cpu") 21 | params, eval_dict = inferencer.convert(input_audio_name, enable_eval=enable_eval) 22 | 23 | converter = obj_dict[synth_name]["converter"]() 24 | converter.dict = params 25 | output_fname = "{}_output.{}".format(synth_name, obj_dict[synth_name]["file_ext"]) 26 | converter.parseToPluginFile(output_fname) 27 | 28 | return output_fname, eval_dict -------------------------------------------------------------------------------- /syntheon/utils/pitch_extractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | [WIP] Common class for pitch extraction across all synthesizers. 3 | """ 4 | 5 | import numpy as np 6 | import os 7 | from torchcrepeV2 import ONNXTorchCrepePredictor 8 | import yaml 9 | 10 | with open( 11 | os.path.join( 12 | os.path.dirname(os.path.realpath(__file__)), 13 | "../inferencer/vital/config.yaml" 14 | ), 'r' 15 | ) as stream: 16 | config = yaml.safe_load(stream) 17 | 18 | 19 | crepe_predictor = ONNXTorchCrepePredictor() 20 | 21 | 22 | def extract_pitch(signal, sampling_rate, block_size, model_capacity="full"): 23 | length = signal.shape[-1] // block_size 24 | f0 = crepe_predictor.predict( 25 | audio=signal, 26 | sr=sampling_rate, 27 | viterbi=True, 28 | center=True, 29 | step_size=int(1000 * block_size / sampling_rate), 30 | ) 31 | 32 | if f0.shape[-1] != length: 33 | f0 = np.interp( 34 | np.linspace(0, 1, length, endpoint=False), 35 | np.linspace(0, 1, f0.shape[-1], endpoint=False), 36 | f0, 37 | ) 38 | 39 | return f0 -------------------------------------------------------------------------------- /syntheon/version.py: -------------------------------------------------------------------------------- 1 | version = "0.1.0" -------------------------------------------------------------------------------- /test/test_audio/dexed_test_audio_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/dexed_test_audio_1.wav -------------------------------------------------------------------------------- /test/test_audio/vital_test_pluck_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_pluck_1.wav -------------------------------------------------------------------------------- /test/test_audio/vital_test_pluck_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_pluck_2.wav -------------------------------------------------------------------------------- /test/test_audio/vital_test_synth_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_synth_1.wav -------------------------------------------------------------------------------- /test/test_audio/vital_test_synth_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_synth_2.wav -------------------------------------------------------------------------------- /test/test_audio/vital_test_synth_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_synth_3.wav -------------------------------------------------------------------------------- /test/test_audio/vital_test_wonky_bass_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudgud96/syntheon/d5838baa624c9f8ccaee0cf8b75baf87453d1da9/test/test_audio/vital_test_wonky_bass_1.wav -------------------------------------------------------------------------------- /test/test_inferencer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | from syntheon import infer_params 4 | 5 | 6 | # def test_dexed_inferencer(): 7 | # """ 8 | # just check if everything runs well for Dexed 9 | # """ 10 | # output_params_file, eval_dict = infer_params( 11 | # "test/test_audio/dexed_test_audio_1.wav", 12 | # "dexed", 13 | # enable_eval=True 14 | # ) 15 | # assert os.path.exists(output_params_file) 16 | 17 | # os.remove(output_params_file) 18 | 19 | 20 | def test_vital_inferencer_1(): 21 | """ 22 | just check if everything runs well for Vital 23 | """ 24 | loss_lst = [0.11, 0.06, 0.37, 0.42, 0.18, 0.15] 25 | audios = sorted(glob.glob("test/test_audio/vital_*.wav")) 26 | for i in range(len(audios)): 27 | output_params_file, eval_dict = infer_params( 28 | audios[i], 29 | "vital", 30 | enable_eval=True 31 | ) 32 | assert os.path.exists(output_params_file) 33 | assert eval_dict["loss"] < loss_lst[i] 34 | os.remove(output_params_file) --------------------------------------------------------------------------------