├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE.md ├── Makefile ├── README.md ├── __init__.py ├── analysis.py ├── synthesis.py └── test ├── Makefile └── test_smoke.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "sigproc"] 2 | path = sigproc 3 | url = https://gitlab.com/gillesdegottex/sigproc.git 4 | [submodule "external/REAPER"] 5 | path = external/REAPER 6 | url = https://github.com/google/REAPER.git 7 | [submodule "external/pyworld"] 8 | path = external/pyworld 9 | url = https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder.git 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | cache: 2 | directories: 3 | - $HOME/.cache/pip 4 | 5 | 6 | language: python 7 | 8 | os: 9 | - linux 10 | 11 | branches: 12 | only: 13 | - master 14 | 15 | before_install: 16 | - echo "before_install..." 17 | - git submodule update --init --recursive 18 | - git fetch --tags 19 | - sleep 1 20 | - git checkout $TRAVIS_BRANCH 21 | - export PROJECTGITVERSION=`git describe --tags --always |sed 's/^v//'` 22 | - echo $PROJECTGITVERSION 23 | - echo $TRAVIS_OS_NAME 24 | - echo $TRAVIS_COMMIT 25 | - echo $TRAVIS_TAG 26 | - export TRAVIS_TAG=`git describe --tags --exact-match HEAD 2>/dev/null` 27 | - echo $TRAVIS_TAG 28 | 29 | - sudo apt-get -qq update 30 | - sudo apt-get install -y cython 31 | - sudo apt-get install -y python-numpy 32 | - sudo apt-get install -y python-scipy 33 | - sudo apt-get install -y sox 34 | 35 | install: 36 | - echo "install..." 37 | - pip install codecov 38 | - pip install cython 39 | - pip install numpy 40 | - pip install scipy 41 | 42 | script: 43 | - echo "script..." 44 | 45 | - export PACKAGENAME=pulsemodel.zip 46 | - echo "Packaging sources of " $PACKAGENAME 47 | - pwd 48 | - ls -l 49 | - mkdir ../pulsemodel_source_build 50 | - cp -r * ../pulsemodel_source_build/ 51 | - echo $PROJECTGITVERSION > ../pulsemodel_source_build/Version 52 | - rm -fr ../pulsemodel_source_build/.git* 53 | - rm -fr ../pulsemodel_source_build/sigproc/.git* 54 | - rm -fr ../pulsemodel_source_build/external/pyworld/.git* 55 | - rm -fr ../pulsemodel_source_build/external/pyworld/lib/World/.git* 56 | - cp -fr ../pulsemodel_source_build pulsemodel 57 | 58 | - zip -r $PACKAGENAME pulsemodel 59 | - ls 60 | - ls -l $PACKAGENAME 61 | 62 | - echo "Testing ..." 63 | - make 64 | - cd test 65 | - make samples.scp 66 | - cd .. 67 | - pwd 68 | - coverage run --source=. --omit="test/*","external/*" test/test_smoke.py 69 | 70 | 71 | after_success: 72 | - codecov 73 | 74 | 75 | deploy: 76 | skip_cleanup: true 77 | provider: releases 78 | api_key: 79 | secure: BlKzFevP0b1d0vB3yfQU4D91agOQ3JA/2mX9gY+rrho0uNlVp+kZIUgPQ4+4EQROwttSdSLMzFLZdAhDmPnS8RP3mBmLdacZ4Z2B27FvhGh+Comfwobk2kT7R2WFg7jAAef9AK3/dJyRYe1bLOEdAGeNlEpa5ucjpLJ389zxKZXkm2T2UMzH+6n+YGND34utB9PKcQ9DluHA9H17FMmxFjQcYCLPiy5ytdOHNUusF9eBoYmgqJnfQ393vWpQbl88HJloPOtdCaCw1XUt6p6AwEsq9F9RP0tDL3hRIXOTk3Uwe1q94wYDR/LFt1MSL/IxQ1PptaszT/CgrHf8y2K+QUKUU2iEIeKywXjYM227+dC8E0VgruMNNWIdOAc6o/e2zJ0ORt8Exh+TYyMWIXIUe4smneLZWjZZVJSO371GWeCgQW4jLh18Ae6CN/FmFiMuhPQDzncoMCr+cJbeQCx3LdYLgaczk5euU5mL6MCJEwYa5Nk2zNKrMrDjx5BQu4cS2zqaGJcKYQ0WPSamDnlgqWFJUW91uscHifKvlkai8kDB8IpsUIRo1aWor9UAJuXPMyJiTT6D2gJxN07ZFmuOBHmGP5FuRSnDwMFN2mHxxbgnnThDu7dmG0bFMFsuCZoSAwezz3yvT15NCm+5a1l3ikZ198Ek6DKzfOLTDL+jbHQ= 80 | file: $PACKAGENAME 81 | on: 82 | repo: gillesdegottex/pulsemodel 83 | tags: true 84 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016 Engineering Department, University of Cambridge, UK. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build submodule_init build_sigproc build_reaper build_pyworld 2 | 3 | 4 | all: build 5 | 6 | submodule_init: 7 | git submodule update --init --recursive 8 | 9 | 10 | build: submodule_init build_sigproc build_reaper build_pyworld 11 | 12 | build_sigproc: submodule_init 13 | cd sigproc; $(MAKE) 14 | 15 | build_reaper: submodule_init 16 | cd external/REAPER; mkdir build; cd build; cmake ..; make 17 | 18 | build_pyworld: submodule_init 19 | cd external/pyworld; python setup.py build_ext --inplace 20 | 21 | 22 | test: build 23 | cd test; $(MAKE) 24 | python test/test_smoke.py 25 | 26 | distclean: 27 | rm -f *.pyc 28 | cd test; $(MAKE) distclean 29 | cd sigproc; $(MAKE) distclean 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/gillesdegottex/pulsemodel.svg?branch=master)](https://travis-ci.org/gillesdegottex/pulsemodel) 2 | [![codecov](https://codecov.io/gh/gillesdegottex/pulsemodel/branch/master/graph/badge.svg)](https://codecov.io/gh/gillesdegottex/pulsemodel) 3 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/c9fbe9dc053046349c7cca95b8ce6404)](https://www.codacy.com/app/gillesdegottex/pulsemodel?utm_source=github.com&utm_medium=referral&utm_content=gillesdegottex/pulsemodel&utm_campaign=Badge_Grade) 4 | 5 | 6 | ## Pulse model analysis and synthesis 7 | 8 | It is basically the vocoder described in: 9 | > G. Degottex, P. Lanchantin and M. Gales, "A Log Domain Pulse Model for Parametric 10 | > Speech Synthesis", IEEE Transactions on Audio, Speech, and Language Processing, 11 | > 26(1):57-70, 2018. 12 | 13 | ### Documentation 14 | Please see the headers of analysis.py and synthesis.py files as well as the 15 | functions documentation for more details. 16 | 17 | ### Testing/HowTo 18 | In the root directory, simply run: 19 | ```make 20 | $ make test 21 | ``` 22 | 23 | You can also have a look at the file test/test_smoke.py to see how the PML's scripts can be used. 24 | 25 | ### Legal 26 | 27 | Copyright(C) 2016 Engineering Department, University of Cambridge, UK. 28 | 29 | The code in this repository is released under the Apache License, Version 2.0. 30 | Please see LICENSE.md for more details. 31 | 32 | Author: Gilles Degottex 33 | 34 | ### External tools 35 | PML first aims at extracting a noise measure and synthesis a waveform assuming F0 curve and amplitude spectral envelopes are already given. 36 | 37 | In order to make it a standalone vocoder, it was thus necessary to import an F0 estimator and a spectral envelope estimator. 38 | 39 | #### For F0 40 | For F0, REAPER is used: 41 | > https://github.com/gillesdegottex/REAPER 42 | 43 | #### For the amplitude spectral envelope 44 | For the amplitude spectral envelope, the estimator CheapTrick is used: 45 | 46 | > Masanori Morise, CheapTrick, a spectral envelope estimator for high-quality speech synthesis, Speech Communication, Volume 67, 2015, Pages 1-7, ISSN 0167-6393, http://dx.doi.org/10.1016/j.specom.2014.09.003. 47 | 48 | The python wrapper of the original implementation is used (without any modification) and can be found at: 49 | > https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder 50 | 51 | Note that all the published results about PML have been done using the spectral envelope of the STRAIGHT vocoder, NOT using WORLD. 52 | Because of legal reason it is not possible to release any of STRAIGHT vocoder analysis. Thus, the use of CheapTrick instead in this repository. 53 | It also means that, even though STRAIGHT's envelope and CheapTrick are quite similar, you might observe small differences in results between the two. 54 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright(C) 2016 Engineering Department, University of Cambridge, UK. 3 | 4 | License 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | Author 18 | Gilles Degottex 19 | ''' 20 | 21 | from analysis import * 22 | from synthesis import * 23 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | 4 | References 5 | [1] G. Degottex, P. Lanchantin, and M. Gales, "A Pulse Model in Log-domain 6 | for a Uniform Synthesizer," in Proc. 9th Speech Synthesis Workshop 7 | (SSW9), 2016. 8 | [2] G. Degottex and D. Erro, "A uniform phase representation for the 9 | harmonic model in speech synthesis applications," EURASIP, Journal on 10 | Audio, Speech, and Music Processing - Special Issue: Models of Speech - 11 | In Search of Better Representations, vol. 2014, iss. 1, p. 38, 2014. 12 | [3] G. Degottex, P. Lanchantin and M. Gales, "A Log Domain Pulse Model for 13 | Parametric Speech Synthesis", IEEE Transactions on Audio, Speech, and 14 | Language Processing, 26(1):57-70, 2018. 15 | 16 | Copyright(C) 2016 Engineering Department, University of Cambridge, UK. 17 | 18 | License 19 | Licensed under the Apache License, Version 2.0 (the "License"); 20 | you may not use this file except in compliance with the License. 21 | You may obtain a copy of the License at 22 | 23 | http://www.apache.org/licenses/LICENSE-2.0 24 | 25 | Unless required by applicable law or agreed to in writing, software 26 | distributed under the License is distributed on an "AS IS" BASIS, 27 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 28 | See the License for the specific language governing permissions and 29 | limitations under the License. 30 | 31 | Author 32 | Gilles Degottex 33 | ''' 34 | 35 | import argparse 36 | import sys 37 | import os 38 | import warnings 39 | 40 | import numpy as np 41 | np.random.seed(123) # Generate always the same "random" numbers, for debugging. 42 | from scipy import signal as sig 43 | 44 | import sigproc as sp 45 | import sigproc.pystraight 46 | import sigproc.interfaces 47 | 48 | # Add the path for REAPER f0 estimator 49 | os.environ["PATH"] += os.pathsep + os.path.join(os.path.split(os.path.realpath(__file__))[0],'external/REAPER/build') 50 | # Add the path for WORLD vocoder's amplitude spectral envelope estimator 51 | sys.path.insert(0, os.path.join(os.path.split(os.path.realpath(__file__))[0],'external/pyworld/pyworld')) 52 | 53 | def analysis_f0postproc(wav, fs, f0s=None, f0_min=60, f0_max=600, 54 | shift=0.005, # Usually 5ms 55 | f0estimator='REAPER', 56 | verbose=1): 57 | ''' 58 | Post process the F0 estimate. 59 | If f0s==None, an F0 estimate is extracted using REAPER. 60 | ''' 61 | if f0s is None: 62 | # TODO Switch f0 estimator using `f0estimator` 63 | f0s = sigproc.interfaces.reaper(wav, fs, shift, f0_min, f0_max) 64 | 65 | # If only values are given, make two column matrix [time[s], value[Hz]] (ljuvela) 66 | if len(f0s.shape)==1: 67 | ts = (shift)*np.arange(len(f0s)) 68 | f0s = np.vstack((ts, f0s)).T 69 | 70 | if not (f0s[:,1]>0).any(): 71 | warnings.warn('''\n\nWARNING: No F0 value can be estimated in this signal. 72 | It will be replaced by the constant f0_min value ({}Hz). 73 | '''.format(f0_min), RuntimeWarning) 74 | f0s[:,1] = f0_min 75 | 76 | 77 | # Build the continuous f0 78 | f0s[:,1] = np.interp(f0s[:,0], f0s[f0s[:,1]>0,0], f0s[f0s[:,1]>0,1]) 79 | # Avoid erratic values outside of the given interval 80 | f0s[:,1] = np.clip(f0s[:,1], f0_min, f0_max) 81 | # Removes steps in the f0 curve (see sigproc.resampling.f0s_rmsteps(.) ) 82 | f0s = sp.f0s_rmsteps(f0s) 83 | # Resample the given f0 to regular intervals 84 | if np.std(np.diff(f0s[:,0]))>2*np.finfo(f0s[0,0]).resolution: 85 | warnings.warn('''\n\nWARNING: F0 curve seems to be sampled non-uniformly (mean(F0)={}, std(F0s')={}). 86 | It will be resampled at {}s intervals. 87 | '''.format(np.std(f0s[:,0]), np.std(np.diff(f0s[:,0])), shift), RuntimeWarning) 88 | f0s = sp.f0s_resample_cst(f0s, shift) 89 | 90 | return f0s 91 | 92 | def analysis_spec(wav, fs, f0s, 93 | shift=0.005, # Usually 5ms 94 | dftlen=4096, # You can adapt this one according to your pipeline 95 | verbose=1): 96 | ''' 97 | Estimate the amplitude spectral envelope. 98 | ''' 99 | 100 | if sp.pystraight.isanalysiseavailable(): # pragma: no cover 101 | # Cannot be tested since STRAIGHT 102 | # is not openly available. 103 | warnings.warn('''\n\nWARNING: straight_mcep is available, 104 | STRAIGHT vocoder will thus be used instead of WORLD. 105 | Note that PML-related publications present results using STRAIGHT vocoder. 106 | ''', RuntimeWarning) 107 | 108 | # Use STRAIGHT's envelope if available (as in PML's publications) 109 | SPEC = sigproc.pystraight.analysis_spec(wav, fs, f0s, shift, dftlen, keeplen=True) 110 | 111 | elif sigproc.interfaces.worldvocoder_is_available(): 112 | 113 | # Then try WORLD vocoder 114 | import pyworld 115 | wav = np.ascontiguousarray(wav) 116 | #_f0, ts = pyworld.dio(x, fs, frame_period=shift*1000) # raw pitch extractor # Use REAPER instead 117 | pwts = np.ascontiguousarray(f0s[:,0]) 118 | pwf0 = pyworld.stonemask(wav, np.ascontiguousarray(f0s[:,1]), pwts, fs) # pitch refinement 119 | SPEC = pyworld.cheaptrick(wav, pwf0, pwts, fs, fft_size=dftlen) # extract smoothed spectrogram 120 | SPEC = 10.0*np.sqrt(SPEC) # TODO Best gain correction I could find. Hard to find the good one between PML and WORLD different syntheses 121 | 122 | else: # pragma: no cover 123 | # This a safeguard that should never happend since WORLD is embeded in 124 | # pulsemodel. 125 | # Estimate the sinusoidal parameters at regular intervals in order 126 | # to build the amplitude spectral envelope 127 | sinsreg, _ = sp.sinusoidal.estimate_sinusoidal_params(wav, fs, f0s, nbper=3, quadraticfit=True, verbose=verbose-1) 128 | 129 | warnings.warn('''\n\nWARNING: Neither straight_mcep nor WORLD's cheaptrick spectral envelope estimators are available. 130 | Thus, a SIMPLISTIC Linear interpolation will be used for the spectral envelope. 131 | Do _NOT_ use this envelope for speech synthesis! 132 | Please use a better one (e.g. STRAIGHT's or WORLD's). 133 | If you use this simplistic envelope, the TTS quality will 134 | be lower than that in the results reported. 135 | ''', RuntimeWarning) 136 | 137 | SPEC = sp.multi_linear(sinsreg, fs, dftlen) 138 | SPEC = np.exp(SPEC)*np.sqrt(float(dftlen)) 139 | 140 | return SPEC 141 | 142 | def analysis_pdd(wav, fs, f0s, 143 | dftlen=4096, # You can adapt this one according to your pipeline 144 | pdd_sin_nbperperiod=4, # 4 analysis instants per period [2] 145 | pdd_sin_winnbper=2.5, # 2.5 is enough for phase measure 146 | # (it overestimates the amplitude but we 147 | # don't use it anyway) 148 | verbose=1): 149 | ''' 150 | Estimate the Phase Distortion Deviation (PDD). 151 | ''' 152 | 153 | # Extract the Phase Distortion Deviation (PDD) feature 154 | # Will need a pitch sync analysis, so resample the f0 accordingly 155 | f0sps = sp.f0s_resample_pitchsync(f0s, nbperperiod=pdd_sin_nbperperiod) 156 | 157 | # Estimate the sinusoidal parameters 158 | sinsps, f0sps = sp.sinusoidal.estimate_sinusoidal_params(wav, fs, f0sps, nbper=pdd_sin_winnbper, quadraticfit=True, verbose=verbose-1) 159 | 160 | # Compute PDD from the sinusoidal parameters 161 | # We don't provide an envelope estimate so the VTF's phase will stay in the computation 162 | # However, the VTF's phase is ~constant wrt time, thus disapear in the variance measure. 163 | # (The only risk is to have the VTF's variations that adds to PDD) 164 | PDD = sp.sinusoidal.estimate_pdd(sinsps, f0sps, fs, pdd_sin_nbperperiod, dftlen, outFscale=True, rmPDtrend=True, extrapDC=True) 165 | 166 | # Resample the feature from pitch synchronous to regular intervals 167 | PDD = sp.featureresample(f0sps[:,0], PDD, f0s[:,0]) 168 | 169 | return PDD 170 | 171 | def analysis_nm(wav, fs, 172 | f0s, # Has to be continuous (should use analysis_f0postproc) 173 | PDD, # Phase Distortion Deviation [2] 174 | # Its length should match f0s' 175 | pdd_threshold=0.75, # 0.75 as in [2] 176 | nm_clean=True, # Use morphological opening and closure to 177 | # clean the mask and avoid learning rubish. 178 | verbose=1): 179 | ''' 180 | Estimate the Noise Mask (NM) from the Phase Distortion Deviation (PDD). 181 | ''' 182 | 183 | if f0s.shape[0]!=PDD.shape[0]: 184 | raise ValueError('f0s size and PDD size do not match!') # pragma: no cover 185 | 186 | shift = np.mean(np.diff(f0s[:,0])) # Get the time shift from the F0 times 187 | dftlen = (PDD.shape[1]-1)*2 # and the DFT len from the PDD feature 188 | 189 | # The Noise Mask is just a thresholded version of PDD 190 | HARM = PDD.copy() 191 | HARM[PDD<=pdd_threshold] = 0 192 | HARM[PDD>pdd_threshold] = 1 193 | 194 | if nm_clean: 195 | # Clean the PDD mask to avoid learning rubish details 196 | import scipy.ndimage 197 | frq = 70.0 # [Hz] 198 | morphstruct = np.ones((int(np.round((1.0/frq)/shift)),int(np.round(frq*dftlen/float(fs))))) 199 | HARM = 1.0-HARM 200 | HARM = scipy.ndimage.binary_opening(HARM, structure=morphstruct) 201 | HARM = scipy.ndimage.binary_closing(HARM, structure=morphstruct) 202 | HARM = 1.0-HARM 203 | 204 | # Avoid noise in low-freqs 205 | for n in range(len(f0s[:,0])): 206 | HARM[n,:int(np.round(1.5*f0s[n,1]*dftlen/float(fs)))] = 0.0 207 | 208 | NM = HARM 209 | 210 | return NM 211 | 212 | def analysis(wav, fs, f0s=None, f0_min=60, f0_max=600, f0estimator='REAPER', 213 | shift=0.005, # Usually 5ms 214 | dftlen=4096, # You can adapt this one according to your pipeline 215 | verbose=1): 216 | 217 | if verbose>0: print('PML Analysis (dur={:.3f}s, fs={}Hz, f0 in [{},{}]Hz, shift={}s, dftlen={})'.format(len(wav)/float(fs), fs, f0_min, f0_max, shift, dftlen)) 218 | 219 | f0s = analysis_f0postproc(wav, fs, f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, f0estimator=f0estimator, verbose=verbose) 220 | 221 | SPEC = analysis_spec(wav, fs, f0s, shift=shift, dftlen=dftlen, verbose=verbose) 222 | 223 | PDD = analysis_pdd(wav, fs, f0s, dftlen=dftlen, verbose=verbose) 224 | 225 | NM = analysis_nm(wav, fs, f0s, PDD, verbose=verbose) 226 | 227 | if verbose>2: 228 | plot_features(wav=wav, fs=fs, f0s=f0s, SPEC=SPEC, PDD=PDD, NM=NM) # pragma: no cover 229 | 230 | return f0s, SPEC, PDD, NM 231 | 232 | def plot_features(wav=None, fs=None, f0s=None, SPEC=None, PDD=None, NM=None): # pragma: no cover 233 | # TODO Could test this by writting in a picture 234 | tstart = 0.0 235 | tend = 1.0 236 | nbview = 0 237 | if not wav is None: nbview+=1 238 | if not f0s is None: nbview+=1 239 | if not SPEC is None: nbview+=1 240 | if not PDD is None: nbview+=1 241 | if not NM is None: nbview+=1 242 | import matplotlib.pyplot as plt 243 | plt.ion() 244 | _, axs = plt.subplots(nbview, 1, sharex=True, sharey=False) 245 | if not isinstance(axs, np.ndarray): axs = np.array([axs]) 246 | view=0 247 | if not wav is None: 248 | times = np.arange(len(wav))/float(fs) 249 | axs[view].plot(times, wav, 'k') 250 | axs[view].set_ylabel('Waveform\nAmplitude') 251 | axs[view].grid() 252 | axs[view].set_xlim((0.0, times[-1])) 253 | view+=1 254 | if not f0s is None: 255 | tstart = f0s[0,0] 256 | tend = f0s[-1,0] 257 | axs[view].plot(f0s[:,0], f0s[:,1], 'k') 258 | axs[view].set_ylabel('F0\nFrequency [Hz]') 259 | axs[view].grid() 260 | view+=1 261 | if not SPEC is None: 262 | axs[view].imshow(sp.mag2db(SPEC).T, origin='lower', aspect='auto', interpolation='none', extent=(tstart, tend, 0, 0.5*fs), cmap='jet') 263 | axs[view].set_ylabel('Amp. Envelope\nFrequency [Hz]') 264 | view+=1 265 | if not PDD is None: 266 | axs[view].imshow(PDD.T, origin='lower', aspect='auto', interpolation='none', extent=(tstart, tend, 0, 0.5*fs), cmap='jet', vmin=0.0, vmax=2.0) 267 | axs[view].set_ylabel('PDD\nFrequency [Hz]') 268 | view+=1 269 | if not NM is None: 270 | axs[view].imshow(NM.T, origin='lower', aspect='auto', interpolation='none', extent=(tstart, tend, 0, 0.5*fs), cmap='Greys', vmin=0.0, vmax=1.0) 271 | axs[view].set_ylabel('Noise Mask \nFrequency [Hz]') 272 | view+=1 273 | axs[-1].set_xlabel('Time [s]') 274 | from IPython.core.debugger import Pdb; Pdb().set_trace() 275 | 276 | def analysisf(fwav, 277 | shift=0.005, 278 | dftlen=4096, 279 | finf0txt=None, f0estimator='REAPER', f0_min=60, f0_max=600, ff0=None, f0_log=False, 280 | finf0bin=None, # input f0 file in binary 281 | fspec=None, 282 | spec_mceporder=None, # Mel-cepstral order for compressing the spectrogram (typically 59; None: no compression) 283 | spec_fwceporder=None,# Frequency warped cepstral order (very similar to above, just faster and less precise) (typically 59; None: no compression) 284 | spec_nbfwbnds=None, # Number of mel-bands in the compressed half log spectrogram (None: no compression) 285 | spec_nblinlogbnds=None, # Number of linear-bands in the compressed half log spectrogram (None: no compression) 286 | fpdd=None, pdd_mceporder=None, # Mel-cepstral order for compressing PDD spectrogram (typically 59; None: no compression) 287 | fnm=None, nm_nbfwbnds=None, # Number of mel-bands in the compressed noise mask (None: no compression) 288 | preproc_fs=None, # Resample the waveform 289 | preproc_hp=None, # Cut-off of high-pass filter (e.g. 20Hz) 290 | verbose=1): 291 | 292 | wav, fs, _ = sp.wavread(fwav) 293 | 294 | if len(wav)==0: raise ValueError('The waveform in {} is empty.'.format(fwav)) 295 | 296 | if verbose>0: print('PML Analysis (dur={:.3f}s, fs={}Hz, f0 in [{},{}]Hz, shift={}s, dftlen={})'.format(len(wav)/float(fs), fs, f0_min, f0_max, shift, dftlen)) 297 | 298 | if (not preproc_fs is None) and (preproc_fs!=fs): 299 | if verbose>0: print(' Resampling the waveform (new fs={}Hz)'.format(preproc_fs)) 300 | wav = sp.resample(wav, fs, preproc_fs, method=2, deterministic=True) 301 | fs = preproc_fs 302 | 303 | if not preproc_hp is None: 304 | if verbose>0: print(' High-pass filter the waveform (cutt-off={}Hz)'.format(preproc_hp)) 305 | b, a = sig.butter(4, preproc_hp/(fs/0.5), btype='high') 306 | wav = sig.filtfilt(b, a, wav) 307 | 308 | f0s = None 309 | if finf0txt: 310 | f0s = np.loadtxt(finf0txt) 311 | 312 | # read input f0 file in float32 (ljuvela) 313 | if finf0bin: 314 | f0s = np.fromfile(finf0bin, dtype=np.float32) 315 | 316 | f0s = analysis_f0postproc(wav, fs, f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, f0estimator=f0estimator, verbose=verbose) 317 | if verbose>2: f0sori=f0s.copy() 318 | 319 | if ff0: 320 | f0_values = f0s[:,1] 321 | if verbose>0: print(' Output F0 {} in: {}'.format(f0_values.shape, ff0)) 322 | if f0_log: f0_values = np.log(f0_values) 323 | if os.path.dirname(ff0)!='' and (not os.path.isdir(os.path.dirname(ff0))): os.mkdir(os.path.dirname(ff0)) 324 | f0_values.astype(np.float32).tofile(ff0) 325 | 326 | SPEC = None 327 | if fspec: 328 | SPEC = analysis_spec(wav, fs, f0s, shift=shift, dftlen=dftlen, verbose=verbose) 329 | if verbose>2: SPECori=SPEC.copy() 330 | if not spec_mceporder is None: # pragma: no cover 331 | # Cannot test this because it needs SPTK 332 | SPEC = sp.spec2mcep(SPEC, sp.bark_alpha(fs), order=spec_mceporder) 333 | if not spec_fwceporder is None: 334 | SPEC = sp.loghspec2fwcep(np.log(abs(SPEC)), fs, order=spec_fwceporder) 335 | if not spec_nbfwbnds is None: 336 | SPEC = sp.linbnd2fwbnd(np.log(abs(SPEC)), fs, dftlen, spec_nbfwbnds) 337 | if not spec_nblinlogbnds is None: 338 | SPEC = np.log(abs(SPEC)) 339 | if verbose>0: print(' Output Spectrogram size={} in: {}'.format(SPEC.shape, fspec)) 340 | if os.path.dirname(fspec)!='' and (not os.path.isdir(os.path.dirname(fspec))): os.mkdir(os.path.dirname(fspec)) 341 | SPEC.astype(np.float32).tofile(fspec) 342 | 343 | PDD = None 344 | if fpdd or fnm: 345 | PDD = analysis_pdd(wav, fs, f0s, dftlen=dftlen, verbose=verbose) 346 | if verbose>2: PDDori=PDD.copy() 347 | 348 | if fpdd: 349 | if not pdd_mceporder is None: # pragma: no cover 350 | # Cannot test this because it needs SPTK 351 | # If asked, compress PDD 352 | PDD[PDD<0.001] = 0.001 # From COVAREP 353 | PDD = sp.spec2mcep(PDD, sp.bark_alpha(fs), pdd_mceporder) 354 | if verbose>0: print(' Output PDD size={} in: {}'.format(PDD.shape, fpdd)) 355 | if os.path.dirname(fpdd)!='' and (not os.path.isdir(os.path.dirname(fpdd))): os.mkdir(os.path.dirname(fpdd)) 356 | PDD.astype(np.float32).tofile(fpdd) 357 | 358 | NM = None 359 | if verbose>2: NMori=None 360 | if fnm: 361 | NM = analysis_nm(wav, fs, f0s, PDD, verbose=verbose) 362 | if verbose>2: NMori=NM.copy() 363 | # If asked, compress NM 364 | if nm_nbfwbnds: 365 | # If asked, compress the noise mask using a number of mel bands 366 | NM = sp.linbnd2fwbnd(NM, fs, dftlen, nm_nbfwbnds) 367 | if verbose>0: print(' Output Noise Mask size={} in: {}'.format(NM.shape, fnm)) 368 | if os.path.dirname(fnm)!='' and (not os.path.isdir(os.path.dirname(fnm))): os.mkdir(os.path.dirname(fnm)) 369 | NM.astype(np.float32).tofile(fnm) 370 | 371 | if verbose>2: 372 | plot_features(wav=wav, fs=fs, f0s=f0sori, SPEC=SPECori, PDD=PDDori, NM=NMori) # pragma: no cover 373 | 374 | def main(argv): 375 | argpar = argparse.ArgumentParser() 376 | argpar.add_argument("wavfile", help="Input wav file") 377 | argpar.add_argument("--shift", default=0.005, type=float, help="time step[s] between the input frames (def. 0.005s)") 378 | argpar.add_argument("--dftlen", default=4096, type=int, help="Number of bins in the DFT (def. 4096)") 379 | argpar.add_argument("--inf0txt", default=None, help="Given f0 file") 380 | argpar.add_argument("--inf0bin", default=None, help="Given f0 file (single precision float binary)") 381 | argpar.add_argument("--f0_min", default=60, type=float, help="Minimal possible f0[Hz] value (def. 60Hz)") 382 | argpar.add_argument("--f0_max", default=600, type=float, help="Maximal possible f0[Hz] value (def. 600Hz)") 383 | argpar.add_argument("--f0", default=None, help="Output f0 file") 384 | argpar.add_argument("--f0_log", action='store_true', help="Output f0 file with log Hertz values instead of linear Hertz (def. False)") 385 | argpar.add_argument("--spec", default=None, help="Output spectrum-related file") 386 | argpar.add_argument("--spec_mceporder", default=None, type=int, help="Mel-cepstral order for the spectrogram (None:uncompressed; typically 59)") 387 | argpar.add_argument("--spec_fwceporder", default=None, type=int, help="Frequency warped cepstral order (very similar to above, just faster and less precise) (typically 59)") 388 | argpar.add_argument("--spec_nbfwbnds", default=None, type=int, help="Number of mel-bands in the compressed half log spectrogram (None:uncompressed; typically 129 (should be odd size as long as full spectrum size if power of 2 (even size)") 389 | argpar.add_argument("--spec_nblinlogbnds", default=None, type=int, help="Number of frequency bands in the compressed half log spectrogram (None:uncompressed; typically 129 (should be odd size as long as full spectrum size if power of 2 (even size)") 390 | argpar.add_argument("--pdd", default=None, help="Output Phase Distortion Deviation (PDD) file") 391 | argpar.add_argument("--pdd_mceporder", default=None, type=int, help="Cepstral order for PDD (None:uncompressed; typically 59)") 392 | argpar.add_argument("--nm", default=None, help="Output noise mask") 393 | argpar.add_argument("--nm_nbfwbnds", default=None, type=int, help="Number of mel-bands in the compressed noise mask (None:uncompressed; typically 33)") 394 | argpar.add_argument("--preproc_fs", default=None, type=float, help="[Hz] Resample the waveform before analysis.") 395 | argpar.add_argument("--preproc_hp", default=None, type=float, help="[Hz] High-pass the waveform before analysis.") 396 | argpar.add_argument("--verbose", default=1, type=int, help="Output some information") 397 | args = argpar.parse_args(argv) 398 | 399 | analysisf(args.wavfile, 400 | shift=args.shift, 401 | dftlen=args.dftlen, 402 | finf0txt=args.inf0txt, f0_min=args.f0_min, f0_max=args.f0_max, ff0=args.f0, f0_log=args.f0_log, 403 | finf0bin=args.inf0bin, 404 | fspec=args.spec, spec_mceporder=args.spec_mceporder, spec_fwceporder=args.spec_fwceporder, spec_nbfwbnds=args.spec_nbfwbnds, spec_nblinlogbnds=args.spec_nblinlogbnds, 405 | fpdd=args.pdd, pdd_mceporder=args.pdd_mceporder, 406 | fnm=args.nm, nm_nbfwbnds=args.nm_nbfwbnds, 407 | preproc_fs=args.preproc_fs, preproc_hp=args.preproc_hp, 408 | verbose=args.verbose) 409 | 410 | if __name__ == "__main__" : # pragma: no cover 411 | main(sys.argv[1:]) 412 | -------------------------------------------------------------------------------- /synthesis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | 4 | Description 5 | 6 | If using files, (call by command line or from python): 7 | all the inputs are raw float32 vectors files that are reshaped by the number 8 | of f0 values in ff0. 9 | 10 | There are three safe patches that were not described in the publication[1]: 11 | (These are not critical, they might remove a few artifacts here and there). 12 | * The noise mask is slightly low-passed (smoothed) across frequency 13 | (def. 9 bins freq. window), in order to avoid cliffs in frequency domain 14 | that end up creating Gibbs phenomenon in the time domain. 15 | * High-pass filtering (def. 0.5*f0 cut-off) 16 | This centers each synthesized segment around zero, to avoid cutting 17 | any DC residual component (e.g. comming from the spectral envelope). 18 | * Short half-window (def. 1ms (yes, one ms)) on the left of the pulse, 19 | in order to avoid any pre-echos. 20 | 21 | Reference 22 | [1] G. Degottex, P. Lanchantin, and M. Gales, "A Pulse Model in Log-domain 23 | for a Uniform Synthesizer," in Proc. 9th Speech Synthesis Workshop 24 | (SSW9), 2016. 25 | [2] G. Degottex, P. Lanchantin and M. Gales, "A Log Domain Pulse Model for 26 | Parametric Speech Synthesis", IEEE Transactions on Audio, Speech, and 27 | Language Processing, 26(1):57-70, 2018. 28 | 29 | Copyright(C) 2016 Engineering Department, University of Cambridge, UK. 30 | 31 | License 32 | Licensed under the Apache License, Version 2.0 (the "License"); 33 | you may not use this file except in compliance with the License. 34 | You may obtain a copy of the License at 35 | 36 | http://www.apache.org/licenses/LICENSE-2.0 37 | 38 | Unless required by applicable law or agreed to in writing, software 39 | distributed under the License is distributed on an "AS IS" BASIS, 40 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 41 | See the License for the specific language governing permissions and 42 | limitations under the License. 43 | 44 | Author 45 | Gilles Degottex 46 | ''' 47 | 48 | import argparse 49 | import sys 50 | import warnings 51 | 52 | import numpy as np 53 | np.random.seed(123) # Generate always the same "random" numbers, for debugging. 54 | import scipy 55 | 56 | import sigproc as sp 57 | 58 | def getwinlen(f0, fs, nbper): 59 | return int(np.max((0.050*fs, nbper*fs/f0))/2)*2+1 # Has to be odd 60 | 61 | def synthesize(fs, f0s, SPEC, NM=None, wavlen=None 62 | , ener_multT0=False 63 | , nm_cont=False # If False, force binary state of the noise mask (by thresholding at 0.5) 64 | , nm_lowpasswinlen=9 65 | , hp_f0coef=0.5 # factor of f0 for the cut-off of the high-pass filter (def. 0.5*f0) 66 | , antipreechohwindur=0.001 # [s] Use to damp the signal at the beginning of the signal AND at the end of it 67 | # Following options are for post-processing the features, after the generation/transformation and thus before waveform synthesis 68 | , pp_f0_rmsteps=False # Removes steps in the f0 curve 69 | # (see sigproc.resampling.f0s_rmsteps(.) ) 70 | , pp_f0_smooth=None # Smooth the f0 curve using median and FIR filters of given window duration [s] 71 | , pp_atten1stharminsilences=None # Typical value is -25 72 | , verbose=1): 73 | 74 | winnbper = 4 # Number of periods in a synthesis windows. It still contains only one single pulse, but leaves space for the VTF to decay without being cut abruptly. 75 | 76 | # Copy the inputs to avoid modifying them 77 | f0s = f0s.copy() 78 | SPEC = SPEC.copy() 79 | if not NM is None: NM = NM.copy() 80 | else: NM = np.zeros(SPEC.shape) 81 | 82 | NM = np.clip(NM, 0.0, 1.0) # The noise mask is supposed to be in [0,1] 83 | 84 | # Check the size of the inputs 85 | if f0s.shape[0]!=SPEC.shape[0]: 86 | raise ValueError('F0 size {} and spectrogram size {} do not match'.format(f0s.shape[0], SPEC.shape[0])) # pragma: no cover 87 | if not NM is None: 88 | if SPEC.shape!=NM.shape: 89 | raise ValueError('spectrogram size {} and NM size {} do not match.'.format(SPEC.shape, NM.shape)) # pragma: no cover 90 | 91 | if wavlen==None: wavlen = int(np.round(f0s[-1,0]*fs)) 92 | dftlen = (SPEC.shape[1]-1)*2 93 | shift = np.median(np.diff(f0s[:,0])) 94 | if verbose>0: 95 | print('PML Synthesis (dur={}s, fs={}Hz, f0 in [{:.0f},{:.0f}]Hz, shift={}s, dftlen={})'.format(wavlen/float(fs), fs, np.min(f0s[:,1]), np.max(f0s[:,1]), shift, dftlen)) 96 | 97 | # Prepare the features 98 | 99 | # Enforce continuous f0 100 | f0s[:,1] = np.interp(f0s[:,0], f0s[f0s[:,1]>0,0], f0s[f0s[:,1]>0,1]) 101 | # If asked, removes steps in the f0 curve 102 | if pp_f0_rmsteps: 103 | f0s = sp.f0s_rmsteps(f0s) 104 | # If asked, smooth the f0 curve using median and FIR filters 105 | if not pp_f0_smooth is None: 106 | print(' Smoothing f0 curve using {}[s] window'.format(pp_f0_smooth)) 107 | import scipy.signal as sig 108 | lf0 = np.log(f0s[:,1]) 109 | bcoefslen = int(0.5*pp_f0_smooth/shift)*2+1 110 | lf0 = sig.medfilt(lf0, bcoefslen) 111 | bcoefs = np.hamming(bcoefslen) 112 | bcoefs = bcoefs/sum(bcoefs) 113 | lf0 = sig.filtfilt(bcoefs, [1], lf0) 114 | f0s[:,1] = np.exp(lf0) 115 | 116 | winlenmax = getwinlen(np.min(f0s[:,1]), fs, winnbper) 117 | if winlenmax>dftlen: 118 | warnings.warn('\n\nWARNING: The maximum window length ({}) is bigger than the DFT length ({}). Please, increase the DFT length of your spectral features (the second dimension) or check if the f0 curve has extremly low values and try to clip them to higher values (at least higher than 50Hz). The f0 curve has been clipped to {}Hz.\n\n'.format(winlenmax, dftlen, winnbper*fs/float(dftlen))) # pragma: no cover 119 | f0s[:,1] = np.clip(f0s[:,1], winnbper*fs/float(dftlen-2), 1e6) 120 | 121 | if not NM is None: 122 | # Remove noise below f0, as it is supposed to be already the case 123 | for n in range(NM.shape[0]): 124 | NM[n,:int((float(dftlen)/fs)*2*f0s[n,1])] = 0.0 125 | 126 | if not nm_cont: 127 | print(' Forcing binary noise mask') 128 | NM[NM<=0.5] = 0.0 # To be sure that voiced segments are not hoarse 129 | NM[NM>0.5] = 1.0 # To be sure the noise segments are fully noisy 130 | 131 | # Generate the pulse positions [1](2) (i.e. the synthesis instants, the GCIs in voiced segments) 132 | ts = [0.0] 133 | while ts[-1] sp.mag2db(np.max(ener))-30)[0] # Get approx active frames # TODO Param 153 | enermed = sp.mag2db(np.median(ener[idxacs])) # Median energy [dB] 154 | ener = sp.mag2db(ener) 155 | 156 | # Resample the noise feature to the pulse positions 157 | # Smooth the frequency response of the mask in order to avoid Gibbs 158 | # (poor Gibbs nobody want to see him) 159 | nm_lowpasswin = np.hanning(nm_lowpasswinlen) 160 | nm_lowpasswin /= np.sum(nm_lowpasswin) 161 | NMR = np.zeros((f0s.shape[0], dftlen/2+1)) 162 | for n, t in enumerate(f0s[:,0]): 163 | idx = int(np.round(t/shift)) # Nearest is better for plosives 164 | idx = np.clip(idx, 0, NM.shape[0]-1) 165 | NMR[n,:] = NM[idx,:] 166 | if nm_lowpasswinlen>1: 167 | NMR[n,:] = scipy.signal.filtfilt(nm_lowpasswin, [1.0], NMR[n,:]) 168 | 169 | NMR = np.clip(NMR, 0.0, 1.0) 170 | 171 | # The complete waveform that we will fill with the pulses 172 | wav = np.zeros(wavlen) 173 | # Half window on the left of the synthesized segment to avoid pre-echo 174 | dampinhwin = np.hanning(1+2*int(np.round(antipreechohwindur*fs))) # 1ms forced dampingwindow 175 | dampinhwin = dampinhwin[:(len(dampinhwin)-1)/2+1] 176 | 177 | for n, t in enumerate(f0s[:,0]): 178 | f0 = f0s[n,1] 179 | 180 | if verbose>1: print "\rPM Synthesis (python) t={:4.3f}s f0={:3.3f}Hz ".format(t,f0), 181 | 182 | # Window's length 183 | # TODO It should be ensured that the beggining and end of the 184 | # noise is within the window. Nothing is doing this currently! 185 | winlen = getwinlen(f0, fs, winnbper) 186 | # TODO We also assume that the VTF's decay is shorter 187 | # than winnbper-1 periods (dangerous with high pitched and tense voice). 188 | if winlen>dftlen: raise ValueError('The window length ({}) is bigger than the DFT length ({}). Please, increase the dftlen of your spectral features or check if the f0 curve has extremly low values and try to clip them to higher values (at least higher than 50[Hz])'.format(winlen, dftlen)) # pragma: no cover 189 | 190 | # Set the rough position of the pulse in the window (the closest sample) 191 | # We keep a third of the window (1 period) on the left because the 192 | # pulse signal is minimum phase. And 2/3rd (remaining 2 periods) 193 | # on the right to let the VTF decay. 194 | pulseposinwin = int((1.0/winnbper)*winlen) 195 | 196 | # The sample indices of the current pulse wrt. the final waveform 197 | winidx = int(round(fs*t)) + np.arange(winlen)-pulseposinwin 198 | 199 | 200 | # Build the pulse spectrum 201 | 202 | # Let start with a Dirac 203 | S = np.ones(dftlen/2+1, dtype=np.complex64) 204 | 205 | # Add the delay to place the Dirac at the "GCI": exp(-j*2*pi*t_i) 206 | delay = -pulseposinwin - fs*(t-int(round(fs*t))/float(fs)) 207 | S *= np.exp((delay*2j*np.pi/dftlen)*np.arange(dftlen/2+1)) 208 | 209 | # Add the spectral envelope 210 | # Both amplitude and phase 211 | E = SPECR[n,:] # Take the amplitude from the given one 212 | if hp_f0coef!=None: 213 | # High-pass it to avoid any residual DC component. 214 | fcut = hp_f0coef*f0 215 | if not pp_atten1stharminsilences is None and ener[n]-enermed0: leftbnd=int(np.round(fs*0.5*(f0s[n-1,0]+t))) 234 | else: leftbnd=int(np.round(fs*(t-0.5/f0s[n,1]))) # int(0) 235 | if n=wavlen: 266 | # The window is partly outside of the waveform ... 267 | # ... thus copy only the existing part 268 | itouse = np.logical_and(winidx>=0,winidx1: print '\r \r', 274 | 275 | if verbose>2: # pragma: no cover 276 | import matplotlib.pyplot as plt 277 | plt.ion() 278 | _, axs = plt.subplots(3, 1, sharex=True, sharey=False) 279 | times = np.arange(len(wav))/float(fs) 280 | axs[0].plot(times, wav, 'k') 281 | axs[0].set_ylabel('Waveform\nAmplitude') 282 | axs[0].grid() 283 | axs[1].plot(f0s[:,0], f0s[:,1], 'k') 284 | axs[1].set_ylabel('F0\nFrequency [Hz]') 285 | axs[1].grid() 286 | axs[2].imshow(sp.mag2db(SPEC).T, origin='lower', aspect='auto', interpolation='none', extent=(f0s[0,0], f0s[-1,0], 0, 0.5*fs)) 287 | axs[2].set_ylabel('Amp. Envelope\nFrequency [Hz]') 288 | 289 | from IPython.core.debugger import Pdb; Pdb().set_trace() 290 | 291 | return wav 292 | 293 | 294 | 295 | def synthesizef(fs, shift=0.005, dftlen=4096, ff0=None, flf0=None, fspec=None, flspec=None, ffwlspec=None, ffwcep=None, fmcep=None, fpdd=None, fmpdd=None, fnm=None, ffwnm=None, nm_cont=False, fsyn=None, verbose=1): 296 | ''' 297 | Call the synthesis from python using file inputs and outputs 298 | ''' 299 | if ff0: 300 | f0 = np.fromfile(ff0, dtype=np.float32) 301 | if flf0: 302 | f0 = np.fromfile(flf0, dtype=np.float32) 303 | f0[f0>0] = np.exp(f0[f0>0]) 304 | ts = (shift)*np.arange(len(f0)) 305 | f0s = np.vstack((ts, f0)).T 306 | 307 | if fspec: 308 | SPEC = np.fromfile(fspec, dtype=np.float32) 309 | SPEC = SPEC.reshape((len(f0), -1)) 310 | if flspec: 311 | SPEC = np.fromfile(flspec, dtype=np.float32) 312 | SPEC = np.exp(SPEC.reshape((len(f0), -1))) 313 | if ffwlspec: 314 | FWLSPEC = np.fromfile(ffwlspec, dtype=np.float32) 315 | FWLSPEC = FWLSPEC.reshape((len(f0), -1)) 316 | SPEC = np.exp(sp.fwbnd2linbnd(FWLSPEC, fs, dftlen, smooth=True)) 317 | if ffwcep: 318 | FWCEP = np.fromfile(ffwcep, dtype=np.float32) 319 | FWCEP = FWCEP.reshape((len(f0), -1)) 320 | SPEC = np.exp(sp.fwcep2loghspec(FWCEP, fs, dftlen)) 321 | if fmcep: # pragma: no cover 322 | # Cannot test this because it needs SPTK 323 | MCEP = np.fromfile(fmcep, dtype=np.float32) 324 | MCEP = MCEP.reshape((len(f0), -1)) 325 | SPEC = sp.mcep2spec(MCEP, sp.bark_alpha(fs), dftlen) 326 | 327 | NM = None 328 | pdd_thresh = 0.75 # For this value, see: 329 | # G. Degottex and D. Erro, "A uniform phase representation for the harmonic model in speech synthesis applications," EURASIP, Journal on Audio, Speech, and Music Processing - Special Issue: Models of Speech - In Search of Better Representations, vol. 2014, iss. 1, p. 38, 2014. 330 | if fpdd: 331 | PDD = np.fromfile(fpdd, dtype=np.float32) 332 | PDD = PDD.reshape((len(f0), -1)) 333 | NM = PDD.copy() 334 | NM[PDDpdd_thresh] = 1.0 336 | if fmpdd: # pragma: no cover 337 | # Cannot test this because it needs SPTK 338 | MPDD = np.fromfile(fmpdd, dtype=np.float32) 339 | MPDD = MPDD.reshape((len(f0), -1)) 340 | PDD = sp.mcep2spec(MPDD, sp.bark_alpha(fs), dftlen) 341 | NM = PDD.copy() 342 | NM[PDDpdd_thresh] = 1.0 344 | 345 | if fnm: 346 | NM = np.fromfile(fnm, dtype=np.float32) 347 | NM = NM.reshape((len(f0), -1)) 348 | if ffwnm: 349 | FWNM = np.fromfile(ffwnm, dtype=np.float32) 350 | FWNM = FWNM.reshape((len(f0), -1)) 351 | NM = sp.fwbnd2linbnd(FWNM, fs, dftlen) 352 | 353 | syn = synthesize(fs, f0s, SPEC, NM=NM, nm_cont=nm_cont, verbose=verbose) 354 | if fsyn: 355 | sp.wavwrite(fsyn, syn, fs, norm_max_ifneeded=True, verbose=verbose) 356 | 357 | return syn 358 | 359 | def main(argv): 360 | ''' 361 | Call the synthesis from the command line 362 | ''' 363 | 364 | argpar = argparse.ArgumentParser() 365 | argpar.add_argument("synth", help="Output synthesis file") 366 | argpar.add_argument("--f0", default=None, help="Input f0[Hz] file") 367 | argpar.add_argument("--logf0", default=None, help="Input f0[log Hz] file") 368 | argpar.add_argument("--spec", default=None, help="Input amplitude spectrogram [linear values]") 369 | argpar.add_argument("--lspec", default=None, help="Input amplitude spectrogram [log spectral values on linear frequency scale]") 370 | argpar.add_argument("--fwlspec", default=None, help="Input amplitude spectrogram [frequency warped log spectral values]") 371 | argpar.add_argument("--fwcep", default=None, help="Input amplitude spectrogram [frequency warped cepstrum values]") 372 | argpar.add_argument("--mcep", default=None, help="Input amplitude spectrogram [mel-cepstrum values]") 373 | argpar.add_argument("--pdd", default=None, help="Input Phase Distortion Deviation file [linear values]") 374 | argpar.add_argument("--mpdd", default=None, help="Input Phase Distortion Deviation file [mel-cepstrum values]") 375 | argpar.add_argument("--nm", default=None, help="Output Noise Mask [linear values in [0,1] ]") 376 | argpar.add_argument("--fwnm", default=None, help="Output Noise Mask [compressed in bands with values still in [0,1] ]") 377 | argpar.add_argument("--nm_cont", action='store_true', help="Allow continuous values for the noisemask (def. False)") 378 | argpar.add_argument("--fs", default=16000, type=int, help="Sampling frequency[Hz]") 379 | argpar.add_argument("--shift", default=0.005, type=float, help="Time step[s] between the frames") 380 | #argpar.add_argument("--dftlen", dftlen=4096, type=float, help="Size of the DFT for extracting the features") 381 | argpar.add_argument("--verbose", default=1, help="Output some information") 382 | args = argpar.parse_args(argv) 383 | args.dftlen = 4096 384 | 385 | synthesizef(args.fs, shift=args.shift, dftlen=args.dftlen, ff0=args.f0, flf0=args.logf0, fspec=args.spec, flspec=args.lspec, ffwlspec=args.fwlspec, ffwcep=args.fwcep, fmcep=args.mcep, fnm=args.nm, ffwnm=args.fwnm, nm_cont=args.nm_cont, fpdd=args.pdd, fmpdd=args.mpdd, fsyn=args.synth, verbose=args.verbose) 386 | 387 | if __name__ == "__main__" : # pragma: no cover 388 | main(sys.argv[1:]) 389 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | # Prepare or clean data for tests 2 | # 3 | # Usage 4 | # $ make 5 | # 6 | # Copyright(C) 2016 Engineering Department, University of Cambridge, UK. 7 | # 8 | # License 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # 21 | # Author 22 | # Gilles Degottex 23 | 24 | all: 25 | $(MAKE) samples.scp 26 | 27 | samples.scp: 28 | wget http://festvox.org/cmu_arctic/cmu_arctic/cmu_us_slt_arctic/wav/arctic_a0010.wav -O slt_arctic_a0010.wav 29 | wget http://festvox.org/cmu_arctic/cmu_arctic/cmu_us_bdl_arctic/wav/arctic_a0020.wav -O bdl_arctic_a0020.wav 30 | wget http://festvox.org/cmu_arctic/cmu_arctic/cmu_us_clb_arctic/wav/arctic_a0030.wav -O clb_arctic_a0030.wav 31 | wget http://festvox.org/cmu_arctic/cmu_arctic/cmu_us_awb_arctic/wav/arctic_a0040.wav -O awb_arctic_a0040.wav 32 | ls *.wav |grep -v '.resynth.wav' > samples.scp 33 | 34 | clean: 35 | rm -f *.resynth.wav *.f0 *.lf0 *.logf0 *.spec *.fwspec *.fwcep *.pdd *.mpdd *.nm *.bndnm *.fwnm *.diff 36 | 37 | distclean: 38 | rm -f samples.scp *.wav *.f0 *.lf0 *.f0txt *.logf0 *.spec* *.fwspec *.fwlspec *.fwcep *.pdd *.mpdd *.nm *.bndnm *.fwnm *.diff 39 | -------------------------------------------------------------------------------- /test/test_smoke.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)),'external/pyworld/pyworld')) 6 | 7 | import unittest 8 | 9 | import numpy as np 10 | np.random.seed(123) 11 | 12 | filenames = ['slt_arctic_a0010.wav', 'bdl_arctic_a0020.wav', 'clb_arctic_a0030.wav', 'awb_arctic_a0040.wav'] 13 | filename_totest = 0 14 | 15 | class TestSmoke(unittest.TestCase): 16 | 17 | @classmethod 18 | def test_smoke_cmd_analysis(cls): 19 | fname = filenames[filename_totest] # Just with one file for smoke test 20 | 21 | import analysis 22 | analysis.main(['test/'+fname]) 23 | analysis.main(['test/'+fname, '--f0', 'test/'+fname.replace('.wav','.f0')]) 24 | analysis.main(['test/'+fname, '--f0', 'test/'+fname.replace('.wav','.f0'), '--preproc_fs', '8000']) 25 | analysis.main(['test/'+fname, '--f0_min', '75', '--f0', 'test/'+fname.replace('.wav','.f0')]) 26 | analysis.main(['test/'+fname, '--f0_max', '200', '--f0', 'test/'+fname.replace('.wav','.f0')]) 27 | analysis.main(['test/'+fname, '--f0_min', '81', '--f0_max', '220', '--f0', 'test/'+fname.replace('.wav','.f0')]) 28 | 29 | f0s = np.fromfile('test/'+fname.replace('.wav','.f0'), dtype=np.float32) 30 | f0s = f0s.reshape((-1, 1)) 31 | np.savetxt('test/'+fname.replace('.wav','.f0txt'), f0s) 32 | 33 | analysis.main(['test/'+fname, '--inf0txt', 'test/'+fname.replace('.wav','.f0txt'), '--spec', 'test/'+fname.replace('.wav','.spec')]) 34 | analysis.main(['test/'+fname, '--inf0bin', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec')]) 35 | analysis.main(['test/'+fname, '--f0_log', '--f0', 'test/'+fname.replace('.wav','.lf0')]) 36 | analysis.main(['test/'+fname, '--spec', 'test/'+fname.replace('.wav','.spec')]) 37 | # analysis.main(['test/'+fname, ' --spec_mceporder', '59', '--spec', 'test/'+fname.replace('.wav','.mcep')]) # Need SPTK for this one 38 | analysis.main(['test/'+fname, '--spec_nbfwbnds', '65', '--spec', 'test/'+fname.replace('.wav','.fwlspec')]) 39 | analysis.main(['test/'+fname, '--pdd', 'test/'+fname.replace('.wav','.pdd')]) 40 | # analysis.main(['test/'+fname, '--pdd_mceporder', '60', '--pdd', 'test/'+fname.replace('.wav','.pdd')]) # Need SPTK for this one 41 | analysis.main(['test/'+fname, '--nm', 'test/'+fname.replace('.wav','.nm')]) 42 | analysis.main(['test/'+fname, '--nm_nbfwbnds', '33', '--nm', 'test/'+fname.replace('.wav','.fwnm')]) 43 | 44 | # Test pre-processing 45 | analysis.main(['test/'+fname, '--inf0txt', 'test/'+fname.replace('.wav','.f0txt'), '--spec', 'test/'+fname.replace('.wav','.spec_resample16kHz'), '--preproc_fs', '16000']) 46 | analysis.main(['test/'+fname, '--inf0txt', 'test/'+fname.replace('.wav','.f0txt'), '--spec', 'test/'+fname.replace('.wav','.spec_preproc_hp'), '--preproc_hp', '100.0']) 47 | 48 | # TODO Test various sampling fromats, encoding and sampling rates for wav files 49 | 50 | @classmethod 51 | def test_smoke_cmd_synthesis(cls): 52 | fname = filenames[filename_totest] # Just with one file for smoke test 53 | 54 | import analysis 55 | import synthesis 56 | 57 | analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--pdd', 'test/'+fname.replace('.wav','.pdd')]) 58 | synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--pdd', 'test/'+fname.replace('.wav','.pdd')]) 59 | 60 | analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--nm', 'test/'+fname.replace('.wav','.nm')]) 61 | synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec')]) 62 | synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--nm', 'test/'+fname.replace('.wav','.nm')]) 63 | 64 | analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '200', '--f0_log', '--f0', 'test/'+fname.replace('.wav','.lf0'), '--spec', 'test/'+fname.replace('.wav','.spec')]) 65 | synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--logf0', 'test/'+fname.replace('.wav','.lf0'), '--spec', 'test/'+fname.replace('.wav','.spec')]) 66 | 67 | analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec_nblinlogbnds', '129', '--spec', 'test/'+fname.replace('.wav','.lspec')]) 68 | synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--logf0', 'test/'+fname.replace('.wav','.lf0'), '--lspec', 'test/'+fname.replace('.wav','.lspec')]) 69 | 70 | analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec_fwceporder', '59', '--spec', 'test/'+fname.replace('.wav','.fwcep'), '--nm_nbfwbnds', '33', '--nm', 'test/'+fname.replace('.wav','.fwnm')]) 71 | synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--logf0', 'test/'+fname.replace('.wav','.lf0'), '--fwcep', 'test/'+fname.replace('.wav','.fwcep'), '--fwnm', 'test/'+fname.replace('.wav','.fwnm')]) 72 | 73 | 74 | # This one is the most used and thus should be the last one 75 | analysis.main(['test/'+fname, '--f0_log', '--f0', 'test/'+fname.replace('.wav','.lf0'), '--spec_nbfwbnds', '65', '--spec', 'test/'+fname.replace('.wav','.fwlspec'), '--nm_nbfwbnds', '33', '--nm', 'test/'+fname.replace('.wav','.fwnm')]) 76 | synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--logf0', 'test/'+fname.replace('.wav','.lf0'), '--fwlspec', 'test/'+fname.replace('.wav','.fwlspec'), '--fwnm', 'test/'+fname.replace('.wav','.fwnm')]) 77 | 78 | 79 | # def test_smoke_analysisf(self): 80 | # fname = filenames[0] # Just with one file for smoke test 81 | # import pulsemodel 82 | # 83 | # f0_min = 75 84 | # f0_max = 800 85 | # 86 | # pulsemodel.analysisf(fname, f0_min=f0_min, f0_max=f0_max, ff0=fname.replace('.wav','.lf0'), f0_log=True, 87 | # fspec='test/'+fname.replace('.wav','.fwlspec'), spec_nbfwbnds=65, fnm=fname.replace('.wav','.fwnm'), nm_nbfwbnds=33, verbose=1) 88 | 89 | @classmethod 90 | def test_smoke_analysis_synthesis(cls): 91 | fname = filenames[filename_totest] # Just with one file for smoke test 92 | 93 | f0_min = 75 94 | f0_max = 800 95 | shift = 0.010 96 | verbose = 1 97 | dftlen = 512 98 | 99 | import pulsemodel 100 | import sigproc as sp 101 | 102 | wav, fs, _ = sp.wavread('test/'+fname) 103 | 104 | f0s, SPEC, PDD, NM = pulsemodel.analysis(wav, fs) 105 | 106 | _ = pulsemodel.analysis_f0postproc(wav, fs, f0s=np.zeros(f0s[:,1].shape), f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose) 107 | 108 | _ = pulsemodel.analysis_f0postproc(wav, fs, f0s=f0s[:,1], f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose) 109 | 110 | nonunif0s = f0s.copy() 111 | nonunif0s[:,0] = np.random.rand(f0s.shape[0])*(f0s[-1,0]-f0s[0,0]) + f0s[0,0] 112 | nonunif0s[:,0] = np.sort(nonunif0s[:,0]) 113 | _ = pulsemodel.analysis_f0postproc(wav, fs, f0s=nonunif0s, f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose) 114 | 115 | f0s = pulsemodel.analysis_f0postproc(wav, fs, f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose) 116 | 117 | f0_min = 60 118 | f0_max = 600 119 | shift = 0.005 120 | dftlen = 4096 121 | f0s, SPEC, PDD, NM = pulsemodel.analysis(wav, fs, f0s=f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, dftlen=dftlen, verbose=verbose) 122 | 123 | 124 | _ = pulsemodel.synthesize(fs, f0s, SPEC, wavlen=len(wav)) 125 | 126 | _ = pulsemodel.synthesize(fs, f0s, SPEC, NM=NM, wavlen=len(wav)) 127 | 128 | NM = PDD.copy() 129 | NM[NM>0.75] = 1 130 | NM[NM<=0.75] = 0 131 | _ = pulsemodel.synthesize(fs, f0s, SPEC, NM=NM, wavlen=len(wav)) 132 | 133 | _ = pulsemodel.synthesize(fs, f0s, SPEC, NM=NM, wavlen=len(wav) 134 | , ener_multT0=True 135 | , nm_cont=True, nm_lowpasswinlen=13, hp_f0coef=0.25, antipreechohwindur=0.002 136 | , pp_f0_rmsteps=True, pp_f0_smooth=0.100, pp_atten1stharminsilences=-25 137 | , verbose=verbose) 138 | 139 | def test_repeatability(self): 140 | 141 | f0_min = 60 142 | f0_max = 600 143 | 144 | import pulsemodel 145 | # import pyworld 146 | # import sigproc as sp 147 | 148 | for fname in filenames: 149 | fname = 'test/'+fname 150 | lf0s_ref = None 151 | # pwf0_ref = None 152 | # SPEC_ref = None 153 | fwlspec_ref = None 154 | fwnm_ref = None 155 | for _ in xrange(2): 156 | print('Extracting features for: '+fname) 157 | pulsemodel.analysisf(fname, f0_min=f0_min, f0_max=f0_max, ff0=fname.replace('.wav','.lf0'), f0_log=True, 158 | fspec=fname.replace('.wav','.fwlspec'), spec_nbfwbnds=65, fnm=fname.replace('.wav','.fwnm'), nm_nbfwbnds=33, verbose=1) 159 | 160 | 161 | lf0s = np.fromfile(fname.replace('.wav','.lf0'), dtype=np.float32) 162 | lf0s = lf0s.reshape((-1, 1)) 163 | print('lf0 sum square: '+str(np.sum((lf0s)**2))) 164 | 165 | if lf0s_ref is None: 166 | lf0s_ref = lf0s 167 | else: 168 | diff = np.sum((lf0s_ref-lf0s)**2) 169 | print('lf0 diff: '+str(diff)) 170 | self.assertEqual(diff, 0.0) 171 | 172 | 173 | # #_f0, ts = pyworld.dio(x, fs, frame_period=shift*1000) # raw pitch extractor # Use REAPER instead 174 | # wav, fs, enc = sp.wavread(fname) 175 | # 176 | # pwts = 0.005*np.arange(len(lf0s)) 177 | # dftlen = 4096 178 | # # from IPython.core.debugger import Pdb; Pdb().set_trace() 179 | # dlf0s = lf0s.astype(np.float64) 180 | # pwf0 = pyworld.stonemask(wav, np.ascontiguousarray(np.exp(dlf0s[:,0])), pwts, fs) # pitch refinement 181 | # if pwf0_ref is None: 182 | # pwf0_ref = pwf0 183 | # else: 184 | # print('pwf0 diff: '+str(np.sum((pwf0_ref-pwf0)**2))) 185 | # 186 | # SPEC = pyworld.cheaptrick(wav, pwf0, pwts, fs, fft_size=dftlen) # extract smoothed spectrogram 187 | # if SPEC_ref is None: 188 | # SPEC_ref = SPEC 189 | # else: 190 | # print('SPEC diff: '+str(np.sum((SPEC_ref-SPEC)**2))) 191 | 192 | 193 | fwlspec = np.fromfile(fname.replace('.wav','.fwlspec'), dtype=np.float32) 194 | fwlspec = fwlspec.reshape((-1, 65)) 195 | print('fwlspec sum square: '+str(np.sum((fwlspec)**2))) 196 | 197 | if fwlspec_ref is None: 198 | fwlspec_ref = fwlspec 199 | else: 200 | diff = np.sum((fwlspec_ref-fwlspec)**2) 201 | print('fwlspec diff: '+str(diff)) 202 | self.assertEqual(diff, 0.0) 203 | 204 | 205 | fwnm = np.fromfile(fname.replace('.wav','.fwnm'), dtype=np.float32) 206 | fwnm = fwnm.reshape((-1, 33)) 207 | print('fwnm sum square: '+str(np.sum((fwnm)**2))) 208 | 209 | if fwnm_ref is None: 210 | fwnm_ref = fwnm 211 | else: 212 | diff = np.sum((fwnm_ref-fwnm)**2) 213 | print('fwnm diff: '+str(diff)) 214 | self.assertEqual(diff, 0.0) 215 | 216 | 217 | if __name__ == '__main__': 218 | unittest.main() 219 | --------------------------------------------------------------------------------