├── .gitignore
├── .gitmodules
├── .travis.yml
├── LICENSE.md
├── Makefile
├── README.md
├── __init__.py
├── analysis.py
├── synthesis.py
└── test
    ├── Makefile
    └── test_smoke.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "sigproc"]
 2 | 	path = sigproc
 3 | 	url = https://gitlab.com/gillesdegottex/sigproc.git
 4 | [submodule "external/REAPER"]
 5 | 	path = external/REAPER
 6 | 	url = https://github.com/google/REAPER.git
 7 | [submodule "external/pyworld"]
 8 | 	path = external/pyworld
 9 | 	url = https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder.git
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | cache:
 2 |   directories:
 3 |     - $HOME/.cache/pip
 4 | 
 5 | 
 6 | language: python
 7 | 
 8 | os:
 9 |     - linux
10 | 
11 | branches:
12 |     only:
13 |     - master
14 | 
15 | before_install:
16 |     - echo "before_install..."
17 |     - git submodule update --init --recursive
18 |     - git fetch --tags
19 |     - sleep 1
20 |     - git checkout $TRAVIS_BRANCH
21 |     - export PROJECTGITVERSION=`git describe --tags --always |sed 's/^v//'`
22 |     - echo $PROJECTGITVERSION
23 |     - echo $TRAVIS_OS_NAME
24 |     - echo $TRAVIS_COMMIT
25 |     - echo $TRAVIS_TAG
26 |     - export TRAVIS_TAG=`git describe --tags --exact-match HEAD 2>/dev/null`
27 |     - echo $TRAVIS_TAG
28 | 
29 |     - sudo apt-get -qq update
30 |     - sudo apt-get install -y cython
31 |     - sudo apt-get install -y python-numpy
32 |     - sudo apt-get install -y python-scipy
33 |     - sudo apt-get install -y sox
34 | 
35 | install:
36 |     - echo "install..."
37 |     - pip install codecov
38 |     - pip install cython
39 |     - pip install numpy
40 |     - pip install scipy
41 | 
42 | script:
43 |     - echo "script..."
44 | 
45 |     - export PACKAGENAME=pulsemodel.zip
46 |     - echo "Packaging sources of " $PACKAGENAME
47 |     - pwd
48 |     - ls -l
49 |     - mkdir ../pulsemodel_source_build
50 |     - cp -r * ../pulsemodel_source_build/
51 |     - echo $PROJECTGITVERSION > ../pulsemodel_source_build/Version
52 |     - rm -fr ../pulsemodel_source_build/.git*
53 |     - rm -fr ../pulsemodel_source_build/sigproc/.git*
54 |     - rm -fr ../pulsemodel_source_build/external/pyworld/.git*
55 |     - rm -fr ../pulsemodel_source_build/external/pyworld/lib/World/.git*
56 |     - cp -fr ../pulsemodel_source_build pulsemodel
57 | 
58 |     - zip -r $PACKAGENAME pulsemodel
59 |     - ls
60 |     - ls -l $PACKAGENAME
61 | 
62 |     - echo "Testing ..."
63 |     - make
64 |     - cd test
65 |     - make samples.scp
66 |     - cd ..
67 |     - pwd
68 |     - coverage run --source=. --omit="test/*","external/*" test/test_smoke.py
69 | 
70 | 
71 | after_success:
72 |     - codecov
73 | 
74 | 
75 | deploy:
76 |   skip_cleanup: true
77 |   provider: releases
78 |   api_key:
79 |     secure: BlKzFevP0b1d0vB3yfQU4D91agOQ3JA/2mX9gY+rrho0uNlVp+kZIUgPQ4+4EQROwttSdSLMzFLZdAhDmPnS8RP3mBmLdacZ4Z2B27FvhGh+Comfwobk2kT7R2WFg7jAAef9AK3/dJyRYe1bLOEdAGeNlEpa5ucjpLJ389zxKZXkm2T2UMzH+6n+YGND34utB9PKcQ9DluHA9H17FMmxFjQcYCLPiy5ytdOHNUusF9eBoYmgqJnfQ393vWpQbl88HJloPOtdCaCw1XUt6p6AwEsq9F9RP0tDL3hRIXOTk3Uwe1q94wYDR/LFt1MSL/IxQ1PptaszT/CgrHf8y2K+QUKUU2iEIeKywXjYM227+dC8E0VgruMNNWIdOAc6o/e2zJ0ORt8Exh+TYyMWIXIUe4smneLZWjZZVJSO371GWeCgQW4jLh18Ae6CN/FmFiMuhPQDzncoMCr+cJbeQCx3LdYLgaczk5euU5mL6MCJEwYa5Nk2zNKrMrDjx5BQu4cS2zqaGJcKYQ0WPSamDnlgqWFJUW91uscHifKvlkai8kDB8IpsUIRo1aWor9UAJuXPMyJiTT6D2gJxN07ZFmuOBHmGP5FuRSnDwMFN2mHxxbgnnThDu7dmG0bFMFsuCZoSAwezz3yvT15NCm+5a1l3ikZ198Ek6DKzfOLTDL+jbHQ=
80 |   file: $PACKAGENAME
81 |   on:
82 |     repo: gillesdegottex/pulsemodel
83 |     tags: true
84 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2016 Engineering Department, University of Cambridge, UK.
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build submodule_init build_sigproc build_reaper build_pyworld
 2 | 
 3 | 
 4 | all: build
 5 | 
 6 | submodule_init:
 7 | 	git submodule update --init --recursive
 8 | 
 9 | 
10 | build: submodule_init build_sigproc build_reaper build_pyworld
11 | 
12 | build_sigproc: submodule_init
13 | 	cd sigproc; $(MAKE)
14 | 
15 | build_reaper: submodule_init
16 | 	cd external/REAPER; mkdir build; cd build; cmake ..; make
17 | 
18 | build_pyworld: submodule_init
19 | 	cd external/pyworld; python setup.py build_ext --inplace
20 | 
21 | 
22 | test: build
23 | 	cd test; $(MAKE)
24 | 	python test/test_smoke.py
25 | 
26 | distclean:
27 | 	rm -f *.pyc
28 | 	cd test; $(MAKE) distclean
29 | 	cd sigproc; $(MAKE) distclean
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/gillesdegottex/pulsemodel.svg?branch=master)](https://travis-ci.org/gillesdegottex/pulsemodel)
 2 | [![codecov](https://codecov.io/gh/gillesdegottex/pulsemodel/branch/master/graph/badge.svg)](https://codecov.io/gh/gillesdegottex/pulsemodel)
 3 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/c9fbe9dc053046349c7cca95b8ce6404)](https://www.codacy.com/app/gillesdegottex/pulsemodel?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=gillesdegottex/pulsemodel&amp;utm_campaign=Badge_Grade)
 4 | 
 5 | 
 6 | ## Pulse model analysis and synthesis
 7 | 
 8 | It is basically the vocoder described in:
 9 | > G. Degottex, P. Lanchantin and M. Gales, "A Log Domain Pulse Model for Parametric
10 | >    Speech Synthesis", IEEE Transactions on Audio, Speech, and Language Processing,
11 | >    26(1):57-70, 2018.
12 | 
13 | ### Documentation
14 | Please see the headers of analysis.py and synthesis.py files as well as the
15 | functions documentation for more details.
16 | 
17 | ### Testing/HowTo
18 | In the root directory, simply run:
19 | ```make
20 | $ make test
21 | ```
22 | 
23 | You can also have a look at the file test/test_smoke.py to see how the PML's scripts can be used.
24 | 
25 | ### Legal
26 | 
27 | Copyright(C) 2016 Engineering Department, University of Cambridge, UK.
28 | 
29 | The code in this repository is released under the Apache License, Version 2.0.
30 | Please see LICENSE.md for more details.
31 | 
32 | Author: Gilles Degottex <gad27@cam.ac.uk>
33 | 
34 | ### External tools
35 | PML first aims at extracting a noise measure and synthesis a waveform assuming F0 curve and amplitude spectral envelopes are already given.
36 | 
37 | In order to make it a standalone vocoder, it was thus necessary to import an F0 estimator and a spectral envelope estimator.
38 | 
39 | #### For F0
40 | For F0, REAPER is used:
41 | > https://github.com/gillesdegottex/REAPER
42 | 
43 | #### For the amplitude spectral envelope
44 | For the amplitude spectral envelope, the estimator CheapTrick is used:
45 | 
46 | > Masanori Morise, CheapTrick, a spectral envelope estimator for high-quality speech synthesis, Speech Communication, Volume 67, 2015, Pages 1-7, ISSN 0167-6393, http://dx.doi.org/10.1016/j.specom.2014.09.003.
47 | 
48 | The python wrapper of the original implementation is used (without any modification) and can be found at:
49 | > https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder
50 | 
51 | Note that all the published results about PML have been done using the spectral envelope of the STRAIGHT vocoder, NOT using WORLD.
52 | Because of legal reason it is not possible to release any of STRAIGHT vocoder analysis. Thus, the use of CheapTrick instead in this repository.
53 | It also means that, even though STRAIGHT's envelope and CheapTrick are quite similar, you might observe small differences in results between the two.
54 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright(C) 2016 Engineering Department, University of Cambridge, UK.
 3 | 
 4 | License
 5 |    Licensed under the Apache License, Version 2.0 (the "License");
 6 |    you may not use this file except in compliance with the License.
 7 |    You may obtain a copy of the License at
 8 | 
 9 |      http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |    Unless required by applicable law or agreed to in writing, software
12 |    distributed under the License is distributed on an "AS IS" BASIS,
13 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |    See the License for the specific language governing permissions and
15 |    limitations under the License.
16 | 
17 | Author
18 |     Gilles Degottex <gad27@cam.ac.uk>
19 | '''
20 | 
21 | from analysis import *
22 | from synthesis import *
23 | 


--------------------------------------------------------------------------------
/analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | 
  4 | References
  5 |     [1] G. Degottex, P. Lanchantin, and M. Gales, "A Pulse Model in Log-domain
  6 |         for a Uniform Synthesizer," in Proc. 9th Speech Synthesis Workshop
  7 |         (SSW9), 2016.
  8 |     [2] G. Degottex and D. Erro, "A uniform phase representation for the
  9 |         harmonic model in speech synthesis applications," EURASIP, Journal on
 10 |         Audio, Speech, and Music Processing - Special Issue: Models of Speech -
 11 |         In Search of Better Representations, vol. 2014, iss. 1, p. 38, 2014.
 12 |     [3] G. Degottex, P. Lanchantin and M. Gales, "A Log Domain Pulse Model for
 13 |         Parametric Speech Synthesis", IEEE Transactions on Audio, Speech, and
 14 |         Language Processing, 26(1):57-70, 2018.
 15 | 
 16 | Copyright(C) 2016 Engineering Department, University of Cambridge, UK.
 17 | 
 18 | License
 19 |    Licensed under the Apache License, Version 2.0 (the "License");
 20 |    you may not use this file except in compliance with the License.
 21 |    You may obtain a copy of the License at
 22 | 
 23 |      http://www.apache.org/licenses/LICENSE-2.0
 24 | 
 25 |    Unless required by applicable law or agreed to in writing, software
 26 |    distributed under the License is distributed on an "AS IS" BASIS,
 27 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 28 |    See the License for the specific language governing permissions and
 29 |    limitations under the License.
 30 | 
 31 | Author
 32 |     Gilles Degottex <gad27@cam.ac.uk>
 33 | '''
 34 | 
 35 | import argparse
 36 | import sys
 37 | import os
 38 | import warnings
 39 | 
 40 | import numpy as np
 41 | np.random.seed(123) # Generate always the same "random" numbers, for debugging.
 42 | from scipy import signal as sig
 43 | 
 44 | import sigproc as sp
 45 | import sigproc.pystraight
 46 | import sigproc.interfaces
 47 | 
 48 | # Add the path for REAPER f0 estimator
 49 | os.environ["PATH"] += os.pathsep + os.path.join(os.path.split(os.path.realpath(__file__))[0],'external/REAPER/build')
 50 | # Add the path for WORLD vocoder's amplitude spectral envelope estimator
 51 | sys.path.insert(0, os.path.join(os.path.split(os.path.realpath(__file__))[0],'external/pyworld/pyworld'))
 52 | 
 53 | def analysis_f0postproc(wav, fs, f0s=None, f0_min=60, f0_max=600,
 54 |              shift=0.005,        # Usually 5ms
 55 |              f0estimator='REAPER',
 56 |              verbose=1):
 57 |     '''
 58 |     Post process the F0 estimate.
 59 |     If f0s==None, an F0 estimate is extracted using REAPER.
 60 |     '''
 61 |     if f0s is None:
 62 |         # TODO Switch f0 estimator using `f0estimator`
 63 |         f0s = sigproc.interfaces.reaper(wav, fs, shift, f0_min, f0_max)
 64 | 
 65 |     # If only values are given, make two column matrix [time[s], value[Hz]] (ljuvela)
 66 |     if len(f0s.shape)==1:
 67 |         ts = (shift)*np.arange(len(f0s))
 68 |         f0s = np.vstack((ts, f0s)).T
 69 | 
 70 |     if not (f0s[:,1]>0).any():
 71 |         warnings.warn('''\n\nWARNING: No F0 value can be estimated in this signal.
 72 |          It will be replaced by the constant f0_min value ({}Hz).
 73 |         '''.format(f0_min), RuntimeWarning)
 74 |         f0s[:,1] = f0_min
 75 | 
 76 | 
 77 |     # Build the continuous f0
 78 |     f0s[:,1] = np.interp(f0s[:,0], f0s[f0s[:,1]>0,0], f0s[f0s[:,1]>0,1])
 79 |      # Avoid erratic values outside of the given interval
 80 |     f0s[:,1] = np.clip(f0s[:,1], f0_min, f0_max)
 81 |     # Removes steps in the f0 curve (see sigproc.resampling.f0s_rmsteps(.) )
 82 |     f0s = sp.f0s_rmsteps(f0s)
 83 |     # Resample the given f0 to regular intervals
 84 |     if np.std(np.diff(f0s[:,0]))>2*np.finfo(f0s[0,0]).resolution:
 85 |         warnings.warn('''\n\nWARNING: F0 curve seems to be sampled non-uniformly (mean(F0)={}, std(F0s')={}).
 86 |          It will be resampled at {}s intervals.
 87 |         '''.format(np.std(f0s[:,0]), np.std(np.diff(f0s[:,0])), shift), RuntimeWarning)
 88 |         f0s = sp.f0s_resample_cst(f0s, shift)
 89 | 
 90 |     return f0s
 91 | 
 92 | def analysis_spec(wav, fs, f0s,
 93 |              shift=0.005,    # Usually 5ms
 94 |              dftlen=4096,    # You can adapt this one according to your pipeline
 95 |              verbose=1):
 96 |     '''
 97 |     Estimate the amplitude spectral envelope.
 98 |     '''
 99 | 
100 |     if sp.pystraight.isanalysiseavailable():   # pragma: no cover
101 |                                                # Cannot be tested since STRAIGHT
102 |                                                # is not openly available.
103 |         warnings.warn('''\n\nWARNING: straight_mcep is available,
104 |             STRAIGHT vocoder will thus be used instead of WORLD.
105 |             Note that PML-related publications present results using STRAIGHT vocoder.
106 |         ''', RuntimeWarning)
107 | 
108 |         # Use STRAIGHT's envelope if available (as in PML's publications)
109 |         SPEC = sigproc.pystraight.analysis_spec(wav, fs, f0s, shift, dftlen, keeplen=True)
110 | 
111 |     elif sigproc.interfaces.worldvocoder_is_available():
112 | 
113 |         # Then try WORLD vocoder
114 |         import pyworld
115 |         wav = np.ascontiguousarray(wav)
116 |         #_f0, ts = pyworld.dio(x, fs, frame_period=shift*1000)    # raw pitch extractor # Use REAPER instead
117 |         pwts = np.ascontiguousarray(f0s[:,0])
118 |         pwf0 = pyworld.stonemask(wav, np.ascontiguousarray(f0s[:,1]), pwts, fs)  # pitch refinement
119 |         SPEC = pyworld.cheaptrick(wav, pwf0, pwts, fs, fft_size=dftlen)  # extract smoothed spectrogram
120 |         SPEC = 10.0*np.sqrt(SPEC) # TODO Best gain correction I could find. Hard to find the good one between PML and WORLD different syntheses
121 | 
122 |     else:   # pragma: no cover
123 |         # This a safeguard that should never happend since WORLD is embeded in
124 |         # pulsemodel.
125 |         # Estimate the sinusoidal parameters at regular intervals in order
126 |         # to build the amplitude spectral envelope
127 |         sinsreg, _ = sp.sinusoidal.estimate_sinusoidal_params(wav, fs, f0s, nbper=3, quadraticfit=True, verbose=verbose-1)
128 | 
129 |         warnings.warn('''\n\nWARNING: Neither straight_mcep nor WORLD's cheaptrick spectral envelope estimators are available.
130 |          Thus, a SIMPLISTIC Linear interpolation will be used for the spectral envelope.
131 |          Do _NOT_ use this envelope for speech synthesis!
132 |          Please use a better one (e.g. STRAIGHT's or WORLD's).
133 |          If you use this simplistic envelope, the TTS quality will
134 |          be lower than that in the results reported.
135 |         ''', RuntimeWarning)
136 | 
137 |         SPEC = sp.multi_linear(sinsreg, fs, dftlen)
138 |         SPEC = np.exp(SPEC)*np.sqrt(float(dftlen))
139 | 
140 |     return SPEC
141 | 
142 | def analysis_pdd(wav, fs, f0s,
143 |              dftlen=4096,    # You can adapt this one according to your pipeline
144 |              pdd_sin_nbperperiod=4, # 4 analysis instants per period [2]
145 |              pdd_sin_winnbper=2.5,  # 2.5 is enough for phase measure
146 |                                     # (it overestimates the amplitude but we
147 |                                     #  don't use it anyway)
148 |              verbose=1):
149 |     '''
150 |     Estimate the Phase Distortion Deviation (PDD).
151 |     '''
152 | 
153 |     # Extract the Phase Distortion Deviation (PDD) feature
154 |     # Will need a pitch sync analysis, so resample the f0 accordingly
155 |     f0sps = sp.f0s_resample_pitchsync(f0s, nbperperiod=pdd_sin_nbperperiod)
156 | 
157 |     # Estimate the sinusoidal parameters
158 |     sinsps, f0sps = sp.sinusoidal.estimate_sinusoidal_params(wav, fs, f0sps, nbper=pdd_sin_winnbper, quadraticfit=True, verbose=verbose-1)
159 | 
160 |     # Compute PDD from the sinusoidal parameters
161 |     # We don't provide an envelope estimate so the VTF's phase will stay in the computation
162 |     # However, the VTF's phase is ~constant wrt time, thus disapear in the variance measure.
163 |     # (The only risk is to have the VTF's variations that adds to PDD)
164 |     PDD = sp.sinusoidal.estimate_pdd(sinsps, f0sps, fs, pdd_sin_nbperperiod, dftlen, outFscale=True, rmPDtrend=True, extrapDC=True)
165 | 
166 |     # Resample the feature from pitch synchronous to regular intervals
167 |     PDD = sp.featureresample(f0sps[:,0], PDD, f0s[:,0])
168 | 
169 |     return PDD
170 | 
171 | def analysis_nm(wav, fs,
172 |              f0s,                # Has to be continuous (should use analysis_f0postproc)
173 |              PDD,                # Phase Distortion Deviation [2]
174 |                                  # Its length should match f0s'
175 |              pdd_threshold=0.75, # 0.75 as in [2]
176 |              nm_clean=True,      # Use morphological opening and closure to
177 |                                  # clean the mask and avoid learning rubish.
178 |              verbose=1):
179 |     '''
180 |     Estimate the Noise Mask (NM) from the Phase Distortion Deviation (PDD).
181 |     '''
182 | 
183 |     if f0s.shape[0]!=PDD.shape[0]:
184 |         raise ValueError('f0s size and PDD size do not match!') # pragma: no cover
185 | 
186 |     shift = np.mean(np.diff(f0s[:,0])) # Get the time shift from the F0 times
187 |     dftlen = (PDD.shape[1]-1)*2 # and the DFT len from the PDD feature
188 | 
189 |     # The Noise Mask is just a thresholded version of PDD
190 |     HARM = PDD.copy()
191 |     HARM[PDD<=pdd_threshold] = 0
192 |     HARM[PDD>pdd_threshold] = 1
193 | 
194 |     if nm_clean:
195 |         # Clean the PDD mask to avoid learning rubish details
196 |         import scipy.ndimage
197 |         frq = 70.0 # [Hz]
198 |         morphstruct = np.ones((int(np.round((1.0/frq)/shift)),int(np.round(frq*dftlen/float(fs)))))
199 |         HARM = 1.0-HARM
200 |         HARM = scipy.ndimage.binary_opening(HARM, structure=morphstruct)
201 |         HARM = scipy.ndimage.binary_closing(HARM, structure=morphstruct)
202 |         HARM = 1.0-HARM
203 | 
204 |     # Avoid noise in low-freqs
205 |     for n in range(len(f0s[:,0])):
206 |         HARM[n,:int(np.round(1.5*f0s[n,1]*dftlen/float(fs)))] = 0.0
207 | 
208 |     NM = HARM
209 | 
210 |     return NM
211 | 
212 | def analysis(wav, fs, f0s=None, f0_min=60, f0_max=600, f0estimator='REAPER',
213 |              shift=0.005,    # Usually 5ms
214 |              dftlen=4096,    # You can adapt this one according to your pipeline
215 |              verbose=1):
216 | 
217 |     if verbose>0: print('PML Analysis (dur={:.3f}s, fs={}Hz, f0 in [{},{}]Hz, shift={}s, dftlen={})'.format(len(wav)/float(fs), fs, f0_min, f0_max, shift, dftlen))
218 | 
219 |     f0s = analysis_f0postproc(wav, fs, f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, f0estimator=f0estimator, verbose=verbose)
220 | 
221 |     SPEC = analysis_spec(wav, fs, f0s, shift=shift, dftlen=dftlen, verbose=verbose)
222 | 
223 |     PDD = analysis_pdd(wav, fs, f0s, dftlen=dftlen, verbose=verbose)
224 | 
225 |     NM = analysis_nm(wav, fs, f0s, PDD, verbose=verbose)
226 | 
227 |     if verbose>2:
228 |         plot_features(wav=wav, fs=fs, f0s=f0s, SPEC=SPEC, PDD=PDD, NM=NM) # pragma: no cover
229 | 
230 |     return f0s, SPEC, PDD, NM
231 | 
232 | def plot_features(wav=None, fs=None, f0s=None, SPEC=None, PDD=None, NM=None): # pragma: no cover
233 |     # TODO Could test this by writting in a picture
234 |     tstart = 0.0
235 |     tend = 1.0
236 |     nbview = 0
237 |     if not wav is None: nbview+=1
238 |     if not f0s is None: nbview+=1
239 |     if not SPEC is None: nbview+=1
240 |     if not PDD is None: nbview+=1
241 |     if not NM is None: nbview+=1
242 |     import matplotlib.pyplot as plt
243 |     plt.ion()
244 |     _, axs = plt.subplots(nbview, 1, sharex=True, sharey=False)
245 |     if not isinstance(axs, np.ndarray): axs = np.array([axs])
246 |     view=0
247 |     if not wav is None:
248 |         times = np.arange(len(wav))/float(fs)
249 |         axs[view].plot(times, wav, 'k')
250 |         axs[view].set_ylabel('Waveform\nAmplitude')
251 |         axs[view].grid()
252 |         axs[view].set_xlim((0.0, times[-1]))
253 |         view+=1
254 |     if not f0s is None:
255 |         tstart = f0s[0,0]
256 |         tend = f0s[-1,0]
257 |         axs[view].plot(f0s[:,0], f0s[:,1], 'k')
258 |         axs[view].set_ylabel('F0\nFrequency [Hz]')
259 |         axs[view].grid()
260 |         view+=1
261 |     if not SPEC is None:
262 |         axs[view].imshow(sp.mag2db(SPEC).T, origin='lower', aspect='auto', interpolation='none', extent=(tstart, tend, 0, 0.5*fs), cmap='jet')
263 |         axs[view].set_ylabel('Amp. Envelope\nFrequency [Hz]')
264 |         view+=1
265 |     if not PDD is None:
266 |         axs[view].imshow(PDD.T, origin='lower', aspect='auto', interpolation='none', extent=(tstart, tend, 0, 0.5*fs), cmap='jet', vmin=0.0, vmax=2.0)
267 |         axs[view].set_ylabel('PDD\nFrequency [Hz]')
268 |         view+=1
269 |     if not NM is None:
270 |         axs[view].imshow(NM.T, origin='lower', aspect='auto', interpolation='none', extent=(tstart, tend, 0, 0.5*fs), cmap='Greys', vmin=0.0, vmax=1.0)
271 |         axs[view].set_ylabel('Noise Mask \nFrequency [Hz]')
272 |         view+=1
273 |     axs[-1].set_xlabel('Time [s]')
274 |     from IPython.core.debugger import  Pdb; Pdb().set_trace()
275 | 
276 | def analysisf(fwav,
277 |         shift=0.005,
278 |         dftlen=4096,
279 |         finf0txt=None, f0estimator='REAPER', f0_min=60, f0_max=600, ff0=None, f0_log=False,
280 |         finf0bin=None, # input f0 file in binary
281 |         fspec=None,
282 |         spec_mceporder=None, # Mel-cepstral order for compressing the spectrogram (typically 59; None: no compression)
283 |         spec_fwceporder=None,# Frequency warped cepstral order (very similar to above, just faster and less precise) (typically 59; None: no compression)
284 |         spec_nbfwbnds=None,  # Number of mel-bands in the compressed half log spectrogram (None: no compression)
285 |         spec_nblinlogbnds=None,  # Number of linear-bands in the compressed half log spectrogram (None: no compression)
286 |         fpdd=None, pdd_mceporder=None, # Mel-cepstral order for compressing PDD spectrogram (typically 59; None: no compression)
287 |         fnm=None, nm_nbfwbnds=None,    # Number of mel-bands in the compressed noise mask (None: no compression)
288 |         preproc_fs=None, # Resample the waveform
289 |         preproc_hp=None, # Cut-off of high-pass filter (e.g. 20Hz)
290 |         verbose=1):
291 | 
292 |     wav, fs, _ = sp.wavread(fwav)
293 | 
294 |     if len(wav)==0: raise ValueError('The waveform in {} is empty.'.format(fwav))
295 | 
296 |     if verbose>0: print('PML Analysis (dur={:.3f}s, fs={}Hz, f0 in [{},{}]Hz, shift={}s, dftlen={})'.format(len(wav)/float(fs), fs, f0_min, f0_max, shift, dftlen))
297 | 
298 |     if (not preproc_fs is None) and (preproc_fs!=fs):
299 |         if verbose>0: print('    Resampling the waveform (new fs={}Hz)'.format(preproc_fs))
300 |         wav = sp.resample(wav, fs, preproc_fs, method=2, deterministic=True)
301 |         fs = preproc_fs
302 | 
303 |     if not preproc_hp is None:
304 |         if verbose>0: print('    High-pass filter the waveform (cutt-off={}Hz)'.format(preproc_hp))
305 |         b, a = sig.butter(4, preproc_hp/(fs/0.5), btype='high')
306 |         wav = sig.filtfilt(b, a, wav)
307 | 
308 |     f0s = None
309 |     if finf0txt:
310 |         f0s = np.loadtxt(finf0txt)
311 | 
312 |     # read input f0 file in float32 (ljuvela)
313 |     if finf0bin:
314 |         f0s = np.fromfile(finf0bin, dtype=np.float32)
315 | 
316 |     f0s = analysis_f0postproc(wav, fs, f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, f0estimator=f0estimator, verbose=verbose)
317 |     if verbose>2: f0sori=f0s.copy()
318 | 
319 |     if ff0:
320 |         f0_values = f0s[:,1]
321 |         if verbose>0: print('    Output F0 {} in: {}'.format(f0_values.shape, ff0))
322 |         if f0_log: f0_values = np.log(f0_values)
323 |         if os.path.dirname(ff0)!='' and (not os.path.isdir(os.path.dirname(ff0))): os.mkdir(os.path.dirname(ff0))
324 |         f0_values.astype(np.float32).tofile(ff0)
325 | 
326 |     SPEC = None
327 |     if fspec:
328 |         SPEC = analysis_spec(wav, fs, f0s, shift=shift, dftlen=dftlen, verbose=verbose)
329 |         if verbose>2: SPECori=SPEC.copy()
330 |         if not spec_mceporder is None: # pragma: no cover
331 |                                        # Cannot test this because it needs SPTK
332 |             SPEC = sp.spec2mcep(SPEC, sp.bark_alpha(fs), order=spec_mceporder)
333 |         if not spec_fwceporder is None:
334 |             SPEC = sp.loghspec2fwcep(np.log(abs(SPEC)), fs, order=spec_fwceporder)
335 |         if not spec_nbfwbnds is None:
336 |             SPEC = sp.linbnd2fwbnd(np.log(abs(SPEC)), fs, dftlen, spec_nbfwbnds)
337 |         if not spec_nblinlogbnds is None:
338 |             SPEC = np.log(abs(SPEC))
339 |         if verbose>0: print('    Output Spectrogram size={} in: {}'.format(SPEC.shape, fspec))
340 |         if os.path.dirname(fspec)!='' and (not os.path.isdir(os.path.dirname(fspec))): os.mkdir(os.path.dirname(fspec))
341 |         SPEC.astype(np.float32).tofile(fspec)
342 | 
343 |     PDD = None
344 |     if fpdd or fnm:
345 |         PDD = analysis_pdd(wav, fs, f0s, dftlen=dftlen, verbose=verbose)
346 |         if verbose>2: PDDori=PDD.copy()
347 | 
348 |     if fpdd:
349 |         if not pdd_mceporder is None:  # pragma: no cover
350 |                                        # Cannot test this because it needs SPTK
351 |             # If asked, compress PDD
352 |             PDD[PDD<0.001] = 0.001 # From COVAREP
353 |             PDD = sp.spec2mcep(PDD, sp.bark_alpha(fs), pdd_mceporder)
354 |         if verbose>0: print('    Output PDD size={} in: {}'.format(PDD.shape, fpdd))
355 |         if os.path.dirname(fpdd)!='' and (not os.path.isdir(os.path.dirname(fpdd))): os.mkdir(os.path.dirname(fpdd))
356 |         PDD.astype(np.float32).tofile(fpdd)
357 | 
358 |     NM = None
359 |     if verbose>2: NMori=None
360 |     if fnm:
361 |         NM = analysis_nm(wav, fs, f0s, PDD, verbose=verbose)
362 |         if verbose>2: NMori=NM.copy()
363 |         # If asked, compress NM
364 |         if nm_nbfwbnds:
365 |             # If asked, compress the noise mask using a number of mel bands
366 |             NM = sp.linbnd2fwbnd(NM, fs, dftlen, nm_nbfwbnds)
367 |         if verbose>0: print('    Output Noise Mask size={} in: {}'.format(NM.shape, fnm))
368 |         if os.path.dirname(fnm)!='' and (not os.path.isdir(os.path.dirname(fnm))): os.mkdir(os.path.dirname(fnm))
369 |         NM.astype(np.float32).tofile(fnm)
370 | 
371 |     if verbose>2:
372 |         plot_features(wav=wav, fs=fs, f0s=f0sori, SPEC=SPECori, PDD=PDDori, NM=NMori) # pragma: no cover
373 | 
374 | def main(argv):
375 |     argpar = argparse.ArgumentParser()
376 |     argpar.add_argument("wavfile", help="Input wav file")
377 |     argpar.add_argument("--shift", default=0.005, type=float, help="time step[s] between the input frames (def. 0.005s)")
378 |     argpar.add_argument("--dftlen", default=4096, type=int, help="Number of bins in the DFT (def. 4096)")
379 |     argpar.add_argument("--inf0txt", default=None, help="Given f0 file")
380 |     argpar.add_argument("--inf0bin", default=None, help="Given f0 file (single precision float binary)")
381 |     argpar.add_argument("--f0_min", default=60, type=float, help="Minimal possible f0[Hz] value (def. 60Hz)")
382 |     argpar.add_argument("--f0_max", default=600, type=float, help="Maximal possible f0[Hz] value (def. 600Hz)")
383 |     argpar.add_argument("--f0", default=None, help="Output f0 file")
384 |     argpar.add_argument("--f0_log", action='store_true', help="Output f0 file with log Hertz values instead of linear Hertz (def. False)")
385 |     argpar.add_argument("--spec", default=None, help="Output spectrum-related file")
386 |     argpar.add_argument("--spec_mceporder", default=None, type=int, help="Mel-cepstral order for the spectrogram (None:uncompressed; typically 59)")
387 |     argpar.add_argument("--spec_fwceporder", default=None, type=int, help="Frequency warped cepstral order (very similar to above, just faster and less precise) (typically 59)")
388 |     argpar.add_argument("--spec_nbfwbnds", default=None, type=int, help="Number of mel-bands in the compressed half log spectrogram (None:uncompressed; typically 129 (should be odd size as long as full spectrum size if power of 2 (even size)")
389 |     argpar.add_argument("--spec_nblinlogbnds", default=None, type=int, help="Number of frequency bands in the compressed half log spectrogram (None:uncompressed; typically 129 (should be odd size as long as full spectrum size if power of 2 (even size)")
390 |     argpar.add_argument("--pdd", default=None, help="Output Phase Distortion Deviation (PDD) file")
391 |     argpar.add_argument("--pdd_mceporder", default=None, type=int, help="Cepstral order for PDD (None:uncompressed; typically 59)")
392 |     argpar.add_argument("--nm", default=None, help="Output noise mask")
393 |     argpar.add_argument("--nm_nbfwbnds", default=None, type=int, help="Number of mel-bands in the compressed noise mask (None:uncompressed; typically 33)")
394 |     argpar.add_argument("--preproc_fs", default=None, type=float, help="[Hz] Resample the waveform before analysis.")
395 |     argpar.add_argument("--preproc_hp", default=None, type=float, help="[Hz] High-pass the waveform before analysis.")
396 |     argpar.add_argument("--verbose", default=1, type=int, help="Output some information")
397 |     args = argpar.parse_args(argv)
398 | 
399 |     analysisf(args.wavfile,
400 |               shift=args.shift,
401 |               dftlen=args.dftlen,
402 |               finf0txt=args.inf0txt, f0_min=args.f0_min, f0_max=args.f0_max, ff0=args.f0, f0_log=args.f0_log,
403 |               finf0bin=args.inf0bin,
404 |               fspec=args.spec, spec_mceporder=args.spec_mceporder, spec_fwceporder=args.spec_fwceporder, spec_nbfwbnds=args.spec_nbfwbnds, spec_nblinlogbnds=args.spec_nblinlogbnds,
405 |               fpdd=args.pdd, pdd_mceporder=args.pdd_mceporder,
406 |               fnm=args.nm, nm_nbfwbnds=args.nm_nbfwbnds,
407 |               preproc_fs=args.preproc_fs, preproc_hp=args.preproc_hp,
408 |               verbose=args.verbose)
409 | 
410 | if  __name__ == "__main__" :            # pragma: no cover
411 |     main(sys.argv[1:])
412 | 


--------------------------------------------------------------------------------
/synthesis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | 
  4 | Description
  5 | 
  6 | If using files, (call by command line or from python):
  7 |     all the inputs are raw float32 vectors files that are reshaped by the number
  8 |     of f0 values in ff0.
  9 | 
 10 | There are three safe patches that were not described in the publication[1]:
 11 |     (These are not critical, they might remove a few artifacts here and there).
 12 |     * The noise mask is slightly low-passed (smoothed) across frequency
 13 |         (def. 9 bins freq. window), in order to avoid cliffs in frequency domain
 14 |         that end up creating Gibbs phenomenon in the time domain.
 15 |     * High-pass filtering (def. 0.5*f0 cut-off)
 16 |         This centers each synthesized segment around zero, to avoid cutting
 17 |         any DC residual component (e.g. comming from the spectral envelope).
 18 |     * Short half-window  (def. 1ms (yes, one ms)) on the left of the pulse,
 19 |         in order to avoid any pre-echos.
 20 | 
 21 | Reference
 22 |     [1] G. Degottex, P. Lanchantin, and M. Gales, "A Pulse Model in Log-domain
 23 |         for a Uniform Synthesizer," in Proc. 9th Speech Synthesis Workshop
 24 |         (SSW9), 2016.
 25 |     [2] G. Degottex, P. Lanchantin and M. Gales, "A Log Domain Pulse Model for
 26 |         Parametric Speech Synthesis", IEEE Transactions on Audio, Speech, and
 27 |         Language Processing, 26(1):57-70, 2018.
 28 | 
 29 | Copyright(C) 2016 Engineering Department, University of Cambridge, UK.
 30 | 
 31 | License
 32 |    Licensed under the Apache License, Version 2.0 (the "License");
 33 |    you may not use this file except in compliance with the License.
 34 |    You may obtain a copy of the License at
 35 | 
 36 |      http://www.apache.org/licenses/LICENSE-2.0
 37 | 
 38 |    Unless required by applicable law or agreed to in writing, software
 39 |    distributed under the License is distributed on an "AS IS" BASIS,
 40 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 41 |    See the License for the specific language governing permissions and
 42 |    limitations under the License.
 43 | 
 44 | Author
 45 |     Gilles Degottex <gad27@cam.ac.uk>
 46 | '''
 47 | 
 48 | import argparse
 49 | import sys
 50 | import warnings
 51 | 
 52 | import numpy as np
 53 | np.random.seed(123) # Generate always the same "random" numbers, for debugging.
 54 | import scipy
 55 | 
 56 | import sigproc as sp
 57 | 
 58 | def getwinlen(f0, fs, nbper):
 59 |     return int(np.max((0.050*fs, nbper*fs/f0))/2)*2+1   # Has to be odd
 60 | 
 61 | def synthesize(fs, f0s, SPEC, NM=None, wavlen=None
 62 |                 , ener_multT0=False
 63 |                 , nm_cont=False     # If False, force binary state of the noise mask (by thresholding at 0.5)
 64 |                 , nm_lowpasswinlen=9
 65 |                 , hp_f0coef=0.5     # factor of f0 for the cut-off of the high-pass filter (def. 0.5*f0)
 66 |                 , antipreechohwindur=0.001 # [s] Use to damp the signal at the beginning of the signal AND at the end of it
 67 |                 # Following options are for post-processing the features, after the generation/transformation and thus before waveform synthesis
 68 |                 , pp_f0_rmsteps=False # Removes steps in the f0 curve
 69 |                                       # (see sigproc.resampling.f0s_rmsteps(.) )
 70 |                 , pp_f0_smooth=None   # Smooth the f0 curve using median and FIR filters of given window duration [s]
 71 |                 , pp_atten1stharminsilences=None # Typical value is -25
 72 |                 , verbose=1):
 73 | 
 74 |     winnbper = 4    # Number of periods in a synthesis windows. It still contains only one single pulse, but leaves space for the VTF to decay without being cut abruptly.
 75 | 
 76 |     # Copy the inputs to avoid modifying them
 77 |     f0s = f0s.copy()
 78 |     SPEC = SPEC.copy()
 79 |     if not NM is None: NM = NM.copy()
 80 |     else:              NM = np.zeros(SPEC.shape)
 81 | 
 82 |     NM = np.clip(NM, 0.0, 1.0)  # The noise mask is supposed to be in [0,1]
 83 | 
 84 |     # Check the size of the inputs
 85 |     if f0s.shape[0]!=SPEC.shape[0]:
 86 |         raise ValueError('F0 size {} and spectrogram size {} do not match'.format(f0s.shape[0], SPEC.shape[0])) # pragma: no cover
 87 |     if not NM is None:
 88 |         if SPEC.shape!=NM.shape:
 89 |             raise ValueError('spectrogram size {} and NM size {} do not match.'.format(SPEC.shape, NM.shape)) # pragma: no cover
 90 | 
 91 |     if wavlen==None: wavlen = int(np.round(f0s[-1,0]*fs))
 92 |     dftlen = (SPEC.shape[1]-1)*2
 93 |     shift = np.median(np.diff(f0s[:,0]))
 94 |     if verbose>0:
 95 |         print('PML Synthesis (dur={}s, fs={}Hz, f0 in [{:.0f},{:.0f}]Hz, shift={}s, dftlen={})'.format(wavlen/float(fs), fs, np.min(f0s[:,1]), np.max(f0s[:,1]), shift, dftlen))
 96 | 
 97 |     # Prepare the features
 98 | 
 99 |     # Enforce continuous f0
100 |     f0s[:,1] = np.interp(f0s[:,0], f0s[f0s[:,1]>0,0], f0s[f0s[:,1]>0,1])
101 |     # If asked, removes steps in the f0 curve
102 |     if pp_f0_rmsteps:
103 |         f0s = sp.f0s_rmsteps(f0s)
104 |     # If asked, smooth the f0 curve using median and FIR filters
105 |     if not pp_f0_smooth is None:
106 |         print('    Smoothing f0 curve using {}[s] window'.format(pp_f0_smooth))
107 |         import scipy.signal as sig
108 |         lf0 = np.log(f0s[:,1])
109 |         bcoefslen = int(0.5*pp_f0_smooth/shift)*2+1
110 |         lf0 = sig.medfilt(lf0, bcoefslen)
111 |         bcoefs = np.hamming(bcoefslen)
112 |         bcoefs = bcoefs/sum(bcoefs)
113 |         lf0 = sig.filtfilt(bcoefs, [1], lf0)
114 |         f0s[:,1] = np.exp(lf0)
115 | 
116 |     winlenmax = getwinlen(np.min(f0s[:,1]), fs, winnbper)
117 |     if winlenmax>dftlen:
118 |         warnings.warn('\n\nWARNING: The maximum window length ({}) is bigger than the DFT length ({}). Please, increase the DFT length of your spectral features (the second dimension) or check if the f0 curve has extremly low values and try to clip them to higher values (at least higher than 50Hz). The f0 curve has been clipped to {}Hz.\n\n'.format(winlenmax, dftlen, winnbper*fs/float(dftlen))) # pragma: no cover
119 |         f0s[:,1] = np.clip(f0s[:,1], winnbper*fs/float(dftlen-2), 1e6)
120 | 
121 |     if not NM is None:
122 |         # Remove noise below f0, as it is supposed to be already the case
123 |         for n in range(NM.shape[0]):
124 |             NM[n,:int((float(dftlen)/fs)*2*f0s[n,1])] = 0.0
125 | 
126 |     if not nm_cont:
127 |         print('    Forcing binary noise mask')
128 |         NM[NM<=0.5] = 0.0 # To be sure that voiced segments are not hoarse
129 |         NM[NM>0.5] = 1.0  # To be sure the noise segments are fully noisy
130 | 
131 |     # Generate the pulse positions [1](2) (i.e. the synthesis instants, the GCIs in voiced segments)
132 |     ts = [0.0]
133 |     while ts[-1]<float(wavlen)/fs:
134 |         cf0 = np.interp(ts[-1], f0s[:,0], f0s[:,1])
135 |         if cf0<50.0: cf0 = 50
136 |         ts.append(ts[-1]+(1.0/cf0))
137 |     ts = np.array(ts)
138 |     f0s = np.vstack((ts, np.interp(ts, f0s[:,0], f0s[:,1]))).T
139 | 
140 | 
141 |     # Resample the features to the pulse positions
142 | 
143 |     # Spectral envelope uses the nearest, to avoid over-smoothing
144 |     SPECR = np.zeros((f0s.shape[0], dftlen/2+1))
145 |     for n, t in enumerate(f0s[:,0]): # Nearest: Way better for plosives
146 |         idx = int(np.round(t/shift))
147 |         idx = np.clip(idx, 0, SPEC.shape[0]-1)
148 |         SPECR[n,:] = SPEC[idx,:]
149 | 
150 |     # Keep trace of the median energy [dB] over the whole signal
151 |     ener = np.mean(SPECR, axis=1)
152 |     idxacs = np.where(sp.mag2db(ener) > sp.mag2db(np.max(ener))-30)[0] # Get approx active frames # TODO Param
153 |     enermed = sp.mag2db(np.median(ener[idxacs])) # Median energy [dB]
154 |     ener = sp.mag2db(ener)
155 | 
156 |     # Resample the noise feature to the pulse positions
157 |     # Smooth the frequency response of the mask in order to avoid Gibbs
158 |     # (poor Gibbs nobody want to see him)
159 |     nm_lowpasswin = np.hanning(nm_lowpasswinlen)
160 |     nm_lowpasswin /= np.sum(nm_lowpasswin)
161 |     NMR = np.zeros((f0s.shape[0], dftlen/2+1))
162 |     for n, t in enumerate(f0s[:,0]):
163 |         idx = int(np.round(t/shift)) # Nearest is better for plosives
164 |         idx = np.clip(idx, 0, NM.shape[0]-1)
165 |         NMR[n,:] = NM[idx,:]
166 |         if nm_lowpasswinlen>1:
167 |             NMR[n,:] = scipy.signal.filtfilt(nm_lowpasswin, [1.0], NMR[n,:])
168 | 
169 |     NMR = np.clip(NMR, 0.0, 1.0)
170 | 
171 |     # The complete waveform that we will fill with the pulses
172 |     wav = np.zeros(wavlen)
173 |     # Half window on the left of the synthesized segment to avoid pre-echo
174 |     dampinhwin = np.hanning(1+2*int(np.round(antipreechohwindur*fs))) # 1ms forced dampingwindow
175 |     dampinhwin = dampinhwin[:(len(dampinhwin)-1)/2+1]
176 | 
177 |     for n, t in enumerate(f0s[:,0]):
178 |         f0 = f0s[n,1]
179 | 
180 |         if verbose>1: print "\rPM Synthesis (python) t={:4.3f}s f0={:3.3f}Hz               ".format(t,f0),
181 | 
182 |         # Window's length
183 |         # TODO It should be ensured that the beggining and end of the
184 |         #      noise is within the window. Nothing is doing this currently!
185 |         winlen = getwinlen(f0, fs, winnbper)
186 |         # TODO We also assume that the VTF's decay is shorter
187 |         #      than winnbper-1 periods (dangerous with high pitched and tense voice).
188 |         if winlen>dftlen: raise ValueError('The window length ({}) is bigger than the DFT length ({}). Please, increase the dftlen of your spectral features or check if the f0 curve has extremly low values and try to clip them to higher values (at least higher than 50[Hz])'.format(winlen, dftlen)) # pragma: no cover
189 | 
190 |         # Set the rough position of the pulse in the window (the closest sample)
191 |         # We keep a third of the window (1 period) on the left because the
192 |         # pulse signal is minimum phase. And 2/3rd (remaining 2 periods)
193 |         # on the right to let the VTF decay.
194 |         pulseposinwin = int((1.0/winnbper)*winlen)
195 | 
196 |         # The sample indices of the current pulse wrt. the final waveform
197 |         winidx = int(round(fs*t)) + np.arange(winlen)-pulseposinwin
198 | 
199 | 
200 |         # Build the pulse spectrum
201 | 
202 |         # Let start with a Dirac
203 |         S = np.ones(dftlen/2+1, dtype=np.complex64)
204 | 
205 |         # Add the delay to place the Dirac at the "GCI": exp(-j*2*pi*t_i)
206 |         delay = -pulseposinwin - fs*(t-int(round(fs*t))/float(fs))
207 |         S *= np.exp((delay*2j*np.pi/dftlen)*np.arange(dftlen/2+1))
208 | 
209 |         # Add the spectral envelope
210 |         # Both amplitude and phase
211 |         E = SPECR[n,:] # Take the amplitude from the given one
212 |         if hp_f0coef!=None:
213 |             # High-pass it to avoid any residual DC component.
214 |             fcut = hp_f0coef*f0
215 |             if not pp_atten1stharminsilences is None and ener[n]-enermed<pp_atten1stharminsilences:
216 |                 fcut = 1.5*f0 # Try to cut between first and second harm
217 |             HP = sp.butter2hspec(fcut, 4, fs, dftlen, high=True)
218 |             E *= HP
219 |             # Not necessarily good as it is non-causal, so make it causal...
220 |             # ... together with the VTF response below.
221 |         # Build the phase of the envelope from the amplitude
222 |         E = sp.hspec2minphasehspec(E, replacezero=True) # We spend 2 FFT here!
223 |         S *= E # Add it to the current pulse
224 | 
225 |         # Add energy correction wrt f0.
226 |         # STRAIGHT and AHOCODER vocoders do it.
227 |         # (why ? to equalize the energy when changing the pulse's duration ?)
228 |         if ener_multT0:
229 |             S *= np.sqrt(fs/f0)
230 | 
231 |         # Generate the segment of Gaussian noise
232 |         # Use mid-points before/after pulse position
233 |         if n>0: leftbnd=int(np.round(fs*0.5*(f0s[n-1,0]+t)))
234 |         else:   leftbnd=int(np.round(fs*(t-0.5/f0s[n,1]))) # int(0)
235 |         if n<f0s.shape[0]-1: rightbnd=int(np.round(fs*0.5*(t+f0s[n+1,0])))-1
236 |         else:                rightbnd=int(np.round(fs*(t+0.5/f0s[n,1])))   #rightbnd=int(wavlen-1)
237 |         gausswinlen = rightbnd-leftbnd # The length of the noise segment
238 |         gaussnoise4win = np.random.normal(size=(gausswinlen)) # The noise
239 | 
240 |         GN = np.fft.rfft(gaussnoise4win, dftlen) # Move the noise to freq domain
241 |         # Normalize it by its energy (@Yannis, That's your answer at SSW9!)
242 |         GN /= np.sqrt(np.mean(np.abs(GN)**2))
243 |         # Place the noise within the pulse's window
244 |         delay = (pulseposinwin-(leftbnd-winidx[0]))
245 |         GN *= np.exp((delay*2j*np.pi/dftlen)*np.arange(dftlen/2+1))
246 | 
247 |         # Add it to the pulse spectrum, under the condition of the mask
248 |         S *= GN**NMR[n,:]
249 | 
250 |         # That's it! the pulse spectrum is ready!
251 | 
252 |         # Move it to time domain
253 |         deter = np.fft.irfft(S)[0:winlen]
254 | 
255 |         # Add half window on the left of the synthesized segment
256 |         # to avoid any possible pre-echo
257 |         deter[:leftbnd-winidx[0]-len(dampinhwin)] = 0.0
258 |         deter[leftbnd-winidx[0]-len(dampinhwin):leftbnd-winidx[0]] *= dampinhwin
259 | 
260 |         # Add half window on the right
261 |         # to avoid cutting the VTF response abruptly
262 |         deter[-len(dampinhwin):] *= dampinhwin[::-1]
263 | 
264 |         # Write the synthesized segment in the final waveform
265 |         if winidx[0]<0 or winidx[-1]>=wavlen:
266 |             # The window is partly outside of the waveform ...
267 |             # ... thus copy only the existing part
268 |             itouse = np.logical_and(winidx>=0,winidx<wavlen)
269 |             wav[winidx[itouse]] += deter[itouse]
270 |         else:
271 |             wav[winidx] += deter
272 | 
273 |     if verbose>1: print '\r                                                               \r',
274 | 
275 |     if verbose>2:                                             # pragma: no cover
276 |         import matplotlib.pyplot as plt
277 |         plt.ion()
278 |         _, axs = plt.subplots(3, 1, sharex=True, sharey=False)
279 |         times = np.arange(len(wav))/float(fs)
280 |         axs[0].plot(times, wav, 'k')
281 |         axs[0].set_ylabel('Waveform\nAmplitude')
282 |         axs[0].grid()
283 |         axs[1].plot(f0s[:,0], f0s[:,1], 'k')
284 |         axs[1].set_ylabel('F0\nFrequency [Hz]')
285 |         axs[1].grid()
286 |         axs[2].imshow(sp.mag2db(SPEC).T, origin='lower', aspect='auto', interpolation='none', extent=(f0s[0,0], f0s[-1,0], 0, 0.5*fs))
287 |         axs[2].set_ylabel('Amp. Envelope\nFrequency [Hz]')
288 | 
289 |         from IPython.core.debugger import  Pdb; Pdb().set_trace()
290 | 
291 |     return wav
292 | 
293 | 
294 | 
295 | def synthesizef(fs, shift=0.005, dftlen=4096, ff0=None, flf0=None, fspec=None, flspec=None, ffwlspec=None, ffwcep=None, fmcep=None, fpdd=None, fmpdd=None, fnm=None, ffwnm=None, nm_cont=False, fsyn=None, verbose=1):
296 |     '''
297 |     Call the synthesis from python using file inputs and outputs
298 |     '''
299 |     if ff0:
300 |         f0 = np.fromfile(ff0, dtype=np.float32)
301 |     if flf0:
302 |         f0 = np.fromfile(flf0, dtype=np.float32)
303 |         f0[f0>0] = np.exp(f0[f0>0])
304 |     ts = (shift)*np.arange(len(f0))
305 |     f0s = np.vstack((ts, f0)).T
306 | 
307 |     if fspec:
308 |         SPEC = np.fromfile(fspec, dtype=np.float32)
309 |         SPEC = SPEC.reshape((len(f0), -1))
310 |     if flspec:
311 |         SPEC = np.fromfile(flspec, dtype=np.float32)
312 |         SPEC = np.exp(SPEC.reshape((len(f0), -1)))
313 |     if ffwlspec:
314 |         FWLSPEC = np.fromfile(ffwlspec, dtype=np.float32)
315 |         FWLSPEC = FWLSPEC.reshape((len(f0), -1))
316 |         SPEC = np.exp(sp.fwbnd2linbnd(FWLSPEC, fs, dftlen, smooth=True))
317 |     if ffwcep:
318 |         FWCEP = np.fromfile(ffwcep, dtype=np.float32)
319 |         FWCEP = FWCEP.reshape((len(f0), -1))
320 |         SPEC = np.exp(sp.fwcep2loghspec(FWCEP, fs, dftlen))
321 |     if fmcep:                           # pragma: no cover
322 |                                         # Cannot test this because it needs SPTK
323 |         MCEP = np.fromfile(fmcep, dtype=np.float32)
324 |         MCEP = MCEP.reshape((len(f0), -1))
325 |         SPEC = sp.mcep2spec(MCEP, sp.bark_alpha(fs), dftlen)
326 | 
327 |     NM = None
328 |     pdd_thresh = 0.75 # For this value, see:
329 |         # G. Degottex and D. Erro, "A uniform phase representation for the harmonic model in speech synthesis applications," EURASIP, Journal on Audio, Speech, and Music Processing - Special Issue: Models of Speech - In Search of Better Representations, vol. 2014, iss. 1, p. 38, 2014.
330 |     if fpdd:
331 |         PDD = np.fromfile(fpdd, dtype=np.float32)
332 |         PDD = PDD.reshape((len(f0), -1))
333 |         NM = PDD.copy()
334 |         NM[PDD<pdd_thresh] = 0.0
335 |         NM[PDD>pdd_thresh] = 1.0
336 |     if fmpdd:                      # pragma: no cover
337 |                                    # Cannot test this because it needs SPTK
338 |         MPDD = np.fromfile(fmpdd, dtype=np.float32)
339 |         MPDD = MPDD.reshape((len(f0), -1))
340 |         PDD = sp.mcep2spec(MPDD, sp.bark_alpha(fs), dftlen)
341 |         NM = PDD.copy()
342 |         NM[PDD<pdd_thresh] = 0.0
343 |         NM[PDD>pdd_thresh] = 1.0
344 | 
345 |     if fnm:
346 |         NM = np.fromfile(fnm, dtype=np.float32)
347 |         NM = NM.reshape((len(f0), -1))
348 |     if ffwnm:
349 |         FWNM = np.fromfile(ffwnm, dtype=np.float32)
350 |         FWNM = FWNM.reshape((len(f0), -1))
351 |         NM = sp.fwbnd2linbnd(FWNM, fs, dftlen)
352 | 
353 |     syn = synthesize(fs, f0s, SPEC, NM=NM, nm_cont=nm_cont, verbose=verbose)
354 |     if fsyn:
355 |         sp.wavwrite(fsyn, syn, fs, norm_max_ifneeded=True, verbose=verbose)
356 | 
357 |     return syn
358 | 
359 | def main(argv):
360 |     '''
361 |     Call the synthesis from the command line
362 |     '''
363 | 
364 |     argpar = argparse.ArgumentParser()
365 |     argpar.add_argument("synth", help="Output synthesis file")
366 |     argpar.add_argument("--f0", default=None, help="Input f0[Hz] file")
367 |     argpar.add_argument("--logf0", default=None, help="Input f0[log Hz] file")
368 |     argpar.add_argument("--spec", default=None, help="Input amplitude spectrogram [linear values]")
369 |     argpar.add_argument("--lspec", default=None, help="Input amplitude spectrogram [log spectral values on linear frequency scale]")
370 |     argpar.add_argument("--fwlspec", default=None, help="Input amplitude spectrogram [frequency warped log spectral values]")
371 |     argpar.add_argument("--fwcep", default=None, help="Input amplitude spectrogram [frequency warped cepstrum values]")
372 |     argpar.add_argument("--mcep", default=None, help="Input amplitude spectrogram [mel-cepstrum values]")
373 |     argpar.add_argument("--pdd", default=None, help="Input Phase Distortion Deviation file [linear values]")
374 |     argpar.add_argument("--mpdd", default=None, help="Input Phase Distortion Deviation file [mel-cepstrum values]")
375 |     argpar.add_argument("--nm", default=None, help="Output Noise Mask [linear values in [0,1] ]")
376 |     argpar.add_argument("--fwnm", default=None, help="Output Noise Mask [compressed in bands with values still in [0,1] ]")
377 |     argpar.add_argument("--nm_cont", action='store_true', help="Allow continuous values for the noisemask (def. False)")
378 |     argpar.add_argument("--fs", default=16000, type=int, help="Sampling frequency[Hz]")
379 |     argpar.add_argument("--shift", default=0.005, type=float, help="Time step[s] between the frames")
380 |     #argpar.add_argument("--dftlen", dftlen=4096, type=float, help="Size of the DFT for extracting the features")
381 |     argpar.add_argument("--verbose", default=1, help="Output some information")
382 |     args = argpar.parse_args(argv)
383 |     args.dftlen = 4096
384 | 
385 |     synthesizef(args.fs, shift=args.shift, dftlen=args.dftlen, ff0=args.f0, flf0=args.logf0, fspec=args.spec, flspec=args.lspec, ffwlspec=args.fwlspec, ffwcep=args.fwcep, fmcep=args.mcep, fnm=args.nm, ffwnm=args.fwnm, nm_cont=args.nm_cont, fpdd=args.pdd, fmpdd=args.mpdd, fsyn=args.synth, verbose=args.verbose)
386 | 
387 | if  __name__ == "__main__" :            # pragma: no cover
388 |     main(sys.argv[1:])
389 | 


--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
 1 | # Prepare or clean data for tests
 2 | #
 3 | # Usage
 4 | # 	$ make
 5 | #
 6 | # Copyright(C) 2016 Engineering Department, University of Cambridge, UK.
 7 | #
 8 | # License
 9 | #    Licensed under the Apache License, Version 2.0 (the "License");
10 | #    you may not use this file except in compliance with the License.
11 | #    You may obtain a copy of the License at
12 | #
13 | #      http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | #    Unless required by applicable law or agreed to in writing, software
16 | #    distributed under the License is distributed on an "AS IS" BASIS,
17 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | #    See the License for the specific language governing permissions and
19 | #    limitations under the License.
20 | #
21 | # Author
22 | #     Gilles Degottex <gad27@cam.ac.uk>
23 | 
24 | all:
25 | 	$(MAKE) samples.scp
26 | 
27 | samples.scp:
28 | 	wget http://festvox.org/cmu_arctic/cmu_arctic/cmu_us_slt_arctic/wav/arctic_a0010.wav -O slt_arctic_a0010.wav
29 | 	wget http://festvox.org/cmu_arctic/cmu_arctic/cmu_us_bdl_arctic/wav/arctic_a0020.wav -O bdl_arctic_a0020.wav
30 | 	wget http://festvox.org/cmu_arctic/cmu_arctic/cmu_us_clb_arctic/wav/arctic_a0030.wav -O clb_arctic_a0030.wav
31 | 	wget http://festvox.org/cmu_arctic/cmu_arctic/cmu_us_awb_arctic/wav/arctic_a0040.wav -O awb_arctic_a0040.wav
32 | 	ls *.wav |grep -v '.resynth.wav' > samples.scp
33 | 
34 | clean:
35 | 	rm -f *.resynth.wav *.f0 *.lf0 *.logf0 *.spec *.fwspec *.fwcep *.pdd *.mpdd *.nm *.bndnm *.fwnm *.diff
36 | 
37 | distclean:
38 | 	rm -f samples.scp *.wav *.f0 *.lf0 *.f0txt *.logf0 *.spec* *.fwspec *.fwlspec *.fwcep *.pdd *.mpdd *.nm *.bndnm *.fwnm *.diff
39 | 


--------------------------------------------------------------------------------
/test/test_smoke.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
  4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)),'external/pyworld/pyworld'))
  6 | 
  7 | import unittest
  8 | 
  9 | import numpy as np
 10 | np.random.seed(123)
 11 | 
 12 | filenames = ['slt_arctic_a0010.wav', 'bdl_arctic_a0020.wav', 'clb_arctic_a0030.wav', 'awb_arctic_a0040.wav']
 13 | filename_totest = 0
 14 | 
 15 | class TestSmoke(unittest.TestCase):
 16 | 
 17 |     @classmethod
 18 |     def test_smoke_cmd_analysis(cls):
 19 |         fname = filenames[filename_totest] # Just with one file for smoke test
 20 | 
 21 |         import analysis
 22 |         analysis.main(['test/'+fname])
 23 |         analysis.main(['test/'+fname, '--f0', 'test/'+fname.replace('.wav','.f0')])
 24 |         analysis.main(['test/'+fname, '--f0', 'test/'+fname.replace('.wav','.f0'), '--preproc_fs', '8000'])
 25 |         analysis.main(['test/'+fname, '--f0_min', '75', '--f0', 'test/'+fname.replace('.wav','.f0')])
 26 |         analysis.main(['test/'+fname, '--f0_max', '200', '--f0', 'test/'+fname.replace('.wav','.f0')])
 27 |         analysis.main(['test/'+fname, '--f0_min', '81', '--f0_max', '220', '--f0', 'test/'+fname.replace('.wav','.f0')])
 28 | 
 29 |         f0s = np.fromfile('test/'+fname.replace('.wav','.f0'), dtype=np.float32)
 30 |         f0s = f0s.reshape((-1, 1))
 31 |         np.savetxt('test/'+fname.replace('.wav','.f0txt'), f0s)
 32 | 
 33 |         analysis.main(['test/'+fname, '--inf0txt', 'test/'+fname.replace('.wav','.f0txt'), '--spec', 'test/'+fname.replace('.wav','.spec')])
 34 |         analysis.main(['test/'+fname, '--inf0bin', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec')])
 35 |         analysis.main(['test/'+fname, '--f0_log', '--f0', 'test/'+fname.replace('.wav','.lf0')])
 36 |         analysis.main(['test/'+fname, '--spec', 'test/'+fname.replace('.wav','.spec')])
 37 |         # analysis.main(['test/'+fname, ' --spec_mceporder', '59', '--spec', 'test/'+fname.replace('.wav','.mcep')]) # Need SPTK for this one
 38 |         analysis.main(['test/'+fname, '--spec_nbfwbnds', '65', '--spec', 'test/'+fname.replace('.wav','.fwlspec')])
 39 |         analysis.main(['test/'+fname, '--pdd', 'test/'+fname.replace('.wav','.pdd')])
 40 |         # analysis.main(['test/'+fname, '--pdd_mceporder', '60', '--pdd', 'test/'+fname.replace('.wav','.pdd')])  # Need SPTK for this one
 41 |         analysis.main(['test/'+fname, '--nm', 'test/'+fname.replace('.wav','.nm')])
 42 |         analysis.main(['test/'+fname, '--nm_nbfwbnds', '33', '--nm', 'test/'+fname.replace('.wav','.fwnm')])
 43 | 
 44 |         # Test pre-processing
 45 |         analysis.main(['test/'+fname, '--inf0txt', 'test/'+fname.replace('.wav','.f0txt'), '--spec', 'test/'+fname.replace('.wav','.spec_resample16kHz'), '--preproc_fs', '16000'])
 46 |         analysis.main(['test/'+fname, '--inf0txt', 'test/'+fname.replace('.wav','.f0txt'), '--spec', 'test/'+fname.replace('.wav','.spec_preproc_hp'), '--preproc_hp', '100.0'])
 47 | 
 48 |         # TODO Test various sampling fromats, encoding and sampling rates for wav files
 49 | 
 50 |     @classmethod
 51 |     def test_smoke_cmd_synthesis(cls):
 52 |         fname = filenames[filename_totest] # Just with one file for smoke test
 53 | 
 54 |         import analysis
 55 |         import synthesis
 56 | 
 57 |         analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--pdd', 'test/'+fname.replace('.wav','.pdd')])
 58 |         synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--pdd', 'test/'+fname.replace('.wav','.pdd')])
 59 | 
 60 |         analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--nm', 'test/'+fname.replace('.wav','.nm')])
 61 |         synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec')])
 62 |         synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--nm', 'test/'+fname.replace('.wav','.nm')])
 63 | 
 64 |         analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '200', '--f0_log', '--f0', 'test/'+fname.replace('.wav','.lf0'), '--spec', 'test/'+fname.replace('.wav','.spec')])
 65 |         synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--logf0', 'test/'+fname.replace('.wav','.lf0'), '--spec', 'test/'+fname.replace('.wav','.spec')])
 66 | 
 67 |         analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec_nblinlogbnds', '129', '--spec', 'test/'+fname.replace('.wav','.lspec')])
 68 |         synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--logf0', 'test/'+fname.replace('.wav','.lf0'), '--lspec', 'test/'+fname.replace('.wav','.lspec')])
 69 | 
 70 |         analysis.main(['test/'+fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec_fwceporder', '59', '--spec', 'test/'+fname.replace('.wav','.fwcep'), '--nm_nbfwbnds', '33', '--nm', 'test/'+fname.replace('.wav','.fwnm')])
 71 |         synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--logf0', 'test/'+fname.replace('.wav','.lf0'), '--fwcep', 'test/'+fname.replace('.wav','.fwcep'), '--fwnm', 'test/'+fname.replace('.wav','.fwnm')])
 72 | 
 73 | 
 74 |         # This one is the most used and thus should be the last one
 75 |         analysis.main(['test/'+fname, '--f0_log', '--f0', 'test/'+fname.replace('.wav','.lf0'), '--spec_nbfwbnds', '65', '--spec', 'test/'+fname.replace('.wav','.fwlspec'), '--nm_nbfwbnds', '33', '--nm', 'test/'+fname.replace('.wav','.fwnm')])
 76 |         synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--logf0', 'test/'+fname.replace('.wav','.lf0'), '--fwlspec', 'test/'+fname.replace('.wav','.fwlspec'), '--fwnm', 'test/'+fname.replace('.wav','.fwnm')])
 77 | 
 78 | 
 79 |     # def test_smoke_analysisf(self):
 80 |     #     fname = filenames[0] # Just with one file for smoke test
 81 |     #     import pulsemodel
 82 |     #
 83 |     #     f0_min = 75
 84 |     #     f0_max = 800
 85 |     #
 86 |     #     pulsemodel.analysisf(fname, f0_min=f0_min, f0_max=f0_max, ff0=fname.replace('.wav','.lf0'), f0_log=True,
 87 |     #     fspec='test/'+fname.replace('.wav','.fwlspec'), spec_nbfwbnds=65, fnm=fname.replace('.wav','.fwnm'), nm_nbfwbnds=33, verbose=1)
 88 | 
 89 |     @classmethod
 90 |     def test_smoke_analysis_synthesis(cls):
 91 |         fname = filenames[filename_totest] # Just with one file for smoke test
 92 | 
 93 |         f0_min = 75
 94 |         f0_max = 800
 95 |         shift = 0.010
 96 |         verbose = 1
 97 |         dftlen = 512
 98 | 
 99 |         import pulsemodel
100 |         import sigproc as sp
101 | 
102 |         wav, fs, _ = sp.wavread('test/'+fname)
103 | 
104 |         f0s, SPEC, PDD, NM = pulsemodel.analysis(wav, fs)
105 | 
106 |         _ = pulsemodel.analysis_f0postproc(wav, fs, f0s=np.zeros(f0s[:,1].shape), f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose)
107 | 
108 |         _ = pulsemodel.analysis_f0postproc(wav, fs, f0s=f0s[:,1], f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose)
109 | 
110 |         nonunif0s = f0s.copy()
111 |         nonunif0s[:,0] = np.random.rand(f0s.shape[0])*(f0s[-1,0]-f0s[0,0]) + f0s[0,0]
112 |         nonunif0s[:,0] = np.sort(nonunif0s[:,0])
113 |         _ = pulsemodel.analysis_f0postproc(wav, fs, f0s=nonunif0s, f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose)
114 | 
115 |         f0s = pulsemodel.analysis_f0postproc(wav, fs, f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose)
116 | 
117 |         f0_min = 60
118 |         f0_max = 600
119 |         shift = 0.005
120 |         dftlen = 4096
121 |         f0s, SPEC, PDD, NM = pulsemodel.analysis(wav, fs, f0s=f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, dftlen=dftlen, verbose=verbose)
122 | 
123 | 
124 |         _ = pulsemodel.synthesize(fs, f0s, SPEC, wavlen=len(wav))
125 | 
126 |         _ = pulsemodel.synthesize(fs, f0s, SPEC, NM=NM, wavlen=len(wav))
127 | 
128 |         NM = PDD.copy()
129 |         NM[NM>0.75] = 1
130 |         NM[NM<=0.75] = 0
131 |         _ = pulsemodel.synthesize(fs, f0s, SPEC, NM=NM, wavlen=len(wav))
132 | 
133 |         _ = pulsemodel.synthesize(fs, f0s, SPEC, NM=NM, wavlen=len(wav)
134 |                         , ener_multT0=True
135 |                         , nm_cont=True, nm_lowpasswinlen=13, hp_f0coef=0.25, antipreechohwindur=0.002
136 |                         , pp_f0_rmsteps=True, pp_f0_smooth=0.100, pp_atten1stharminsilences=-25
137 |                         , verbose=verbose)
138 | 
139 |     def test_repeatability(self):
140 | 
141 |         f0_min = 60
142 |         f0_max = 600
143 | 
144 |         import pulsemodel
145 |         # import pyworld
146 |         # import sigproc as sp
147 | 
148 |         for fname in filenames:
149 |             fname = 'test/'+fname
150 |             lf0s_ref = None
151 |             # pwf0_ref = None
152 |             # SPEC_ref = None
153 |             fwlspec_ref = None
154 |             fwnm_ref = None
155 |             for _ in xrange(2):
156 |                 print('Extracting features for: '+fname)
157 |                 pulsemodel.analysisf(fname, f0_min=f0_min, f0_max=f0_max, ff0=fname.replace('.wav','.lf0'), f0_log=True,
158 |                 fspec=fname.replace('.wav','.fwlspec'), spec_nbfwbnds=65, fnm=fname.replace('.wav','.fwnm'), nm_nbfwbnds=33, verbose=1)
159 | 
160 | 
161 |                 lf0s = np.fromfile(fname.replace('.wav','.lf0'), dtype=np.float32)
162 |                 lf0s = lf0s.reshape((-1, 1))
163 |                 print('lf0 sum square: '+str(np.sum((lf0s)**2)))
164 | 
165 |                 if lf0s_ref is None:
166 |                     lf0s_ref = lf0s
167 |                 else:
168 |                     diff = np.sum((lf0s_ref-lf0s)**2)
169 |                     print('lf0 diff: '+str(diff))
170 |                     self.assertEqual(diff, 0.0)
171 | 
172 | 
173 |                 # #_f0, ts = pyworld.dio(x, fs, frame_period=shift*1000)    # raw pitch extractor # Use REAPER instead
174 |                 # wav, fs, enc = sp.wavread(fname)
175 |                 #
176 |                 # pwts = 0.005*np.arange(len(lf0s))
177 |                 # dftlen = 4096
178 |                 # # from IPython.core.debugger import  Pdb; Pdb().set_trace()
179 |                 # dlf0s = lf0s.astype(np.float64)
180 |                 # pwf0 = pyworld.stonemask(wav, np.ascontiguousarray(np.exp(dlf0s[:,0])), pwts, fs)  # pitch refinement
181 |                 # if pwf0_ref is None:
182 |                 #     pwf0_ref = pwf0
183 |                 # else:
184 |                 #     print('pwf0 diff: '+str(np.sum((pwf0_ref-pwf0)**2)))
185 |                 #
186 |                 # SPEC = pyworld.cheaptrick(wav, pwf0, pwts, fs, fft_size=dftlen)  # extract smoothed spectrogram
187 |                 # if SPEC_ref is None:
188 |                 #     SPEC_ref = SPEC
189 |                 # else:
190 |                 #     print('SPEC diff: '+str(np.sum((SPEC_ref-SPEC)**2)))
191 | 
192 | 
193 |                 fwlspec = np.fromfile(fname.replace('.wav','.fwlspec'), dtype=np.float32)
194 |                 fwlspec = fwlspec.reshape((-1, 65))
195 |                 print('fwlspec sum square: '+str(np.sum((fwlspec)**2)))
196 | 
197 |                 if fwlspec_ref is None:
198 |                     fwlspec_ref = fwlspec
199 |                 else:
200 |                     diff = np.sum((fwlspec_ref-fwlspec)**2)
201 |                     print('fwlspec diff: '+str(diff))
202 |                     self.assertEqual(diff, 0.0)
203 | 
204 | 
205 |                 fwnm = np.fromfile(fname.replace('.wav','.fwnm'), dtype=np.float32)
206 |                 fwnm = fwnm.reshape((-1, 33))
207 |                 print('fwnm sum square: '+str(np.sum((fwnm)**2)))
208 | 
209 |                 if fwnm_ref is None:
210 |                     fwnm_ref = fwnm
211 |                 else:
212 |                     diff = np.sum((fwnm_ref-fwnm)**2)
213 |                     print('fwnm diff: '+str(diff))
214 |                     self.assertEqual(diff, 0.0)
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     unittest.main()
219 | 


--------------------------------------------------------------------------------